# Define here the models for your spider middleware # # See documentation in: # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals import json import redis import random from .useragent import agents from .cookies import init_cookie, remove_cookie, update_cookie from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware from scrapy.downloadermiddlewares.retry import RetryMiddleware import logging
from scrapy.spiders importCrawlSpider, Rule, Request ##CrawlSpider与Rule配合使用可以骑到历遍全站的作用、Request干啥的我就不解释了 from scrapy.linkextractors importLinkExtractor ##配合Rule进行URL规则匹配 from haoduofuli.items importHaoduofuliItem ##不解释 from scrapy importFormRequest ##Scrapy中用作登录使用的一个包
account = '你的帐号' password = '你的密码'
classmyspider(CrawlSpider):
name = 'haoduofuli' allowed_domains = ['haoduofuli.wang'] start_urls = ['http://www.haoduofuli.wang/wp-login.php']
''' 获取 ifconfig 命令的输出 ''' defgetIfconfig(): p = Popen(['ifconfig'], stdout = PIPE) data = p.stdout.read() return data
''' 获取 dmidecode 命令的输出 ''' defgetDmi(): p = Popen(['dmidecode'], stdout = PIPE) data = p.stdout.read() return data
''' 根据空行分段落 返回段落列表''' defparseData(data): parsed_data = [] new_line = '' data = [i for i in data.split('\n') if i] for line in data: if line[0].strip(): parsed_data.append(new_line) new_line = line + '\n' else: new_line += line + '\n' parsed_data.append(new_line) return [i for i in parsed_data if i]
''' 根据输入的段落数据分析出ifconfig的每个网卡ip信息 ''' defparseIfconfig(parsed_data): dic = {} parsed_data = [i for i in parsed_data ifnot i.startswith('lo')] for lines in parsed_data: line_list = lines.split('\n') devname = line_list[0].split()[0] macaddr = line_list[0].split()[-1] ipaddr = line_list[1].split()[1].split(':')[1] break dic['ip'] = ipaddr return dic
''' 根据输入的dmi段落数据 分析出指定参数 ''' defparseDmi(parsed_data): dic = {} parsed_data = [i for i in parsed_data if i.startswith('System Information')] parsed_data = [i for i in parsed_data[0].split('\n')[1:] if i] dmi_dic = dict([i.strip().split(':') for i in parsed_data]) dic['vender'] = dmi_dic['Manufacturer'].strip() dic['product'] = dmi_dic['Product Name'].strip() dic['sn'] = dmi_dic['Serial Number'].strip() return dic
''' 获取Linux系统主机名称 ''' defgetHostname(): with open('/etc/sysconfig/network') as fd: for line in fd: if line.startswith('HOSTNAME'): hostname = line.split('=')[1].strip() break return {'hostname':hostname}
''' 获取Linux系统的版本信息 ''' defgetOsVersion(): with open('/etc/issue') as fd: for line in fd: osver = line.strip() break return {'osver':osver}
''' 获取CPU的型号和CPU的核心数 ''' defgetCpu(): num = 0 with open('/proc/cpuinfo') as fd: for line in fd: if line.startswith('processor'): num += 1 if line.startswith('model name'): cpu_model = line.split(':')[1].strip().split() cpu_model = cpu_model[0] + ' ' + cpu_model[2] + ' ' + cpu_model[-1] return {'cpu_num':num, 'cpu_model':cpu_model}
''' 获取Linux系统的总物理内存 ''' defgetMemory(): with open('/proc/meminfo') as fd: for line in fd: if line.startswith('MemTotal'): mem = int(line.split()[1].strip()) break mem = '%.f' % (mem / 1024.0) + ' MB' return {'Memory':mem}
import re import scrapy #导入scrapy包 from bs4 import BeautifulSoup from scrapy.http import Request ##一个单独的request的模块,需要跟进URL的时候,需要用它 from dingdian.items import DingdianItem ##这是我定义的需要保存的字段,(导入dingdian项目中,items文件中的DingdianItem类)
$ composer require monolog/monolog Using version ^1.19 for monolog/monolog ./composer.json has been updated Loading composer repositories withpackage information Updating dependencies (including require-dev) - Installing psr/log (1.0.0) Downloading: 100%
ADSL (Asymmetric Digital Subscriber Line ,非对称数字用户环路)是一种新的数据传输方式。它因为上行和下行带宽不对称,因此称为非对称数字用户线环路。它采用频分复用技术把普通的电话线分成了电话、上行和下行三个相对独立的信道,从而避免了相互之间的干扰。 他有个独有的特点,每拨一次号,就获取一个新的 IP。也就是它的 IP 是不固定的,不过既然是拨号上网嘛,速度也是有保障的,用它搭建一个代理,那既能保证可用,又能自由控制拨号切换。 如果你是用的 ADSL 上网方式,那就不用过多设置了,直接自己电脑调用一个拨号命令就好了,自动换 IP,分分钟解决封 IP 的事。 然而,你可能说?我家宽带啊,我连得公司无线啊,我蹭的网上的啊!那咋办? 这时,你就需要一台 VPS 拨号主机。
import os import time import threading import multiprocessing from mongodb_queue import MogoQueue from Download import request from bs4 import BeautifulSoup
threads = [] while threads or crawl_queue: """ 这儿crawl_queue用上了,就是我们__bool__函数的作用,为真则代表我们MongoDB队列里面还有数据 threads 或者 crawl_queue为真都代表我们还没下载完成,程序就会继续执行 """ for thread in threads: ifnot thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉 threads.remove(thread) while len(threads) < max_threads or crawl_queue.peek(): ##线程池中的线程少于max_threads 或者 crawl_qeue时 thread = threading.Thread(target=pageurl_crawler) ##创建线程 thread.setDaemon(True) ##设置守护线程 thread.start() ##启动线程 threads.append(thread) ##添加进线程队列 time.sleep(SLEEP_TIME)
defprocess_crawler(): process = [] num_cpus = multiprocessing.cpu_count() print('将会启动进程数为:', num_cpus) for i in range(num_cpus): p = multiprocessing.Process(target=mzitu_crawler) ##创建进程 p.start() ##启动进程 process.append(p) ##添加进进程队列 for p in process: p.join() ##等待进程队列里面的进程结束
if __name__ == '__main__': for i in range(5): p = multiprocessing.Process(target=process, args=(i,)) p.start()
最简单的创建 Process 的过程如上所示,target 传入函数名,args 是函数的参数,是元组的形式,如果只有一个参数,那就是长度为 1 的元组。 然后调用 start()方法即可启动多个进程了。 另外你还可以通过 cpu_count() 方法还有 active_children() 方法获取当前机器的 CPU 核心数量以及得到目前所有的运行的进程。 通过一个实例来感受一下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
import multiprocessing import time
def process(num): time.sleep(num) print'Process:', num
if __name__ == '__main__': for i in range(5): p = multiprocessing.Process(target=process, args=(i,)) p.start()
print('CPU number:' + str(multiprocessing.cpu_count())) for p in multiprocessing.active_children(): print('Child process name: ' + p.name + ' id: ' + str(p.pid))
print('Process Ended')
运行结果:
1 2 3 4 5 6 7 8 9 10 11
Process:0 CPUnumber:8 Child process name: Process-2 id:9641 Child process name: Process-4 id:9643 Child process name: Process-5 id:9644 Child process name: Process-3 id:9642 ProcessEnded Process:1 Process:2 Process:3 Process:4
自定义类
另外你还可以继承 Process 类,自定义进程类,实现 run 方法即可。 用一个实例来感受一下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
from multiprocessing import Process import time
classMyProcess(Process): def __init__(self, loop): Process.__init__(self) self.loop = loop def run(self): for count in range(self.loop): time.sleep(1) print('Pid: ' + str(self.pid) + ' LoopCount: ' + str(count)) if __name__ == '__main__': for i in range(2, 5): p = MyProcess(i) p.start()
在上面的例子中,我们继承了 Process 这个类,然后实现了 run 方法。打印出来了进程号和参数。 运行结果:
class Producer(Process): def run(self): global buffer, empty, full, lock whileTrue: empty.acquire() lock.acquire() num = random() print 'Producer put ', num buffer.put(num) time.sleep(1) lock.release() full.release()
if __name__ == '__main__': p = Producer() c = Consumer() p.daemon = c.daemon = True p.start() c.start() p.join() c.join() print 'Ended!'
运行结果:
1 2 3 4 5 6 7 8
Producer put 0.719213647437 Producer put 0.44287326683 Consumer get0.719213647437 Consumer get0.44287326683 Producer put 0.722859424381 Producer put 0.525321338921 Consumer get0.722859424381 Consumer get0.525321338921
可以看到生产者放入队列中数据,然后消费者将数据取出来。 get 方法有两个参数,blocked 和 timeout,意思为阻塞和超时时间。默认 blocked 是 true,即阻塞式。 当一个队列为空的时候如果再用 get 取则会阻塞,所以这时候就需要吧 blocked 设置为 false,即非阻塞式,实际上它就会调用 get_nowait()方法,此时还需要设置一个超时时间,在这么长的时间内还没有取到队列元素,那就抛出 Queue.Empty 异常。 当一个队列为满的时候如果再用 put 放则会阻塞,所以这时候就需要吧 blocked 设置为 false,即非阻塞式,实际上它就会调用 put_nowait()方法,此时还需要设置一个超时时间,在这么长的时间内还没有放进去元素,那就抛出 Queue.Full 异常。 另外队列中常用的方法 Queue.qsize() 返回队列的大小 ,不过在 Mac OS 上没法运行。 原因:
def qsize(self): # Raises NotImplementedError on Mac OSX because of broken sem_getvalue() return self._maxsize - self._sem._semlock._get_value()
Producer Received: Consumer Words Consumer Received: Producer Words Ended!
以上是对 pipe 的简单介绍。
Pool
在利用 Python 进行系统管理的时候,特别是同时操作多个文件目录,或者远程控制多台主机,并行操作可以节约大量的时间。当被操作对象数目不大时,可以直接利用 multiprocessing 中的 Process 动态成生多个进程,十几个还好,但如果是上百个,上千个目标,手动的去限制进程数量却又太过繁琐,此时可以发挥进程池的功效。 Pool 可以提供指定数量的进程,供用户调用,当有新的请求提交到 pool 中时,如果池还没有满,那么就会创建一个新的进程用来执行该请求;但如果池中的进程数已经达到规定最大值,那么该请求就会等待,直到池中有进程结束,才会创建新的进程来它。 在这里需要了解阻塞和非阻塞的概念。 阻塞和非阻塞关注的是程序在等待调用结果(消息,返回值)时的状态。 阻塞即要等到回调结果出来,在有结果之前,当前进程会被挂起。 Pool 的用法有阻塞和非阻塞两种方式。非阻塞即为添加进程后,不一定非要等到改进程执行完就添加其他进程运行,阻塞则相反。 现用一个实例感受一下非阻塞的用法:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
from multiprocessing import Lock, Pool import time
def function(index): print'Start process: ', index time.sleep(3) print'End process', index
if __name__ == '__main__': pool = Pool(processes=3) for i in xrange(4): pool.apply_async(function, (i,))
from multiprocessing import Lock, Pool import time
def function(index): print'Start process: ', index time.sleep(3) print'End process', index return index
if __name__ == '__main__': pool = Pool(processes=3) for i in xrange(4): result = pool.apply_async(function, (i,)) print result.get() print"Started processes" pool.close() pool.join() print"Subprocess done."
Thread-1: Thu Nov 316:43:012016 Thread-2: Thu Nov 316:43:032016 Thread-1: Thu Nov 316:43:032016 Thread-1: Thu Nov 316:43:052016 Thread-2: Thu Nov 316:43:072016 Thread-1: Thu Nov 316:43:072016 Thread-1: Thu Nov 316:43:092016 Thread-2: Thu Nov 316:43:112016 Thread-2: Thu Nov 316:43:152016 Thread-2: Thu Nov 316:43:192016
Starting Thread-1Starting Thread-2 Exiting Main Thread Thread-1: Thu Nov 318:42:192016 Thread-2: Thu Nov 318:42:202016 Thread-1: Thu Nov 318:42:202016 Thread-1: Thu Nov 318:42:212016 Thread-2: Thu Nov 318:42:222016 Thread-1: Thu Nov 318:42:222016 Thread-1: Thu Nov 318:42:232016 Exiting Thread-1 Thread-2: Thu Nov 318:42:242016 Thread-2: Thu Nov 318:42:262016 Thread-2: Thu Nov 318:42:282016 Exiting Thread-2
Starting Thread-1 Starting Thread-2 Thread-1: Thu Nov 318:56:492016 Thread-1: Thu Nov 318:56:502016 Thread-1: Thu Nov 318:56:512016 Thread-2: Thu Nov 318:56:532016 Thread-2: Thu Nov 318:56:552016 Thread-2: Thu Nov 318:56:572016 Exiting Main Thread
# 创建新线程 for tName in threadList: thread = myThread(threadID, tName, workQueue) thread.start() threads.append(thread) threadID += 1
# 填充队列 queueLock.acquire() for word in nameList: workQueue.put(word) queueLock.release()
# 等待队列清空 whilenot workQueue.empty(): pass
# 通知线程是时候退出 exitFlag = 1
# 等待所有线程完成 for t in threads: t.join() print"Exiting Main Thread"
运行结果:
1 2 3 4 5 6 7 8 9 10 11 12
Starting Thread-1 Starting Thread-2 Starting Thread-3 Thread-3 processing One Thread-1 processing Two Thread-2 processing Three Thread-3 processing Four Thread-2 processing Five Exiting Thread-2 Exiting Thread-3 Exiting Thread-1 Exiting Main Thread
各位可以自己实例化测试一下,headers 会不会变哦 ε=ε=ε=(~ ̄ ▽  ̄)~ 好啦下面我们继续还有一个点没有处理:那就是限制 IP 频率的反爬虫。 首先是需要获取代理 IP 的网站,我找到了这个站点 http://haoip.cc/tiqu.htm(这儿本来我是准备教大家自己维护一个 IP 代理池的,不过有点麻烦啊!还好发现这个代理站,还是这么好心的站长。我就可以光明正大的偷懒啦!ヾ(≧O≦)〃嗷~) 我们先把这写 IP 爬取下来吧!本来想让大家自己写,不过有用到正则表达式的,虽然简单,不过有些小哥儿(妹儿)怕是不会使。我也写出来啦.
1 2 3 4 5 6 7 8
iplist = [] ##初始化一个list用来存放我们获取到的IP html = requests.get("http://haoip.cc/tiqu.htm")##不解释咯 iplistn = re.findall(r'r/>(.*?)<b', html.text, re.S) ##表示从html.text中获取所有r/><b中的内容,re.S的意思是包括匹配包括换行符,findall返回的是个list哦! for ip in iplistn: i = re.sub('\n', '', ip)##re.sub 是re模块替换的方法,这儿表示将\n替换为空 iplist.append(i.strip()) ##添加到我们上面初始化的list里面, i.strip()的意思是去掉字符串的空格哦!!(这都不知道的小哥儿基础不牢啊) print(i.strip()) print(iplist)
我们来打印一下看看 下面[———————]中的内容就我们添加进 iplist 这个初始化的 list 中的内容哦! 完美!!好啦现在我们把这段代码加到之前写的代码里面去;并判断是否使用了代理:
If you’re familiar with modifying variables in Sass—or any other CSS preprocessor—you’ll be right at home to move into flexbox mode.
Open the _variables.scss file and find the $enable-flex variable.
Change it from false to true.
Recompile, and done!
Alternatively, if you don’t need the source Sass files, you may swap the default Bootstrap compiled CSS with the compiled flexbox variation. Head to the download page for more information.
</div> <divclass="container"> <divclass="row"> <divclass="col-xs"> 1of3 </div> <divclass="col-xs-6"> 2of3 (wider) </div> <divclass="col-xs"> 3of3 </div> </div> <divclass="row"> <divclass="col-xs"> 1of3 </div> <divclass="col-xs-5"> 2of3 (wider) </div> <divclass="col-xs"> 3of3 </div> </div> </div> <divclass="container"> <divclass="row"> <divclass="col-xs"> 1of3 </div> <divclass="col-xs-6"> 2of3 (wider) </div> <divclass="col-xs"> 3of3 </div> </div> <divclass="row"> <divclass="col-xs"> 1of3 </div> <divclass="col-xs-5"> 2of3 (wider) </div> <divclass="col-xs"> 3of3 </div> </div> </div> <divclass="container"> <divclass="row flex-items-xs-top"> <divclass="col-xs"> One of three columns </div> <divclass="col-xs"> One of three columns </div> <divclass="col-xs"> One of three columns </div> </div> <divclass="row flex-items-xs-middle"> <divclass="col-xs"> One of three columns </div> <divclass="col-xs"> One of three columns </div> <divclass="col-xs"> One of three columns </div> </div> <divclass="row flex-items-xs-bottom"> <divclass="col-xs"> One of three columns </div> <divclass="col-xs"> One of three columns </div> <divclass="col-xs"> One of three columns </div> </div> </div> <divclass="container"> <divclass="row flex-items-xs-left"> <divclass="col-xs-4"> One of two columns </div> <divclass="col-xs-4"> One of two columns </div> </div> <divclass="row flex-items-xs-center"> <divclass="col-xs-4"> One of two columns </div> <divclass="col-xs-4"> One of two columns </div> </div> <divclass="row flex-items-xs-right"> <divclass="col-xs-4"> One of two columns </div> <divclass="col-xs-4"> One of two columns </div> </div> <divclass="row flex-items-xs-around"> <divclass="col-xs-4"> One of two columns </div> <divclass="col-xs-4"> One of two columns </div> </div> <divclass="row flex-items-xs-between"> <divclass="col-xs-4"> One of two columns </div> <divclass="col-xs-4"> One of two columns </div> </div> </div> <divclass="container"> <divclass="row"> <divclass="col-xs flex-xs-unordered"> First, but unordered </div> <divclass="col-xs flex-xs-last"> Second, butlast </div> <divclass="col-xs flex-xs-first"> Third, butfirst </div> </div> </div> <style> .row { margin-top: 1rem; } .row > [class^="col-"] { padding-top: .75rem; padding-bottom: .75rem; background-color: rgba(86, 61, 124, 0.15); border: 1px solid rgba(86, 61, 124, 0.2); } .flex-items-xs-top, .flex-items-xs-middle,.flex-items-xs-bottom { min-height: 6rem; background-color: rgba(255, 0, 0, 0.1); } </style>
{ "name": "bootstrap-sass-demo", "authors": [ "Germey" ], "description": "bootstrap-sass is a Sass-powered version of Bootstrap, ready to drop right into your Sass powered applications.", "moduleType": "globals", "keywords": [ "twbs", "bootstrap", "sass" ], "license": "MIT", "dependencies": { "jquery": ">= 1.9.0" } }
@config(age=10 * 24 * 60 * 60) defindex_page(self, response): for each in response.doc('a[href^="http"]').items(): self.crawl(each.attr.href, callback=self.detail_page)
pyquery allows you to make jquery queries on xml documents. The API is as much as possible the similar to jquery. pyquery uses lxml for fast xml and html manipulation. This is not (or at least not yet) a library to produce or interact with javascript code. I just liked the jquery API and I missed it in python so I told myself “Hey let’s make jquery in python”. This is the result. It can be used for many purposes, one idea that I might try in the future is to use it for templating with pure http templates that you modify using pyquery. I can also be used for web scrapping or for theming applications with Deliverance.
pyquery 可让你用 jQuery 的语法来对 xml 进行操作。这I和 jQuery 十分类似。如果利用 lxml,pyquery 对 xml 和 html 的处理将更快。 这个库不是(至少还不是)一个可以和 JavaScript交互的代码库,它只是非常像 jQuery API 而已。
初始化
在这里介绍四种初始化方式。 (1)直接字符串
1 2
from pyquery import PyQuery as pq doc = pq("<html></html>")
pq 参数可以直接传入 HTML 代码,doc 现在就相当于 jQuery 里面的 $ 符号了。 (2)lxml.etree
1 2
from lxml import etree doc = pq(etree.fromstring("<html></html>"))
可以首先用 lxml 的 etree 处理一下代码,这样如果你的 HTML 代码出现一些不完整或者疏漏,都会自动转化为完整清晰结构的 HTML代码。 (3)直接传URL
1 2
from pyquery import PyQuery as pq doc = pq('http://www.baidu.com')
这里就像直接请求了一个网页一样,类似用 urllib2 来直接请求这个链接,得到 HTML 代码。 (4)传文件
1 2
from pyquery import PyQuery as pq doc = pq(filename='hello.html')
依旧是那么优雅与自信! 在这里我们发现了,这是一连串的操作,而 p 是一直在原来的结果上变化的。 因此执行上述操作之后,p 本身也发生了变化。
DOM操作
同样的原汁原味的 jQuery 语法
1 2 3 4 5 6 7 8 9 10 11
from pyquery import PyQuery as pq
p = pq('<p id="hello" class="hello"></p>')('p') print p.append(' check out <a href="http://reddit.com/r/python"><span>reddit</span></a>') print p.prepend('Oh yes!') d = pq('<div class="wrap"><div id="test"><a href="http://cuiqingcai.com">Germy</a></div></div>') p.prependTo(d('#test')) print p print d d.empty() print d
运行结果
1 2 3 4 5
<pid="hello"class="hello"> check out <ahref="http://reddit.com/r/python"><span>reddit</span></a></p> <pid="hello"class="hello">Oh yes! check out <ahref="http://reddit.com/r/python"><span>reddit</span></a></p> <pid="hello"class="hello">Oh yes! check out <ahref="http://reddit.com/r/python"><span>reddit</span></a></p> <divclass="wrap"><divid="test"><pid="hello"class="hello">Oh yes! check out <ahref="http://reddit.com/r/python"><span>reddit</span></a></p><ahref="http://cuiqingcai.com">Germy</a></div></div> <divclass="wrap"/>
这不需要多解释了吧。 DOM 操作也是与 jQuery 如出一辙。
遍历
遍历用到 items 方法返回对象列表,或者用 lambda
1 2 3 4 5 6 7
from pyquery import PyQuery as pq doc = pq(filename='hello.html') lis = doc('li') for li in lis.items(): print li.html()
from lxml import etree html = etree.parse('hello.html') result = etree.tostring(html, pretty_print=True) print(result)
同样可以得到相同的结果。
XPath实例测试
依然以上一段程序为例 (1)获取所有的
标签
1 2 3 4 5 6 7 8
from lxml import etree html = etree.parse('hello.html') print type(html) result = html.xpath('//li') print result print len(result) print type(result) print type(result[0])
运行结果
1 2 3 4 5
<type 'lxml.etree._ElementTree'> [<Element li at 0x1014e0e18>, <Element li at 0x1014e0ef0>, <Element li at 0x1014e0f38>, <Element li at 0x1014e0f80>, <Element li at 0x1014e0fc8>] 5 <type 'list'> <type 'lxml.etree._Element'>
The driver.get method will navigate to a page given by the URL. WebDriver will wait until the page has fully loaded (that is, the “onload” event has fired) before returning control to your test or script. It’s worth noting that if your page uses a lot of AJAX on load then WebDriver may not know when it has completely loaded.
WebDriver offers a number of ways to find elements using one of the findelement_by* methods. For example, the input text element can be located by its name attribute using find_element_by_name method
WebDriver 提供了许多寻找网页元素的方法,譬如 findelement_by* 的方法。例如一个输入框可以通过 find_element_by_name 方法寻找 name 属性来确定。
Next we are sending keys, this is similar to entering keys using your keyboard. Special keys can be send using Keys class imported from selenium.webdriver.common.keys
The test case class is inherited from unittest.TestCase. Inheriting from TestCase class is the way to tell unittest module that this is a test case. The setUp is part of initialization, this method will get called before every test function which you are going to write in this test case class. The test case method should always start with characters test. The tearDown method will get called after every test method. This is a place to do all cleanup actions. You can also call quit method instead of close. The quit will exit the entire browser, whereas close will close a tab, but if it is the only tab opened, by default most browser will exit entirely.
测试用例是继承了 unittest.TestCase 类,继承这个类表明这是一个测试类。setUp方法是初始化的方法,这个方法会在每个测试类中自动调用。每一个测试方法命名都有规范,必须以 test 开头,会自动执行。最后的 tearDown 方法会在每一个测试方法结束之后调用。这相当于最后的析构方法。在这个方法里写的是 close 方法,你还可以写 quit 方法。不过 close 方法相当于关闭了这个 TAB 选项卡,然而 quit 是退出了整个浏览器。当你只开启了一个 TAB 选项卡的时候,关闭的时候也会将整个浏览器关闭。
element = driver.find_element_by_id("passwd-id") element = driver.find_element_by_name("passwd") element = driver.find_elements_by_tag_name("input") element = driver.find_element_by_xpath("//input[@id='passwd-id']")
element = driver.find_element_by_xpath("//select[@name='name']") all_options = element.find_elements_by_tag_name("option") for option in all_options: print("Valueis: %s" % option.get_attribute("value")) option.click()
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC
PhantomJS is a headless WebKit scriptable with a JavaScript API. It has fast andnative support for various web standards: DOM handling, CSS selector, JSON, Canvas, and SVG.
Loading http://cuiqingcai.com Loading time 11678 msec
这个时间包括 JS 渲染的时间,当然和网速也有关。
代码评估
To evaluate JavaScript code in the context of the web page, use evaluate() function. The execution is “sandboxed”, there is no way for the code to access any JavaScript objects and variables outside its own page context. An object can be returned from evaluate(), however it is limited to simple objects and can’t contain functions or closures.
varurl = 'http://www.baidu.com'; var page = require('webpage').create(); page.open(url, function(status) { var title = page.evaluate(function() { returndocument.title; }); console.log('Page title is ' + title); phantom.exit(); });
Since PhantomJS is using WebKit, a real layout and rendering engine, it can capture a web page as a screenshot. Because PhantomJS can render anything on the web page, it can be used to convert contents not only in HTML and CSS, but also SVG and Canvas.
var page = require('webpage').create(); //viewportSize being the actual size of the headless browser page.viewportSize = { width:1024, height:768 }; //the clipRect is the portion of the page you are taking a screenshot of page.clipRect = { top:0, left:0, width:1024, height:768 }; //the rest of the code is the same as the previous example page.open('http://cuiqingcai.com/', function() { page.render('germy.png'); phantom.exit(); });
Because PhantomJS permits the inspection of network traffic, it is suitable to build various analysis on the network behavior and performance.
因为 PhantomJS 有网络通信的检查功能,它也很适合用来做网络行为的分析。
When a page requests a resource from a remote server, both the request and the response can be tracked via onResourceRequested and onResourceReceived callback.
Because PhantomJS can load and manipulate a web page, it is perfect to carry out various page automations.
因为 PhantomJS 可以加载和操作一个 web 页面,所以用来自动化处理也是非常适合的。
DOM 操作
Since the script is executed as if it is running on a web browser, standard DOM scripting and CSS selectors work just fine.
脚本都是像在浏览器中运行的,所以标准的 JavaScript 的 DOM 操作和 CSS 选择器也是生效的。 例如下面的例子就修改了 User-Agent,然后还返回了页面中某元素的内容。
1 2 3 4 5 6 7 8 9 10 11 12 13 14
var page = require('webpage').create(); console.log('The default user agent is ' + page.settings.userAgent); page.settings.userAgent = 'SpecialAgent'; page.open('http://www.httpuseragent.org', function(status) { if (status !== 'success') { console.log('Unable to access network'); } else { var ua = page.evaluate(function() { return document.getElementById('myagent').textContent; }); console.log(ua); } phantom.exit(); });
运行结果
1 2
The default user agent is Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.0 Safari/538.1 Your Http User Agent string is: SpecialAgent
--help or -h lists all possible command-line options. Halts immediately, will not run a script passed as argument. [帮助列表] —version or -v prints out the version of PhantomJS. Halts immediately, will not run a script passed as argument. [查看版本] —cookies-file=/path/to/cookies.txt specifies the file name to store the persistent Cookies. [指定存放 cookies 的路径] —disk-cache=[true|false] enables disk cache (at desktop services cache storage location, default is false). Also accepted: [yes|no]. [硬盘缓存开关,默认为关] —ignore-ssl-errors=[true|false] ignores SSL errors, such as expired or self-signed certificate errors (default is false). Also accepted: [yes|no]. [忽略 ssl 错误,默认不忽略] —load-images=[true|false] load all inlined images (default is true). Also accepted: [yes|no]. [加载图片,默认为加载] —local-storage-path=/some/path path to save LocalStorage content and WebSQL content. [本地存储路径,如本地文件和 SQL 文件等] —local-storage-quota=number maximum size to allow for data. [本地文件最大大小] —local-to-remote-url-access=[true|false] allows local content to access remote URL (default is false). Also accepted: [yes|no]. [是否允许远程加载文件,默认不允许] —max-disk-cache-size=size limits the size of disk cache (in KB). [最大缓存空间] —output-encoding=encoding sets the encoding used for terminal output (default is utf8). [默认输出编码,默认 utf8] —remote-debugger-port starts the script in a debug harness and listens on the specified port [远程调试端口] —remote-debugger-autorun runs the script in the debugger immediately: ‘yes’ or ‘no’ (default) [在调试环境下是否立即执行脚本,默认否] —proxy=address:port specifies the proxy server to use (e.g. —proxy=192.168.1.42:8080). [代理] —proxy-type=[http|socks5|none] specifies the type of the proxy server (default is http). [代理类型,默认 http] —proxy-auth specifies the authentication information for the proxy, e.g. —proxy-auth=username:password). [代理认证] —script-encoding=encoding sets the encoding used for the starting script (default is utf8). [脚本编码,默认 utf8] —ssl-protocol=[sslv3|sslv2|tlsv1|any’] sets the SSL protocol for secure connections (default is SSLv3). [SSL 协议,默认 SSLv3] —ssl-certificates-path= Sets the location for custom CA certificates (if none set, uses system default). [SSL 证书路径,默认系统默认路径] —web-security=[true|false] enables web security and forbids cross-domain XHR (default is true). Also accepted: [yes|no]. [是否开启安全保护和禁止异站 Ajax,默认开启保护] —webdriver starts in ‘Remote WebDriver mode’ (embedded GhostDriver): ‘[[:]]’ (default ‘127.0.0.1:8910’) [以远程 WebDriver 模式启动] —webdriver-selenium-grid-hub URL to the Selenium Grid HUB: ‘URLTOHUB’ (default ‘none’) (NOTE: works only together with ‘—webdriver’) [Selenium 接口] —config=/path/to/config.json can utilize a JavaScript Object Notation (JSON) configuration file instead of passing in multiple command-line optionss [所有的命令行配置从 config.json 中读取]
r = requests.post("http://httpbin.org/post") r = requests.put("http://httpbin.org/put") r = requests.delete("http://httpbin.org/delete") r = requests.head("http://httpbin.org/get") r = requests.options("http://httpbin.org/get")
PhantomJS 是一个基于 WebKit 的服务器端 JavaScript API。它全面支持 web 而不需浏览器支持,其快速、原生支持各种 Web 标准:DOM 处理、CSS 选择器、JSON、Canvas 和 SVG。 PhantomJS 可以用于页面自动化、网络监测、网页截屏以及无界面测试等。 安装 以上附有官方安装方式,如果你是 Ubuntu 或 Mac OS X 用户,可以直接用命令来安装 Ubuntu: