下载完成后,将 PhantomJS 可执行文件所在的路径配置到环境变量里。比如在 Windows 下,将下载的文件解压之后并打开,会看到一个 bin 文件夹,里面包括一个可执行文件 phantomjs.exe,我们需要将它直接放在配置好环境变量的路径下或者将它所在的路径配置到环境变量里。比如,我们既可以将它直接复制到 Python 的 Scripts 文件夹,也可以将它所在的 bin 目录加入到环境变量。
Windows 下环境变量的配置可以参见 1.1 节,Linux 及 Mac 环境变量的配置可以参见 1.2.3 节,在此不再赘述,关键在于将 PhantomJS 的可执行文件所在路径配置到环境变量里。
配置成功后,可以在命令行下测试一下,输入:
1
phantomjs
如果可以进入到 PhantomJS 的命令行,那就证明配置完成了,如图 1-21 所示。
图 1-21 控制台
3. 验证安装
在 Selenium 中使用的话,我们只需要将 Chrome 切换为 PhantomJS 即可:
1 2 3 4
from selenium import webdriver browser = webdriver.PhantomJS() browser.get('https://www.baidu.com') print(browser.current_url)
from itertools import chain import pandas as pd import numpy as np # Merge all words all_words = list(chain(*words)) # All words to Series all_words_sr = pd.Series(all_words) # Get value count, index changed toset all_words_counts = all_words_sr.value_counts() # Get words set all_words_set = all_words_counts.index # Get words ids all_words_ids = range(1, len(all_words_set) + 1)
from __future__ import (absolute_import, division, print_function) __metaclass__ = type
import json import pipes
from ansible.compat.six import text_type, iteritems
from ansible import constants as C from ansible.errors import AnsibleError from ansible.release import __version__
try: from __main__ import display except ImportError: from ansible.utils.display import Display display = Display()
classMagicStackBase(object):
def_mount_nfs(self, ansible_nfs_src, ansible_nfs_dest): cmd = ['mount',ansible_nfs_src, ansible_nfs_dest] cmd = [pipes.quote(c) for c in cmd] cmd = ' '.join(cmd) result = self._low_level_execute_command(cmd=cmd, sudoable=True) return result
def_umount_nfs(self, ansible_nfs_dest): cmd = ['umount', ansible_nfs_dest] cmd = [pipes.quote(c) for c in cmd] cmd = ' '.join(cmd) result = self._low_level_execute_command(cmd=cmd, sudoable=True) return result
def_execute_module(self, module_name=None, module_args=None, tmp=None, task_vars=None, persist_files=False, delete_remote_tmp=True): ''' Transfer and run a module along with its arguments. '''
# display.v(task_vars)
if task_vars isNone: task_vars = dict()
# if a module name was not specified for this execution, use # the action from the task if module_name isNone: module_name = self._task.action if module_args isNone: module_args = self._task.args
# set check mode in the module arguments, if required if self._play_context.check_mode: ifnot self._supports_check_mode: raise AnsibleError("check mode is not supported for this operation") module_args['_ansible_check_mode'] = True else: module_args['_ansible_check_mode'] = False
# Get the connection user for permission checks remote_user = task_vars.get('ansible_ssh_user') or self._play_context.remote_user
# set no log in the module arguments, if required module_args['_ansible_no_log'] = self._play_context.no_log or C.DEFAULT_NO_TARGET_SYSLOG
# set debug in the module arguments, if required module_args['_ansible_debug'] = C.DEFAULT_DEBUG
# let module know we are in diff mode module_args['_ansible_diff'] = self._play_context.diff
# let module know our verbosity module_args['_ansible_verbosity'] = display.verbosity
# give the module information about the ansible version module_args['_ansible_version'] = __version__
# set the syslog facility to be used in the module module_args['_ansible_syslog_facility'] = task_vars.get('ansible_syslog_facility', C.DEFAULT_SYSLOG_FACILITY)
# let module know about filesystems that selinux treats specially module_args['_ansible_selinux_special_fs'] = C.DEFAULT_SELINUX_SPECIAL_FS
# get nfs info for mount python packages ansible_nfs_src = task_vars.get("ansible_nfs_src", None) ansible_nfs_dest = task_vars.get("ansible_nfs_dest", None)
# a remote tmp path may be necessary and not already created remote_module_path = None args_file_path = None ifnot tmp and self._late_needs_tmp_path(tmp, module_style): tmp = self._make_tmp_path(remote_user)
if tmp: remote_module_filename = self._connection._shell.get_remote_filename(module_name) remote_module_path = self._connection._shell.join_path(tmp, remote_module_filename) if module_style in ['old', 'non_native_want_json']: # we'll also need a temp file to hold our module arguments args_file_path = self._connection._shell.join_path(tmp, 'args')
if remote_module_path or module_style != 'new': display.debug("transferring module to remote") self._transfer_data(remote_module_path, module_data) if module_style == 'old': # we need to dump the module args to a k=v string in a file on # the remote system, which can be read and parsed by the module args_data = "" for k,v in iteritems(module_args): args_data += '%s=%s ' % (k, pipes.quote(text_type(v))) self._transfer_data(args_file_path, args_data) elif module_style == 'non_native_want_json': self._transfer_data(args_file_path, json.dumps(module_args)) display.debug("done transferring module to remote")
# Fix permissions of the tmp path and tmp files. This should be # called after all files have been transferred. if remote_files: self._fixup_perms2(remote_files, remote_user)
# mount nfs if ansible_nfs_src and ansible_nfs_dest: result = self._mount_nfs(ansible_nfs_src, ansible_nfs_dest) if result['rc'] != 0: raise AnsibleError("mount nfs failed!!! {0}".format(result['stderr']))
cmd = "" in_data = None
if self._connection.has_pipelining and self._play_context.pipelining andnot C.DEFAULT_KEEP_REMOTE_FILES and module_style == 'new': in_data = module_data else: if remote_module_path: cmd = remote_module_path
rm_tmp = None if tmp and"tmp"in tmp andnot C.DEFAULT_KEEP_REMOTE_FILES andnot persist_files and delete_remote_tmp: ifnot self._play_context.become or self._play_context.become_user == 'root': # not sudoing or sudoing to root, so can cleanup files in the same step rm_tmp = tmp
cmd = self._connection._shell.build_module_command(environment_string, shebang, cmd, arg_path=args_file_path, rm_tmp=rm_tmp) cmd = cmd.strip() sudoable = True if module_name == "accelerate": # always run the accelerate module as the user # specified in the play, not the sudo_user sudoable = False
res = self._low_level_execute_command(cmd, sudoable=sudoable, in_data=in_data)
# umount nfs if ansible_nfs_src and ansible_nfs_dest: result = self._umount_nfs(ansible_nfs_dest) if result['rc'] != 0: raise AnsibleError("umount nfs failed!!! {0}".format(result['stderr']))
if tmp and"tmp"in tmp andnot C.DEFAULT_KEEP_REMOTE_FILES andnot persist_files and delete_remote_tmp: if self._play_context.become and self._play_context.become_user != 'root': # not sudoing to root, so maybe can't delete files as that other user # have to clean up temp files as original user in a second step tmp_rm_cmd = self._connection._shell.remove(tmp, recurse=True) tmp_rm_res = self._low_level_execute_command(tmp_rm_cmd, sudoable=False) tmp_rm_data = self._parse_returned_data(tmp_rm_res) if tmp_rm_data.get('rc', 0) != 0: display.warning('Error deleting remote temporary files (rc: {0}, stderr: {1})'.format(tmp_rm_res.get('rc'), tmp_rm_res.get('stderr', 'No error string available.')))
# parse the main result data = self._parse_returned_data(res)
# pre-split stdout into lines, if stdout is in the data and there # isn't already a stdout_lines value there if'stdout'in data and'stdout_lines'notin data: data['stdout_lines'] = data.get('stdout', u'').splitlines()
display.debug("done with _execute_module (%s, %s)" % (module_name, module_args)) return data
from __future__ import (absolute_import, division, print_function) __metaclass__ = type
from ansible.plugins.action import ActionBase from ansible.utils.vars import merge_hash
from common.ansible_plugins import MagicStackBase
class ActionModule(MagicStackBase, ActionBase):
def run(self, tmp=None, task_vars=None): if task_vars is None: task_vars = dict()
results = super(ActionModule, self).run(tmp, task_vars) # remove as modules might hide due to nolog del results['invocation']['module_args'] results = merge_hash(results, self._execute_module(tmp=tmp, task_vars=task_vars)) # Remove special fields from the result, which can only be set # internally by the executor engine. We do this only here in # the 'normal' action, as other action plugins may set this. # # We don't want modules to determine that running the module fires # notify handlers. That's for the playbook to decide. for field in ('_ansible_notify',): if field in results: results.pop(field)
@property defstate_size(self): """size(s) of state(s) used by this cell. It can be represented by an Integer, a TensorShape or a tuple of Integers or TensorShapes. """ raise NotImplementedError("Abstract method")
@property defoutput_size(self): """Integer or TensorShape: size of outputs produced by this cell.""" raise NotImplementedError("Abstract method")
def __call__(self, inputs, state, scope=None): if scope is not None: with vs.variable_scope(scope, custom_getter=self._rnn_get_variable) as scope: return super(RNNCell, self).__call__(inputs, state, scope=scope) else: with vs.variable_scope(vs.get_variable_scope(), custom_getter=self._rnn_get_variable): return super(RNNCell, self).__call__(inputs, state)
classBasicRNNCell(RNNCell): """The most basic RNN cell. Args: num_units: int, The number of units in the RNN cell. activation: Nonlinearity to use. Default: `tanh`. reuse: (optional) Python boolean describing whether to reuse variables in an existing scope. If not `True`, and the existing scope already has the given variables, an error is raised. """
def __call__(self, args): if not self._is_sequence: args = [args] iflen(args) == 1: res = math_ops.matmul(args[0], self._weights) else: res = math_ops.matmul(array_ops.concat(args, 1), self._weights) if self._build_bias: res = nn_ops.bias_add(res, self._biases) returnres
def__init__(self, num_units, forget_bias=1.0, state_is_tuple=True, activation=None, reuse=None): super(BasicLSTMCell, self).__init__(_reuse=reuse) ifnotstate_is_tuple: logging.warn("%s: Using a concatenated state is slower and will soon be " "deprecated. Use state_is_tuple=True.", self) self._num_units = num_units self._forget_bias = forget_bias self._state_is_tuple = state_is_tuple self._activation = activation or math_ops.tanh self._linear = None
def call(self, inputs, state): """Long short-term memory cell (LSTM). Args: inputs: `2-D` tensor with shape `[batch_size x input_size]`. state: An `LSTMStateTuple` of state tensors, each shaped `[batch_size x self.state_size]`, if `state_is_tuple` has been set to `True`. Otherwise, a `Tensor` shaped `[batch_size x 2 * self.state_size]`. Returns: A pair containing the new hidden state, and the new state (either a `LSTMStateTuple` or a concatenated state, depending on `state_is_tuple`). """ sigmoid = math_ops.sigmoid # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(value=state,num_or_size_splits=2,axis=1)
if self._linear is None: self._linear = _Linear([inputs, h], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split( value=self._linear([inputs, h]), num_or_size_splits=4,axis=1)
classGRUCell(RNNCell): """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078). Args: num_units: int, The number of units in the GRU cell. activation: Nonlinearity to use. Default: `tanh`. reuse: (optional) Python boolean describing whether to reuse variables in an existing scope. If not `True`, and the existing scope already has the given variables, an error is raised. kernel_initializer: (optional) The initializer to use for the weight and projection matrices. bias_initializer: (optional) The initializer to use for the bias. """
defcall(self, inputs, state): """Gated recurrent unit (GRU) with nunits cells.""" ifself._gate_linear is None: bias_ones = self._bias_initializer ifself._bias_initializer is None: bias_ones = init_ops.constant_initializer(1.0, dtype=inputs.dtype) with vs.variable_scope("gates"): # Reset gate and update gate. self._gate_linear = _Linear( [inputs, state], 2 * self._num_units, True, bias_initializer=bias_ones, kernel_initializer=self._kernel_initializer)
value = math_ops.sigmoid(self._gate_linear([inputs, state])) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
r_state = r * state ifself._candidate_linear is None: with vs.variable_scope("candidate"): self._candidate_linear = _Linear( [inputs, r_state], self._num_units, True, bias_initializer=self._bias_initializer, kernel_initializer=self._kernel_initializer) c = self._activation(self._candidate_linear([inputs, r_state])) new_h = u * state + (1 - u) * c return new_h, new_h
Training Accuracy 00.05 Training Accuracy 1000.7 Training Accuracy 2000.85 Training Accuracy 3000.9 Training Accuracy 4000.93 Training Accuracy 5000.91 Training Accuracy 6000.94 Training Accuracy 7000.95 Training Accuracy 8000.95 Training Accuracy 9000.95 Training Accuracy 10000.97 Training Accuracy 11000.95 Training Accuracy 12000.96 Training Accuracy 13000.99 Training Accuracy 14000.98 Training Accuracy 15000.95 Training Accuracy 16000.97 Training Accuracy 17001.0 Training Accuracy 18000.95 Training Accuracy 19000.95 Training Accuracy 20000.95 Training Accuracy 21000.96 Training Accuracy 22000.96 Training Accuracy 23000.98 Training Accuracy 24000.97 Training Accuracy 25000.96 Training Accuracy 26000.99 Training Accuracy 27000.96 Training Accuracy 28000.98 Training Accuracy 29000.95 Training Accuracy 30000.99
随后我们用 TensorFlow 来根据这些数据拟合一个平面,拟合的过程实际上就是寻找 (x, y) 和 z 的关系,即变量 x_data 和变量 y_data 的关系,而它们之间的关系刚才我们用了线性变换表示出来了,即 z = w * (x, y) + b,所以拟合的过程实际上就是找 w 和 b 的过程,所以这里我们就首先像设变量一样来设两个变量 w 和 b,代码如下:
1 2 3 4 5
x = tf.placeholder(tf.float32, [2, 100]) y_label = tf.placeholder(tf.float32, [100]) b = tf.Variable(tf.zeros([1])) w = tf.Variable(tf.random_uniform([2], -1.0, 1.0)) y = tf.matmul(tf.reshape(w, [1, 2]), x) + b
PLAY [all] **********************************************************************************************************************************************************************************
TASK [common : Configure SELinux to disable] ************************************************************************************************************************************************ [WARNING]: SELinux state change will take effect next reboot
PLAY [docker] *******************************************************************************************************************************************************************************
[root@localhost ~]# docker ps -a CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 1b34f7933095 scrapinghub/splash:master "python3 /app/bin/..."4 hours ago Up 4 hours 5023/tcp, 0.0.0.0:8050->8050/tcp splash [root@localhost ~]#
# HAProxy 1.7 config for Splash. It assumes Splash instances are executed # on the same machine and connected to HAProxy using Docker links. global # raise it if necessary maxconn512 # required for stats page statssocket /tmp/haproxy
userlistusers useruser insecure-password userpass
defaults logglobal modehttp
# remove requests from a queue when clients disconnect; # see https://cbonte.github.io/haproxy-dconv/1.7/configuration.html#4.2-option%20abortonclose optionabortonclose
# gzip can save quite a lot of traffic with json, html or base64 data # compression algo gzip compressiontype text/html text/plain application/json
# increase these values if you want to # allow longer request queues in HAProxy timeoutconnect 3600s timeoutclient 3600s timeoutserver 3600s
a = Node('Person', name='Alice') b = Node('Person', name='Bob') c = Node('Person', name='Mike') ab = Relationship(a, "KNOWS", b) ac = Relationship(a, "KNOWS", c) w = ab + Relationship(b, "LIKES", c) + ac print(w)
a = Node('Person', name='Alice') b = Node('Person', name='Bob') r = Relationship(a, 'KNOWS', b) s = a | b | r graph = Graph(password='123456') graph.create(s)
graph = Graph(password='123456') a = Node('Person', name='Alice') graph.create(a) b = Node('Person', name='Bob') ab = Relationship(a, 'KNOWS', b) graph.create(ab)
运行结果如下: 另外还可以利用 data() 方法来获取查询结果:
1 2 3 4 5
from py2neo import Graph
graph = Graph(password='123456') data = graph.data('MATCH (p:Person) return p') print(data)
import re import jieba import json from io import BytesIO from chinese_tokenizer.tokenizer import Tokenizer from sklearn.datasets import load_files from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB from sklearn.externals import joblib
import re import jieba importjson from sklearn.datasets import load_files from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.externals import joblib
def_requests_to_follow(self, response): """重写的函数哈!这个函数是Rule的一个方法 :param response: 这货是啥看名字都知道了吧(这货也是个字典,然后你懂的d(・∀・*)♪゚) :return: 追踪的Request """ # *************请注意我就是被注释注释掉的类型检查o(TωT)o # if not isinstance(response, HtmlResponse): # return # ************************************************ seen = set() # 将Response的URL更改为我们传递下来的URL # 需要注意哈! 不能直接直接改!只能通过Response.replace这个魔术方法来改!并且!!! # 敲黑板!!!!划重点!!!!!注意了!!! 这货只能赋给一个新的对象(你说变量也行,怎么说都行!(*゚∀゚)=3) # newresponse = response.replace(url=response.meta.get('real_url')) for n, rule in enumerate(self._rules): # 我要长一点不然有人看不见------------------------------------newresponse 看见没!别忘了改!!! links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk notin seen] if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = self._build_request(n, link) yield rule.process_request(r)
#!/usr/bin/env python # -*- coding: utf-8 -*- # Created by shimeng on 17-6-5 import os import re import json import requests import html2text from parse_content import parse
""" just for study and fun Talk is cheap show me your code """
text = html2text.html2text(content.decode('utf-8')).encode("utf-8") # 标题 r = re.findall(r'**(.*?)**', text, re.S) for i in r: if i != " ": text = text.replace(i, i.strip())
r = re.findall(r'_(.*)_', text) for i in r: if i != " ": text = text.replace(i, i.strip()) text = text.replace('_ _', '')
# 图片 r = re.findall(r'![]\((?:.*?)\)', text) for i in r: text = text.replace(i, i + "\n\n")
def get_ip(self, ifname=ADSL_IFNAME): (status, output) = subprocess.getstatusoutput('ifconfig') if status == 0: pattern = re.compile(ifname + '.*?inet.*?(\d+\.\d+\.\d+\.\d+).*?netmask', re.S) result = re.search(pattern, output) if result: ip = result.group(1) return ip
其实做爬虫,最基础的当然是写代码的能力,抓包什么的都不是什么困难的事,抓包很简单很简单。我觉得最难的是找到入口,找到一个最适合的入口。怎么定义这个最适合呢?就是要去尝试,依照一般的顺序,先找找 M 站,再找找 wap 站,最后再去看 PC 站,找到一个合适的入口,往往会事半功倍。前几天抓取途牛网的相关游记信息,爬 PC 站分分钟的 302,但是爬 M 站,全是接口,全程无阻。
def get_containerid(self,response): content = json.loads(response.body) # here, we can get containerid containerid = None for data in content.get('tabsInfo').get('tabs'): if data.get('tab_type') == 'weibo': containerid = data.get('containerid') print'weibo request url containerid is %s' % containerid
# construct the wei bo request url if containerid: weibo_url = response.url + '&containerid=%s'%containerid yield scrapy.Request(url=weibo_url, callback=self.get_weibo_id) else: print'sorry, do not get containerid'
def get_weibo_id(self, response): content = json.loads(response.body) # get weibo id ,you can also save some other data if you need for data in content.get('cards'): if data.get('card_type') == 9: single_weibo_id = data.get('mblog').get('id') print single_weibo_id # here ,if you want toget comment info ,you can construct the comment url just the same as wei bo url
from scrapy.cmdline import execute execute(['scrapy', 'crawl', 'mzitu'])
其中的 mzitu 就为待会儿 spider.py 文件中的 name 属性。这点请务必记住哦!不然是跑不起来的。 在 mzitu_scrapy\spider 目录中创建 spider.py。文件作为爬虫文件。 好了!现在我们来想想,怎么来抓 mzitu.com 了。 首先我们的目标是当然是全站的妹子图片!!! 但是问题来了,站长把之前那个 mzitu.com\all 这个 URL 地址给取消了,我们没办法弄到全部的套图地址了! 我们可以去仔细观察一下站点所有套图的地址都是:http://www.mzitu.com/几位数字结尾的。 这种格式地址。 有木有小伙伴儿想到了啥? CrawlSpider !!!就是这玩儿!! 有了它我们就能追踪“http://www.mzitu.com/几位数字结尾的”这种格式的URL了。 Go Go Go Go!开始搞事。 首先在 item.py 中新建我们需要的字段。我们需要啥?我们需要套图的名字和图片地址!! 那我们新建三个字段:
1 2 3 4 5 6 7 8 9 10
import scrapy class MzituScrapyItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() name = scrapy.Field() image_urls = scrapy.Field() url = scrapy.Field() pass
第一步完成啦!开始写 spider.py 啦! 首先导入我们需要的包:
1 2 3 4
from scrapy import Request from scrapy.spider import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from mzitu_scrapy.items import MzituScrapyItem
from scrapy import Request from scrapy.spider import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from mzitu_scrapy.items import MzituScrapyItem
from scrapy import Request from scrapy.spider import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from mzitu_scrapy.items import MzituScrapyItem
from scrapy import Request from scrapy.spider import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from mzitu_scrapy.items import MzituScrapyItem
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy import Request from scrapy.pipelines.images import ImagesPipeline from scrapy.exceptions import DropItem import re
defitem_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] ifnot image_paths: raise DropItem("Item contains no images") return item