爬虫——Scrapy框架使用,扩展与代理中间件
来自:Pixiv 画师( Alayna Danner )
- 安装与基础
- 应用扩展(略)
- 代理与中间件
- 自定义scrapy命令(看源码)
安装与基础
安装Scrapy环境:
pip3 install scrapy
创建爬虫工程:
#工程名chouti
scrapy startproject chouti
使用默认模板生成爬虫实例:
#xx:实例名与domains
scrapy genspider xx xx.com
运行蜘蛛:
#chouti:实例名
#--nolog无日志模式启动
scrapy crawl chouti --nolog
- 爬虫工作流程:
- 指定初始URL—下载页面
- 解析:item(格式化)+pipeline(持久化)
- 并发
- 深度
简例:
# 获取抽屉标题
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['http://dig.chouti.com/']
def parse(self, response):
hxs = Selector(response=response).xpath(
'//a[@class="link-title link-statistics"]'
)
for obj in hxs:
title = obj.xpath('./text()').extract_first()
# 等同于title = obj.xpath('./text()').extract()[0]
print(title)
Scrapy选择器:
// 子孙
.// 当前对象的子孙
/ 儿子
div[@id="xx"] id为xx的盒子
obj.extract() hxs对象转字符串
a/text() a标签内文本
a/@href a@xx 拿a标签的属性xx
子代
Selector(response=response).xpath('/html/body/ul/li/a/@href').extract()
包含内容xx
Selector(response=response).xpath('//a[contains(@href, "link")]')
以xx开始
Selector(response=response).xpath('//a[starts-with(@href, "link")]')
正则
Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract()
Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract()
把将要访问的url添加到调度器
yield Request(url=url,callback=self.parse)
# 递归取52所有页码
class A52pojieSpider(scrapy.Spider):
name = '52pojie'
allowed_domains = ['www.52pojie.cn']
start_urls = ['https://www.52pojie.cn/forum-50-1.html']
all_set=set()
def md5(self,url):
import hashlib
obj = hashlib.md5()
obj.update(bytes(url,encoding='utf8'))
return obj.hexdigest()
def parse(self, response):
set0 = set()
# hxs = Selector(response=response).xpath(
# '//div[@class="bm bw0 pgs cl"]/span[@id="fd_page_top"]/div[@class="pg"]'
# )
# hxs=hxs.xpath('./a/@href').extract()
hxs=Selector(response=response).xpath('//a[re:test(@href,"forum-50-\d+.html")]/@href').extract()
for url in hxs:
md5_url = self.md5(url)
if md5_url in set0:
pass
else:
if self.md5(url) in self.all_set:
pass
else:
self.all_set.add(self.md5(url))
print(url)
set0.add(md5_url)
url='https://www.52pojie.cn/'+url
yield Request(url=url,callback=self.parse)
由于每个页面链接都有末尾页,不适用于深度限制!
#设置深度:settings.py
DEPTH_LIMIT=3
#抽屉取标题与链接并写入本地
#chouti.py
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['http://dig.chouti.com/']
def parse(self, response):
hxs = Selector(response=response).xpath(
'//a[@class="link-title link-statistics"]'
)
for obj in hxs:
title = obj.xpath('./text()').extract()[0]
href = obj.xpath('./@href').extract()[0]
from ..items import D1204Item
item_obj = D1204Item(title=title, href=href)
yield item_obj
#settings.py
#配置pipelines
ITEM_PIPELINES = {
'd1204.pipelines.D1204Pipeline': 300,
}
#items.py
class D1204Item(scrapy.Item):
title=scrapy.Field()
href=scrapy.Field()
#pipelines.py
class D1204Pipeline(object):
def process_item(self, item, spider):
#对应爬虫实例执行相应操作
if spider.name=='chouti':
dataFormat='%s\n%s\n' %(item['title'],item['href'])
with open('标题-链接.txt','a+') as f:
f.write(dataFormat)
应用扩展
略…
代理与中间件
代理
方式一:scrapy默认代理方式(不好)
os.environ
{
http_proxy:http://root:secret@ip:port/
http_proxy:http://ip:port/
}
方式二:自定义下载中间件
#默认依赖环境变量
#自定义
def to _bytes(text,encoding=None, errors='strict') :
if isinstance (text,bytes) :
return text
if not isinstance(text,six.string_ types) :
raise TypeError('to bytes must receive a unicode,str or bytes
'object, got %s' %(type(text)._ name_)
if encoding is None:
encoding = 'utf-8'
return text .encode (encoding,errors)
class ProxyMiddleware (object) :
def process_ request(self, request,spider) :
PROXIES = [
{'ip_ po2t': '111.11.228.75:80', 'user_ pass': ''},
{'ip_ port': '120.198.243.22:80', 'user_ pass': '},
{'ip_ port': '111.8.60.9:8123', 'user_ pass': '},
{'ip_ port': '101.71.27.120:80', 'user_ pass': '},
{'ip_ port': '122.96.59.104:80, 'user_ _pass': ''},
{'ip_ port': '122.224.249.122:8088 'user_ pass': ''},
]
proxy = random. choice (PROXIES)
proxy['user_ pass'] is not None:
request.meta['proxy'] = to _bytes( "http://%s" 8 proxy['ip_ port'] )
encoded_user_pass = base64. encodestring(to bytes(proxy['user_ pass']))
request. headers[' Proxy-Authorization'] = to_ bytes('Basic’+ encoded _user_pass)
print("**************ProxyMiddlewre have pass************" + proxy['ip_ port'])
else:
print "*************ProxyMiadleware no pass***********" + proxy['ip_ port'l
request.meta['proxy'] = to byte3("http://%s" $ proxy['ip_ port'])
#settings.py中配置
DOWNLOADER MIDDLEWARES = {
'xxx.xxx.ProxyMiddleware': 500,
}
方式三:使用第三方的代理
import requests
#请求地址
targetUrl = "http://baidu.com"
#代理服务器
proxyHost = "ip"
proxyPort = "port"
proxyMeta = "http://%(host)s:%(port)s" % {
"host" : proxyHost,
"port" : proxyPort,
}
#pip install -U requests[socks] socks5代理
# proxyMeta = "socks5://%(host)s:%(port)s" % {
# "host" : proxyHost,
# "port" : proxyPort,
# }
proxies = {
"http" : proxyMeta,
}
resp = requests.get(targetUrl, proxies=proxies)
print resp.status_code
print resp.text
补充:Https的访问
Https访问时有两种情况:
1. 要爬取网站使用的可信任证书(默认支持)
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory"
2. 要爬取网站使用的自定义证书
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory"
# https.py
from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate)
class MySSLFactory(ScrapyClientContextFactory):
def getCertificateOptions(self):
from OpenSSL import crypto
v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.key.unsecure', mode='r').read())
v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.pem', mode='r').read())
return CertificateOptions(
privateKey=v1, # pKey对象
certificate=v2, # X509对象
verify=False,
method=getattr(self, 'method', getattr(self, '_ssl_method', None))
)
中间件
下载中间件
class DownMiddleware1(object):
def process_request(self, request, spider):
'''
请求需要被下载时,经过所有下载器中间件的process_request调用
:param request:
:param spider:
:return:
None,继续后续中间件去下载;
Response对象,停止process_request的执行,开始执行process_response
Request对象,停止中间件的执行,将Request重新调度器
raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
'''
pass
def process_response(self, request, response, spider):
'''
spider处理完成,返回时调用
:param response:
:param result:
:param spider:
:return:
Response 对象:转交给其他中间件process_response
Request 对象:停止中间件,request会被重新调度下载
raise IgnoreRequest 异常:调用Request.errback
'''
print('response1')
return response
def process_exception(self, request, exception, spider):
'''
当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
:param response:
:param exception:
:param spider:
:return:
None:继续交给后续中间件处理异常;
Response对象:停止后续process_exception方法
Request对象:停止中间件,request将会被重新调用下载
'''
return None
#settings.py
默认下载中间件
{
'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100,
'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300,
'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580,
'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590,
'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700,
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750,
'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830,
'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850,
'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900,
}
爬虫中间件
class SpiderMiddleware(object):
def process_spider_input(self,response, spider):
'''
下载完成,执行,然后交给parse处理
:param response:
:param spider:
:return:
'''
pass
def process_spider_output(self,response, result, spider):
'''
spider处理完成,返回时调用
:param response:
:param result:
:param spider:
:return: 必须返回包含 Request 或 Item 对象的可迭代对象(iterable)
'''
return result
def process_spider_exception(self,response, exception, spider):
'''
异常调用
:param response:
:param exception:
:param spider:
:return: None,继续交给后续中间件处理异常;含 Response 或 Item 的可迭代对象(iterable),交给调度器或pipeline
'''
return None
def process_start_requests(self,start_requests, spider):
'''
爬虫启动时调用
:param start_requests:
:param spider:
:return: 包含 Request 对象的可迭代对象
'''
return start_requests
#settings.py
内置爬虫中间件:
'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50,
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500,
'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700,
'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': 800,
'scrapy.contrib.spidermiddleware.depth.DepthMiddleware': 900,
自定义scrapy命令
操作步骤:
在spiders同级创建任意目录,如:commands 。在其中创建 crawlall.py 文件 (此处文件名就是自定义的命令)
#crawlall.py
from scrapy.commands import ScrapyCommand
from scrapy.utils.project import get_project_settings
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Runs all of the spiders'
def run(self, args, opts):
spider_list = self.crawler_process.spiders.list()
for name in spider_list:
self.crawler_process.crawl(name, **opts.__dict__)
self.crawler_process.start()
在settings.py 中添加配置 COMMANDS_MODULE = ‘项目名称.目录名称’, 在项目目录执行命令:scrapy crawlall