爬虫——Scrapy框架使用,扩展与代理中间件

来自:Pixiv 画师( Alayna Danner 

  • 安装与基础
  •  
  • 应用扩展(略)
  •  
  • 代理与中间件
  •  
  • 自定义scrapy命令(看源码)

安装与基础

安装Scrapy环境:

pip3 install scrapy

创建爬虫工程:

#工程名chouti
scrapy startproject chouti

使用默认模板生成爬虫实例:

#xx:实例名与domains
scrapy genspider xx xx.com

运行蜘蛛:

#chouti:实例名
#--nolog无日志模式启动
scrapy crawl chouti --nolog
  • 爬虫工作流程
    • 指定初始URL—下载页面
    • 解析:item(格式化)+pipeline(持久化)
    • 并发
    • 深度

简例:

# 获取抽屉标题
class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['chouti.com']
    start_urls = ['http://dig.chouti.com/']

    def parse(self, response):
        hxs = Selector(response=response).xpath(
            '//a[@class="link-title link-statistics"]'
        )
        for obj in hxs:
            title = obj.xpath('./text()').extract_first()            
            # 等同于title = obj.xpath('./text()').extract()[0]
            print(title)

Scrapy选择器:

//      子孙
.//     当前对象的子孙
/       儿子
div[@id="xx"]     id为xx的盒子
obj.extract()     hxs对象转字符串
a/text()          a标签内文本
a/@href            a@xx  拿a标签的属性xx
子代
Selector(response=response).xpath('/html/body/ul/li/a/@href').extract()
包含内容xx
Selector(response=response).xpath('//a[contains(@href, "link")]')
以xx开始
Selector(response=response).xpath('//a[starts-with(@href, "link")]')
正则
Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract()
Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract()
把将要访问的url添加到调度器
yield Request(url=url,callback=self.parse)
# 递归取52所有页码

class A52pojieSpider(scrapy.Spider):
    name = '52pojie'
    allowed_domains = ['www.52pojie.cn']
    start_urls = ['https://www.52pojie.cn/forum-50-1.html']
    all_set=set()

    def md5(self,url):
        import hashlib
        obj = hashlib.md5()
        obj.update(bytes(url,encoding='utf8'))
        return obj.hexdigest()

    def parse(self, response):
        set0 = set()
        # hxs = Selector(response=response).xpath(
        #     '//div[@class="bm bw0 pgs cl"]/span[@id="fd_page_top"]/div[@class="pg"]'
        # )
        # hxs=hxs.xpath('./a/@href').extract()
        hxs=Selector(response=response).xpath('//a[re:test(@href,"forum-50-\d+.html")]/@href').extract()
        for url in hxs:
            md5_url = self.md5(url)
            if md5_url in set0:
                pass
            else:
                if self.md5(url) in self.all_set:
                    pass
                else:
                    self.all_set.add(self.md5(url))
                    print(url)
                set0.add(md5_url)
                url='https://www.52pojie.cn/'+url
                yield Request(url=url,callback=self.parse)

由于每个页面链接都有末尾页,不适用于深度限制!

#设置深度:settings.py
DEPTH_LIMIT=3
#抽屉取标题与链接并写入本地
#chouti.py
class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['chouti.com']
    start_urls = ['http://dig.chouti.com/']

    def parse(self, response):
        hxs = Selector(response=response).xpath(
            '//a[@class="link-title link-statistics"]'
        )
        for obj in hxs:
            title = obj.xpath('./text()').extract()[0]
            href = obj.xpath('./@href').extract()[0]
            from ..items import D1204Item
            item_obj = D1204Item(title=title, href=href)
            yield item_obj


#settings.py
#配置pipelines
ITEM_PIPELINES = {
   'd1204.pipelines.D1204Pipeline': 300,
}


#items.py
class D1204Item(scrapy.Item):
    title=scrapy.Field()
    href=scrapy.Field()

#pipelines.py
class D1204Pipeline(object):
    def process_item(self, item, spider):
        #对应爬虫实例执行相应操作
        if spider.name=='chouti':
            dataFormat='%s\n%s\n' %(item['title'],item['href'])
            with open('标题-链接.txt','a+') as f:
                f.write(dataFormat)

应用扩展

略…

代理与中间件

代理

方式一:scrapy默认代理方式(不好)

os.environ
{
     http_proxy:http://root:secret@ip:port/
     http_proxy:http://ip:port/
}

方式二:自定义下载中间件

#默认依赖环境变量
#自定义
def to _bytes(text,encoding=None, errors='strict') :
    if isinstance (text,bytes) :
        return text
    if not isinstance(text,six.string_ types) :
        raise TypeError('to bytes must receive a unicode,str or bytes
        'object, got %s' %(type(text)._ name_)
    if encoding is None:
        encoding = 'utf-8'
    return text .encode (encoding,errors)

class ProxyMiddleware (object) :
    def process_ request(self, request,spider) :
        PROXIES = [
            {'ip_ po2t': '111.11.228.75:80', 'user_ pass': ''},
            {'ip_ port': '120.198.243.22:80', 'user_ pass': '},
            {'ip_ port': '111.8.60.9:8123', 'user_ pass': '},
            {'ip_ port': '101.71.27.120:80', 'user_ pass': '},
            {'ip_ port': '122.96.59.104:80, 'user_ _pass': ''},
            {'ip_ port': '122.224.249.122:8088 'user_ pass': ''},
        ]
        proxy = random. choice (PROXIES)
        proxy['user_ pass'] is not None:
        request.meta['proxy'] = to _bytes( "http://%s" 8 proxy['ip_ port'] )
        encoded_user_pass = base64. encodestring(to bytes(proxy['user_ pass']))
        request. headers[' Proxy-Authorization'] = to_ bytes('Basic’+ encoded _user_pass)
        print("**************ProxyMiddlewre have pass************" + proxy['ip_ port'])
        else:
        print "*************ProxyMiadleware no pass***********" + proxy['ip_ port'l
        request.meta['proxy'] = to byte3("http://%s" $ proxy['ip_ port'])
#settings.py中配置
DOWNLOADER MIDDLEWARES = {
    'xxx.xxx.ProxyMiddleware': 500,
    }

方式三:使用第三方的代理

import requests
#请求地址
targetUrl = "http://baidu.com"
#代理服务器
proxyHost = "ip"
proxyPort = "port"
proxyMeta = "http://%(host)s:%(port)s" % {
    "host" : proxyHost,
    "port" : proxyPort,
}

#pip install -U requests[socks]  socks5代理
# proxyMeta = "socks5://%(host)s:%(port)s" % {
#     "host" : proxyHost,
#     "port" : proxyPort,
# }

proxies = {
    "http"  : proxyMeta,
}
resp = requests.get(targetUrl, proxies=proxies)
print resp.status_code
print resp.text

补充:Https的访问

Https访问时有两种情况:

1. 要爬取网站使用的可信任证书(默认支持)
        DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
        DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory"
        
    2. 要爬取网站使用的自定义证书
        DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
        DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory"
        
        # https.py
        from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
        from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate)
        
        class MySSLFactory(ScrapyClientContextFactory):
            def getCertificateOptions(self):
                from OpenSSL import crypto
                v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.key.unsecure', mode='r').read())
                v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.pem', mode='r').read())
                return CertificateOptions(
                    privateKey=v1,  # pKey对象
                    certificate=v2,  # X509对象
                    verify=False,
                    method=getattr(self, 'method', getattr(self, '_ssl_method', None))
                )

中间件

下载中间件

class DownMiddleware1(object):
        def process_request(self, request, spider):
            '''
            请求需要被下载时,经过所有下载器中间件的process_request调用
            :param request:
            :param spider:
            :return:
                None,继续后续中间件去下载;
                Response对象,停止process_request的执行,开始执行process_response
                Request对象,停止中间件的执行,将Request重新调度器
                raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
            '''
            pass
    
    
    
        def process_response(self, request, response, spider):
            '''
            spider处理完成,返回时调用
            :param response:
            :param result:
            :param spider:
            :return:
                Response 对象:转交给其他中间件process_response
                Request 对象:停止中间件,request会被重新调度下载
                raise IgnoreRequest 异常:调用Request.errback
            '''
            print('response1')
            return response
    
        def process_exception(self, request, exception, spider):
            '''
            当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
            :param response:
            :param exception:
            :param spider:
            :return:
                None:继续交给后续中间件处理异常;
                Response对象:停止后续process_exception方法
                Request对象:停止中间件,request将会被重新调用下载
            '''
            return None
#settings.py
 默认下载中间件
    {
        'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100,
        'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300,
        'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350,
        'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
        'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
        'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
        'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580,
        'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590,
        'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
        'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700,
        'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750,
        'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830,
        'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850,
        'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900,
    }

爬虫中间件

    class SpiderMiddleware(object):

        def process_spider_input(self,response, spider):
            '''
            下载完成,执行,然后交给parse处理
            :param response: 
            :param spider: 
            :return: 
            '''
            pass
    
        def process_spider_output(self,response, result, spider):
            '''
            spider处理完成,返回时调用
            :param response:
            :param result:
            :param spider:
            :return: 必须返回包含 Request 或 Item 对象的可迭代对象(iterable)
            '''
            return result
    
        def process_spider_exception(self,response, exception, spider):
            '''
            异常调用
            :param response:
            :param exception:
            :param spider:
            :return: None,继续交给后续中间件处理异常;含 Response 或 Item 的可迭代对象(iterable),交给调度器或pipeline
            '''
            return None
    
    
        def process_start_requests(self,start_requests, spider):
            '''
            爬虫启动时调用
            :param start_requests:
            :param spider:
            :return: 包含 Request 对象的可迭代对象
            '''
            return start_requests
#settings.py
 内置爬虫中间件:
        'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50,
        'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500,
        'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700,
        'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': 800,
        'scrapy.contrib.spidermiddleware.depth.DepthMiddleware': 900,

自定义scrapy命令

操作步骤:

在spiders同级创建任意目录,如:commands 。在其中创建 crawlall.py 文件 (此处文件名就是自定义的命令)

#crawlall.py
from scrapy.commands import ScrapyCommand
    from scrapy.utils.project import get_project_settings


    class Command(ScrapyCommand):

        requires_project = True

        def syntax(self):
            return '[options]'

        def short_desc(self):
            return 'Runs all of the spiders'

        def run(self, args, opts):
            spider_list = self.crawler_process.spiders.list()
            for name in spider_list:
                self.crawler_process.crawl(name, **opts.__dict__)
            self.crawler_process.start()

在settings.py 中添加配置 COMMANDS_MODULE = ‘项目名称.目录名称’, 在项目目录执行命令:scrapy crawlall 

发表回复