fangjack 2020-06-25
异步的爬虫框架。
框架:就是一个集成好了各种功能且具有很强通用性的一个项目模板。
环境安装:
Linux:
pip3 install scrapy
Windows:
a. pip3 install wheel b. 下载twisted http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted c. 进入下载目录,执行 pip3 install Twisted?17.1.0?cp35?cp35m?win_amd64.whl d. pip3 install pywin32 e. pip3 install scrapy
基本使用
1.新建一个工程:scrapy startporject proName
2.cd proName
3.scrapy genspider spiderName www.xxx.com
4.执行工程:scrapy crawl spiderName
settings.py:
数据解析:
持久化存储
# -*- coding: utf-8 -*- import scrapy class FirstSpider(scrapy.Spider): #爬虫名称:当前爬虫文件的唯一标识 name = ‘first‘ #允许的域名 # allowed_domains = [‘www.baidu.com‘] #起始的url列表:列表元素只可以是url #作用:列表元素表示的url就会被进行请求发送 start_urls = [‘http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/‘] #数据解析 #调用次数是由请求次数决定 # def parse(self, response): # article_list = response.xpath(‘/html/body/section/div/div/main/article‘) # for article in article_list: # #xpath在进行数据提取时,返回的不再是字符串而是一个Selector对象,想要的数据被包含在了该对象的data参数中 # # title = article.xpath(‘./div[1]/h1/a/text()‘)[0].extract() # title = article.xpath(‘./div[1]/h1/a/text()‘).extract_first() # content = article.xpath(‘./div[2]//text()‘).extract() # content = ‘‘.join(content) # print(title,content) #==基于终端指令的持久化存储== def parse(self, response): all_data = [] article_list = response.xpath(‘/html/body/section/div/div/main/article‘) for article in article_list: #xpath在进行数据提取时,返回的不再是字符串而是一个Selector对象,想要的数据被包含在了该对象的data参数中 # title = article.xpath(‘./div[1]/h1/a/text()‘)[0].extract() title = article.xpath(‘./div[1]/h1/a/text()‘).extract_first() content = article.xpath(‘./div[2]//text()‘).extract() content = ‘‘.join(content) dic = { ‘title‘:title, ‘content‘:content } all_data.append(dic) return all_data #将解析到的数据进行了返回
基于管道的持久化存储的编码流程
全栈数据的爬取
五大核心组件(对象)
如何适当提升scrapy爬取数据的效率
增加并发:
默认scrapy开启的并发线程为16个,可以适当进行增加。在settings配置文件中修改CONCURRENT_REQUESTS = 100值为100,并发设置成了为100。
降低日志级别:
在运行scrapy时,会有大量日志信息的输出,为了减少CPU的使用率。可以设置log输出信息为INFO或者ERROR即可。在配置文件中编写:LOG_LEVEL = ‘ERROR’
禁止cookie:
如果不是真的需要cookie,则在scrapy爬取数据时可以禁止cookie从而减少CPU的使用率,提升爬取效率。在配置文件中编写:COOKIES_ENABLED = False
禁止重试:
对失败的HTTP进行重新请求(重试)会减慢爬取速度,因此可以禁止重试。在配置文件中编写:RETRY_ENABLED = False
减少下载超时:
如果对一个非常慢的链接进行爬取,减少下载超时可以能让卡住的链接快速被放弃,从而提升效率。在配置文件中进行编写:DOWNLOAD_TIMEOUT = 10 超时时间为10s
请求传参
爬虫文件
# -*- coding: utf-8 -*- import scrapy from DuanziPro.items import DuanziproItem #管道机制对应的操作 # class DuanziSpider(scrapy.Spider): # name = ‘duanzi‘ # # allowed_domains = [‘www.xxx.com‘] # start_urls = [‘http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/‘] # # def parse(self, response): # all_data = [] # article_list = response.xpath(‘/html/body/section/div/div/main/article‘) # for article in article_list: # # xpath在进行数据提取时,返回的不再是字符串而是一个Selector对象,想要的数据被包含在了该对象的data参数中 # # title = article.xpath(‘./div[1]/h1/a/text()‘)[0].extract() # title = article.xpath(‘./div[1]/h1/a/text()‘).extract_first() # content = article.xpath(‘./div[2]//text()‘).extract() # content = ‘‘.join(content) # #实例化一个item类型的对象,然后将解析到的一组数据存进去 # item = DuanziproItem() # item[‘title‘] = title # item[‘content‘] = content # # yield item #将item提交给管道 #全栈数据爬取对应的操作 class DuanziSpider(scrapy.Spider): name = ‘duanzi‘ # allowed_domains = [‘www.xxx.com‘] start_urls = [‘http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/‘] #通用的url模板 url = ‘http://duanziwang.com/category/经典段子/%d/‘ pageNum = 1 # def start_requests(self): # for url in self.start_urls: # yield scrapy.Request(url,callback=self.parse) def parse(self, response): all_data = [] article_list = response.xpath(‘/html/body/section/div/div/main/article‘) for article in article_list: # xpath在进行数据提取时,返回的不再是字符串而是一个Selector对象,想要的数据被包含在了该对象的data参数中 # title = article.xpath(‘./div[1]/h1/a/text()‘)[0].extract() title = article.xpath(‘./div[1]/h1/a/text()‘).extract_first() content = article.xpath(‘./div[2]//text()‘).extract() content = ‘‘.join(content) # 实例化一个item类型的对象,然后将解析到的一组数据存进去 item = DuanziproItem() item[‘title‘] = title item[‘content‘] = content yield item # 将item提交给管道 #编写手动请求的操作 if self.pageNum < 5: self.pageNum += 1 print(‘正在下载的页码是:‘,self.pageNum) new_url = format(self.url%self.pageNum) yield scrapy.Request(url=new_url,callback=self.parse)
构建管道items
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class DuanziproItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() content = scrapy.Field()
管道存储pipelines
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don‘t forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html #写入到文本文件中 import pymysql from redis import Redis class DuanziproPipeline(object): fp = None def open_spider(self,spider): print(‘开始爬虫......‘) self.fp = open(‘./duanzi.txt‘,‘w‘,encoding=‘utf-8‘) #方法每被调用一次,参数item就是其接收到的一个item类型的对象 def process_item(self, item, spider): # print(item)#item就是一个字典 self.fp.write(item[‘title‘]+‘:‘+item[‘content‘]+‘\n‘) return item#可以将item提交给下一个即将被执行的管道类 def close_spider(self,spider): self.fp.close() print(‘爬虫结束!!!‘) #将数据写入到mysql class MysqlPipeLine(object): conn = None cursor = None def open_spider(self,spider): self.conn = pymysql.Connect(host=‘127.0.0.1‘,port=3306,user=‘root‘,password=‘222‘,db=‘spider‘,charset=‘utf8‘) print(self.conn) def process_item(self,item,spider): sql = ‘insert into duanzi values ("%s","%s")‘%(item[‘title‘],item[‘content‘]) self.cursor = self.conn.cursor() try: self.cursor.execute(sql) self.conn.commit() except Exception as e: print(e) self.conn.rollback() return item def close_spider(self,spider): self.cursor.close() self.conn.close() #将数据写入到redis class RedisPileLine(object): conn = None def open_spider(self,spider): self.conn = Redis(host=‘127.0.0.1‘,port=6379) print(self.conn) def process_item(self,item,spider): self.conn.lpush(‘duanziData‘,item) return item
爬虫文件
# -*- coding: utf-8 -*- import scrapy from moviePro.items import MovieproItem #深度爬取 # class MovieSpider(scrapy.Spider): # name = ‘movie‘ # # allowed_domains = [‘www.xxx.com‘] # start_urls = [‘https://www.4567tv.tv/index.php/vod/show/class/动作/id/1.html‘] # url = ‘https://www.4567tv.tv/index.php/vod/show/class/动作/id/1/page/%d.html‘ # def parse(self, response): # li_list = response.xpath(‘/html/body/div[1]/div/div/div/div[2]/ul/li‘) # for li in li_list: # title = li.xpath(‘.//div[@class="stui-vodlist__detail"]/h4/a/text()‘).extract_first() # detail_url = ‘https://www.4567tv.tv‘+li.xpath(‘.//div[@class="stui-vodlist__detail"]/h4/a/@href‘).extract_first() # # item = MovieproItem() # item[‘title‘] = title # # # print(title,detail_url) # #对详情页的url进行手动请求发送 # #请求传参: # #参数meta是一个字典,字典会传递给callback # yield scrapy.Request(detail_url,callback=self.parse_detail,meta={‘item‘:item}) # #自定义的另一个解析方法(必须要有response参数) # def parse_detail(self,response): # #接收传递过来的meta # item = response.meta[‘item‘] # desc = response.xpath(‘/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()‘).extract_first() # item[‘desc‘] = desc # # yield item #深度爬取+全栈爬取 class MovieSpider(scrapy.Spider): name = ‘movie‘ # allowed_domains = [‘www.xxx.com‘] start_urls = [‘https://www.4567tv.tv/index.php/vod/show/class/动作/id/1.html‘] url = ‘https://www.4567tv.tv/index.php/vod/show/class/动作/id/1/page/%d.html‘ pageNum = 1 def parse(self, response): li_list = response.xpath(‘/html/body/div[1]/div/div/div/div[2]/ul/li‘) for li in li_list: title = li.xpath(‘.//div[@class="stui-vodlist__detail"]/h4/a/text()‘).extract_first() detail_url = ‘https://www.4567tv.tv‘+li.xpath(‘.//div[@class="stui-vodlist__detail"]/h4/a/@href‘).extract_first() item = MovieproItem() item[‘title‘] = title # print(title,detail_url) #对详情页的url进行手动请求发送 #请求传参: #参数meta是一个字典,字典会传递给callback yield scrapy.Request(detail_url,callback=self.parse_detail,meta={‘item‘:item}) #全栈爬取 if self.pageNum < 4: self.pageNum += 1 new_url = format(self.url%self.pageNum) yield scrapy.Request(new_url,callback=self.parse) #自定义的另一个解析方法(必须要有response参数) def parse_detail(self,response): #接收传递过来的meta item = response.meta[‘item‘] desc = response.xpath(‘/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()‘).extract_first() item[‘desc‘] = desc yield item
items
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class MovieproItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() desc = scrapy.Field()
pipelines
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don‘t forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html class MovieproPipeline(object): def process_item(self, item, spider): print(item) return item
robots
UA伪装
动态变化的请求参数
验证码
cookie
代理
动态加载的数据
js加密
js混淆
图片懒加载