Catastrophe 2020-04-17
爬虫py文件代码:
# -*- coding: utf-8 -*- import scrapy import sys import io from scrapy.selector import Selector from scrapy.http import Request from ..items import Day96XiaohuaItem import re sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding=‘utf-8‘) class XiaohuaSpider(scrapy.Spider): name = ‘xiaohua‘ allowed_domains = [‘www.xueshengmai.com/hua/‘] start_urls = [‘http://www.xueshengmai.com/hua/‘] def parse(self, response): # ------------持久化数据-------------- hxs = Selector(response=response).xpath("//div[@class=‘item_t‘]/div[@class=‘img‘]/a/img").extract() # print(hxs) for i in hxs: # print(i) title = re.findall("alt=(.*) src=",i)[0].strip(‘"‘)+".jpg" src = "http://www.xueshengmai.com%s"%re.findall("src=(.*)>",i)[0].strip(‘"‘) print(title,src) item_obj = Day96XiaohuaItem(title=title, src=src) yield item_obj
items.py 代码:
import scrapy class Day96XiaohuaItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title=scrapy.Field() src=scrapy.Field()
pipelines代码:
import requests class Day96XiaohuaPipeline(object): def process_item(self, item, spider): file_path="imgs/%s"%item["title"] file_src=item["src"] f=open(file_path,"wb") img_date=requests.get(file_src) f.write(img_date.content) f.close()