winmeanyoung 2020-05-05
注意:之前用过openpyxl库保存数据到Excel文件写入不了,换用xlsxwriter
import os import requests import re from openpyxl import load_workbook import xlsxwriter from multiprocessing.dummy import Pool as ThreadPool def spider(url): headers = { ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36‘} html = requests.get(url, headers, timeout=None) pic_url = re.findall(‘class="product-image">.*?<img src="(.*?)" height="‘, html.text, re.S) sku = re.findall(‘q=(\d+)‘, url, re.S)#正则匹配链接后面的sku if pic_url != []: print(‘正在下载‘+sku[0]+‘图片,图片地址:‘ + pic_url[0]) pic = requests.get(pic_url[0]) dir = cwd + ‘\\images\\‘ + sku[0] + ‘.jpg‘ # print(dir) with open(dir, ‘wb‘) as file: file.write(pic.content) else: if sku !=[]: print(‘没有找到‘ + sku[0]+‘产品‘) No_images.append(sku[0]) #之前用过openpyxl创建新的Excel文件,但是写入不了,之后换用xlsxwriter保存数据到Excel def save_excel(sku): print(sku) wb1 = xlsxwriter.Workbook(cwd + ‘\\‘ + ‘No_images.xlsx‘) ws1 = wb1.add_worksheet() ws1.write(0, 0, ‘No_images_sku‘) for i in range(1,len(sku)+1): ws1.write(i, 0, sku[i-1]) wb1.close() print(‘保存没有图片的sku成功!‘) if __name__ == ‘__main__‘: cwd=os.getcwd() path = cwd + ‘\\‘+‘最近12个月没有销量产品(201711).xlsx‘ wb =load_workbook(path) ws = wb.worksheets[0] pool =ThreadPool(50)#开启多少个进程,四核电脑 urls = [] No_images = [] for i in range(1, ws.max_row+1):#通过循环将Excel数据读取出来 sku = ws.cell(i,2).value if sku !=None: print(‘正在爬取第‘+str(i)+‘个sku图片‘) url = ‘http://www.fulchic.com/catalogsearch/result/?q=‘ + str(sku) urls.append(url) pool.map(spider,urls)#多线程工作,其中,spider是爬虫函数名,urls是个爬取链接列表 pool.close() pool.join() # print(No_images) save_excel(No_images)