liuweiq 2020-07-08
爬取素材库。直接存入mysql数据库。
包含html源码直接存入数据库需要的转义函数。
替换掉源码中的html注释语句
import re import requests import random import time from bs4 import BeautifulSoup import pymysql #html源码进数据库,转义函数 def transferContent(content): if content is None: return None else: stri = "" for c in content: if c == ‘"‘: stri += c.replace(‘"‘, ‘\\\"‘) elif c == "‘": stri += c.replace("‘", "\\\‘") elif c == "\\": stri += "\\\\" else: stri += str(c) return stri user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] UA = random.choice(user_agent_list) ##从self.user_agent_list中随机取出一个字符串 headers = {‘User-Agent‘: UA} ##构造成一个完整的User-Agent (UA代表的是上面随机取出来的字符串哦) # 连接database conn =pymysql.connect(user=‘root‘, password=‘1234‘ ,host=‘127.0.0.1‘,database=‘sucai‘) url = ‘https://www.***.com/***_0_‘ for x in range(26, 724): time.sleep(random.randint(1, 5)) with requests.get(url + str(x), headers=headers, timeout=5) as response: soup = BeautifulSoup(response.text, ‘lxml‘) li_list = soup.find_all(‘div‘, class_=‘material-item‘) for li_quick in li_list: # 源码 item_code = li_quick.find(‘div‘, class_=‘item-code‘).decode_contents() #转换为字符串 decode_contents(不含当前标签) re_comment=re.compile(‘<![^>]*>‘)#HTML注释 item_code = re_comment.sub(‘‘, item_code) # 去掉HTML注释 item_code = transferContent(item_code.strip()) #去掉前后空格 # 简介 item_info = li_quick.find(‘div‘, class_=‘item-bottom‘).find(‘div‘, class_=‘item-info‘).a.get_text().strip() # lable item_label = li_quick.find(‘div‘, class_=‘item-bottom‘).find(‘div‘, class_=‘item-label‘) item_label_0=‘‘ for kj in item_label.find_all(‘span‘): item_label_0+=kj.get_text()+‘;‘ # 创建游标 cursor = conn.cursor() # --insert--- effect_rows = cursor.execute(‘insert into sucaix (ye,rowcode,info,lablex) values (%s,%s,%s,%s)‘,[str(x),item_code,item_info,item_label_0]) # 提交任务 conn.commit() print(x) # 关闭游标 cursor.close() conn.close()