fanhuasijin 2020-05-16
我之前写的一份爬虫,在百度网盘没有改版之前,有很多资源达人在他们的百度网盘动态分享自己的资源,后来我关注了一批分享影视资源的账号,程序定时去爬取他们的动态,将他们分享出来的百度网盘链接收入自己的数据库,写入数据库之前查询资源是否重复和不良关键词过滤,然后在另一端网页或APP,将数据库的资源展示出来,早期市面上的网盘资源搜索就是运用了这个原理,因为后来百度网盘改版,取消动态分享,程序目前已经无法正常运行,本文做个思路记录。
程序主入口,实现爬取百度网盘动态分享的功能都写在这个文件了,还负责调用其他文件函数,运行这个脚本就可以不间断的执行
# 主程序 import requests,re, json, time import random from mysql_db import * import threading from aidy_pc import * from yszx import * from defs import * header = { "Cookie": "", "Host": "pan.baidu.com", "Referer": "https://pan.baidu.com/pcloud/friendpage?type=follow&uk=2489863899&self=1", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36", "X-Requested-With": "XMLHttpRequest" } # 获取账号订阅的id list_uk = [‘2489863899‘] def getShareUser(): start = 0 for star in range(100): try: url = ‘https://pan.baidu.com/pcloud/friend/getfollowlist?query_uk=2489863899&limit=24&start=%d‘ % start follows_json = (requests.get(url, headers=header)).json() if len(follows_json[‘follow_list‘]) == 0: # 如果没有返回数据侧退出 break lists = follows_json[‘follow_list‘] # 全部信息列表 for i in lists: # 遍历分离出每个订阅用户的信息 list_uk.append(i[‘follow_uk‘]) # 添加uk到列表 start = start + 24 time.sleep(random.randint(10, 25)) except: continue # if list_uk == ‘‘: # return False # else: # return list_uk # 程序开始 def gethtml(): # 爬取网盘资源函数 tu = getShareUser() # 这里是去获取我订阅的账号id if tu == False: # 如果获取不到订阅列表,则退出 pass else: start = 0 for uk in list_uk: # 循环订阅id for n in range(2): # 循环翻页 url = "https://pan.baidu.com/pcloud/feed/getdynamiclist?auth_type=1&filter_types=11000&query_uk=%s&category=0&limit=25&start=%s&bdstoken=29b0093f2c23b7afd5f41c39f57be34e&channel=chunlei&clienttype=0&web=1" % ( uk, start) filelist_json = requests.get(url, headers=header).json() if filelist_json[‘errno‘] != 0: break list_records = filelist_json[‘records‘] # 本次请求的所有资源列表 for data_vaule in list_records: # 遍历资源列表里的所有字典 if data_vaule[‘category‘] == 3: # 不要图片 pass if gjc_gl(data_vaule[‘title‘])==False: # 关键词过滤 pass else: #print(data_vaule[‘title‘]) print(data_vaule) #mysql_into(data_vaule) # 开始写入数据库 ##print(data_vaule) # 文件类型:category(文件夹6,视频1,图片3) 链接:shorturl 标题:title 时间:feed_time start = start + 25 time.sleep(random.randint(10, 25)) if __name__ == ‘__main__‘: while True: try: gethtml() # 网盘爬虫函数 t1 = threading.Thread(target=bdsl) # 网盘失效连接检测函数 #t2 = threading.Thread(target=aidy) # 爱电影网站爬虫函数 #t3 = threading.Thread(target=main_ys) t1.start() #t2.start() #t3.start() time.sleep(10800) # 每3个小时爬一次,一天爬4次 except: continue
数据写入数据库和百度网盘失效链接检测删除函数,将爬取到的数据传入函数即可写入数据库,还有一个链接失效检测函数,链接失效很正常,这个函数对整个数据库的链接进行检测,如果失效的链接删除。
# 对数据库进行连接与数据入库 import pymysql,time import requests,re import random def pysql(): try: mysql = pymysql.connect(‘127.0.0.1‘, ‘bdwp‘, ‘xDnwLnjSEXLbGJYa‘, ‘bdwp‘, charset="utf8") #mysql = pymysql.connect(‘127.0.0.1‘, ‘root‘, ‘root‘, ‘bdwp‘, charset="utf8") return mysql except: print("数据库连接失败!") exit() def mysql_into(data_vaule): # 网盘数据添加数据库函数 mysql = pysql() db = mysql.cursor() sqlcx = "select title from data_zy WHERE title=‘%s‘"%data_vaule[‘title‘] db.execute(sqlcx) data = db.fetchall() if not data: # 没有的时候执行 sqlcxid = "select max(id) from data_zy" db.execute(sqlcxid) dataid = db.fetchall() ids = (int(dataid[0][0])) + 1 # 获取最后一个入库id time_time = time.strftime("%Y-%m-%d %H:%M", time.localtime()) # 获取入库时间 timeStamp = data_vaule[‘feed_time‘] # 转换资源分享时间 timeStamp = float(timeStamp / 1000) timeArray = time.localtime(timeStamp) otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) try: sqltj = "insert into data_zy (id,category,shorturl,title,feed_time,rk_time) VALUES (‘%d‘,‘%d‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘)" % (ids,data_vaule[‘category‘], data_vaule[‘shorturl‘],data_vaule[‘title‘],otherStyleTime,time_time) db.execute(sqltj) mysql.commit() except: pass else: return False # 数据库里存在文件时 mysql.close() # 百度链接失效检测函数 def bdsl(): header = { "Host": "pan.baidu.com", "Referer": "https://pan.baidu.com/pcloud/friendpage?type=follow&uk=2489863899&self=1", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36", "X-Requested-With": "XMLHttpRequest" } mysql = pysql() db = mysql.cursor() sqlcx = "select id,shorturl from data_zy" db.execute(sqlcx) data = db.fetchall() #查询完成 for r in data: url = "https://pan.baidu.com/s/"+r[1] id = r[0] html = (requests.get(url, headers=header).text).encode(‘iso-8859-1‘).decode(‘utf-8‘) srt = "此链接分享内容可能因为涉及侵权、色情、反动、低俗等信息,无法访问!" if srt in html: sqlde = "DELETE FROM data_zy WHERE id = %s" % id db.execute(sqlde) time.sleep(random.randint(10, 25)) else: pass
这里还有个小函数,如果我们爬取的资源标题包含敏感词则不写入数据库,主要过滤广告
from mysql_db import pysql def gjc_gl(title): mysql = pysql() db = mysql.cursor() sql = "select * from gjc_gl WHERE id=1" db.execute(sql) data = db.fetchall()[0][1] data = data.split(‘,‘) for trs in data: if trs in title: return False else: pass return True # import os # import binascii # cats = { # u‘video‘: u‘视频‘, # u‘image‘: u‘图片‘, # u‘document‘: u‘书籍‘, # u‘music‘: u‘音乐‘, # u‘package‘: u‘压缩‘, # u‘software‘: u‘软件‘, # } # # def get_label(name): # if name in cats: # return cats[name] # return u‘其它‘ # # # 函数用途,根据传入的文件名后缀而判断文件类型 # def get_category(ext): # ext = ext + ‘.‘ # cats = { # u‘video‘: ‘.avi.mp4.rmvb.m2ts.wmv.mkv.flv.qmv.rm.mov.vob.asf.3gp.mpg.mpeg.m4v.f4v.‘, # u‘image‘: ‘.jpg.bmp.jpeg.png.gif.tiff.‘, # u‘document‘: ‘.pdf.isz.chm.txt.epub.bc!.doc.docx.xlsx.xls.pptx.ppt.‘, # u‘music‘: ‘.mp3.wma.ape.wav.dts.mdf.flac.‘, # u‘package‘: ‘.zip.rar.7z.tar.gz.iso.dmg.pkg.‘, # u‘software‘: ‘.exe.app.msi.apk.‘, # u‘torrent‘: ‘.torrent.‘ # } # for k, v in cats.items(): # if ext in v: # return get_label(k) # 调用 # return ‘其他‘
这里写了一个拓展函数,去爬取其他网站的函数,动态分享获取到的资源或许不够,这里可以多渠道爬取其他网站,从而可以建立一个更加全面的百度网盘资源搜索
import requests,re,time import random import pymysql from mysql_db import pysql def aidy(): for i in range(11, 24): # 1000 for r in range(1, 6): try: url = "http://520.58801hn.com/%d/page/%d" % (i, r) header = { "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Mobile Safari/537.36"} html = requests.get(url, headers=header).text re_url = re.findall(‘<div class="entry-meta">.*?<a href="(.*?)" rel="bookmark">‘, html, re.S) times = re.findall(‘<div class="entry-meta">.*?itemprop="datePublished">(.*?)</time></a>‘, html, re.S) t=0 for for_url in re_url: html_wp = requests.get(for_url, headers=header).text re_wp = re.findall(‘<p>.*?href="https://pan.baidu.com/s/(.*?)">百度云盘</a>.*?:(.*?)</p>‘, html_wp, re.S) if re_wp: h1 = re.findall(‘<h1 class="entry-title" itemprop="name headline">(.*?)</h1>‘, html_wp, re.S) # 开始连接数据库 mysql = pysql() db = mysql.cursor() # # 查询是否有重复标题 sqlcx = "select title from data_zy WHERE title=‘%s‘" % h1[0] db.execute(sqlcx) data = db.fetchall() # # 没有重复标题时添加数据 if not data: # 没有的时候执行 sqlcxid = "select max(id) from data_zy" db.execute(sqlcxid) dataid = db.fetchall() ids = (int(dataid[0][0])) + 1 # 获取最后一个入库id time_time = time.strftime("%Y-%m-%d %H:%M", time.localtime()) # 获取入库时间 try: sqltj = "insert into data_zy (id,category,shorturl,title,feed_time,rk_time,wpmm) VALUES (‘%d‘,‘6‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘)" % ( ids, re_wp[0][0], h1[0], times[t], time_time, re_wp[0][1]) db.execute(sqltj) mysql.commit() t = t + 1 except: pass else: pass time.sleep(random.randint(2, 10)) time.sleep(random.randint(2, 10)) except: time.sleep(60) continue if __name__ == ‘__main__‘: while True: try: aidy() time.sleep(10800) # 每3个小时爬一次,一天爬4次 except: continue
数据库的设计比较简单,只做了两个表,可以看看写入数据库函数那部分。