fanhuasijin 2020-04-17
#!/usr/bin/env python # -*- coding: utf-8 -*- """ @File:csdn_reads.py @E-mail: @Time:2020/4/16 10:03 下午 @Author:Nobita @Version:1.0 @Desciption:None """ import re import time import random import requests import urllib.request from bs4 import BeautifulSoup user_agent_list = [ "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/61.0", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15", ] firefoxHead = {"User-Agent": user_agent_list} firefoxHead[‘User-Agent‘] = random.choice(user_agent_list) IPRegular = r"(([1-9]?\d|1\d{2}|2[0-4]\d|25[0-5]).){3}([1-9]?\d|1\d{2}|2[0-4]\d|25[0-5])" host = "https://blog.csdn.net" url = "https://blog.csdn.net/hz90s/article/details/{}" code = ["105518260"] def parseIPList(url="http://www.xicidaili.com/"): IPs = [] request = urllib.request.Request(url, headers=firefoxHead) response = urllib.request.urlopen(request) soup = BeautifulSoup(response, "html.parser") tds = soup.find_all("td") for td in tds: string = str(td.string) if re.search(IPRegular, string): IPs.append(string) return IPs def PV(code): s = requests.Session() s.headers = firefoxHead[‘User-Agent‘] count = 0 while True: count += 1 print("asking for {} times\t".format(count), end="\t") IPs = parseIPList() s.proxies = {"http": "{}:8080".format(IPs[random.randint(0, 40)])} s.get(host) r = s.get(url.format(code)) html = r.text soup = BeautifulSoup(html, "html.parser") spans = soup.find_all("span") print(spans[2].string) time.sleep(random.randint(60, 75)) def main(): PV(code[0]) if __name__ == "__main__": main()