hilary0 2019-10-31
import requests import bs4 #获取网页代码 def gethtml(url): try: response = requests.get(url) response.raise_for_status() response.encoding = response.apparent_encoding return response.text except: return "禁止爬取本网站" #获取每一页中的文字 def chapters(url,name): html = gethtml("http://www.bjkgjlu.com"+url) soup = bs4.BeautifulSoup(html,‘html.parser‘) for i in soup.find_all("div",attrs={"class":"chapter_content"}): with open(name+".txt","wb") as f: f.write(i.text.split("<")[0].encode("utf-8")) print(name+"爬取结束,并存入文件") if __name__=="__main__": url = "http://www.bjkgjlu.com/303618kyi/catalog" chapter_name_list = [] chapter_url_list = [] html =gethtml(url) soup = bs4.BeautifulSoup(html, "html.parser") for i in soup.findAll("div", attrs={"class": "col-xs-120 col-sm-60 col-md-40 col-lg-30"}): for j in i.children: chapter_name_list.append(j.text) chapter_url_list .append(j.get("href")) print(chapter_name_list ) for j in range(len(chapter_name_list)): chapters(chapter_url_list[j],chapter_name_list[j] )