katyusha 2018-09-23
本次练习用到的知识点有
* Requests 库的使用
* BeautifulShop 库的使用
* 正则表达式的使用
* pymongo 库的使用
有需要Python学习资料的小伙伴吗?小编整理【一套Python资料、源码和PDF】,感兴趣者可以关注小编后私信学习资料(是关注后私信哦)反正闲着也是闲着呢,不如学点东西啦
1、项目流程分析
2、中心调度
# 中心调度 def main(offset): # 获取列表页 index_data = get_page_index(offset,KEYWORDS) if index_data is None: print("offset:"+offset+"异常") return # 解析列表页获取所有详情页的url for url in parse_page_index(index_data): # 获取详情页 detail_data = get_page_detail(url) if detail_data is None: print('url:%s异常'.format(url)) pass # 解析详情页 data = parse_page_detail(detail_data, url) if data is None: continue save_to_mongo(data)
3、请求和解析列表页
# 请求获取列表页的响应数据 def get_page_index(offset,keywords): params = { 'offset':offset, 'format':'json', 'keyword':KEYWORDS, 'cur_tab':3, 'autoload':'true', 'count':20 } try: response = requests.get('http://www.toutiao.com/search_content/',params=params) if response.status_code==200: return response.text return None except RequestException as e: return None # 解析列表页 def parse_page_index(text): try: data = json.loads(text) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url') except JSONDecodeError as e: print('解析异常') return []
4、请求和解析详情页
# 解析详情页面 def parse_page_detail(html, url): soup = BeautifulSoup(html,'lxml') # 获取页面的标题 title = soup.title.string image_pattern = re.compile('var gallery = (.*?);',re.S) result = image_pattern.search(html) if result: try: data = json.loads(result.group(1)) if data and 'sub_images' in data.keys(): # 获取所有的image的url images = [item.get('url') for item in data.get('sub_images')] for image in images: # 下载图片 download_image(image) return {'title':title, 'url':url, 'images':images} except JSONDecodeError as e: return None return None
5、下载图片和保存至Mongodb
# 获取图片的二进制流 def download_image(url): try: print('图片'+url+'正在下载') response = requests.get(url) if response.status_code == 200: # 保存图片 save_image(response.content) except RequestException as e: print('异常image:'+url) pass # 保存二进制流至文件 def save_image(content): file_path = '{0}/images/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg') if not os.path.exists(file_path): with open(file_path,'wb+') as file: file.write(content) file.close() def save_to_mongo(data): if db[MONGO_TABLE].insert(data): print('成功保存'+data['title']) return True return False
6、完整代码
#!/usr/bin/python # -*- coding: utf-8 -*- import os import re import requests import pymongo import json from hashlib import md5 from bs4 import BeautifulSoup from setting import * from requests.exceptions import RequestException from json.decoder import JSONDecodeError from multiprocessing import Pool client = pymongo.MongoClient(MONGO_URL) db = client[MONGO_DB] # 请求获取列表页的响应数据 def get_page_index(offset,keywords): params = { 'offset':offset, 'format':'json', 'keyword':KEYWORDS, 'cur_tab':3, 'autoload':'true', 'count':20 } try: response = requests.get('http://www.toutiao.com/search_content/',params=params) if response.status_code==200: return response.text return None except RequestException as e: return None # 解析列表页 def parse_page_index(text): try: data = json.loads(text) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url') except JSONDecodeError as e: print('解析异常') return [] # 请求获取详情页面的响应数据 def get_page_detail(url): response = requests.get(url) try: if response.status_code==200: return response.text return None except RequestException as e: return None # 解析详情页面 def parse_page_detail(html, url): soup = BeautifulSoup(html,'lxml') # 获取页面的标题 title = soup.title.string image_pattern = re.compile('var gallery = (.*?);',re.S) result = image_pattern.search(html) if result: try: data = json.loads(result.group(1)) if data and 'sub_images' in data.keys(): # 获取所有的image的url images = [item.get('url') for item in data.get('sub_images')] for image in images: # 下载图片 download_image(image) return {'title':title, 'url':url, 'images':images} except JSONDecodeError as e: return None return None # 获取图片的二进制流 def download_image(url): try: print('图片'+url+'正在下载') response = requests.get(url) if response.status_code == 200: # 保存图片 save_image(response.content) except RequestException as e: print('异常image:'+url) pass # 保存二进制流至文件 def save_image(content): file_path = '{0}/images/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg') if not os.path.exists(file_path): with open(file_path,'wb+') as file: file.write(content) file.close() def save_to_mongo(data): if db[MONGO_TABLE].insert(data): print('成功保存'+data['title']) return True return False # 中心调度 def main(offset): # 获取列表页 index_data = get_page_index(offset,KEYWORDS) if index_data is None: print("offset:"+offset+"异常") return # 解析列表页获取所有详情页的url for url in parse_page_index(index_data): # 获取详情页 detail_data = get_page_detail(url) if detail_data is None: print('url:%s异常'.format(url)) pass # 解析详情页 data = parse_page_detail(detail_data, url) if data is None: continue save_to_mongo(data) if __name__=='__main__': groups = [x*20 for x in range(GROUP_START,GROUP_END+1)] pool = Pool() pool.map(main, groups)
7、运行结果
有需要Python学习资料的小伙伴吗?小编整理【一套Python资料、源码和PDF】,感兴趣者可以关注小编后私信学习资料(是关注后私信哦)反正闲着也是闲着呢,不如学点东西啦