软件设计 2017-04-25
爬取安科开发指定用户的文章修饰后全部保存到本地
首先定义爬取的模块文件:
crawlers_main.py 执行入口
# coding:utf8 from com.crawlers import download_manager from com.crawlers import output_manager from com.crawlers import parser_manager from com.crawlers import url_manager class SpiderMain(object): def __init__(self): self.urls = url_manager.UrlManager() self.downloader = download_manager.DownloadManager() self.parser = parser_manager.ParserManager() self.output = output_manager.OutputManager() def craw(self, root_url): html_root = self.downloader.download(root_url) new_urls = self.parser.parseUrls(root_url,html_root) self.urls.add_new_urls(new_urls) count = 1 while self.urls.has_new_url(): try: new_url = self.urls.get_new_url() print('craw %d : %s' % (count, new_url)) html_cont = self.downloader.download(new_url) new_data = self.parser.parse(new_url, html_cont) self.output.collect_data(new_data) if count == 1000: break count += 1 except: print('craw failed') self.output.output_html() if __name__ == "__main__": root_url = "http://www.cnblogs.com/zhuyuliang/" obj_spider = SpiderMain() obj_spider.craw(root_url)crawlers_main.py
url_manager.py url管理器
1 # coding:utf8 2 class UrlManager(object): 3 4 def __init__(self): 5 self.new_urls = set() 6 self.old_urls = set() 7 8 def add_new_url(self, url): 9 if url is None: 10 return 11 if url not in self.new_urls and url not in self.old_urls: 12 self.new_urls.add(url) 13 14 def add_new_urls(self, urls): 15 if urls is None or len(urls) == 0: 16 return 17 for url in urls: 18 self.add_new_url(url) 19 20 def has_new_url(self): 21 return len(self.new_urls) != 0 22 23 def get_new_url(self): 24 new_url = self.new_urls.pop() 25 self.old_urls.add(new_url) 26 return new_urlurl_manager.py
download_manager.py 下载模块
# coding:utf8 import urllib2 class DownloadManager(object): def download(self, url): if url is None: return None response = urllib2.urlopen(url) if response.code != 200: return None return response.read()download_main.py
parser_manager.py html解析器(解析html需要利用的内容)
# coding:utf8 import re from HTMLParser import HTMLParser from bs4 import BeautifulSoup import urlparse import sys reload(sys) sys.setdefaultencoding('utf-8') class ParserManager(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.links = [] def handle_starttag(self, tag, attrs): # print "Encountered the beginning of a %s tag" % tag if tag == 'img' or tag == "script": for (variable, value) in attrs: if variable == "src" or variable == "href": self.links.append(value) if tag == "link": dic = dict(attrs) if dic['rel'] == "stylesheet": self.links.append(dic['href']) def parse(self, page_url, html_cont): if page_url is None or html_cont is None: return soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8') new_data = self._get_new_data(page_url,soup) return new_data def _get_new_urls(self, page_url, soup): new_urls = set() #href="http://www.cnblogs.com/zhuyuliang/p/5218635.html" links = soup.find_all('a',href=re.compile(r'http://www.cnblogs.com/zhuyuliang/p/...')) for link in links: new_url = link['href'] new_full_url = urlparse.urljoin(page_url,new_url) new_urls.add(new_full_url) return new_urls def _get_new_data(self, page_url, soup): res_data = {} res_data['url'] = page_url #<a id="cb_post_title_url" class="postTitle2" href="http://www.cnblogs.com/zhuyuliang/p/5218635.html">Android开发代码规范</a> title_node = soup.find('a',class_='postTitle2') res_data['title'] = title_node.get_text() #div id='topics' summary_node = soup.find('div',class_="post") res_data['summary'] = summary_node new_tag = soup.new_tag("body") new_tag.string = summary_node.encode('utf-8') soup.body.replace_with(new_tag) res_data['template'] = soup return res_data def parseUrls(self,root_url,html_cont): soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8') new_urls = self._get_new_urls(root_url, soup) return new_urlsparser_manager.py
output_manager.py 输出html网页全部内容文件(包括css,png,js等)
# -*- coding:utf-8 -*- # !/bin/sh import os import urllib from com.crawlers.parser_manager import ParserManager class OutputManager(object): def __init__(self): self.datas = [] def collect_data(self, data): if data is None: return self.datas.append(data) def output_html(self): for data in self.datas: fout = open('output/%s.txt'%data['title'].encode('utf-8'), 'w') fout.write("%s" % data['summary'].encode('utf-8')) fout.close() url = data['url'].encode('utf-8') pagename = data['title'].encode('utf-8') # html_code = urllib.urlopen(url).read() hp = ParserManager() html_code = data['template'].encode('utf-8') html_code = hp.unescape(html_code) hp.feed(html_code) hp.close() durl = url.rsplit('/',1)[0] self.download(pagename,html_code,durl,hp.links) def download(self,pagename,html_code,durl,links): if not os.path.exists('output/'+pagename+'_files'): os.mkdir('output/'+pagename+'_files') upurl = durl.rsplit('/',1)[0] for link in links: fname = link.split('/')[-1] fname = fname.split('?')[0] localpath = '%s%s' % ('output/'+pagename+'_files/',fname) replacelocalpath = '%s%s' % (pagename + '_files/', fname) # if link[0:3] == '../': # downlink = "http:" + link # else: # downlink = link try: urllib.urlretrieve("http://www.cnblogs.com" + link,localpath) except Exception,error: print 'download error:', error else: print 'download '+fname html_code = html_code.replace(link,replacelocalpath) open('output/'+pagename+'.html','w').write(html_code) return Trueoutput_manager.py
最后输出:
>结束