软件设计 2017-04-25
爬取安科开发指定用户的文章修饰后全部保存到本地
首先定义爬取的模块文件:
crawlers_main.py 执行入口


# coding:utf8
from com.crawlers import download_manager
from com.crawlers import output_manager
from com.crawlers import parser_manager
from com.crawlers import url_manager
class SpiderMain(object):
def __init__(self):
self.urls = url_manager.UrlManager()
self.downloader = download_manager.DownloadManager()
self.parser = parser_manager.ParserManager()
self.output = output_manager.OutputManager()
def craw(self, root_url):
html_root = self.downloader.download(root_url)
new_urls = self.parser.parseUrls(root_url,html_root)
self.urls.add_new_urls(new_urls)
count = 1
while self.urls.has_new_url():
try:
new_url = self.urls.get_new_url()
print('craw %d : %s' % (count, new_url))
html_cont = self.downloader.download(new_url)
new_data = self.parser.parse(new_url, html_cont)
self.output.collect_data(new_data)
if count == 1000:
break
count += 1
except:
print('craw failed')
self.output.output_html()
if __name__ == "__main__":
root_url = "http://www.cnblogs.com/zhuyuliang/"
obj_spider = SpiderMain()
obj_spider.craw(root_url) crawlers_main.pyurl_manager.py url管理器


1 # coding:utf8 2 class UrlManager(object): 3 4 def __init__(self): 5 self.new_urls = set() 6 self.old_urls = set() 7 8 def add_new_url(self, url): 9 if url is None: 10 return 11 if url not in self.new_urls and url not in self.old_urls: 12 self.new_urls.add(url) 13 14 def add_new_urls(self, urls): 15 if urls is None or len(urls) == 0: 16 return 17 for url in urls: 18 self.add_new_url(url) 19 20 def has_new_url(self): 21 return len(self.new_urls) != 0 22 23 def get_new_url(self): 24 new_url = self.new_urls.pop() 25 self.old_urls.add(new_url) 26 return new_urlurl_manager.py
download_manager.py 下载模块


# coding:utf8
import urllib2
class DownloadManager(object):
def download(self, url):
if url is None:
return None
response = urllib2.urlopen(url)
if response.code != 200:
return None
return response.read() download_main.pyparser_manager.py html解析器(解析html需要利用的内容)


# coding:utf8
import re
from HTMLParser import HTMLParser
from bs4 import BeautifulSoup
import urlparse
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class ParserManager(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.links = []
def handle_starttag(self, tag, attrs):
# print "Encountered the beginning of a %s tag" % tag
if tag == 'img' or tag == "script":
for (variable, value) in attrs:
if variable == "src" or variable == "href":
self.links.append(value)
if tag == "link":
dic = dict(attrs)
if dic['rel'] == "stylesheet":
self.links.append(dic['href'])
def parse(self, page_url, html_cont):
if page_url is None or html_cont is None:
return
soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
new_data = self._get_new_data(page_url,soup)
return new_data
def _get_new_urls(self, page_url, soup):
new_urls = set()
#href="http://www.cnblogs.com/zhuyuliang/p/5218635.html"
links = soup.find_all('a',href=re.compile(r'http://www.cnblogs.com/zhuyuliang/p/...'))
for link in links:
new_url = link['href']
new_full_url = urlparse.urljoin(page_url,new_url)
new_urls.add(new_full_url)
return new_urls
def _get_new_data(self, page_url, soup):
res_data = {}
res_data['url'] = page_url
#<a id="cb_post_title_url" class="postTitle2" href="http://www.cnblogs.com/zhuyuliang/p/5218635.html">Android开发代码规范</a>
title_node = soup.find('a',class_='postTitle2')
res_data['title'] = title_node.get_text()
#div id='topics'
summary_node = soup.find('div',class_="post")
res_data['summary'] = summary_node
new_tag = soup.new_tag("body")
new_tag.string = summary_node.encode('utf-8')
soup.body.replace_with(new_tag)
res_data['template'] = soup
return res_data
def parseUrls(self,root_url,html_cont):
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
new_urls = self._get_new_urls(root_url, soup)
return new_urls parser_manager.pyoutput_manager.py 输出html网页全部内容文件(包括css,png,js等)


# -*- coding:utf-8 -*-
# !/bin/sh
import os
import urllib
from com.crawlers.parser_manager import ParserManager
class OutputManager(object):
def __init__(self):
self.datas = []
def collect_data(self, data):
if data is None:
return
self.datas.append(data)
def output_html(self):
for data in self.datas:
fout = open('output/%s.txt'%data['title'].encode('utf-8'), 'w')
fout.write("%s" % data['summary'].encode('utf-8'))
fout.close()
url = data['url'].encode('utf-8')
pagename = data['title'].encode('utf-8')
# html_code = urllib.urlopen(url).read()
hp = ParserManager()
html_code = data['template'].encode('utf-8')
html_code = hp.unescape(html_code)
hp.feed(html_code)
hp.close()
durl = url.rsplit('/',1)[0]
self.download(pagename,html_code,durl,hp.links)
def download(self,pagename,html_code,durl,links):
if not os.path.exists('output/'+pagename+'_files'):
os.mkdir('output/'+pagename+'_files')
upurl = durl.rsplit('/',1)[0]
for link in links:
fname = link.split('/')[-1]
fname = fname.split('?')[0]
localpath = '%s%s' % ('output/'+pagename+'_files/',fname)
replacelocalpath = '%s%s' % (pagename + '_files/', fname)
# if link[0:3] == '../':
# downlink = "http:" + link
# else:
# downlink = link
try:
urllib.urlretrieve("http://www.cnblogs.com" + link,localpath)
except Exception,error:
print 'download error:', error
else:
print 'download '+fname
html_code = html_code.replace(link,replacelocalpath)
open('output/'+pagename+'.html','w').write(html_code)
return True output_manager.py最后输出:
>结束