cassiePython 2014-04-07
代码如下:
def get_seed_data(filename): dom = minidom.parse(filename) root = dom.documentElement system_nodes = root.getElementsByTagName("system") k = 0 seed_list = [] for system_node in system_nodes: #print system_node.nodeName+' id='+system_node.getAttribute('id') system_id = system_node.getAttribute("id") system_name = system_node.getAttribute("name") #print 'system_name:%s'%system_name section_nodes = system_node.getElementsByTagName("section") for section_node in section_nodes: section_id = section_node.getAttribute('id') section_name = section_node.getAttribute('name') #print ' '+section_node.nodeName+' id='+section_id+' name='+section_name crawl_cycle_node = section_node.getElementsByTagName("crawl_cycle") crawl_cycle = crawl_cycle_node[0].childNodes[0].nodeValue #print ' '+crawl_cycle_node[0].nodeName+'='+crawl_cycle seed_nodes = section_node.getElementsByTagName('seed') for seed_node in seed_nodes: seed = {} seed['crawl_cycle'] = crawl_cycle seed['system_id'] = int(system_id) seed['system_name'] = system_name seed['section_id'] = int(section_id) seed['section_name'] = section_name seed_id = seed_node.getAttribute('id') seed['seed_id'] = int(seed_id) #print ' '+seed_node.nodeName+' '+'id='+seed_id userblog_url_node = seed_node.getElementsByTagName('userblog_url') userblog_url = userblog_url_node[0].childNodes[0].nodeValue seed['userblog_url'] = userblog_url #print ' '+'userblog_url'+' '+userblog_url print '-------------------------------------------' print 'system_id:%d' % seed['system_id'] print 'system_name:%s'%seed['system_name'] print ' section_id:%d' % seed['section_id'] print ' section_name:%s' % seed['section_name'] print ' seed_id:%d' %seed['seed_id'] print ' userblog_url:%s' %seed['userblog_url'] print '=========================' seed_list.append(seed) print seed_list[k] k += 1 os.system('pause') return seed_list
代码如下:
<?xml version="1.0" encoding="utf-8" ?> <seeds> <system id="1" name="新浪"> <section id="1" name="娱乐"> <crawl_cycle> </crawl_cycle> <seed id="1"> <userblog_url>http://aaa.com.cn/loveissuuny</userblog_url> </seed> <seed id="2"> <userblog_url>http://aaa.com.cn/loveissuuny</userblog_url> </seed> <seed id="3"> <userblog_url>http://aaa.com.cn/sanxiazaixian</userblog_url> </seed> </section> <section id="2" name="读书"> <crawl_cycle> </crawl_cycle> <seed id="11"> <userblog_url>http://aaa.com.cn/twocold</userblog_url> </seed> <seed id="12"> <userblog_url>http://aaa.com.cn/u/1233526741</userblog_url> </seed> </section> </system> </seeds>