python 爬取文章后存储excel 以及csv

mieleizhi0 2019-12-13

import requests
from bs4 import BeautifulSoup
import random
import openpyxl
xls=openpyxl.Workbook()
sheet=xls.active
sheet.title=‘movies‘
sheet[‘A1‘]=‘序号‘
sheet[‘B1‘]=‘名称‘
sheet[‘C1‘]=‘评分‘
sheet[‘D1‘]=‘推荐语‘
sheet[‘E1‘]=‘链接‘

for i in range(11):
    params={
        ‘start‘: str(i*25),
        ‘filter‘:‘‘
    }
    headers={
        ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36‘
    }
    url=‘https://movie.douban.com/top250‘
    res=requests.get(url,params=params,headers=headers)
    con=res.text
    soup=BeautifulSoup(con,‘html.parser‘)
    maindiv=soup.find(class_="grid_view")
    for titles in maindiv.find_all(‘li‘):
        try:
            num = titles.find(‘em‘,class_="").text
            #查找序号
            title = titles.find(‘span‘, class_="title").text
            #查找电影名
            tes = titles.find(‘span‘,class_="inq").text
            #查找推荐语
            comment = titles.find(‘span‘,class_="rating_num").text
            #查找评分
            url_movie = titles.find(‘a‘)[‘href‘]
            print(num + ‘.‘ + title + ‘——‘ + comment + ‘\n‘ + ‘推荐语:‘ + tes +‘\n‘ + url_movie)
            sheet.append([num,title,comment,tes,url_movie])
        except:
            continue
xls.save(‘douban.xlsx‘)

csv:

import requests
from bs4 import BeautifulSoup
import random
import openpyxl
import csv

url="https://www.zhihu.com/api/v4/members/zhang-jia-wei/articles"
headers={
    ‘referer‘: ‘https://www.zhihu.com/people/zhang-jia-wei/posts/posts_by_votes?page=1‘,
    ‘user-agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36‘
}
csv_file=open(‘dazhangwei.csv‘,‘w‘,newline=‘‘,encoding=‘utf-8‘)
writer=csv.writer(csv_file)
header=[‘标题‘,‘简介‘,‘连接‘]
writer.writerow(header)
x=0
while True:
    params={
        ‘include‘: ‘data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics‘,
        ‘offset‘: str((x*10)),
        ‘limit‘: ‘10‘,
        ‘sort_by‘: ‘voteups‘
    }
    res=requests.get(url,headers=headers,params=params)
    res_json=res.json()
    con=res_json[‘data‘]
    for i in con:
        lists=[i[‘title‘],i[‘url‘],i[‘excerpt‘]]
        writer.writerow(lists)
    if res_json[‘paging‘][‘is_end‘] == True:
        break
    x+=1
csv_file.close()

相关推荐