qyf 2019-12-07
爬虫:requests,beautifulsoup
词云:wordcloud,jieba
代码加注释:
# -*- coding: utf-8 -*- import xlrd#读取excel import xlwt#写入excel import requests import linecache import wordcloud import jieba import matplotlib.pyplot as plt from bs4 import BeautifulSoup if __name__=="__main__": yun="" n=0#ID编号 target=‘https://api.bilibili.com/x/v1/dm/list.so?oid=132084205‘#b站oid页 user_agent = ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36‘ headers = {‘User-Agent‘:user_agent}#伪装浏览器 req=requests.get(url=target) html=req.text html=html.encode(‘ISO 8859-1‘) #html=html.replace(‘<br>‘,‘ ‘).replace(‘<br/>‘,‘ ‘).replace(‘/>‘,‘>‘) bf=BeautifulSoup(html,"html.parser") texts=bf.find(‘i‘) texts_div=texts.find_all(‘d‘) #print(texts_div) for item in texts_div: n=n+1 item_name=item.text#标题 yun+=str(item_name) yun=yun.replace(" ","") yun=yun.replace("哈","") yun=yun.replace("啊","") yun=yun.replace("一","")#去除无意义弹幕 # 结巴分词,生成字符串,wordcloud无法直接生成正确的中文词云 cut_text = " ".join(jieba.cut(yun)) wc = wordcloud.WordCloud( #设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的 font_path="C:/Windows/Fonts/simfang.ttf", #设置了背景,宽高 background_color="white",width=1000,height=880).generate(cut_text) plt.imshow(wc, interpolation="bilinear") plt.axis("off") plt.show() print("Done!")
运行结果图: