Python数据分析结业作业,还有很多待优化的细节,request还是不太适合大型爬虫,回头慢慢修……
短评爬取
import requests
import json
from fake_useragent import UserAgent
import time
import datetime
import pandas as pd
#模拟浏览器
headers = {'User-Agent':UserAgent(verify_ssl=False).random}
comment_api = 'https://api.bilibili.com/pgc/review/short/list?media_id=6038&ps=20&sort=0'
#发送get请求
response = requests.get(url = comment_api, headers=headers)
data_json = response.text
data = json.loads(data_json)
#取1000条短评,全量容易被封……
#alldata_num=data['data']['total']
alldata_num=1001
#定义数据格式
cols=['author','content','ctime','disliked','likes','score','progress']
df = pd.DataFrame(index = range(alldata_num),columns=cols)
#获取short comment
j = 0
while j < alldata_num :
data_list = data['data']['list']
for i in range(len(data_list)):
df.loc[j,'author']=data_list[i]['author']['uname']
df.loc[j,'content']=data_list[i]['content']
df.loc[j,'ctime']=data_list[i]['ctime']
df.loc[j,'disliked']=data_list[i]['stat']['disliked']
df.loc[j,'likes']=data_list[i]['stat']['likes']
df.loc[j,'score']=data_list[i]['score']
try:
df.loc[j,'progress']=data_list[i]['progress']#提取进度易报错
except:
pass
j += 1
#提取cursor参数,进入下一个循环
next_cursor = data['data']['next']
url = comment_api + '&cursor=' + str(next_cursor)
response = requests.get(url=url, headers=headers)
data_json = response.text
data = json.loads(data_json)
#防止被反爬虫机制阻断QAQ
if j % 100 ==0 :
print('————Have finished {}%————'.format(round(j/alldata_num*100,1)))
# print(next_cursor)
time.sleep(0.6)
#存入文件,后续无需再爬
df.to_csv('shortcomments.csv',index=False)
for i in range(len(data_list)):
next_cursor = data['data']['next']
url = comment_api + '&cursor=' + str(next_cursor)
response = requests.get(url=url, headers=headers)
data_json = response.text
data = json.loads(data_json)
相关性数据分析
#读取数据
df = pd.read_csv('shortcomments.csv')
#去除空值
data_all = df.fillna(0)
#转换时间格式
def getDate(x):
x = time.gmtime(x)
return(pd.Timestamp(datetime.datetime(x[0],x[1],x[2])))
data_all['date'] = data_all.ctime.apply(lambda x: getDate(x))
#评论日期分布堆积图
from collections import Counter
import matplotlib.pyplot as plt
cdays = Counter(data_all['date'])
comm_days = pd.DataFrame(cdays.items(),columns = ['date','comment_num'])
comment_days = comm_days.sort_values(by='date')
comment_days.plot.area(x='date',y='comment_num',cmap='tab10_r')
#计算评论距今天数
def Days(x):
x = time.gmtime(x)
return((datetime.datetime.now() - datetime.datetime(x[0],x[1],x[2],x[3],x[4],x[5])).days)
data_all['days'] = data_all.ctime.apply(lambda x: Days(x))
#短评长度
data_all['comm_len'] = data_all.content.apply(lambda x: len(x))
#数据相关性分析
#评分与评论距今天数散点图
plt.scatter(data_all['score'],data_all['days'], color='blue')
#多元拟合,评论获赞数与评论字数、评分、评论时间的关系
import numpy as np
import statsmodels.api as sm
X = pd.DataFrame(data_all.loc[:,['comm_len','score','days']], columns = ['comm_len','score','days'])
y = pd.DataFrame(data_all['likes'], columns = ['likes'])
X_add1 = sm.add_constant(X)
model = sm.OLS(y, X_add1).fit()
print (model.summary())
#仅距今日期显著性较高,删除评分和短评长度,重新训练
X.drop('score', axis = 1, inplace = True)
X.drop('comm_len', axis = 1, inplace = True)
X_add1 = sm.add_constant(X)
model = sm.OLS(y, X_add1).fit()
print(model.summary())
print(model.params) #输出回归系数
#模型拟合显著性不高,且R方较小,可以推送日期与点赞数相关,但数据解释力度小。
#假设测试数据为距今6天
X_test = np.array([1,6])
#利用模型的 predict 获得 预测 结果
print(model.predict(X_test))
#点赞数为负,显然不对,后期可以考虑挖掘更多数据进行模拟。比如发表用户的粉丝数等
绘制评论中高频词词云
#绘制短评内容的热点词云
import jieba
import jieba.posseg as pseg
from wordcloud import WordCloud
from PIL import Image
#分词&过滤停用词
#停用词来自百度停用词库
stop_words = set(line.strip() for line in open('stopwords_baidu.txt'))
outstr = ''
for i in range(len(data_all)):
jieba.suggest_freq(('风灵玉秀'), True)
jieba.suggest_freq(('铃儿'), True)
jieba.suggest_freq(('钰袖'), True)
x = jieba.cut(data_all['content'][i])
for word in x:
if word not in stop_words:
if word != '\t':
xx=' '.join(word)
outstr += xx
#统计词频、排序
content = dict(Counter(outstr))
content1 = sorted(content.items(),key=lambda content:content[1],reverse=True)
content2 = {}
for i in range(len(content1)):
content2[content1[i][0]] = content1[i][1]
#画出词云
mask_image = np.array(Image.open('heart.png'))
Wc = WordCloud(font_path='simhei.ttf', background_color='white', mask=mask_image, colormap='Oranges',width=900,height=600, max_words=50)
wordcloud = Wc.generate_from_frequencies(content2)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
看起来都是好评~百合、有爱和能打的主题也完美体现了233
完结撒花✿✿ヽ(°▽°)ノ✿