我选用了综合排名第一的悦花来获取评论
链接:https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.1b1d2b9c7CiL7D&id=560453600352&skuId=3669849810834
因为天猫的评论只能抓取99页,所以一共有1980条数据,删除无效数据后有1700条左右。
数据量比较少,分析结果可能不准确,仅供参考
天猫评论爬虫代码:
# coding = utf-8
import requests
import json
import csv
import time
import re
import random
headers = {"user-agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' }
data = []
def gettbpl(page):
url = 'https://rate.tmall.com/list_detail_rate.htm?itemId=560453600352&sellerId=3077671836¤tPage=%d'%page
#print (url)
response = requests.get(url,headers=headers)
try:
resp = re.search(r'rateList":(.*),"searchinfo"',response.text).group(1)
jsonre = json.loads(resp.strip().strip('()'))
for j in jsonre:
jdate = j['rateDate']
jcontent = j['rateContent'].encode("gbk","ignore").decode("gbk","ignore")
jsku = j['auctionSku']
jnick = j['displayUserNick']
jid = j['id']
data.append([jdate,jcontent,jsku,jnick,jid])
except BaseException:
print("第页%d无法获取!"%page)
with open ('flowerplus.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['date','content','sku','nickname','id'])
for k in data:
writer.writerow(k)
if __name__ == '__main__':
for p in range(1,100):#共99页
gettbpl(p)
t = random.uniform(0,3)
time.sleep(t)
用sonwnlp进行情感分析并绘制情感分布直方图(以0.6为界限,0.6分以上认为是积极的,0.6分以下是负面的)
from snownlp import SnowNLP
from openpyxl import load_workbook
import matplotlib.pyplot as plt
import numpy as np
path = "D:/anaconda/shirleylearn/flowerplus/flowerplus_tbpl.xlsx"
myexcel = load_workbook(path)
mysheet = myexcel["Sheet1"]
n = mysheet.max_row#rows=table.nrows是xlrd的方法
#print(n)
scores = []
mysheet.cell(1,7,"sentiscore")
for j in range(2,n+2):
comment = mysheet["B%d"%j].value
if comment:#出现None类型报错,用if把None类型排除
comment = SnowNLP(comment)
sentiscore = comment.sentiments
mysheet.cell(j,7,sentiscore)
scores.append(sentiscore)
myexcel.save(path)
plt.hist(scores,bins=np.arange(0,1.01,0.01))
plt.show()
情感分布如图所示,可以看出评分为0附近(极差评)的数量还是很多的,从0.6这个界限看,好评稍稍多余差评,但是总体来看评价并不高,有点出乎意料。
我对评论进行了词频统计(代码参照之前的文章),发现排名在前四的几种花分别为玫瑰、康乃馨、睡莲、百合,所以又分别对出现这四种花的评论进行统计绘制直方图。
from openpyxl import load_workbook
import matplotlib.pyplot as plt
import numpy as np
path = "D:/anaconda/shirleylearn/flowerplus/flowerplus_tbpl.xlsx"
myexcel = load_workbook(path)
mysheet = myexcel["Sheet1"]
n = mysheet.max_row#rows=table.nrows是xlrd的方法
#print(n)
rose = []
carnation = []
waterlily = []
lily = []
r = "玫瑰"
c = "康乃馨"
w = "睡莲"
l = "百合"
for j in range(2,n+2):
comment = mysheet["B%d"%j].value
if comment:#出现None类型报错,用if把None类型排除
if r in comment:
rose.append(mysheet["F%d"%j].value)#情感分值上个步骤已经写入excel,直接从表格中获取
if c in comment:
carnation.append(mysheet["F%d"%j].value)
if w in comment:
waterlily.append(mysheet["F%d"%j].value)
if l in comment:
lily.append(mysheet["F%d"%j].value)
plt.hist(rose,bins=np.arange(0,1.2,0.2))
plt.show()
plt.hist(carnation,bins=np.arange(0,1.2,0.2))
plt.show()
plt.hist(waterlily,bins=np.arange(0,1.2,0.2))
plt.show()
plt.hist(lily,bins=np.arange(0,1.2,0.2))
plt.show()
分布与总体类似,都呈现凹字形,因为样本太少,不做具体分析
总结:作为花花爱好者,曾对这种包月鲜花非常感兴趣,除了花加还有花点时间等等。因为不知道会收到什么样的鲜花,所以收到花时既有惊喜也有失望,个人感觉失望的情况比较多,比如花花不是自己喜欢的品种,花花的状态不好,花的数量少等各种情况。我认为这是包月鲜花这种非标准化商品有如此多负面评价的原因之一。