python抓取百度百科结构化信息
import pymysql
import re
import requests
from lxml import html
import xlwt,xlrd
def baidubaike(name):
baseurl='https://baike.baidu.com/item/'
headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
url=baseurl+str(name)
response=requests.get(url,headers=headers)
print(response.status_code)
print(response.url)
text=response.content.decode("utf-8").replace('\n','')
text = text.replace('</a>', '')
text=re.sub(r'(<a.*?>)', '', text)
text = text.replace('<br/>','、')
text = re.sub(r'(<em.*?>)', '', text)
text = text.replace('</em>', '、')
text = re.sub(r'(<sup.*?>)', '', text)
text = text.replace('</sup>', '、')
text = text.replace('<i>', '')
text = text.replace('</i>', '')
tree=html.fromstring(text)
result0=tree.xpath('//dt[@class="basicInfo-item name"]/text()')
result00 = tree.xpath('//dd[@class="basicInfo-item value"]/text()')
result1=[i.replace('\xa0','') for i in result0]
result11 = [i.replace('\xa0', '') for i in result00]
if(len(result1)!=len(result11)):
print(name,"出现了一个错误")
pass
else:
s={}
for i in range(len(result1)):
s[result1[i]]=result11[i]
return s
aa=baidubaike("刘诗诗")
# print((aa[0]),'\n',aa[1],'\n',aa[2],'\n',aa[3],'\n',aa[4])
print(aa)
python从excel读取数据并将抓取到的数据存入excel
import pymysql
import re
import requests
from lxml import html
import xlwt,xlrd
def baidubaike(name):
baseurl='https://baike.baidu.com/item/'
headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
url=baseurl+str(name)
response=requests.get(url,headers=headers)
print(response.status_code)
print(response.url)
text=response.content.decode("utf-8").replace('\n','')
text = text.replace('</a>', '')
text=re.sub(r'(<a.*?>)', '', text)
text = text.replace('<br/>','、')
text = re.sub(r'(<em.*?>)', '', text)
text = text.replace('</em>', '、')
text = re.sub(r'(<sup.*?>)', '', text)
text = text.replace('</sup>', '、')
text = text.replace('<i>', '')
text = text.replace('</i>', '')
tree=html.fromstring(text)
result0=tree.xpath('//dt[@class="basicInfo-item name"]/text()')
result00 = tree.xpath('//dd[@class="basicInfo-item value"]/text()')
result1=[i.replace('\xa0','') for i in result0]
result11 = [i.replace('\xa0', '') for i in result00]
if(len(result1)!=len(result11)):
print(name,"出现了一个错误")
pass
else:
s={}
for i in range(len(result1)):
s[result1[i]]=result11[i]
return s
"""
读取excel表格
"""
readbook = xlrd.open_workbook('C:\\Users\\root\\Desktop\\6.xls')
sheet = readbook.sheet_by_index(0)
data=sheet.col_values(0)
headlist=[]
for i in range(len(data)):
print(data[i])
a = baidubaike(data[i]).keys()
print(a)
headlist=list(set(headlist+list(a)))
print(headlist)
print(1234)
"""
将数据写入
"""
workbook = xlwt.Workbook(encoding = 'utf-8')
# 第2步:创建一个worksheet
worksheet = workbook.add_sheet('My Worksheet')
# 第3步:写入excel # 参数对应 行, 列, 值
for k in range(len(headlist)):
worksheet.write(0, k, headlist[k])
for i in range(len(data)):
aa=baidubaike(data[i])
bb = list(aa.keys())
cc= list(aa.values())
for j in range(len(bb)):
if(bb[j] in headlist):
indexkey=headlist.index(bb[j])
print(indexkey)
print(headlist[indexkey])
worksheet.write(i+1,indexkey,aa[headlist[indexkey]])
else:
pass
# 第4步:保存(一定记得保存)
workbook.save('C:\\Users\\root\\Desktop\\ls.xls')
2024年1月9日最新更新网页结构解析变更
result0=tree.xpath('//dt[@class="basicInfo-item name"]/text()')
result00 = tree.xpath('//dd[@class="basicInfo-item value"]/text()')
result0=tree.xpath('//dt[@class="basicInfoItem_h36i9 itemName_S6BA4"]/text()')
for p in tree.xpath('//dd[@class="basicInfoItem_h36i9 itemValue_UM0sz"]'):
p.text="".join(p.xpath(".//text()"))
for span in p.xpath("./span"):
p.remove(span)
result00 = tree.xpath('//dd[@class="basicInfoItem_h36i9 itemValue_UM0sz"]/text()')