import pandas as pd
import requests
from fake_useragent import UserAgent
import random
from lxml import etree
'''解析网页数据'''
def parse_html(url):
ua = UserAgent()
print(ua.random) # 随机打印任意厂家的浏览器
headers = {
'User-Agent': ua.random,
'Referer': 'https://www.dxsbb.com'
}
try:
resp = requests.get(url, headers=headers)
# 将编码方式设置为从内容中分析出的响应内容编码方式
resp.encoding = resp.apparent_encoding
if resp.status_code == 200:
tree = etree.HTML(resp.text)
# 定位获取表格信息
tb = tree.xpath('//*[@id="content"]/table')
print("**************tb********************88", tb)
# 将byte类型解码为str类型
tb = etree.tostring(tb[0], encoding='utf8').decode()
return tb
else:
print("出现问题")
except:
pass
def main():
url = 'https://www.dxsbb.com/news/50354.html'
tb = parse_html(url)
print("**************8tb********************88", tb)
# 解析表格数据
df = pd.read_html(tb, encoding='utf-8', header=0)[0]
print("df.T.to_dict()********************88", df.T.to_dict())
print("df.T.to_dict().values()********************88", df.T.to_dict().values())
# 转换成列表嵌套字典的格式
result = list(df.T.to_dict().values())
print("result*****************", result)
# 保存为csv格式
df.to_csv('211_university.csv', index=False)
main()