import requests
import time
from lxml import etree
import xlwt
headers = {
'User-Aegnt':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
all_info_lists = []
def get_info(url,headers=headers):
res = requests.get(url)
html = etree.HTML(res.text)
infos = html.xpath('//div[@class="col1"]/div')
for info in infos:
try:
id = info.xpath('div[1]/a[2]/h2/text()')[0]
except IndexError:
pass
id = info.xpath('div[1]/span[2]/h2/text()')[0]
content1 = info.xpath('a[1]/div/span[1]')[0]
content = content1.xpath('string(.)').strip()
laugh = info.xpath('div[2]/span[1]/i/text()')[0]
comment = info.xpath('div[2]/span[2]/a/i/text()')[0]
info_list = [id,content,laugh,comment]
all_info_lists.append(info_list)
if __name__ == '__main__':
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('sheet1')
header =['id', 'content', 'laugh', 'comment']
for t in range(len(header)):
sheet.write(0, t, header[t])
urls = ['https://www.qiushibaike.com/text/page/{}/'.format(str(i)) for i in range(1,14)]
for url in urls:
get_info(url)
time.sleep(2)
i = 1
for list in all_info_lists:
j = 0
for data in list:
sheet.write(i, j, data)
j += 1
i += 1
book.save('C:/Users/madin/Desktop/test.xls')