爬取网址:https://www.qidian.com/all?page=1 (选取前100页)
爬取信息:小说名,作者ID,小说类型,完成情况,摘要,字数
import requests
from lxml import etree
import time
import xlwt
def get_info(url):
r = requests.get(url)
html = etree.HTML(r.text)
books = html.xpath('//li[@data-rid]')
for book in books:
name = book.xpath('div/h4/a/text()')[0]
id = book.xpath('div/p/a[1]/text()')[0]
type1 = book.xpath('div/p/a[2]/text()')[0]
type2 = book.xpath('div/p/a[3]/text()')[0]
type = type1 + '.' + type2
complete = book.xpath('div/p/span/text()')[0]
summary = book.xpath('div/p[@class="intro"]/text()')[0].strip()
# summary = book.xpath('div/p[2]/text()')[0].strip() #等价于上一句
word = book.xpath('div[2]/p[3]/span/text()')[0].strip('万字')
info_list = [name,id,type,complete,summary,word]
info_lists.append(info_list)
if __name__ == '__main__':
urls = ['https://www.qidian.com/all?page={}'.format(i) for i in range(1,101)]
info_lists =[]
for url in urls:
get_info(url)
time.sleep(1)
head = ['小说名','作者ID','小说类型','完成情况','摘要','字数(万字)']
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('qidian')
for x in range(len(head)):
sheet.write(0,x,head[x]) #写入表头信息
i = 1
for info_list in info_lists:
j = 0
for info in info_list:
sheet.write(i,j,info) #写入爬取信息
j += 1
i += 1
book.save('F://qidian.xls')