爬取python异步社区图书
关于Python的搜索前一页的书
这次的爬取风格比较靠谱,先将爬取到的异步社区的html存起来,这样一方面,加快了测试速度, 另一方面,也不会由于爬取的过于频繁,对对方的服务器,造成负担。
所谓“盗亦有道”,大概如是也
import requests
from bs4 import BeautifulSoup
import os, re
import xlwt
def getToThetxt(url):
res = requests.get(url)
res.encoding = res.apparent_encoding
# print(res.text)
soup = BeautifulSoup(res.text, 'lxml')
with open(os.getcwd() + '/book.txt', 'w') as f:
f.write(soup.prettify().replace('\u0142', '').replace('\xa9', ''))
def getfromtxt():
with open(os.getcwd() + '/book.txt', 'r') as f:
return f.read()
def getBookMeg(html):
soup = BeautifulSoup(html, 'lxml')
search = soup.find(attrs={'id': 'search-result'})
bookimg = []
bookNames = []
bookAuthor = []
translator = []
summary = []
price = []
books = search.div.ul.find_all('li', attrs={'class': 'block-item bookList__item'})
for book in books:
divs = book.find_all('div')
bookimg.append(divs[0].find('img'))
bookNames.append(divs[1].find('h3').contents[1].string.replace(' ', '').replace('\n', ''))
bookAuthor.append(divs[1].find(attrs={'class': 'bookList__author'}).text.replace(' ', '').replace('\n', ''))
translator.append(divs[1].find(attrs={'class': 'bookList__translator'}).text.replace(' ', '').replace('\n', ''))
summary.append(divs[1].find(attrs={'class': 'bookList__summary'}).text.replace(' ', '').replace('\n', ''))
price.append(divs[2].find_all('li')[0].find('em').find('del').text.replace(' ', '').replace('\n', ''))
# img先放着,先处理好data先
work_book = xlwt.Workbook("D:\\Code\\python\\BookGet\\")
sheet = work_book.add_sheet('sheet1')
sheet.write(0, 0, "书名")
sheet.write(0, 1, "作者")
sheet.write(0, 2, "译者")
sheet.write(0, 3, "大纲")
sheet.write(0, 4, "价格")
for i in range(1, len(books)+1):
sheet.write(i, 0, bookNames[i-1])
sheet.write(i, 1, bookAuthor[i-1])
sheet.write(i, 2, translator[i-1])
sheet.write(i, 3, summary[i-1])
sheet.write(i, 4, price[i-1])
work_book.save("book.xls")
# data
if __name__ == "__main__":
url = "http://www.epubit.com.cn/search?q=python&type=book"
path = os.getcwd()
path = path + '\\book.txt'
# if not os.path.exists(path):
if not os.path.exists(path):
getToThetxt(url)
getBookMeg(getfromtxt())