目标:根据最近学习的python(selenium+pyquery)爬取当当网图书排行榜
宣言:涨薪仍未成功,码农仍需努力,Write Less Do More!
页面分析
- 目标地址:http://bang.dangdang.com/books/bestsellers
-
共有“图书畅销榜”,“新书热卖榜”,“童书榜”,“图书尾品汇榜”,“五星图书榜”,“图书飙升榜”,“电子书畅销榜”,“电子书新书热卖榜”八种不同的排行榜,页面上表现为一个tab切换,点击之后发现浏览器地址栏刷新,说明不是ajax请求,点击翻页时发现页面也有刷新,并且地址栏的参数有变化,url最后一个参数为翻页参数,找到规则后开始爬取。
代码
# 系统windows10 python3
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from pyquery import PyQuery as pq
from urllib.parse import quote
import json
import time
browser=webdriver.Chrome()
wait=WebDriverWait(browser,30)
'''
获取图书排行榜分类,得到分类名称和url
'''
def books_category():
browser.get('http://bang.dangdang.com/books/bestsellers')
try:
wait.until(
EC.presence_of_element_located( (By.CSS_SELECTOR, '.bang_title > div.tab > h2') )
)
# xmlns 如果不将这个参数去掉,则无法使用pq定位子级标签
html=(browser.page_source).replace('xmlns="http://www.w3.org/1999/xhtml"','')
doc=pq(html)
cate_list=doc('.bang_title .tab h2').items()
for cate in cate_list:
cate_name=cate.find('a').text().strip()
cate_url=cate.find('a').attr('href').strip()
yield {
'cate_name' : cate_name,
'cate_url' : cate_url
}
except TimeoutError:
print(e.message)
'''
获取每个分类总页数
'''
def get_total_page(url):
browser.get(url)
try:
total=wait.until(
EC.presence_of_all_elements_located( (By.CSS_SELECTOR, '.paginating > ul.paging > li:nth-last-child(3) > a' ) )
)
return total[0].text
except TimeoutError:
print(e.message)
'''
获取需要爬取的页面
'''
def get_page(url, name, page_number):
browser.get(url)
try:
#到页面底部
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#等待加载完成
wait.until(
EC.presence_of_all_elements_located( (By.CSS_SELECTOR, '.bang_list_box > .bang_list > li' ) )
)
#判断是否翻页成功
wait.until(
EC.text_to_be_present_in_element( (By.CSS_SELECTOR, '.paginating > ul.paging > li > a.current'), str(page_number) )
)
#如果不将xmlns属性去掉,使用pyquery时无法根据标签名称获取子级标签
html=(browser.page_source).replace('xmlns="http://www.w3.org/1999/xhtml"','')
write_to_file(parse_html(html,name))
except TimeoutException:
get_page(url, name, page_number)
'''
使用pq抽取有用的信息
'''
def parse_html(html,cate_name):
doc=pq(html)
lis=doc('.bang_list_box .bang_list li').items()
for li in lis:
name=li.find('div.name a').text()
img=li.find('div.pic a img').attr('src')
author=li.find('div:nth-child(5) a').text()
publisher=li.find('div:nth-child(6) a').text()
original_cost=li.find('div.price p span:nth-child(1)').text()
discount_price=li.find('div.price p span:nth-child(2)').text()
discount=li.find('div.price p span:nth-child(3)').text()
yield {
'cate_name' : cate_name,
'books_infos':{
'name' : name,
'img' : img,
'author' : author,
'publisher' : publisher,
'original_cost' : original_cost,
'discount_price' : discount_price,
'discount' : discount
}
}
'''
将抓取的数据写入文件
'''
def write_to_file(jsondata):
for x in jsondata:
with open('dangdang.json','a',encoding='utf-8') as f:
f.write(json.dumps(x,ensure_ascii=False) + '\n')
f.close()
def main():
cate_list=books_category()
for cate in cate_list:
total=int(get_total_page(cate['cate_url']))
for i in range(1,total+1):
print('正在爬取%s,共%i页,正在抓取第%i页' % (cate['cate_name'], total, i))
cate_url=cate['cate_url'][:len(cate['cate_url'])-1] + str(i)
get_page(cate_url, cate['cate_name'], i)
time.sleep(3)
print('数据抓取完成')
if __name__=='__main__':
main()
代码执行
cd D:\test\python\
py .\dangdang.py