作业要求:
CODE:
from bs4 import BeautifulSoup
import requests
import time,random
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
def main():
for i in range(1,2):
url = 'http://yl.58.com/shouji/pn{}/?PGTID=0d300024-0173-64b4-17a6-4832c180f4aa&ClickID=2'.format(i)
r = requests.get(url, headers = headers)
if r.status_code != 200:
continue
soup = BeautifulSoup(r.text, 'lxml')
datas = soup.select('a.t')
for data in datas:
detail_url = data.get('href')
get_detail(detail_url)
def get_detail(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
titles = soup.select('h1.info_titile')
prices = soup.select('span.price_now > i')
belong_tos = soup.select('span.crb_i > a ')
areas = soup.select('div.palce_li > span > i')
qualities = soup.select('div.quality')
look_times = soup.select('span.look_time')
time.sleep(random.randint(1,3))
for title, price, belong_to, area, quality, look_time in zip(titles, prices, belong_tos, areas, qualities,look_times):
datas = {
'title':title.get_text(),
'price':price.get_text(),
'belongs_to':belong_to.get_text(),
'are':area.get_text(),
'quality':quality.get_text().replace('\n',' '), #将提取的多个标签去掉换行符用空格代替
'look_time':look_time.get_text()
}
print(datas)
if __name__ == '__main__':
main()
结果:
C:\Users\XXZX-0\AppData\Local\Programs\Python\Python35-32\python.exe D:/test/58.py
{'title': '苹果6plus. iPhone6plus手机,屏幕有一处小角破了点, 16g价格最低3000不可商议!', 'quality': ' WiFi/蓝牙正常 无拆无修 无锁机 屏幕正常 无进水 ', 'are': '榆林-佳县', 'look_time': '638次浏览', 'price': '3000', 'belongs_to': '榆林二手市场'}
{'title': '三星A7100 2016版 低价处理了', 'quality': ' WiFi/蓝牙正常 无拆无修 无锁机 屏幕正常 无进水 ', 'are': '榆林-绥德', 'look_time': '258次浏览', 'price': '1700', 'belongs_to': '榆林二手市场'}
.......
Process finished with exit code 0