视频重点
练习代码
扩展
1. 视频重点
- 操作与客户端(本地的mongodb)进行链接(激活); 给客服打同电话
client = pymongo.MongoClint('localhost',27017)
创建数据库命名 = 给excel命名;告诉客服要定一个酒店hotel = client['hotel']
给数据库的文件创建表单(页) = 给Excel创建sheet;而且定具体的房间room = hotel['room']
房间是存放东西的,而酒店可以有很多房间,客服又可以联系到很多酒店,但每次都要联系客服。 -
enumerate:会将数组或列表组成一个索引序列。
- 添加数据:room.insert_one()
- 查找:room.find() $lt/$gt/$lte/$gte/$!=分别是<,><=,>=,不等于
2.练习代码
from bs4 import BeautifulSoup
import requests
import time
import pymongo
client = pymongo.MongoClient('localhost', 27017)
dz = client['dz']
dzinfo = dz['dzinfo']
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
"Cookie": "abtest_ABTest4SearchDate=b; xzuuid=d8302b2c; OZ_1U_2282=vid=v742d196332c56.0&ctime=1464069891<ime=1464069890; OZ_1Y_2282=erefer=-&eurl=http%3A//bj.xiaozhu.com/search-duanzufang-p1-0%3Ft%3D1463996817649&etime=1463996822&ctime=1464069891<ime=1464069890&compid=2282; _ga=GA1.2.679671644.1463996823; _gat_UA-33763849-7=1; __utmt=1; __utma=29082403.679671644.1463996823.1463996824.1464069893.2; __utmb=29082403.1.10.1464069893; __utmc=29082403; __utmz=29082403.1463996824.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)"
}
urls = []
# url = 'http://bj.xiaozhu.com/search-duanzufang-p1-0/'
# urla = 'http://bj.xiaozhu.com/fangzi/1580034935.html'
def each_href(url):
wb_data = requests.get(url, headers=headers)
soup = BeautifulSoup(wb_data.text, 'lxml')
xiangqings = soup.select('a.resule_img_a')
for xiangqing in xiangqings:
urls.append(xiangqing.get('href'))
return (print(urls,len(urls)))
def each_info(urla):
wb_data = requests.get(urla)
time.sleep(2)
soup = BeautifulSoup(wb_data.text, 'lxml')
titles = soup.select('div.pho_info > h4 > em')
prices = soup.select('#pricePart > div.day_l > span')
adds = soup.select('span[class="pr5"]')
img_rooms = soup.select('#detailImageBox > div.pho_show_r > div > ul > li > img')
img_hosts = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
ffs = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > span')
names = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
for title, price, add, img_room, img_host, ff, name in zip(titles, prices, adds, img_rooms, img_hosts, ffs, names):
if str(ff) == '<span class="member_girl_ico"></span>':
ff = "女性"
else:
ff = "男性"
data = {
'title': title.get_text(),
'price': price.get_text(),
'add': add.get_text().split("\n")[0],
'img_room': img_room.get('data-bigimg'),
'img_host': img_host.get('src'),
'ff': ff,
'name': name.get_text()
}
# print(data)
dzinfo.insert_one(data)
i = input("想爬取多少页的信息?")
for k in range(1, int(i)+1):
for url in ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(k)]:
each_href(url)
print(urls, len(urls))
for urla in urls:
each_info(urla)
for info in dzinfo.find():
if int(info['price'])>=500:
print(info)
运行结果
注:在pycharm中的pymongo插件中查看爬到的结果,最多看到0~299,共300条信息