Date:2016-9-21
update:2016-9-30
By:Black Crow
前言:
终于进入到网络页面的抓取了。前面一节课静态页面的作业做了之后总是有报错,所以一直没有单独写总结。听课的时候就感觉到内容十分的吸引人,爬取的过程也是特别有意思,后面一节课关于select的条件上是有做优化的,比前一节课更高效。PPT里的地址已失效,所以随便设定条件搜的短租房信息。
作业效果:
看着信息滚动的感觉其实挺爽的
20160921爬取的excel表格:链接: http://pan.baidu.com/s/1nvEVDvN 密码: j4vt
20160922update表格:链接: http://pan.baidu.com/s/1c198fN6 密码: kq4a
20160922update图片:
我的代码:
20160921代码
from bs4 import BeautifulSoup
import requests
import time
def gender_change(gender_lorder):
if gender_lorder =='member_girl_ico'or 'member_ico1':
return 'girl'
elif gender_lorder =='member_boy_ico' or 'member_ico':
return 'boy'
else:
return 'unknown gender!'
def info(url):
info_data = requests.get(url)
info_soup=BeautifulSoup(info_data.text,'lxml')
titles = info_soup.select('div.con_l > div.pho_info > h4 > em')
addresses =info_soup.select('div.pho_info > p')
images_house = info_soup.select('img[id="curBigImage"]')
days_fee =info_soup.select('div.day_l > span')
urls_lorder =info_soup.select('div.member_pic > a > img')
names_lorder =info_soup.select('div.w_240 > h6 > a')
genders_lorder = info_soup.select('div.w_240 > h6 > span')
#print(days_fee)
for title,address,image_house,day_fee,url_lorder,name_lorder,gender_lorder in zip(titles,addresses,images_house,days_fee,urls_lorder,names_lorder,genders_lorder):
data ={
'title':title.get_text('em'),
'address':address.get('title'),
'image_house':image_house.get('src'),
'url_lorder':url_lorder.get('src'),
'name_lorder':name_lorder.get_text(),
'gender_lorder':gender_change(str(gender_lorder.get('class')[0])),
'day_fee': day_fee.get_text(),
}
print(data)
time.sleep(0.01)
house_urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/?startDate=2016-09-21&endDate=2016-10-01'.format(str(i)) for i in range(0,15,1)]
for house_url in house_urls:
wb_data = requests.get(house_url)
soup = BeautifulSoup(wb_data.text, 'lxml')
detail_urls = soup.select('a[class="resule_img_a"]')
for detail_url in detail_urls:
house_data=detail_url.get('href')
info(house_data)
#print(house_data)
print('Done')
#####20160922update代码:修正了性别判断
>```
from bs4 import BeautifulSoup
import requests
import time
def gender_change(gender_lorder):
if gender_lorder == 'member_girl_ico':
return 'girl'
elif gender_lorder == 'member_boy_ico':
return 'boy'
else:
return 'unknown gender!'
def info(url):
info_data = requests.get(url)
info_soup=BeautifulSoup(info_data.text,'lxml')
titles = info_soup.select('div.con_l > div.pho_info > h4 > em')
addresses =info_soup.select('div.pho_info > p')
images_house = info_soup.select('img[id="curBigImage"]')
days_fee =info_soup.select('div.day_l > span')
urls_lorder =info_soup.select('div.member_pic > a > img')
names_lorder =info_soup.select('div.w_240 > h6 > a')
genders_lorder = info_soup.select('div.w_240 > h6 > span')
#print(genders_lorder)
for title,address,image_house,day_fee,url_lorder,name_lorder,gender_lorder in zip(titles,addresses,images_house,days_fee,urls_lorder,names_lorder,genders_lorder):
data ={
'title':title.get_text('em'),
'address':address.get('title'),
'image_house':image_house.get('src'),
'url_lorder':url_lorder.get('src'),
'name_lorder':name_lorder.get_text(),
'gender_lorder':gender_change(gender_lorder.get('class')[0]),
'day_fee': day_fee.get_text(),
}
print(data)
house_urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/?startDate=2016-09-21&endDate=2016-10-01'.format(str(i)) for i in range(0,15,1)]
for house_url in house_urls:
wb_data = requests.get(house_url)
soup = BeautifulSoup(wb_data.text, 'lxml')
detail_urls = soup.select('a[class="resule_img_a"]')
for detail_url in detail_urls:
house_data=detail_url.get('href')
info(house_data)
#print(house_data)
print('Done')
20160924update:性别表述修改为male和female;去除标题中的换行符,避免影响数据处理;增加写入本地文件;增加计数项,避免爬取过程中无聊。
from bs4 import BeautifulSoup
import requests
import time
def gender_change(gender_lorder):
if gender_lorder =='member_girl_ico':
return 'female'
elif gender_lorder =='member_boy_ico':
return 'male'
else:
return 'unknown gender'
def counter(last=[0]):
#last[0]将列表里面的第一个元素取出,然后加1,赋值给next
next = last[0] + 1
#修改列表里面第一个元素的值
last[0] = next
#返回此时运行的次数
return next
def info(url):
info_data = requests.get(url)
info_soup=BeautifulSoup(info_data.text,'lxml')
titles = info_soup.select('h4 em')
#titles = info_soup.select('div.con_l > div.pho_info > h4 > em')
addresses =info_soup.select('div.pho_info > p')
images_house = info_soup.select('img[id="curBigImage"]')
days_fee =info_soup.select('div.day_l > span')
urls_lorder =info_soup.select('div.member_pic > a > img')
names_lorder =info_soup.select('div.w_240 > h6 > a')
genders_lorder = info_soup.select('div.w_240 > h6 > span')
#print(titles)
for title,address,image_house,day_fee,url_lorder,name_lorder,gender_lorder
in zip(titles,addresses,images_house,days_fee,urls_lorder,names_lorder,
genders_lorder):
data ={
#'title':title.get_text('em'),
'title': title.get_text(),
'address':address.get('title'),
'image_house':image_house.get('src'),
'url_lorder':url_lorder.get('src'),
'name_lorder':name_lorder.get_text(),
'gender_lorder':gender_change(str(gender_lorder.get('class')[0])),
'day_fee': day_fee.get_text(),
}
#print(data)
with open('F://python/2/xiaozhu_data.txt','a',encoding='utf-8') as file:#路径根据实际填写
#以w形式写入,前面会被清空;a为追加写入
#标题里有换行,用replace去除
file_content =data['title'].replace("\n",'')+';'+data['day_fee']+';'
+data['address']+';'+data['image_house']+';'
+data['name_lorder']+';'+data['gender_lorder']+';'
+data['url_lorder']+'\n'
file.write(file_content)
print(counter()) # 调用计数器,避免无聊
time.sleep(0.01)
house_urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/?startDate=2016-09-21'
'&endDate=2016-10-01'.format(str(i)) for i in range(0,15,1)]
for house_url in house_urls:
wb_data = requests.get(house_url)
soup = BeautifulSoup(wb_data.text, 'lxml')
detail_urls = soup.select('a[class="resule_img_a"]')
for detail_url in detail_urls:
house_data=detail_url.get('href')
info(house_data)
#print(house_data)
print('Done')
####总结:
>1. 该网站也设置了反爬措施,房屋图片及房东图片都采用了障眼法(假src,点击链接后图片下载了但是打不开),但是目前技术有限,绕不过去,只能是暂时搁置了。(update20160930图片的地址是真实的,但是因为我默认浏览器为chrome,打开链接就直接下载了图片,图片无法打开,在该链接复制进IE浏览器后,发现原来可以显示。雾~~~)
![1.png](http://upload-images.jianshu.io/upload_images/1059649-5555e1182aab31d6.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
2. 本来打算将爬取的数据写入文档,但实验了多次后发现dict的转换写入方法还没掌握,这个后续打算问问老师怎么处理比较妥当;此次作业的表格是采用傻瓜式处理的,复制粘贴到excel,然后用excel分列处理的。大致看了下短租房日租金以128-499区间的房屋最多,地址没细作研究,但是觉得可以再excel里用地图展现一下。(dict里的内容打印存储到本地的坑已经填上,20160924update)
3. 代码写的时候是先写的单个页面的解析,后来写的是房屋链接的采集,两段代码合并时稍微做了调整。
4. 速度有些慢,不知道是代码原因还是本身数据爬取过程就比较慢的原因。sleep的时间还是设定了,比较短,以防万一。
5. 性别一项抓取的数据都是girl,估计还是有问题,还没有一个个细看是不是真的如此,但直觉是女性确实比较多。(此项已经修正,20160924update)