python实战计划：爬取租房信息

Date:2016-9-21
update:2016-9-30
By:Black Crow

前言：

终于进入到网络页面的抓取了。前面一节课静态页面的作业做了之后总是有报错，所以一直没有单独写总结。听课的时候就感觉到内容十分的吸引人，爬取的过程也是特别有意思，后面一节课关于select的条件上是有做优化的，比前一节课更高效。PPT里的地址已失效，所以随便设定条件搜的短租房信息。

作业效果：

看着信息滚动的感觉其实挺爽的

房租信息.gif

20160921爬取的excel表格：链接: http://pan.baidu.com/s/1nvEVDvN 密码: j4vt
20160922update表格：链接: http://pan.baidu.com/s/1c198fN6 密码: kq4a
20160922update图片：

各区女房东占多数.png

东城均价最高，通州均价最低.png

女房东的房子均价要高.png

我的代码：

20160921代码

from bs4 import BeautifulSoup
import requests
import time
def gender_change(gender_lorder):
if gender_lorder =='member_girl_ico'or 'member_ico1':
return 'girl'
elif gender_lorder =='member_boy_ico' or 'member_ico':
return 'boy'
else:
return 'unknown gender!'
def info(url):
info_data = requests.get(url)
info_soup=BeautifulSoup(info_data.text,'lxml')
titles = info_soup.select('div.con_l > div.pho_info > h4 > em')
addresses =info_soup.select('div.pho_info > p')
images_house = info_soup.select('img[id="curBigImage"]')
days_fee =info_soup.select('div.day_l > span')
urls_lorder =info_soup.select('div.member_pic > a > img')
names_lorder =info_soup.select('div.w_240 > h6 > a')
genders_lorder = info_soup.select('div.w_240 > h6 > span')
#print(days_fee)
for title,address,image_house,day_fee,url_lorder,name_lorder,gender_lorder in zip(titles,addresses,images_house,days_fee,urls_lorder,names_lorder,genders_lorder):
data ={
'title':title.get_text('em'),
'address':address.get('title'),
'image_house':image_house.get('src'),
'url_lorder':url_lorder.get('src'),
'name_lorder':name_lorder.get_text(),
'gender_lorder':gender_change(str(gender_lorder.get('class')[0])),
'day_fee': day_fee.get_text(),
}
print(data)
time.sleep(0.01)
house_urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/?startDate=2016-09-21&endDate=2016-10-01'.format(str(i)) for i in range(0,15,1)]
for house_url in house_urls:
wb_data = requests.get(house_url)
soup = BeautifulSoup(wb_data.text, 'lxml')
detail_urls = soup.select('a[class="resule_img_a"]')
for detail_url in detail_urls:
house_data=detail_url.get('href')
info(house_data)
#print(house_data)
print('Done')


#####20160922update代码：修正了性别判断
>```
from bs4 import BeautifulSoup
import requests
import time
def gender_change(gender_lorder):
    if gender_lorder == 'member_girl_ico':
        return 'girl'
    elif gender_lorder == 'member_boy_ico':
        return 'boy'
    else:
        return 'unknown gender!'
def info(url):
    info_data = requests.get(url)
    info_soup=BeautifulSoup(info_data.text,'lxml')
    titles = info_soup.select('div.con_l > div.pho_info > h4 > em')
    addresses =info_soup.select('div.pho_info > p')
    images_house = info_soup.select('img[id="curBigImage"]')
    days_fee =info_soup.select('div.day_l > span')
    urls_lorder =info_soup.select('div.member_pic > a > img')
    names_lorder =info_soup.select('div.w_240 > h6 > a')
    genders_lorder = info_soup.select('div.w_240 > h6 > span')
    #print(genders_lorder)
    for title,address,image_house,day_fee,url_lorder,name_lorder,gender_lorder in zip(titles,addresses,images_house,days_fee,urls_lorder,names_lorder,genders_lorder):
        data ={
            'title':title.get_text('em'),
            'address':address.get('title'),
            'image_house':image_house.get('src'),
            'url_lorder':url_lorder.get('src'),
            'name_lorder':name_lorder.get_text(),
            'gender_lorder':gender_change(gender_lorder.get('class')[0]),
            'day_fee': day_fee.get_text(),
        }
        print(data)
house_urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/?startDate=2016-09-21&endDate=2016-10-01'.format(str(i)) for i in range(0,15,1)]
for house_url in house_urls:
    wb_data = requests.get(house_url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    detail_urls = soup.select('a[class="resule_img_a"]')
    for detail_url in detail_urls:
        house_data=detail_url.get('href')
        info(house_data)
        #print(house_data)
print('Done')

20160924update:性别表述修改为male和female;去除标题中的换行符，避免影响数据处理；增加写入本地文件；增加计数项，避免爬取过程中无聊。

from bs4 import BeautifulSoup
import requests
import time
def gender_change(gender_lorder):
if gender_lorder =='member_girl_ico':
return 'female'
elif gender_lorder =='member_boy_ico':
return 'male'
else:
return 'unknown gender'
def counter(last=[0]):
#last[0]将列表里面的第一个元素取出，然后加1，赋值给next
next = last[0] + 1
#修改列表里面第一个元素的值
last[0] = next
#返回此时运行的次数
return next
def info(url):
info_data = requests.get(url)
info_soup=BeautifulSoup(info_data.text,'lxml')
titles = info_soup.select('h4 em')
#titles = info_soup.select('div.con_l > div.pho_info > h4 > em')
addresses =info_soup.select('div.pho_info > p')
images_house = info_soup.select('img[id="curBigImage"]')
days_fee =info_soup.select('div.day_l > span')
urls_lorder =info_soup.select('div.member_pic > a > img')
names_lorder =info_soup.select('div.w_240 > h6 > a')
genders_lorder = info_soup.select('div.w_240 > h6 > span')
#print(titles)
for title,address,image_house,day_fee,url_lorder,name_lorder,gender_lorder
in zip(titles,addresses,images_house,days_fee,urls_lorder,names_lorder,
genders_lorder):
data ={
#'title':title.get_text('em'),
'title': title.get_text(),
'address':address.get('title'),
'image_house':image_house.get('src'),
'url_lorder':url_lorder.get('src'),
'name_lorder':name_lorder.get_text(),
'gender_lorder':gender_change(str(gender_lorder.get('class')[0])),
'day_fee': day_fee.get_text(),
}
#print(data)
with open('F://python/2/xiaozhu_data.txt','a',encoding='utf-8') as file:#路径根据实际填写
#以w形式写入，前面会被清空；a为追加写入
#标题里有换行，用replace去除
file_content =data['title'].replace("\n",'')+';'+data['day_fee']+';'
+data['address']+';'+data['image_house']+';'
+data['name_lorder']+';'+data['gender_lorder']+';'
+data['url_lorder']+'\n'
file.write(file_content)
print(counter()) # 调用计数器，避免无聊
time.sleep(0.01)
house_urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/?startDate=2016-09-21'
'&endDate=2016-10-01'.format(str(i)) for i in range(0,15,1)]
for house_url in house_urls:
wb_data = requests.get(house_url)
soup = BeautifulSoup(wb_data.text, 'lxml')
detail_urls = soup.select('a[class="resule_img_a"]')
for detail_url in detail_urls:
house_data=detail_url.get('href')
info(house_data)
#print(house_data)
print('Done')

####总结：
>1. 该网站也设置了反爬措施，房屋图片及房东图片都采用了障眼法（假src，点击链接后图片下载了但是打不开），但是目前技术有限，绕不过去，只能是暂时搁置了。(update20160930图片的地址是真实的，但是因为我默认浏览器为chrome，打开链接就直接下载了图片，图片无法打开，在该链接复制进IE浏览器后，发现原来可以显示。雾~~~)
![1.png](http://upload-images.jianshu.io/upload_images/1059649-5555e1182aab31d6.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
2. 本来打算将爬取的数据写入文档，但实验了多次后发现dict的转换写入方法还没掌握，这个后续打算问问老师怎么处理比较妥当；此次作业的表格是采用傻瓜式处理的，复制粘贴到excel，然后用excel分列处理的。大致看了下短租房日租金以128-499区间的房屋最多，地址没细作研究，但是觉得可以再excel里用地图展现一下。（dict里的内容打印存储到本地的坑已经填上，20160924update）
3. 代码写的时候是先写的单个页面的解析，后来写的是房屋链接的采集，两段代码合并时稍微做了调整。
4. 速度有些慢，不知道是代码原因还是本身数据爬取过程就比较慢的原因。sleep的时间还是设定了，比较短，以防万一。
5. 性别一项抓取的数据都是girl,估计还是有问题，还没有一个个细看是不是真的如此，但直觉是女性确实比较多。（此项已经修正，20160924update）

最后编辑于：2017.12.04 03:52:30

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 203,324评论 5赞 476
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 85,303评论 2赞 381
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 150,192评论 0赞 337
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 54,555评论 1赞 273
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 63,569评论 5赞 365
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 48,566评论 1赞 281
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 37,927评论 3赞 395
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 36,583评论 0赞 257
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 40,827评论 1赞 297
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 35,590评论 2赞 320
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 37,669评论 1赞 329
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 33,365评论 4赞 318
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 38,941评论 3赞 307
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 29,928评论 0赞 19
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 31,159评论 1赞 259
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 42,880评论 2赞 349
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 42,399评论 2赞 342

python实战计划：爬取租房信息

前言：

作业效果：

我的代码：

20160921代码

20160924update:性别表述修改为male和female;去除标题中的换行符，避免影响数据处理；增加写入本地文件；增加计数项，避免爬取过程中无聊。

推荐阅读更多精彩内容