写在前面得话
确定在武昌这边买套房,用python爬取链家武昌、洪山、东高三区的租房和二手房信息。计算租售比辅助买房决策。(之前先爬过房天下,但满满的都是假房源没有意义只能放弃)
爬取租房信息
代码
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 20 10:44:38 2017
@author: huanglei
"""
# -*- coding: utf-8 -*-
#-*- coding:utf-8 -*-
import requests
import re
import random
from bs4 import BeautifulSoup
import pandas as pd
def is_num_by_except(num):
try:
int(num)
return True
except ValueError:
# print "%s ValueError" % num
return False
def spider_1(url):
user_agent=['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.87 Safari/537.36',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
]
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'User-Agent': user_agent[random.randint(0,5)]
}
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.text,'lxml')
page_array = []
titles = soup.select('#house-lst > li > div.info-panel > h2 > a') # 标题
courts = soup.select('span.region') #小区
areas = soup.select('#house-lst > li > div.info-panel > div.col-1 > div.where > span.meters') #平米
zones = soup.select('#house-lst > li > div.info-panel > div.col-1 > div.where > span.zone') # 几室几厅
prices = soup.select('#house-lst > li > div.info-panel > div.col-3 > div.price > span') #价格
for title, court, area, zone, price in zip(titles, courts, areas, zones, prices):
data = {
'title': title.get_text().strip(),
'court': court.get_text().strip(),
'roomWay': zone.get_text().strip(),
'square': area.get_text().strip(),
# 'orient': list(detail.stripped_strings)[5],
'price': price.get_text().strip(),
}
if is_num_by_except(data['square'][:-2]) == True:
data['square'] = data['square'][:-2]
data['danjia']=int(int(data['price'])/int(data['square']))
if int(data['square'])>20:
page_array.append(data)
return page_array
def pandas_to_xlsx(info, file_name):
pd_look = pd.DataFrame(info)
sheet_n = '武汉租房'
pd_look.to_excel(file_name,sheet_name=sheet_n)
def sort_xlsx(file_name):
df = pd.read_excel(file_name)
df = df.drop_duplicates()
df_zufang = df.groupby('court').mean()
df_zufang.to_excel("均一化"+file_name,'均一化')
nlist = add_youzufang(df_zufang)
array_all =[]
list_qu = ['wuchang','hongshan','donghugaoxin']
for qu in list_qu:
page = 1
url_qu = 'https://wh.lianjia.com/zufang/'+qu
while page < 3:
url = url_qu+'/pg'+str(page)
try:
array_all.extend(spider_1(url))
except:
pandas_to_xlsx(array_all)
break
page = page + 1
print(qu+str(page))
pandas_to_xlsx(array_all,"链家武汉租房.xlsx")
df = pd.read_excel("链家武汉租房.xlsx")
df = df.drop_duplicates()
df_zufang = df.groupby('court').mean()
df_zufang.to_excel('租房均一化.xlsx','均一化')
爬取租房信息截图如下
爬取二手房信息
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 20 10:44:38 2017
@author: huanglei
"""
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import random
def spider_1(url):
user_agent=['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.87 Safari/537.36',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
]
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'User-Agent': user_agent[random.randint(0,5)]
}
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.text,'lxml')
page_array = []
titles = soup.select('li.clear > div.info.clear > div.title > a') # 标题
hrefs = soup.select('ul.sellListContent > li.clear > a.img')
details = soup.select("div.address > div.houseInfo")
prices = soup.select("div.priceInfo > div.totalPrice > span")#解析总价
danjias = soup.select("div.priceInfo > div.unitPrice > span")#解析单价
loucengs = soup.select("div.info.clear > div.flood > div.positionInfo")
addresss = soup.select("div.info.clear > div.flood > div.positionInfo > a")
for title, href, detail, price, danjia, louceng, address in zip(titles, hrefs, details, prices, danjias, loucengs, addresss):
data = {
'title': title.get_text(),
'href': href.get('href'),
'detail': detail.get_text().strip(),
'price': price.get_text(),
'danjia': danjia.get_text(),
'louceng': louceng.get_text(),
'add': address.get_text(),
}
#print(float(data['price'])<170)
data['court'] = data['detail'].split('|')[0].strip()
data['area'] = data['detail'].split('|')[2][:-3]
data['author'] = ""
page_array.append(data)
return page_array
def pandas_to_xlsx(info):
pd_look = pd.DataFrame(info)
xlsx_n = '链家二手房.xlsx'
sheet_n = '武汉二手房'
pd_look.to_excel(xlsx_n,sheet_name=sheet_n)
#返回有租房小区列表
def add_youzufang(info):
nlist=[]
for index, row in info.iterrows():
#for col_name in info.columns:
nlist.append(index)
return nlist
#修改买房者为租金单价
def chinese(info):
info.rename(columns={"author":"租售比", "add":"地址","area":"平米","court":"小区名","danjia":"单价"}, inplace = True)
info.rename(columns={"detail":"细节", "price":"总价","title":"标题"}, inplace = True)
return info
# 读取租房信息并得到所有小区信息
df = pd.read_excel("链家武汉租房.xlsx")
df = df.drop_duplicates()
df_zufang = df.groupby('court').mean()
df_zufang.to_excel('租房均一化.xlsx','均一化')
nlist = add_youzufang(df_zufang)
# 读取所有卖二手房信息并放在pd数据中
page = 1
df_ershoufang =[]
while page < 300:
url = 'https://wh.lianjia.com/ershoufang/sf1l1l2l3p1p2p3p4/pg'+str(page)
try:
df_ershoufang.extend(spider_1(url))
page = page + 1
except:
print("error:")
break
print(page)
pandas_to_xlsx(df_ershoufang)
df_ershoufang = pd.read_excel("链家二手房.xlsx")
for index, row in df_ershoufang.iterrows():
if (row['court']) in nlist:
df_ershoufang.at[index,'author']= df_zufang.at[row['court'],'danjia']
else:
df_ershoufang.drop(index,axis=0,inplace=True)
df_ershoufang['author'] = 0.12*df_ershoufang['author'].astype('int')/(df_ershoufang['price'].astype('int')/df_ershoufang['area'].astype('float'))
df_ershoufang = df_ershoufang.sort_values(by='author',ascending = False)
df_ershoufang = chinese(df_ershoufang)
pandas_to_xlsx(df_ershoufang)
结果截图如下
写在后面的话
数据分析依旧只能做参考
得到房源信息没能区分70年产权和40年产权的房子。而且也不能满足我其他关于交通、工作的准确信息。最终还是去现场看了几天才确认。
租售比只能做参考
曾经看了各种帖子,说金融界租售比(年租金除以卖价)为4%为最佳,2.4%一下就有泡沫。但在中国这个房价飞速增长的年代,没点儿泡沫的房子不仅难找,是不是真有意义也谁又说的定呢?话说3年前周围人在北京和武汉买的房倒是真的租售比超过4% _