1.背景
上次通过stata爬取了教育部的政策文件数据,大概了解了爬取数据思路以及正则表达式方面的知识。
但是,用stata在清洗时确实比较费力。
这不,前段时间安装了stata 16,新功能(官方介绍New in Stata 16)中有个比较亮眼的功能,
就是可以在stata中写python语句,调用python的包进行数据处理,
这就给了很大动力去学习python爬虫,
这样的话,就可以实现python爬取数据,再用stata进行处理分析,用stata和python结合出报告啦~
想想都激动呢~
因此,以链家北京在售二手房数据为例,花了2天时间学习并爬取了共84278条数据~
2.学习过程
找学习帖子:找了不少帖子也看了不少python爬取链家数据的帖子,有的要么太简单,有的要么没有注释,完全不知道思路,最后找到了这篇学习帖子:python爬取链家网的房屋数据
-
准备工具:随便度娘就可以实现;
- 下载好python3;
- 下载好sublime3;
- 在sublime 3中进行配置,可以执行运行python;
- 安装chrome浏览器及Xpath插件;
-
学习过程:
- 按照学习帖子,将帖子中的代码一行一行敲下来;
- 然后,一段一段地去理解和运行,不懂就度娘;
- 因为学习帖子里是爬取二手房交易数据,而需求是爬取二手房在售数据,所以,在理解代码之后,根据需求调整修改代码;
-
经过上述过程,最后得到了爬取链家北京在售二手房数据的代码,代码如下:
3. 爬取代码
正如帖子里所述,爬取思路主要是,先获得每个房屋的链接,再对每个房屋页面内的数据进行提取。
获得每个房屋的链接的代码见:【实战】 python爬取链家北京在售二手房数据(一)
接着是,对每个房屋页面内的数据提取的代码:
这个代码虽然有些长,但理解了前部分的帖子,这个就很好理解了,主要是字段获取有些长了~
研究了半天没研究好怎么添加线程,目前这个只有17个线程,跑起来比较慢,差不多将近10个小时了~
以后接着学习吧~
代码如下:
# 获取链接html,urllib.request.urlopen import urllib.request # 取消证书验证 import ssl # 解析html,xpath from lxml import etree # 正则表达式,re.findall import re # 线程 import threading # 导入csv import csv # 全局取消证书验证 ssl._create_default_https_context=ssl._create_unverified_context # 获取页面 def get_page(url): page=urllib.request.urlopen(url,timeout=15) html=page.read().decode('utf_8') return html # 读取文件 def read_file(path_file): with open(path_file,'r') as file: lines=file.readlines() return lines # 把中文数字转为阿拉伯数字 def zw2alb(zw): zwsy=['零', '一', '两', '三', '四', '五', '六', '七', '八', '九', '十', '十一', '十二', '十三', '十四','十五', '十六', '十七', '十八', '十九', '二十','二十一','二十二','二十三','二十四','二十五','二十六','二十七','二十八','二十九','三十','三十一','三十二','三十三','三十四','三十五','三十六','三十七','三十八','三十九','四十'] return zwsy.index(zw) # 解析页面,获取数据 def get_data(html): list_data=[] selector=etree.HTML(html) # 标题 try: title=selector.xpath('/html/body/div[3]/div[1]/div[1]/div[1]/h1/text()')[0] except: title='null' list_data.append(title) # 副标题 try: subtitle=selector.xpath('/html/body/div[3]/div[1]/div[1]/div[1]/div/text()')[0] except: subtitle='null' list_data.append(subtitle) # 关注人数 try: Followers=selector.xpath('/html/body/div[3]/div[1]/div[1]/div[2]/div[1]/div[1]/span/text()')[0] except: Followers='null' list_data.append(Followers) # 总价 try: TotalPrice=selector.xpath('/html/body/div[5]/div[2]/div[3]/span[@class="total"]/text()')[0] TotalPrice=float(TotalPrice)*10000 except: TotalPrice='null' list_data.append(TotalPrice) # 单价 try: unitPrice=selector.xpath('/html/body/div[5]/div[2]/div[3]/div[1]/div[1]/span/text()')[0] unitPrice=float(unitPrice) except: unitPrice='null' list_data.append(unitPrice) # 建楼年份 try: BuildYear=selector.xpath('/html/body/div[5]/div[2]/div[4]/div[3]/div[2]/text()')[0] BuildYear=re.findall(r'(.+?)年建',BuildYear)[0] except: BuildYear='null' list_data.append(BuildYear) # 小区名称 try: XQname=selector.xpath('/html/body/div[5]/div[2]/div[5]/div[1]/a[1]/text()')[0] except: XQname='null' list_data.append(XQname) # 所在区域 try: Zone=selector.xpath('/html/body/div[5]/div[2]/div[5]/div[2]/span[2]/a[2]/text()')[0] except: Zone='null' list_data.append(Zone) # 所在环 try: Huan=selector.xpath('/html/body/div[5]/div[2]/div[5]/div[2]/span[2]/text()[2]')[0].strip() except: Huan='null' list_data.append(Huan) # 看房时间 try: BookTime=selector.xpath('/html/body/div[5]/div[2]/div[5]/div[3]/span[2]/text()')[0] except: BookTime='null' list_data.append(BookTime) # 链家编号 try: LjID=selector.xpath('/html/body/div[5]/div[2]/div[5]/div[4]/span[2]/text()')[0] except: LjID='null' list_data.append(LjID) # 链家中介 try: LjAgent=selector.xpath('//*[@id="zuanzhan"]/div[2]/div/div[1]/a/text()')[0] except: LjAgent='null' list_data.append(LjAgent) # 链家中介评分 try: LjAgentScore=selector.xpath('//*[@id="zuanzhan"]/div[2]/div/div[2]/span[1]/text()')[0] LjAgentScore=re.findall(r'评分:(.+?)',LjAgentScore)[0] LjAgentScore=float(LjAgentScore) except: LjAgentScore='null' list_data.append(LjAgentScore) # 链家中介评价次数 try: LjAgentScoreTimes=selector.xpath('//*[@id="zuanzhan"]/div[2]/div/div[2]/span[2]/text()')[0] LjAgentScoreTimes=re.findall(r'/(.+?)次评价',LjAgentScoreTimes)[0] LjAgentScoreTimes=float(LjAgentScoreTimes) except: LjAgentScoreTimes='null' list_data.append(LjAgentScoreTimes) # 房屋户型 try: HouseType=selector.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[1]/text()')[0].strip() except: HouseType='null' list_data.append(HouseType) # 卧室数目: try: LivingPos=HouseType.index('室') Living=HouseType[LivingPos-1] except: Living='null' list_data.append(Living) # 客厅数目 try: DrawingPos=HouseType.index('厅') Drawing=HouseType[DrawingPos-1] except: Drawing='null' list_data.append(Drawing) # 厨房数目 try: KitchenPos=HouseType.index('厨') Kitchen=HouseType[KitchenPos-1] except: Kitchen='null' list_data.append(Kitchen) # 卫生间数目 try: BathPos=HouseType.index('卫') Bath=HouseType[BathPos-1] except: Bath='null' list_data.append(Bath) # 所处楼层与总楼层 try: Floor = selector.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[2]/text()')[0] Floor = Floor.replace(' ', '').replace('楼层', '').replace('共', '').replace('层', '').replace('(', ' ').replace(')', '') except: Floor = 'null' list_data.append(Floor) # 建筑面积 try: Area=selector.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[3]/text()')[0].replace(' ', '').replace('㎡', '') except: Area='null' list_data.append(Area) # 户型结构 try: Structure=selector.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[4]/text()')[0].replace(' ', '').replace('㎡', '') except: Structure='null' list_data.append(Structure) # 套内面积 try: InnerArea=selector.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[5]/text()')[0].replace(' ', '').replace('㎡', '') except: InnerArea='null' list_data.append(InnerArea) # 建筑类型 try: Type=selector.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[6]/text()')[0].replace(' ', '') except: Type='null' list_data.append(Type) # 房屋朝向 try: Orientation=selector.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[7]/text()')[0].strip() except: Orientation='null' list_data.append(Orientation) # 建筑结构 try: ArchStructure=selector.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[8]/text()')[0] except: ArchStructure='null' list_data.append(ArchStructure) # 装修情况 try: Decoration=selector.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[9]/text()')[0].replace(' ', '') except: Decoration='null' list_data.append(Decoration) # 梯户比例 try: LiftHu=selector.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[10]/text()')[0].replace(' ', '') except: LiftHu='null' list_data.append(LiftHu) try: Ti=re.findall(r'(.+?)梯',LiftHu)[0] Ti=zw2alb(Ti) Hu=re.findall(r'梯(.+?)户',LiftHu)[0] Hu=zw2alb(Hu) except: pass list_data.append(Ti) list_data.append(Hu) try: LiftRatio=round(Ti/Hu,5) except: LiftRatio='null' list_data.append(LiftRatio) # 供暖方式 try: Warmth=selector.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[11]/text()')[0].replace(' ','') except: Warmth='null' list_data.append(Warmth) # 配备电梯 try: Elevator=selector.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[12]/text()')[0] except: Elevator='null' if Elevator=='有': Elevator=1 elif LiftRatio!='null': Elevator=1 else: Elevator=0 list_data.append(Elevator) # 产权年限 try: PropertyRight=selector.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[13]/text()')[0] except: PropertyRight='null' list_data.append(PropertyRight) # 挂牌时间 try: ListTime=selector.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[1]/span[2]/text()')[0] except: ListTime='null' list_data.append(ListTime) # 交易权属 try: DealProperty=selector.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[2]/span[2]/text()')[0] except: DealProperty='null' list_data.append(DealProperty) # 上次交易 try: LastDealTime=selector.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[3]/span[2]/text()')[0] except: LastDealTime='null' list_data.append(LastDealTime) # 房屋用途 try: HouseUse=selector.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[4]/span[2]/text()')[0] except: HouseUse='null' list_data.append(HouseUse) # 房屋年限 try: HouseUseYear=selector.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[5]/span[2]/text()')[0] except: HouseUseYear='null' list_data.append(HouseUseYear) # 产权所有 try: ProRightOwner=selector.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[6]/span[2]/text()')[0] except: ProRightOwner='null' list_data.append(ProRightOwner) # 抵押信息 try: Mortgage=selector.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[7]/span[2]/text()')[0].strip() except: Mortgage='null' list_data.append(Mortgage) # 房本备件 try: OwnerCerCopy=selector.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[8]/span[2]/text()')[0].strip() except: OwnerCerCopy='null' list_data.append(OwnerCerCopy) # 近7天带看次数 try: WatchWeekly=selector.xpath('//*[@id="record"]/div[2]/div[2]/text()')[0] except: WatchWeekly='null' list_data.append(WatchWeekly) # 30天带看次数 try: WatchMonthly=selector.xpath('//*[@id="record"]/div[2]/div[3]/span/text()')[0] except: WatchMonthly='null' list_data.append(WatchMonthly) # 房源标签 try: HTag=selector.xpath('/html/body/div[7]/div[1]/div[2]/div/div[1]/div[2]/a/text()') Housetag=[x.strip() for x in HTag if x.strip()!=''] except: Housetag='null' list_data.append(Housetag) # 交通出行 try: traffic=selector.xpath('/html/body/div[7]/div[1]/div[2]/div/div[2]/div[2]/text()') Traffictag=[x.strip() for x in traffic if x.strip()!=''] except: Traffictag='null' list_data.append(Traffictag) # 周边配套 try: Surrounding=selector.xpath('/html/body/div[7]/div[1]/div[2]/div/div[3]/div[2]/text()') Surroundtag=[x.strip() for x in Surrounding if x.strip()!=''] except: Surroundtag='null' list_data.append(Surroundtag) # 户型介绍 try: TypeIntro=selector.xpath('/html/body/div[7]/div[1]/div[2]/div/div[4]/div[2]/text()') TypeIntrotag=[x.strip() for x in TypeIntro if x.strip()!=''] except: TypeIntrotag='null' list_data.append(TypeIntrotag) # 核心卖点 try: CorePoint=selector.xpath('/html/body/div[7]/div[1]/div[2]/div/div[5]/div[2]/text()') CorePointtag=[x.strip() for x in CorePoint if x.strip()!=''] except: CorePointtag='null' list_data.append(CorePointtag) return list_data # 把数据写入到excel def write_data(file_path,mode,row_data): with open(file_path,mode,newline='') as csv_file: csv_writer=csv.writer(csv_file) csv_writer.writerow(row_data) def main(index): list_district=['dongcheng','xicheng','chaoyang','haidian', 'fengtai','shijingshan','tongzhou','changping','daxing', 'yizhuangkaifaqu','shunyi','fangshan','mentougou', 'pinggu','huairou','miyun','yanqing'] list_district_ch=['东城','西城','朝阳','海淀', '丰台','石景山','通州','昌平','大兴', '亦庄开发区','顺义','房山','门头沟', '平谷','怀柔','密云','延庆'] district=list_district[index] district_ch=list_district_ch[index] # 文件写入的路径 file_write_path='C:/study/实战/python/data/zaishou/data/ershoufang_zs_info.csv' # 写excel标题 row_title=['url','district','title','subtitle','Followers','TotalPrice','unitPrice', 'BuildYear','XQname','Zone','Huan','BookTime','LjID','LjAgent', 'LjAgentScore','LjAgentScoreTimes','HouseType','Living','Drawing', 'Kitchen','Bath','Floor','Area','Structure','InnerArea','Type','Orientation', 'ArchStructure','Decoration','LiftHu','Ti','Hu','LiftRatio','Warmth', 'Elevator','PropertyRight','ListTime','DealProperty','LastDealTime', 'HouseUse','HouseUseYear','ProRightOwner','Mortgage','OwnerCerCopy', 'WatchWeekly','WatchMonthly','Housetag','Traffictag','Surroundtag','TypeIntrotag','CorePointtag'] write_data(file_write_path,'w',row_title) # 文件读取路径 file_read_path='C:/study/实战/python/data/zaishou/'+district+".txt" list_url=read_file(file_read_path) for url in list_url: url=url.replace('\n','') print(url) try: html=get_page(url) row_data=get_data(html) # #获取成交日期 # deal_date=row_data[0] # # 获取年份 # deal_year=int(deal_date[:4]) # # 筛选2018年的数据 # if deal_year>2018: # continue # if deal_year<2018: # break row_data.insert(0,district_ch) row_data.insert(0,url) print(row_data) # 写数据 write_data(file_write_path,'a',row_data) except: pass if __name__=='__main__': # 根据需求调节线程数 for index in range(0,17): thread=threading.Thread(target=main,args=(index,)) thread.start()