有了之前的爬虫知识,相对比较简单。爬取赶集网二手物品信息。
1、先得到主目录链接
from bs4 import BeautifulSoup
#import requests
import urllib2
#import re
start_url="http://sh.ganji.com/wu/"
def get_channel_list(url):
web_data=urllib2.urlopen(url).read()
#web_data = requests.get(start_url)只能读取部分数据
soup = BeautifulSoup(web_data, 'lxml')
contents=soup.select('#wrapper > div.content > div > div > dl > dt > a')
#contents是list不能直接get
for content in contents:
name=content.get('href').split('/')[1]
channel_list='http://sh.ganji.com/{}/'.format(name)
print(channel_list)
get_channel_list(start_url)
2、再得到不同类的全部链接和爬取内容
from bs4 import BeautifulSoup
#import requests
import urllib2
#import re
import time
import pymongo
client=pymongo.MongoClient('localhost',27017)
ceshi=client['ceshi']
url_list=ceshi['url_list']
info=ceshi['info']
#t_url="http://zhuanzhuan.ganji.com/detail/788638496047104004z.shtml?from=pc&source=ganji&cate=&cateurl="
def get_url_list(channel,page):
if page==1:
url=channel
else:
url='{}o{}/'.format(channel,str(page))
#page格式
web_data=urllib2.urlopen(url).read()
time.sleep(2)
soup = BeautifulSoup(web_data, 'lxml')
if soup.find('td','t'):
links=soup.select('#infolist > div.infocon > table > tbody > tr > td.t > a')
for link in links:
url=link.get('href').split('?')[0]
url_list.insert_one[{'url':url}]
print(url)
else:
pass
def get_info(url):
web_data=urllib2.urlopen(url).read()
soup = BeautifulSoup(web_data, 'lxml')
name=soup.select('div.box_left_top > h1')[0].text
price=soup.select('div.price_li > span > i')[0].text
district=soup.select('div.palce_li > span > i')
info.insert_one[{'name':name,'price':price,'district':district}]
print[{'name':name,'price':price,'district':district}]
3、多进程进行爬取
from multiprocessing import Pool
from channel_list import channel_list
from get_data_from_url import get_url_list
def get_all_date(channel):
for num in range(1,101):
get_url_list(channel,num)
if __name__ == "__main__":
pool = Pool()
#分到pool里自动选择进程数
pool.map(get_all_date,channel_list.split())
#map的用法,后面一个一个带入进前面
4、计数
import time
from get_data_from_url import url_list
while True:
print(url_list.find().count())
time.sleep(3)
学习总结:
1、创建数据库,数据库中插入数据;
2、requests只能读取大部分代码,而urllib2可以读全,不知道为什么;
3、多进程进行爬取数据。
4、扩展库安装pip install pymongo。
2017年第1周