流程很清晰:
1.分析拉勾页面
2.发现拉勾下面招聘页面是用Ajax写的,抓包试了以下很简单:
3.通过代码获取到json
4.通过json里面的一个值positionId拼接得出详情页面的网址
5.通过分析详情页的HTML获取到职位要求
6.将所有的职业要求添加到一个字符串当中,并将无用的词语去掉(比如'职位要求'这四个字这样的)
7.使用jieba分析出高频词汇
8.最后制作出职业技能画像
中间遇到了很多问题,比如PC端的网页不好分析,字符串不在一个标签当中,而且拉勾的headers很迷,然后我抓取详情页面就用的手机端的抓取的,结果刚出了一个错误,我用了一个try之后,我的IP就被禁掉了
没办法啊,只能去获取IP代理去了,代码先写到这儿
# -*- coding: UTF-8 -*-
import requests,re,json
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)#去掉SSL警告
def get_detail_page(detail_url):
mobile_headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Mobile Safari/537.36',
'Cookie': 'JSESSIONID=15A9FE2A6A2CC09A2FB61021BF8E8086; '
'user_trace_token=20170501124201-1adf364d88864075b61dde9bdd5871ea; '
'LGUID=20170501124202-850be946-2e28-11e7-b43c-5254005c3644; '
'index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=index_search;'
' SEARCH_ID=0a596428cb014d3bab7284f879e214f0; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1493613734;'
' Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1493622592;'
' LGRID=20170501150939-247d4c29-2e3d-11e7-8a78-525400f775ce;'
' _ga=GA1.2.1933438823.1493613734'
}
detail_response = requests.get(detail_url,headers=mobile_headers,verify=False)
detail_soup = BeautifulSoup(detail_response.content,'html.parser')
detail_page_dd = detail_soup.find('div',attrs={'class': 'content'})
try:
return detail_page_dd.text
except Exception,e:
print "这页没找到对应标签"
finally:
pass
def get_all_text(data):
all_text = ''
for i in data['content']['positionResult']['result']:
#解析json,获取详情页的拼接地址
detail_url_pinjie = 'https://m.lagou.com/jobs/'
detail_url = detail_url_pinjie + str(i['positionId']) + '.html'
print detail_url
all_text += get_detail_page(detail_url)
print all_text
return all_text
def get_page():
url = 'https://www.lagou.com/jobs/positionAjax.json'
for pn in range(1,30):
post_data = {
'city': '北京',
'needAddtionalResult': 'false',
'first': 'true',
'pn': str(pn),
'kd': 'python'
}
headers = {
'Host': 'www.lagou.com',
'Referer': 'https://www.lagou.com/jobs/list_python爬虫',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Mobile Safari/537.36',
'Cookie': 'JSESSIONID=15A9FE2A6A2CC09A2FB61021BF8E8086; '
'user_trace_token=20170501124201-1adf364d88864075b61dde9bdd5871ea; '
'LGUID=20170501124202-850be946-2e28-11e7-b43c-5254005c3644; '
'index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=index_search;'
' SEARCH_ID=0a596428cb014d3bab7284f879e214f0; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1493613734;'
' Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1493622592;'
' LGRID=20170501150939-247d4c29-2e3d-11e7-8a78-525400f775ce;'
' _ga=GA1.2.1933438823.1493613734'
}
response = requests.post(url=url, verify=False, params=post_data, headers=headers)
data = json.loads(response.content)
print response.content
get_all_text(data)
get_page()