例子是仿抄:崔庆才先生 的案例
他的个人博客地址是:http://cuiqingcai.com/
#!/bin/python3.4
# -- coding:utf-8 --
import re
import json
from bs4 import BeautifulSoup
from urllib.parse import urlencode
from requests.exceptions import RequestException
import requests
from config import *
from hashlib import md5
from multiprocessing import Pool
from json.decoder import JSONDecoder
from pymongo import MongoClient
import os
client = MongoClient(MONGO_URL,connect=False)
db = client[MONGO_DB]
def get_page_index(offset,keyword):
data = {
'offset':offset,
'format':'json',
'keyword':keyword,
'autoload':'true',
'count':'20',
'cur_tab':1
}
url = 'http://www.toutiao.com/search_content/?' + urlencode(data)
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
print ("请求索引页面出错")
return None
def parse_page_index(html):
try:
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
except JSONDecoder:
pass
def get_page_detail(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
print ("请求详情页面出错",url)
return None
def parse_page_detail(html,url):
soup = BeautifulSoup(html,'lxml')
title = soup.select('title')[0].get_text()
images_pattern = re.compile('var gallery = (.*?);',re.S)
result = re.search(images_pattern,html)
if result:
data = json.loads(result.group(1))
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images]
for image in images: download_image(image)
return {
'title':title,
'url':url,
'images':images,
}
def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print ("存储到Mongodb成功",result)
return True
return False
def download_image(url):
try:
response = requests.get(url)
if response.status_code == 200:
save_image(response.content)
return None
except RequestException:
print ("请求图片出错")
return None
def save_image(content):
file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(content)
f.close()
def main(offset):
html = get_page_index(offset,KEYWORD)
for url in parse_page_index(html):
html = get_page_detail(url)
if html:
result = parse_page_detail(html,url)
if result: save_to_mongo(result)
if __name__ == '__main__':
groups = [x * 20 for x in range(GROUP_START,GROUP_END + 1)]
pool = Pool()
pool.map(main,groups)
config.py配置文件
#!/bin/python3.4
# -*- coding:utf-8 -*-
MONGO_URL = 'localhost'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'toutiao'
GROUP_START = 1
GROUP_END = 20
KEYWORD = '街拍'
images_pattern与result正则匹配到数据转化成json格式:
"sub_images":
[
{
"url":"http:\/\/p2.pstatp.com\/origin\/168300027e4c8323ee22",
"width":700,
"url_list":
[
{"url":"http:\/\/p2.pstatp.com\/origin\/168300027e4c8323ee22"},
{"url":"http:\/\/pb3.pstatp.com\/origin\/168300027e4c8323ee22"},
{"url":"http:\/\/pb3.pstatp.com\/origin\/168300027e4c8323ee22"}
],
"uri":"origin\/168300027e4c8323ee22","height":981
},
{
"url":"http:\/\/p2.pstatp.com\/origin\/168600026fb5ecf86ba9",
"width":700,
"url_list":
[
{"url":"http:\/\/p2.pstatp.com\/origin\/168600026fb5ecf86ba9"},
{"url":"http:\/\/pb3.pstatp.com\/origin\/168600026fb5ecf86ba9"},
{"url":"http:\/\/pb3.pstatp.com\/origin\/168600026fb5ecf86ba9"}
],
"uri":"origin\/168600026fb5ecf86ba9","height":891
},
{
"url":"http:\/\/p3.pstatp.com\/origin\/16870003ef0948da7863",
"width":700,
"url_list":
[
{"url":"http:\/\/p3.pstatp.com\/origin\/16870003ef0948da7863"},
{"url":"http:\/\/pb2.pstatp.com\/origin\/16870003ef0948da7863"},
{"url":"http:\/\/pb3.pstatp.com\/origin\/16870003ef0948da7863"}
],
"uri":"origin\/16870003ef0948da7863","height":1078
},
{
"url":"http:\/\/p1.pstatp.com\/origin\/16820003ee9c72717ad5",
"width":700,
"url_list":
[
{"url":"http:\/\/p1.pstatp.com\/origin\/16820003ee9c72717ad5"},
{"url":"http:\/\/pb3.pstatp.com\/origin\/16820003ee9c72717ad5"},
{"url":"http:\/\/pb3.pstatp.com\/origin\/16820003ee9c72717ad5"}
],
"uri":"origin\/16820003ee9c72717ad5","height":999
},
{
"url":"http:\/\/p1.pstatp.com\/origin\/16870003ef0b2bbec810",
"width":960,
"url_list":
[
{"url":"http:\/\/p1.pstatp.com\/origin\/16870003ef0b2bbec810"},
{"url":"http:\/\/pb3.pstatp.com\/origin\/16870003ef0b2bbec810"},
{"url":"http:\/\/pb3.pstatp.com\/origin\/16870003ef0b2bbec810"}
],
"uri":"origin\/16870003ef0b2bbec810","height":609
}
],