分析暂时不介绍
import base64
import re
import time
import requests
'''抓取美拍 指定分类 视频'''
class MeiPai:
def __init__(self):
self.home_url = 'http://www.meipai.com'
self.category = {
'搞笑': ['13', '/squares/new_timeline'],
'爱豆': ['16', '/squares/new_timeline'],
'高颜值': ['474', '/squares/new_timeline'],
'舞蹈': ['63', '/topics/hot_timeline'],
'精选': ['488', '/squares/new_timeline'],
'音乐': ['62', '/topics/hot_timeline'],
'美食': ['59', '/topics/hot_timeline'],
'美妆': ['27', '/squares/new_timeline'],
'吃秀': ['423', '/squares/new_timeline'],
'宝宝': ['18', '/topics/hot_timeline'],
'宠物': ['6', '/topics/hot_timeline'],
'手工': ['450', '/topics/hot_timeline'],
'游戏': ['480', '/topics/hot_timeline'],
'运动': ['487', '/topics/hot_timeline'],
'穿秀': ['460', '/topics/hot_timeline'],
}
self.category_name = None
self.video_id = None # 视频 id
self.total = 0 # 记录下载视频个数
self.page = 1 # 初始化开始下载的页数
self.DEBUG = True # 默认开启 调试模式,不会真正下载视频
def tid(self): # 为网页源码中的 interested_id: (\d+),
url = self.home_url + '/square/{}'.format(self.category[self.category_name][0])
html = requests.get(url).text
tid = re.compile('interested_id: (\d+)').findall(html)[0]
if tid == '0':
tid = self.category[self.category_name][0]
return tid
def ajax(self, tid, total_page=None):
"""
:param total_page: 希望下载多少页的视频,每页下载24个,默认无限制
:type tid: object
"""
url = self.home_url + self.category[self.category_name][1]
while 1:
print('正在下载第{}页...'.format(self.page))
params = {
'page': self.page,
'count': 24,
'tid': tid,
}
js_data = requests.get(url, params=params).json()
medias = js_data.get('medias')
for media in medias: # 每一个page至多有24条数据
self.video_id = media.get('id')
if self.video_id:
yield media.get('video')
if self.page == total_page or len(medias) < 24:
print('共下载了{}页,{}个视频'.format(self.page, self.total))
break
time.sleep(1)
self.page += 1
#################################################
@staticmethod
def decode(code):
"""
:type code: 解密之前的视频链接
"""
first_4 = str(int(code[:4][::-1], 16))
pre = [int(x) for x in first_4[:2]]
tail = [int(x) for x in first_4[2:]]
code = code[4:]
code = code[:pre[0]] + code[pre[0]:].replace(code[pre[0]:pre[0] + pre[1]], '', 1)
tail[0] = len(code) - sum(tail)
code = code[:tail[0]] + code[tail[0]:].replace(code[tail[0]:tail[0] + tail[1]], '', 1)
return base64.b64decode(code).decode()
def download(self, video_url):
"""
默认开启 调试模式,不会真正下载视频到本地
:type video_url: 解密之后的视频下载地址
"""
print('正在下载...{}'.format(video_url, self.video_id))
if not self.DEBUG:
video = requests.get(video_url).content
with open('{}.mp4'.format(self.video_id), 'wb') as f:
f.write(video)
def start(self, total_page=None):
"""
:type total_page: 希望下载多少页的视频,每页下载24个,默认无限制
"""
tid = self.tid()
for _ in self.ajax(tid, total_page):
try:
video = self.decode(_)
except Exception as e:
print(e, self.video_id, '解密失败!')
continue
self.download(video)
self.total += 1
if __name__ == '__main__':
mp = MeiPai()
mp.category_name = '舞蹈'
mp.DEBUG = False
mp.start(1) # 参见start函数说明