#猫眼电影
import requests
from lxml import etree
import csv
# import time
headers = {
'user-agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Mobile Safari/537.36'
}
def get_url(url):
res = requests.get(url,headers=headers)
# print(res.text)
html = etree.HTML(res.text)
infos = html.xpath('//dl[@class="board-wrapper"]/dd')
for info in infos:
title = info.xpath('div/div/div[1]/p[1]/a/text()')[0]
author = info.xpath('div/div/div[1]/p[2]/text()')[0].strip().strip('主演:')
pub_time = info.xpath('div/div/div[1]/p[3]/text()')[0].strip('上映时间:')
star_1 = info.xpath('div/div/div[2]/p/i[1]/text()')[0]
star_2 = info.xpath('div/div/div[2]/p/i[2]/text()')[0]
star = star_1 + star_2
movie_url = 'https://maoyan.com' + info.xpath('div/div/div[1]/p[1]/a/@href')[0]
# print(title,author,pub_time,star,movie_url)
get_info(movie_url,title,author,pub_time,star)
def get_info(url,title,author,pub_time,star):
res = requests.get(url, headers=headers)
html = etree.HTML(res.text)
style = html.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[1]/text()')[0]
long_time = html.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[2]/text()')[0].split('/')[1].strip().strip('分钟')
# print(title,author,pub_time,star,style,long_time)
writer.writerow([title,author,pub_time,star,style,long_time])
if __name__ == '__main__':
fp = open('C://Users/madin/Desktop/maoyan.csv','w',newline='',encoding='utf-8')
writer = csv.writer(fp)
writer.writerow(['title','author','pub_time','star','style','long_time'])
urls = ['https://maoyan.com/board/4?offset={}'.format(str(i)) for i in range(0,100,10)]
for url in urls:
get_url(url)