练手,对正则的手法熟悉一下
需要构造headers才可以正常访问,结果构造为字典,未写入和保存,直接打印,为防止封ip,也没有加入多线程,反而加了sleep
把整个程序构造成几个简短的函数,每个函数完成简单的功能,方便理解和检查,只需加入一个写入文件或数据库的函数,结果可保存。
import requests
import re
import time
urls=["http://maoyan.com/board/4?offset={}".format(str(i)) for i in range(0,100,10)]
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
}
def get_onepage(url):
web_data=requests.get(url,headers=headers)
if web_data.status_code==200:
return web_data.text
else:
return None
def parse_onepage(web_text):
pattern=re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?name">.*?title="(.*?)".*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(\d+)</i>.*?</dd>',re.S)
items=re.findall(pattern,web_text,)
return items
def main():
for url in urls:
web_text=get_onepage(url)
items=parse_onepage(web_text)
for i in items:
data={
"index":i[0],
"title":i[1],
"actor":i[2].split()[0],
"time":i[3],
"score":i[4]+i[5],
}
print(data)
time.sleep(2)
if __name__ == '__main__':
main()