出处:http://www.ehcoblog.ml/category/1/
import requests,random,time
from bs4 import BeautifulSoup
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'__gads=ID=e90e92452442eef2:T=1497427490:S=ALNI_MZi4whQxIsWkk8QW0yrpnZ7MyTtfw; bdshare_firstime=1497599748961; PHPSESSID=jbnm2jk08p9fripuj44lu07427; yunsuo_session_verify=df3422073e655cde45e3dc9018fb2f94; Hm_lvt_adaf29565debc85c07b8d3c36c148a6b=1497604233,1497604246,1497604251,1497921441; Hm_lpvt_adaf29565debc85c07b8d3c36c148a6b=1497930371; AJSTAT_ok_pages=74; AJSTAT_ok_times=3',
}
proxy_list=[
'http://219.135.164.245:3128',
'http://106.14.51.145:8118', # 9月25日测试,好的
# 'http://111.13.7.123:80',
'http://122.224.227.202:3128',
# 'http://218.201.98.196:3128',
]
proxy_ip = random.choice(proxy_list)
proxies = {'http':proxy_ip}
print(proxy_ip)
def get_html(url):
try:
r = requests.get(url,headers=headers,proxies=proxies,timeout=30)
r.raise_for_status()
#这个网站使用utf-8是???问好,使用gb2312部分字是������,要使用gbk
r.encoding='gbk'
# print(r.text)
return r.text
except:
return 'something wrong'
def get_content(url):
html = get_html(url)
soup = BeautifulSoup(html,'lxml')
movies_list = soup.find('ul',class_='picList clearfix')
movies = movies_list.find_all('li')
print(movies)
for top in movies:
img_url_1 = top.find('img')['src'] # 查找img元素中的src
img_url = 'https:'+img_url_1.split('?')[0]
print(img_url)
name = top.find('span', class_='sTit').a.text
#查找span元素下的class=sTit的标签下a标签中的文字内容
print(name)
try:
time = top.find('span',class_='sIntro').text
except:
time = "暂无"
actors = top.find('p',class_='pActor')
actor= ''
# print(actors.contents)
for act in actors.contents:
actor = actor + act.string +' '
#找到影片简介
intro = top.find('p',class_='pTxt pIntroShow').text
print("片名:{}\t{}\n{}\n{} \n \n ".format(name,time,actor,intro))
#要注意wb+,这里不用加encoding='***'
with open(name + '.png', 'wb+') as f:
f.write(requests.get(img_url,headers=headers,proxies=proxies,timeout=30).content)
#这里要注意
time.sleep(5)
def main():
url = '2345'
get_content(url)
if __name__ == "__main__":
main()
#```