学习BeautifulSoup和requests下载图片

出处：http://www.ehcoblog.ml/category/1/
import requests,random,time
from bs4 import BeautifulSoup

headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate',
    'Accept-Language':'zh-CN,zh;q=0.8',
    'Cache-Control':'max-age=0',
    'Connection':'keep-alive',
    'Cookie':'__gads=ID=e90e92452442eef2:T=1497427490:S=ALNI_MZi4whQxIsWkk8QW0yrpnZ7MyTtfw; bdshare_firstime=1497599748961; PHPSESSID=jbnm2jk08p9fripuj44lu07427; yunsuo_session_verify=df3422073e655cde45e3dc9018fb2f94; Hm_lvt_adaf29565debc85c07b8d3c36c148a6b=1497604233,1497604246,1497604251,1497921441; Hm_lpvt_adaf29565debc85c07b8d3c36c148a6b=1497930371; AJSTAT_ok_pages=74; AJSTAT_ok_times=3',
}

proxy_list=[
    'http://219.135.164.245:3128',
    'http://106.14.51.145:8118',  # 9月25日测试，好的

    # 'http://111.13.7.123:80',
    'http://122.224.227.202:3128',
    # 'http://218.201.98.196:3128',
]

proxy_ip = random.choice(proxy_list)
proxies = {'http':proxy_ip}
print(proxy_ip)


def get_html(url):
    try:
        r = requests.get(url,headers=headers,proxies=proxies,timeout=30)
        r.raise_for_status()
        #这个网站使用utf-8是？？？问好，使用gb2312部分字是������，要使用gbk
        r.encoding='gbk'
        # print(r.text)
        return r.text
    except:
        return 'something wrong'

def get_content(url):
    html = get_html(url)
    soup = BeautifulSoup(html,'lxml')
    movies_list = soup.find('ul',class_='picList clearfix')
    movies = movies_list.find_all('li')
    print(movies)
    for top in movies:
        img_url_1 = top.find('img')['src'] # 查找img元素中的src
        img_url = 'https:'+img_url_1.split('?')[0]
        print(img_url)
        name = top.find('span', class_='sTit').a.text
        #查找span元素下的class=sTit的标签下a标签中的文字内容
        print(name)
        try:
            time = top.find('span',class_='sIntro').text
        except:
            time = "暂无"
        actors = top.find('p',class_='pActor')
        actor= ''
        # print(actors.contents)
        for act in actors.contents:
            actor = actor + act.string +'  '

        #找到影片简介
        intro = top.find('p',class_='pTxt pIntroShow').text

        print("片名：{}\t{}\n{}\n{} \n \n ".format(name,time,actor,intro))

        #要注意wb+，这里不用加encoding='***'
        
        with open(name + '.png', 'wb+') as f:
            f.write(requests.get(img_url,headers=headers,proxies=proxies,timeout=30).content)
#这里要注意
            time.sleep(5)


def main():
    url = '2345'
    get_content(url)
if __name__ == "__main__":
    main()
#```
最后编辑于：2017.12.07 17:21:33