这是我的第一个python爬虫,抓取了好奇心主页实时滚动的五篇文章中的背景图。
import urllib.request
import os
def url_open(url):
req=urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.13 Safari/537.36')
response=urllib.request.urlopen(req)
return(response)
def download_hqx(folder='article_jpg',pages=10):
os.mkdir(folder)
os.chdir(folder)
url='http://www.qdaily.com'
response=url_open(url)
html=response.read().decode('utf-8')
for i in range(5):
a=html.find('pc:click:banner'+str(i)+'" href="')+24
b=html.find('"',a)
xu=html[a:b]
nurl=url+xu
nresponse=url_open(nurl)
nhtml=nresponse.read().decode('utf-8')
na=nhtml.find('full-banner-bd imgcover"><img src="')+35
nb=nhtml.find('.jpg',na)+4
jpgurl=nhtml[na:nb]
if len(jpgurl)>100:
nb=nhtml.find('.jpeg',na)+5
jpgurl=nhtml[na:nb]
filename=jpgurl.split('/')[-1]
print(jpgurl)
with open(filename, 'wb') as f:
img=url_open(jpgurl).read()
f.write(img)
if __name__=='__main__':
html=download_hqx()