- 功能就是输入一个想要爬取的字符串比如“超级玛丽”,程序会自动分页加载一直爬取,直到没有图片
import requests
import os
import re
pn = 0 #从哪个图片下标开始
rn = 30 #每次多少张图片 pn与rn参数是在Google开发者工具里面找到的两个参数。。。很难找,坑爹啊啊啊
global number
#中文的话文件夹名字会乱码
name = "chaojimali"
def getImagePath(pn = 0):
try:
url = '''http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=%s&pn=%d&rn=%d''' % (name,pn,rn)
headers = {"user_agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
# source = BeautifulSoup(response.content, 'lxml', from_encoding='utf-8')
# paths = source.find_all("objURL")
content = response.content
#正则验证
# links = re.findall('"((http|ftp)s?://.*?.(png|jpg|jpeg|gif))"', response.content)
links = re.findall('"((http|ftp)s?://.*?.(png|jpg|jpeg|gif))"',content)
if not os.path.exists(name):
os.mkdir(name)
for path in links:
imgPath = path[0]
image = requests.get(imgPath)
#返回码为200才去下载
if image.status_code != 200:
continue
print imgPath
try:
#尝试下载图片,失败了跳过这张图
open(name + os.sep + (imgPath[imgPath.rfind("/"):]), "wb").write(image.content)
except:
continue
pn+=rn
getImagePath(pn)
except:
pn += rn
getImagePath(pn)
#开始
getImagePath(pn)
全栈第一步~~~~哈哈哈哈哈啊哈哈哈哈哈
Python也很有趣啊