1.基础知识
2.1urllib库
urllib库是python爬虫常用的库文件,其中用的比较多的有urllib.request模块和urllib.parse模块
data = urllib.parse.urlencode(data).encode('utf-8') #将data编码为‘utf-8’格式
req = urllib.request.Request(url,headers=headers)#发送请求(定制请求头)
resp = urllib.request.urlopen(req)#抓取网页
print(resp.read().decode("utf-8"))#读解码后的文件
urllib.parse.urlparse(url)#对URL进行分割
urllib.parse.urlsplit(url) #对URL进行分割
2.2使用代理
使用ProxyHandler处理器进行代理
获取免费IP网站有
www.xicidaili.com
www.kuaidaili.com
www.dailiyun.com
#代理IP
from urllib import request
url = 'http://httpbin.org/ip'
handler = request.ProxyHandler({"http":"60.191.11.249:3128"}) #用ProxyHandler传入代理,构建一个handler
opener = request.build_opener(handler) #用handler构建一个opener
resp = opener.open(url) #用opener发送请求并获取响应内容
print(resp.read())
2.3 cookie
识别用户信息常用于模拟登录
http.cookiejar库提供了获取cookie信息的操作
# 人人网案例 登录以获取页面(cookie)
# from urllib import request
# from urllib import parse
# from http.cookiejar import CookieJar
# headers = {
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
# }
# def get_opener(): #创建opener模块
# cookiejar = CookieJar() #创建一个cookiejar对象
# handler = request.HTTPCookieProcessor(cookiejar) #使用cookiejar创建一个HTTPCookieProcessor对象
# opener = request.build_opener(handler) #使用上一步创建的handler创建一个opener对象
# return opener
# def login_renren(opener):#登录模块
# #传入登录账号密码
# data = {
# 'email':'9701380748qq.com',
# 'password':'pythonspider'
# }
# login_url = 'http://www.renren.com/PLogin.do'
# data = parse.urlencode(data).encode('utf-8')
# req = request.Request(login_url,data,headers)
# opener.open(req)
# def get_dapeng(opener):#获取页面模块
# dapeng_url = 'http://www.ronren.com/880151247/profile'
# req = request.Request(dapeng_url,headers=headers)
# resp = opener.open(req)
# with open('E:/桌面/dapeng.html','w',encoding='utf-8')as fp:
# fp.write(resp.read().decode('utf-8'))
# # print(resp.read().decode('utf-8'))
# if __name__ == '__main__':#主函数
# opener = get_opener()
# login_renren(opener)
# get_dapeng(opener)
3.1 requests库
属于第三方库,可通过包管理工具pip安装
pip install requests
requests库的优点:更具人性化,比之前操作更方便
3.2 利用requests发送请求以及response的组成
resp = requests.get(url ,parse = 'kd',headers = headers) #kd为关键字
resp = requests.post(url,data = data,headers = headers)#data会自动编码
print(resp.text)#解码后的数据
print(resp.content)#从网页抓取的原始数据
print(resp.url)#查看url地址
print(resp.encoding)#查看响应头部字符编码
print(resp.status_code)#查看响应码
3.2用requests库使用代理
只需在请求时传递proxies参数即可
# # 设置代理IP
proxy = {
'http':'140.143.142.14:1080'
}
requests.post(url,data = data,headers = headers,proxies = proxy)
3.3 cookie信息的加载与保存
使用的模块requests.Session()
session = requests.Session()
session.post(url_1,data,headers=headers)#获取到cookie信息并保存,可用于接下来的需要cookie信息的请求
resp = session.get(url,headers=headers)
3.4用requests处理不信任的ssl证书
在发送请求时添加参数verify = False
requests.post(url,data = data,headers = headers,verify = False)
彩蛋
拉勾网尝试爬取
#requests模块爬取拉勾网
import requests
url_1 = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'Referer':'https://www.lagou.com/'
}
session = requests.Session()
session.get(url_1,headers=headers)
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
headers = {
'origin':'https://www.lagou.com',
'referer':'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
data = {
'first':'true',
'pn':'1',
'kd':'python'
}
response = session.post(url,data,headers=headers)
print(response.json())
# with open('E:/桌面/lagou.txt','w',encoding='utf-8')as fp: #写入文件
# fp.write(response.text)