python2
网页请求方式方法1:需要设置头信息时,需要配合urllib.urlencode(data)使用
import urllib2
req = urllib2.Request(url)此时可以设置传递头信息
response = urllib2.urlopen(req)
网页请求方式方法2:不需要头信息时
import urllib2
response = urllib2.urlopen(url)
html = response.read()
使用代理
import urllib2
proxy=urllib2.ProxyHandler({'http':'127.0.0.1:8087'})
opener=urllib2.build_opener(proxy,)
response=opener.open(url)
网页请求方式方法3:使用cookies,我12306就是使用此方法
import urllib2
import urllib
import cookielib
from json import loads
c=cookielib.LWPCookieJar()
cookie=urllib2.HTTPCookieProcessor(c)
opener=urllib2.build_opener(cookie)
urllib2.install_opener(opener)
req = urllib2.Request(url,data=data,headers=headers)
html = opener.open(req).read()#在open()只用req参数,否则运行不成功
网页请求方式方法4:
requests
requests.get()
requests.post()
ses=requests.session()
ses.get()
ses.post()
python3:
import urllib.request
方法1
response=urllib.request.urlopen(url)
方法2
req=urllib.request.Request(url)此时可以设置传递头信息
response=urllib.request.urlopen(req)
方法3使用代理IP
import urllib.request
proxy=urllib.request.ProxyHandler({
'http':'129.46.13.157:8080',
'https':'182.30.179.29:9797'
})
opener=urllib.request.build_opener(proxy_handler)
response=opener.open(url)
response.read()
方法4:
requests
requests.get()
requests.post()
ses=requests.session()
ses.get()
ses.post()
或
import requests
proxies={
'http':'ip:端口',
'https':'ip:端口',
}
requests.get(url,proxies=proxies)
方法5使用cookies
import urllib.request
from http import cookiejar
参考玩转python网络爬虫第5章49页
方法6
from urllib.request import HTTPPasswordMgrWithDefaultRealm,HTTPBasicAuthHandler,buile_opener
from urllib.error import URLError
username='username'
password='passowrd'
p=HTTPPasswordMgrWithDefaultRealm()
p.add_password(None,url,username,password)
auth_handler=THHPBasicAuthHandler(p)
opener=build_opener(auth_handler)此处可以传入代理
try{
result=opener.open(url)
html=result.read().decode('utf-8')
print(html)
except URLError as e:
print(e.reason)
}
其他模块方法
httplib/urllib用得比较少,初识网络爬虫P77