正则:
单字符匹配
. 除换行符之外的任意字符
\d 表示数字
\D 匹配非数字
\w 匹配单词字符[a-z,A-Z,0-9]
\W 匹配非单词字符
\s 匹配空白字符,空格,\n \t…
\S 匹配非空白字符
^ 匹配以…开头
$ 匹配以…结尾
[0-9] => \d 匹配0-9
多字符匹配(贪婪匹配)
- 匹配*前面的字符任意次数
-
匹配+前面的字符至少一次
?匹配?前面的字符0-1次
{n,m}匹配{n,m}前面的字符n-m次
多字符匹配(非贪婪匹配)
*?
+?
??
其他
()分组
|逻辑或
\转义字符
re模块下的方法
re.compile():构建正则表达式对象
re.match():从起始位开始匹配,单次匹配,如果匹配到结果立即返回,反之,返回None
re.search():在整个字符串中进行匹配,单次匹配,如果匹配到结果立即返回,反之,返回None
re.findall():匹配出整个字符串中,所有符合正则规则的结果,返回一个列表
re.finditer():匹配出整个字符串中,所有符合正则规则的结果,返回的是一个可迭代对象
re.sub():根据正则表达式进行字符串替换
re.split():根据正则表达式进行分割
正则的用法
def get_rank_data(url='http://top.hengyan.com/dianji/default.aspx?p=1'):构建请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
}url, \目标url
data=None, \默认为None表示是get请求,如果不为None说明是get请求
timeout 设置请求的超时时间
cafile=None, capath=None, cadefault=False,:证书相关参数
context=None :忽略证书认证
urlopen不能添加请求头
response = request.urlopen(url=url,timeout=10)
添加请求头
req = request.Request(url=url,headers=headers)
response = request.urlopen(req,timeout=10)响应状态码
code = response.status
当前请求的url地址
url = response.url
print(code,url)b_content = response.read()
bytes -> str: decode
str -> bytes: encode
print(b_content)
html = b_content.decode('utf-8')
print(html)
#文件操作
"""
w: w+: wb: wb+ a: a+: ab: ab+: r: rb:
"""
with open('hengyan.html','w') as file:
file.write(html)
证据正则表达式解析数据
re.S 修饰:表示.可以匹配换行符
pattern = re.compile('<div\sclass="list">(.*?)</div>',re.S)
ul_str = re.findall(pattern,html)[0]pattern1 = re.compile('<ul.?>(.?)</ul>',re.S)
li_strs = re.findall(pattern1,ul_str)[1:]for li_str in li_strs:
# print(li_str)
pattern = re.compile(
'<li\sclass="num">(.?)</li>'+
'.?<a.?>(.?)</a>'+
'.?<li.?>(.?)</li>'+
'.?<li.?>(.?)</li>'+
'.?<li.?>(.?)</li>'+
'.?<li.?>(.?)</li>',
re.S
)data = re.findall(pattern=pattern,string=li_str)[0] print(data)
提取下一页:
if '下一页' in html:
#说明还存在下一页
pattern = re.compile('<span\sclass="pageBarCurrentStyle">(.*?)</span>',re.S)
current_page = int(re.findall(pattern,html)[0])
next_page = current_page+1
#构造下一页的URL地址
next_page_url = re.sub('\d+',str(next_page),url)
print(next_page_url)
get_rank_data(next_page_url)
else:
print('数据提取完毕')
if name == 'main':
get_rank_data()
作者:某某某的洛先生
来源:CSDN
原文:https://blog.csdn.net/cc576795555/article/details/98338862
版权声明:本文为博主原创文章,转载请附上博文链接!
xpath:
XPath (XML Path Language) 是一门在 XML 文档中查找信息的语言,可用来在 XML 文档中对元素和属性进行遍历。
import requests
import re
import time
import urllib.parse
from lxml import etree
class MeiNv():
def init(self):
self.url = 'http://www.tieba.baidu.com/category/40076/?page='
self.headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0'
}
def loadpage(self, url):
response = requests.get(url=self.url)
html = response.content.decode('utf-8')
with open('baidu.html', 'w') as f:
f.write(html)
#html转成xml
content = etree.HTML(html)
# print(content)
url_list = content.xpath(
'//div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@href'
)
# print(url_list)
for detail_url in url_list:
full_url = 'http://tieba.baidu.com' + detail_url
self.detail(full_url)
详情页
def detail(self, url):
response = requests.get(url=url)
html = response.content.decode('utf-8')
content = etree.HTML(html)
img_list = content.xpath(
'//img[@class="BDE_Image"]/@src'
)
for img in img_list:
self.download(img)
下载
def download(self, url):
response = requests.get(url=url)
无需decode
self.save(response.content, url)
保存
def save(self, content, img_url):
filename = 'tieba/' + img_url[-10:] + '.jpg'
print('正在下载' + filename)
with open(filename, 'wb') as f:
f.write(content)
def main(self):
kw = input('请输入网址')
start = int(input('输入起始页'))
end = int(input('输入终止页'))
for i in range(start, end + 1):
拼接
full_url = self.url + 'f?' + 'kw=' + kw + '&' + 'pn=' + str((i-1)*50)
self.loadpage(full_url)
if name == 'main':
mn = MeiNv()
mn.main()
bs4:
和 lxml 一样,Beautiful Soup 也是一个HTML/XML的解析器,主要的功能也是如何解析和提取 HTML/XML 数据。
lxml 只会局部遍历,而Beautiful Soup 是基于HTML DOM的,会载入整个文档,解析整个DOM树,因此时间和内存开销都会大很多,所以性能要低于lxml。
BeautifulSoup 用来解析 HTML 比较简单,API非常人性化,支持CSS选择器、Python标准库中的HTML解析器,也支持 lxml 的 XML解析器。
import requests
from bs4 import BeautifulSoup
import urllib.parse
import jsonpath
import json
import re
class QiDianSpider():
def init(self):
self.url = 'https://www.address.com/all?page=1'
self.headers = {
'User-Agent': 'Mozilla / 5.0(X11;Ubuntu;Linuxx86_64;rv: 54.0) Gecko / 20100101Firefox / 54.0'
}
def loadpage(self, url):
response = requests.get(url=url, headers=self.headers)
bs = BeautifulSoup(response.text, 'lxml')
li_list = bs.select('ul[class="all-img-list cf"] li')
for li in li_list:
title = li.select('div[class="book-mid-info"] h4 a')[0].get_text()
href = urllib.parse.urljoin(response.url, li.select('div[class="book-mid-info"] h4 a')[0].get('href'))
author = li.select('div[class="book-mid-info"] p a')[0].get_text()
type1 = li.select('div[class="book-mid-info"] p a')[1].get_text()
type2 = li.select('div[class="book-mid-info"] p a')[2].get_text()
id = re.search('(\d+)',href).group(1)
print(id)
dict = {
'title':title,
'author':author,
'type':type1+'.'+type2,
'href':href,
'id':id
}
# print(dict)
self.loaddetail(id, dict)
def loaddetail(self, bookid, dict):
response = requests.get(url='https://book.qidian.com/ajax/book/category?_csrfToken=asYDuKBW3fwHjeBdQNcX1GFeE2B9KcEe6dJyt&bookId='+bookid, headers=self.headers)
html = response.content.decode('utf-8')
vs = jsonpath.jsonpath(json.loads(html), '$..vs')
count = sum(jsonpath.jsonpath(json.loads(html), '$..cnt'))
dict['vs'] = vs[0]
with open('qidian.html', 'w') as f:
f.write(json.dumps(dict,ensure_ascii=False)+'\n')
def start(self):
self.loadpage(self.url)
if name == 'main':
qds = QiDianSpider()
qds.start()