1.xpath语法
2.1用lxml库解析html字符串和文件
from lxml import etree
#解析HTML字符串
html = etree.HTML(text)
print(etree.tostring(html,encoding="utf-8").decode("utf-8"))
#解析HTML文件
html = etree.HTML("lagou.html")
print(etree.tostring(html,encoding="utf-8").decode("utf-8"))
#解析HTML文件错误时(默认为xml解析器)需创建指定的解析器
parser = etree.HTMLParser(encoding = 'utf-8')
html = etree.HTML("lagou.html",parser=parser)
print(etree.tostring(html,encoding="utf-8").decode("utf-8"))
2.2xpath和lxml库配合使用
from lxml import etree
parser = etree.HTMLParser(encoding = "utf-8")
html = etree.parse('tencet.html',parser = parser)
#1.获取所有tr标签
trs = html.xpath("//tr") #xpath返回的是个列表
for tr in trs:
print(etree.tostring(tr,encoding='utf-8').decode("utf-8"))
#2.获取第二个tr标签
tr = html.xpath("//tr[2]").[0]
print(etree.tostring(tr,encoding='utf-8').decode("utf-8"))
#3.获取所有class = even得tr标签
tr = html.xpath("//tr[@clss = 'even']")
#4.获取所有a标签的href属性
alist = html.xpath("//a/@herf")
#5.获取某个标签下的文本文档
title = tr.xpath(".//td[1]//text()")
示例 电影天堂爬取
import requests
from lxml import etree
Base_DOMAIN = 'https://www.dytt8.net/'
# url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_1.html'
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",
"Referer": "https://www.dytt8.net/html/gndy/dyzz/list_23_2.html"}
def get_url(url):
resp = requests.get(url, headers=HEADERS)
# text = resp.content.decode('gbk', errors='ignore')
text = resp.text
html = etree.HTML(text)
links = html.xpath("//table[@class='tbspan']//a/@href")
urls = map(lambda url: Base_DOMAIN + url, links)
return urls
def jx(url):
movies = {}
resp = requests.get(url, headers=HEADERS)
text = resp.content.decode('gbk', errors='ignore')
html = etree.HTML(text)
movie_title = html.xpath(
"//div[@class='title_all']//font[@color='#07519a']/text()")[0]
movies["title"] = movie_title
Zoom = html.xpath("//div[@id='Zoom']")[0]
photos = Zoom.xpath(".//img/@src")
haibao = photos[0]
movies["haibao"] = haibao
infos = Zoom.xpath(".//text()")
for index, info in enumerate(
infos): # enumerate(infos)返回两个值,下标和内容,获取演员列表需要下标
if info.startswith(
"◎年 代"): # startswith("text")查找以text为开头的部分,并返回text整体
# info = info.replace("◎年 代","").strip()
# #replace()将text整体中text部分替换为空,即去除text部分,strip()去掉内容前后空格
year = info_1(info, "◎年 代")
movies["years"] = year
elif info.startswith("◎豆瓣评分"):
# info = info.replace("◎豆瓣评分","").strip()
scores = info_1(info, "◎豆瓣评分")
movies["scores"] = scores
elif info.startswith("◎主 演"):
info = info_1(info, "◎主 演")
actor = [info]
for x in range(index + 1, len(infos)):
actors = infos[x].strip()
if actors.startswith("◎"):
break
actor.append(actors)
movies["actors"] = actor
elif info.startswith("◎简 介"):
info = info_1(info, "◎简 介")
for x in range(index + 1, len(infos)):
profile = infos[x].strip()
if profile.startswith("◎"):
break
movies["profile"] = profile
download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")
movies["download_url"] = download_url
return movies
def info_1(info, rule):
return info.replace(rule, "").strip()
def spider():
base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html' # 预留页码位置
film = []
for x in range(1, 2):
url = base_url.format(x) # 填入页码位置获得完整链接
films_details = get_url(url)
for page_url in films_details:
movie = jx(page_url)
film.append(movie)
print(film)
# with open("E:/桌面/电影.txt","w")as f:
# for x in film:
# f.write("\n"+str(x))
if __name__ == '__main__':
spider()
3.BeautifulSoup4库
BeautifulSoup也是HTML/XML的解析器,主要用于解析和提取HTML/XML。它作用于HTML DOM,会载入整个文档,而xpath只是局部遍历,因此BeautifulSoup性能上低于xpath,但它解析HTML比xpath简单
安装方法:pip安装
pip install bs4
基本使用
from bs4 import BeautifulSoup
bs = BeautifulSoup(html,'lxml')#将HTML导入用lxml解析器进行解析
#1.获取所有tr标签
trs = soup.find_all('tr')
#2.获取第二个tr标签
tr_2 = soup.find_all('tr',limit=2)[1] #limit限制获取几个数据,find_all返回列表
#3.获取所有class=even的标签
tr_even = soup.find_all('tr',class_='even')
tr_even = soup.find_all('tr',attrs={'class':'even'})#atrrs可指定获取tr的某些属性
#4.获取id=test,class=test的a标签
alist = soup.find_all('a',id='test',class_='test')
alist = soup.find_all('a',attrs={'id':'test','class':'test'})
#5.获取a标签下的href属性
alist = soup.find_all('a')
for a in alist:
#方法1
href = a['href']
#方法2
href = a.attrs('href')
#6.获取所有文本
tr_3 = soup.find_all('tr')[1:]#过滤第一个
for tr in tr_3:
#infos = tr.strings #用strings会包括“\n”等字符,string会返回字符串,get_text()返回的不是列表
infos = tr.stripped_strings#获取非空字符
infos =list(infos) #转换为列表可提取其中元素
CSS选择器 select
#1.通过标签名查找
print(soup.select('a'))
#2.通过类名查找,如查找class=sy
print(soup.select('.sy'))
#3.通过id查找
print(soup.select('#sy'))
#4.组合查找 标签+id/class等
print(soup.select('p #sy'))
#5.通过属性查找
print(soup.select("a[href='http://......']"))
实例
#BeautifulSoup实例及数据可视化简单应用
import requests
from bs4 import BeautifulSoup
from pyecharts.charts import Bar #数据可视化库,版本1.7.1,新版本改动
from pyecharts import options as opts
weather = []
def page_parse(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
resp = requests.get(url,headers=headers).content.decode('utf-8')
soup = BeautifulSoup(resp,'html5lib') #港澳台地区需要用html5lib进行解析
conmidtabs = soup.find('div',class_='conMidtab')
tables = conmidtabs.find_all('table')
for table in tables:
trs = table.find_all('tr')[2:]
for index,tr in enumerate(trs):
tds = tr.find_all('td')
city_td = tds[0]
if index == 0:
city_td = tds[1]
city = list(city_td.stripped_strings)[0]
temp_td = tds[-2]
temp = list(temp_td.stripped_strings)[0]
# print({'city':city,'min-temp':temp})
weather.append({'city':city,'min_temp':int(temp)})
# with open('E:/桌面/weather.txt','w')as fp:
# for x in weather:
# fp.write('\n'+str(x))
def main():
urls = [
'http://www.weather.com.cn/textFC/hb.shtml',
'http://www.weather.com.cn/textFC/db.shtml',
'http://www.weather.com.cn/textFC/hd.shtml',
'http://www.weather.com.cn/textFC/hz.shtml',
'http://www.weather.com.cn/textFC/hn.shtml',
'http://www.weather.com.cn/textFC/xb.shtml',
'http://www.weather.com.cn/textFC/xn.shtml',
'http://www.weather.com.cn/textFC/gat.shtml'
]
for url in urls:
page_parse(url)
weather.sort(key=lambda weather:weather['min_temp'])
data = weather[0:10]
# print(data)
cities = list(map(lambda x:x['city'],data))
min_temps = list(map(lambda x:x['min_temp'],data))
chart = Bar() #创建一个直方图
chart.set_global_opts(title_opts=opts.TitleOpts(title="天气预报")) #创建直方图主标题
chart.add_xaxis(cities)
chart.add_yaxis('',min_temps)
chart.set_global_opts(xaxis_opts=opts.AxisOpts(name='城市')) #建立x轴图标
chart.set_global_opts(yaxis_opts=opts.AxisOpts(name='温度'))
chart.render('E:/桌面/天气.html')
if __name__ == '__main__':
main()
4.正则表达式
基本知识
正则表达式常用小案例
import re
#1.匹配电话号码
text = '13691612426'
ret = re.match('1[34578]\d{9}',text)
print(ret.group())
#2.匹配邮箱
text = '1871759153@qq.com'
ret = re.match('\w+@[a-z0-9]+\.[a-z]+',text)
print(ret.group())
#3.匹配url
text = 'https://www.runoob.com/python3/python3-tutorial.html'
ret = re.match('(http|https|ftp)://[^\s]+',text)
print(ret.group())
#4.验证身份证
text = '32042519121281241x'
ret = re.match('\d{17}[\dxX]',text)
print(ret.group())
#5.匹配100内的数字
text = '98'
ret = re.match('[1-9]\d?$|100$',text)
print(ret.group())
group()分组
import re
#group分组
text = 'apple prince is $5,iphone price is $300'
ret = re.match('.*(\$\d+).*(\$\d+)',text)
print(ret.group(1))
print(ret.group(2))
re模块常用函数
import re
# re常用函数
# 1.findall() 找出所有满足条件的,返回的是一个列表
text = 'apple prince is $5,iphone price is $300'
ret = re.findall('\d+',text)
print(ret)
# 2.sub() 找出所有满足条件的并将其替换
text = 'apple prince is $5,iphone price is $300'
ret = re.sub('\d+','0',text)
print(ret)
# 3.split()函数,返回一个列表
text = 'hello world ni hao'
ret = re.split(' ',text)
print(ret)
# 4.compile() 对经常要用的正则表达式进行编译能提高效率
text = 'the number is 20.50'
r = re.compile(r"""
\d+ #小数点前面的
\.? #小数点
\d+ #小数点后面的
""",re.VERBOSE)
ret = re.search(r,text)
print(ret.group())
实例分析
#古诗词网爬取(正则表达式的应用)
import requests
import re
poems = []
def page_parse(url):
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
}
resp = requests.get(url,headers=headers)
text = resp.text
titles = re.findall(r'<div class="cont">.*?<b>(.*?)</b>',text,re.DOTALL) #加入re.DOTALL使.*可以识别\n
dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL)
authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>',text,re.DOTALL)
contents_tags = re.findall(r'<div class="contson".*?>(.*?)</div>',text,re.DOTALL)
contents = []
for content in contents_tags:
content = re.sub('<.*?>','',content)
contents.append(content.strip())
#zip()函数简介
#zip函数将两个或多个序列作为参数,返回一个组成元素为元组的列表,元组由各序列构成
# x= [1,2,3]
# y= [4,5,6]
# xy = zip(x,y)
# print(xy)
#得到[(1,4),(2,5),(3,6)]
# poems = []
for x in zip(titles,dynasties,authors,contents):
title,dynasty,author,content = x
poem = {
"title":title,
"dynasty":dynasty,
"author":author,
"content":content
}
poems.append(poem)
# for poem in poems:
# print(poem)
def main1():
base_url = 'https://www.gushiwen.cn/default_{}.aspx'
for x in range(1,4):
url = base_url.format(x) #确定前几页url
page_parse(url)
if __name__ == '__main__':
main1()
with open('E:/桌面/poems.txt','w')as fp:
for poem in poems:
fp.write("\n"+str(poem))