预热知识
预热知识---html 超文本标记语言
前端 html + css + javascript
1.新建html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Title</title>
</head>
<body>
<!--利用h1来进行文本的书写-->
<h1>欢迎来到王者荣耀</h1>
<!--ol为一个有序列表-->
<ol>
<li>猪八戒</li>
<!-- src="" alt="" 都是 img这个标签的属性-->
<li><a href="https://pvp.qq.com/web201605/herodetail/513.shtml"><img src="waner.jpg" alt="">上官婉儿</a></li>
<!-- 通过a中的href属性来插入超链接网址,以达到转移到其他网页的目的-->
<li>虞姬</li>
<li>典韦</li>
</ol>
<!-- style为美化设置,background-color是背景颜色选项->
<div id="header" style="background-color: aquamarine" >
<ul>
<li>曹操</li>
<li>吕布</li>
<li>安琪拉</li>
</ul>
</div>
<!--div + css盒子模型-->
<!--div是无色无味的容器-->
<div class="div-top" >这是div标签</div>
<div id="container">
<p>上官婉儿选择一个点,然后以自己的位置为终点进行书写,落笔的一瞬间对触碰的敌人造成150/180/210/240/270/300(+20%法术加成)点法术伤害,书写过程中将会对触碰敌人造成300/360/420/480/540/600(+40%法术加成)点法术伤害和50%减速持续2秒(每12秒储存一次笔势,最多可储备2次)</p>
<a href="http://www.baidu.com">点击跳转至百度</a>
</div>
</body>
</html>
2.爬虫
1.安装包lxml
from lxml import hxml
2.使用xpath语法进行html页面提取
from lxml import hxml
with open('index1.html','r',encoding='utf-8') as f:
html_data = f.read()
seletor = html.fromstring(html_data)
#使用xpath进行提取 获取h1的标题
# / 代表从根节点出发 text()为获取标签内容 @为获取属性
h1 = seletor.xpath('/html/body/h1/text()')
h1 = ' ' if len(h1)==0 else h1[0]
# if,else三元表达式
#if len(a) ==0:
# a=' '
#else:
# a[0]
# 变量1= 成立结果 if 条件 else 不成立结果
#a = '' if len(a)==0 else a[0]
#获取a标签内容,从任意的节点出发//
#获取标签内容的xpath语法
# //标签名1[@属性=属性值]/标签名2[@属性=属性值].../text()
#通过[0]的方式来去掉大括号
a = seletor.xpath('//div[@id="container"]/a/text()')[0]
ol_list = seletor.xpath('//ol/li')
for li in ol_list:
hero = li.xpath('./text()')[0]
print(hero)
3.requests库
利用requests库可以进行网页内容的查看,具体代码如下所示:
#对库的导入
import requests
#获取目标网址
response = requests.get('https://www.baidu.com')
#查看编码方式
print(response.encoding)
#修改编码方式
response.encoding='utf-8'
#获取str类型的响应
html_data = response.text
#获取byte类型的响应
bytes_data = response.content
#修改content编码方式
response.content.decode('utf-8')
#获取响应状态码
print('response.status_code')
#请求头信息
print('response.headers')
#下载百度的界面
#定义添加请求头 以字典的形式定义
with open('baidu.html,'w',encoding='utf-8') as f:
f.write(html_data)
headers ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
resp = requests.get('https://www.zhihu.com/', headers=headers)
4.豆瓣top250爬虫
import requests
from lxml import etree
import pandas as pd
def parse():
"""豆瓣网top250爬虫"""
#添加请求头,模拟浏览器访问
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
# 定义列表存储top250信息 [{},{},{}],列表的元素为字典
movie_info_list = []
# 循环每一页
for i in range(0, 226, 25):
url = 'https://movie.douban.com/top250?start={}&filter='.format(i)
# 获取 byte的类型的响应
resp = requests.get(url, headers=headers)
data = resp.content
# 调用etree.HTML获取html对象,然后调用html的xpath语法
html = etree.HTML(data)
movie_list = html.xpath('//div[@id="content"]//ol/li')
# print(len(movie_list))
for movie in movie_list:
# 获取电影序号
serial_number = movie.xpath('./div[@class="item"]/div[@class="pic"]/em/text()')
serial_number = '' if len(serial_number) == 0 else serial_number[0]
# print(serial_number)
# 电影名称
movie_name = movie.xpath('./div[@class="item"]/div[@class="info"]/div[@class="hd"]/a/span[1]/text()')
movie_name = '' if len(movie_name) == 0 else movie_name[0]
# print(movie_name)
# 电影介绍
introduce = movie.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[1]/text()')
introduce = '' if len(introduce) == 0 else introduce[0]
# 去两端空格操作
introduce = introduce.strip()
# print(introduce)
# 电影星级
star = movie.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[2]/text()')
star = '' if len(star) == 0 else star[0]
# print(star)
# 电影的评价
evalute = movie.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()')
evalute = '人评价' if len(evalute) == 0 else evalute[0]
evalute = evalute.replace('人评价', '')
# print(evalute)
# 电影的描述
describe = movie.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()')
describe = '' if len(describe) == 0 else describe[0]
# print(describe)
# 详情链接地址
detail_link = movie.xpath('./div[@class="item"]/div[@class="pic"]/a/@href')
detail_link = '' if len(detail_link) == 0 else detail_link[0]
# print(detail_link)
# 图片地址
img_url = movie.xpath('./div[@class="item"]/div[@class="pic"]/a/img/@src')
img_url = '' if len(img_url) == 0 else img_url[0]
# print(img_url)
movie_info_list.append({
'serial_number': serial_number,
'movie_name':movie_name,
'introduce':introduce,
'star':star,
'evalute':evalute,
'describe':describe,
'detail_link': detail_link,
'img_url':img_url
})
for movie_info in movie_info_list:
print(movie_info)
rsp = requests.get(movie_info['img_url'])
if rsp.status_code==200:
#执行写入操作
#参考图片命名方式 0000001.jpg
img_name = '000000{}.png'.format(movie_info['serial_number'])
with open('./img/{}'.format(img_name),'wb') as f:
#读取照片
f.write(rsp.content)
# 存储成csv
df = pd.DataFrame(movie_info_list)
df.to_csv('douban_top250_info.csv')
#进行函数的调用
parse()