因为对request,cooick等不甚了解,所以选用最简单的selenium爬取
selenium 的特点是所见即所得,爬取到的网页结构和正常加载的一样
配置也很简单,使用driver将谷歌浏览器驱动起来即可
功能需求很简单:
- 爬取个人的动态
- 保存至数据库
- 如果检测到更,新通过邮件通知
一 分析个人界面的网页url:
可以看出 id 后面的XXXXXXXX(位数不固定)标识了每个用户,想要更换用户只需要找到对应用户的id即可
二 分析网页结构:
可以很容的看出 网页的结构为如下
注意: ⚠️爬取时需要从frame 切换到 iframe
<iframe>
<html>
<div>.........</div> //为要爬取的内容
</html>
</iframe>
三 分析如何获取元素:
selenium提供了方法有很多,因为有的div的id是随机生成的,class结构也比较复杂;我使用了full xpath的方法,获取方法也比较简单,只需要使用chrome浏览器,
在网页任意位置单击左键 --> 点击检查 --> 选中要获取的元素标签可以是<div> ,<li>,<a>,<span >--> 再次点击左键 --> 选择copy --> 选择copy full xpath
此时会得到如下的串:
是该元素从网页<html>标签下的结构
/html/body/div[3]/div/div[2]/div[1]/div/div/div/div/ul/li[1]/div[2]/div/div[4]/div/div/div[2]/h3/a
通过.text方法即可获取其中的内容
源码:
import re
from datetime import datetime,timedelta
import smtplib
from email.mime.text import MIMEText
from email.header import Header
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import pymysql
"""
selenium 模块 爬去动态返回入库系统需要的基本信息
"""
def eye (url):
# 配置谷歌浏览器无界面运行
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--headless')
chrome_options.add_argument('blink-settings=imagesEnabled=false')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors",
"enable-automation"])
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get(url)
driver.implicitly_wait(1) # 显式等待1秒
driver.switch_to.frame('contentFrame') # 切入contentFrame
##使用 fullxpath获取元素的内容
name_session = driver.find_elements_by_xpath('/html/body/div[3]/div/dl/dd/div[1]/div/h2/span[1]')
name = name_session[0].text
dynamic_session = driver.find_elements_by_xpath('/html/body/div[3]/div/dl/dd/ul/li[1]/a/strong')
dynamic = dynamic_session[0].text
addtimes_session = driver.find_elements_by_xpath(
'/html/body/div[3]/div/div[2]/div[1]/div/div/div/div/ul/li/div[2]/div/div[2]/a')
addtimes = []
for item in addtimes_session:
addtimes.append(item.text)
comments_session = driver.find_elements_by_xpath(
'/html/body/div[3]/div/div[2]/div[1]/div/div/div/div/ul/li/div[2]/div/div[3]')
comments = []
for item in comments_session:
comments.append(item.text)
songs_session = driver.find_elements_by_xpath(
'/html/body/div[3]/div/div[2]/div[1]/div/div/div/div/ul/li/div[2]/div/div[4]/div/div/div[2]/h3/a')
songs = []
for item in songs_session:
songs.append(item.text)
singers_session = driver.find_elements_by_xpath(
'/html/body/div[3]/div/div[2]/div[1]/div/div/div/div/ul/li/div[2]/div/div[4]/div/div/div[2]/h4/a')
singers = []
for item in singers_session:
singers.append(item.text)
driver.quit()
return name,dynamic,addtimes,comments,songs,singers
"""
数据持久化模块 将爬去的信息存储到数据库中
"""
def keep(url):
flag= True
email_message = ''
error_message = ''
name, dynamic, addtimes, comments, songs, singers = eye(url)
db = pymysql.connect("XXXX", "root", "XXXXXX.", "数据库名称")
cursor = db.cursor()
cursor.execute("SELECT VERSION()")
data = cursor.fetchone()
print("Database version : %s " % data)
#keep_eye_on_title_on 标题内容存入数据库
# 先检察动态数是否改变(不否认删除一条增加一条的情况,但是我又懒得写)
sql = 'select dynamic from eye_on_title order by `date` desc limit 1 '
try:
cursor.execute(sql)
result = cursor.fetchall()
dynamic_number = result[0][0]
print("查询成功")
except:
print("出错")
if dynamic_number != dynamic:
today = datetime.today()
sql = "insert into eye_on_title (name,dynamic,date) values ( '%s', '%s', '%s')" % (name, dynamic, today)
print(sql)
try:
cursor.execute(sql)
db.commit()
print("保存成功")
if dynamic == 0:
email_message = email_message + "keep_an_eye_on失败,计划暴露或结束请求撤离" + dynamic
else:
email_message = email_message + "提示:动态更新 " + dynamic + "\n"
except:
db.rollback()
print("出错")
##检察更新
for i in range(len(comments)):
songName = re.sub(r'\'', "\\'", songs[i]) # 匹配掉 歌名中的 ' 单引号
sql = "select * from eye_on_timeline where comment = '%s' and song = '%s' " % (comments[i], songName)
print(sql)
cursor.execute(sql)
result = cursor.fetchall()
if result:
print("此条动态已存在")
if i + 1 == len(comments):
error_message = " \n(删除后未添加新的内容) " + today.strftime(
"%m月%d日 %H:%M") # 循环条件左开右闭 所以 i+1 才可以等于 len(comments)取巧写法不好
else:
print("未检测到此条动态,准备写入 ")
##处理时间问题
if addtimes[i] == "刚刚":
print("刚刚")
addtime = (datetime.now() + timedelta(minutes=-1)).strftime("%m月%d日 %H:%M")
sql = "insert into eye_on_timeline (song,singer,`comment`,addtine)values('%s','%s','%s','%s') " % (
songName, singers[i], comments[i], addtime)
elif (addtimes[i])[-3:] == "分钟前":
print('分钟前')
reducetime = (addtimes[i])[:-3]
print(reducetime, "计算时间")
addtime = (datetime.now() + timedelta(minutes=-int(reducetime))).strftime("%m月%d日 %H:%M")
sql = "insert into eye_on_timeline (song,singer,`comment`,addtine)values('%s','%s','%s','%s') " % (
songName, singers[i], comments[i], addtime)
elif (addtimes[i])[:2] == "昨天":
print("昨天")
addtime = (datetime.now() + timedelta(days=-1)).strftime("%m月%d日 %H:%M")
sql = "insert into eye_on_timeline (song,singer,`comment`,addtine)values('%s','%s','%s','%s') " % (
songName, singers[i], comments[i], addtime)
else:
sql = "insert into eye_on_timeline (song,singer,`comment`,addtine)values('%s','%s','%s','%s') " % (
songName, singers[i], comments[i], addtimes[i])
print(sql)
sql_message = ("分享歌曲: " + songs[i]) + (" 歌手: " + singers[i]) + (" 评论: " + comments[i] + "\n")
email_message = email_message + sql_message
try:
cursor.execute(sql)
db.commit()
print("保存成功")
except:
db.rollback()
print("出错")
else:
print("没有更新")
flag = False
db.close()
return flag,email_message ,error_message
"""邮件模块将检测到的更新信息发送到邮箱内提醒"""
def mail (email_message,url,error_message,flag):
if flag == True:
from_addr = 'XXXXXXXX@qq.com' # 邮件发送账号
to_addrs = 'XXXXXXXXX@qq.com' # 接收邮件账号
qqCode = 'XXXXXXX' # 授权码(这个要填自己获取到的)
smtp_server = 'smtp.qq.com' # 固定写死
smtp_port = 465 # 固定端口
# 配置服务器
stmp = smtplib.SMTP_SSL(smtp_server, smtp_port)
stmp.login(from_addr, qqCode)
# 组装发送内容
email_message = email_message + ("点击查看: " + url) + error_message
print(email_message)
message = MIMEText(email_message, 'plain', 'utf-8') # 发送的内容
message['From'] = Header("EYE", 'utf-8') # 发件人
message['To'] = Header("boss", 'utf-8') # 收件人
subject = 'Keep_an_eye_on 计划'
message['Subject'] = Header('Keep_an_eye_on', 'utf-8') # 邮件标题
try:
stmp.sendmail(from_addr, to_addrs, message.as_string())
except Exception as e:
print('邮件发送失败--' + str(e))
print('邮件发送成功')
if __name__ == '__main__':
url = 'https://music.163.com/#/user/event?id=XXXXXXXXX'#所要爬去的网易云动态页面
flag,email_message ,error_message=keep(url)
mail(email_message,url,error_message,flag)
"""整体有三大模块:
1. 爬取模块,使用selenium 爬取网易云的动态上的基本信息
2. 入库模块,使用pymysql 将爬取到的信息存入数据库
3. 邮件模块,使用smtp 将数据发送到用户邮箱以题型
4. 需要添加一个日志模块 保证服务持久运行,报错有据可循
"""