最近在追一个连载的小说,作者更新时间很不稳定,所以打算写个定时任务,去检查有没有更新,有更新就给我发封邮件,里面附上最新的链接。
使用urllib来抓地址,email和smtplib来发送邮件
from urllib import request
from email.mime.text import MIMEText
from email.header import Header
from smtplib import SMTP_SSL
import time
time.sleep(2)
req = request.Request('http://www.piaotian.com/bookinfo/5/5623.html')
req.add_header('User-Agent', 'Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25')
port_num=[]
with request.urlopen(req) as f:
port_num.append(f.read().decode('GBK'))
# print(port_num)
start=port_num[0].find('" title=')
# print(start)
port_num.append(port_num[0][start-70:start+50])
print(start)
print(port_num[0][start-70:start+50])
with open('/home/pi/test/latest.txt', 'r') as f:
pre=f.read()
if pre==port_num[1]:
print('no change')
else:
with open('/home/pi/test/latest.txt', 'w') as f:
f.write(port_num[1])
host_server = 'smtp.qq.com'
sender_qq = 'xxxxxxxx'
pwd = 'xxxxxxx'
sender_qq_mail = 'xxxxxx@qq.com'
receiver = 'xxxxxxxx@gmail.com'
mail_content = str(port_num[1])
mail_title = '有更新'
smtp = SMTP_SSL(host_server)
#set_debuglevel()
smtp.set_debuglevel(0)
smtp.ehlo(host_server)
smtp.login(sender_qq, pwd)
msg = MIMEText(mail_content, "plain", 'utf-8')
msg["Subject"] = Header(mail_title, 'utf-8')
msg["From"] = sender_qq_mail
msg["To"] = receiver
smtp.sendmail(sender_qq_mail, receiver, msg.as_string())
smtp.quit()
然后用crontab -e 把这个脚本设置每个小时启动一次。
下面这个是从youtube下载视频的脚本
import requests
import re
import urllib.request
import time
import json
import pafy
from datetime import datetime
database_url=[]
def saveHTML(data):
f_obj = open('example.html', 'w')
f_obj.write(data)
f_obj.close()
def Schedule(a,b,c):
per = 100*a * b / c
if per > 100 :
per = 100
print ('%.2f%%' % per)
def main_spider():
global video_number01
video_number02=video_number01
url='https://www.youtube.com/feed/trending'
#initial page
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
#i am an explore
req =requests.get(url,headers=headers)
#get the initial page
reg=r'href="(/watch.*?)"'
#find sub-page
double_url=re.findall(reg,req.text)
single_url=[]
[single_url.append(i) for i in double_url if not i in single_url]
#delete the duplicates
for i in single_url:
if i in database_url:
print(i,'video exists')
video_number02=video_number02-1
if video_number02<1:
break
continue
else:
real_url='https://www.youtube.com'+i
#build the integrated sub-page
print(real_url)
spider_youtube(real_url)
#download it
database_url.append(i)
video_number02=video_number02-1
time.sleep(5)
#real customer would have a rest
if video_number02<1:
break
def spider_youtube(url):
video = pafy.new(url)
#find the real video source
print(video.title)
best = video.getbest()
#find the best quality
print(best.url)
best.download(quiet=False,filepath="%s"%(len(database_url)))
def keeper():
global video_number01
video_number01=int(input('How many videos do you want to update each cycle?'))
for i in range(10000):
main_spider()
print('scanning time:',datetime.now())
time.sleep(30)
keeper()