爬虫是python的强项,也是其热门应用领域,有很多完善成熟的爬虫模块/库,像是scrapy、selenium、beautifulsoup,以及简易的urlib、requests。今天就是介绍一下怎么用requests简单爬取pubmed的文献搜索结果并进行批量下载全部搜索结果(当然前提是给出了doi号),已经将关键代码进行了注释,只需要运行代码,然后输入你的搜索关键词,用空格隔开,即可自动获得doi号并且从sci-hub地址下载文献(下载到你的当前文件夹),不过。。。由于网速原因,python去下载确实有点不太安全,因为也不好断点续传什么的。。。所以我把sci-hub的对应pdf下载地址还写入到了一个文件里,复制粘贴到迅雷里即可下载,这样做的话可以直接把代码段中直接下载pdf那段注释掉。
import requests
import re
import os
if __name__ == "__main__":
# UA伪装
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
}
# 设定pubmed的url,以及搜索关键词,从用户端空格隔开输入
url = "https://pubmed.ncbi.nlm.nih.gov/"
term = input("Please input your keyword: ").split(" ")
# pubmed一页显示的结果数量
size = 200
# 结果页码号
page = 1
param = {
"term": term,
"size": size,
"page": page
}
doi_list = []
# 发送请求
response = requests.get(url=url, params=param, headers=headers)
page_text = response.text
# 得到结果总数量
results_amount = int(re.search(r"""<span class="value">(\d+(?:,?\d+)?)</span>.*?results""", page_text,
re.DOTALL).group(1).replace(",", ""))
# 正则获得doi号
doi_list += re.findall(r"""doi: (10\..*?)\.[ <]""", page_text)
# 模拟翻页,将剩余pages中的doi号提取
if results_amount % 200 == 0:
step_num = results_amount / 200 - 1
else:
step_num = results_amount // 200
if step_num:
for page in range(2, step_num+2):
size = 200
page = page
param = {
"term": term,
"size": size,
"page": page
}
response = requests.get(url=url, params=param, headers=headers)
page_text = response.text
doi_list += re.findall(r"""doi: (10\..*?)\.[ <]""", page_text)
# 从sci-hub下载
for doi in doi_list:
down_url = r"https://sci.bban.top/pdf/"+doi+".pdf"
# 将下载地址写入文件
with open(r"./down_url.txt", "a") as u:
u.write(down_url+"\n")
r = requests.get(url=down_url)
# 直接下载pdf
with open(f"./{os.path.basename(down_url)}", "wb") as f:
f.write(r.content)