特别声明:
- 供交流学习使用,不得用作商业用途。
- 如有违规侵权,请联系删除。
import requests
# from pyquery import PyQuery as pq
import time
import json
import sys
import os
import re
# 设置数据保存路径 & 请求网址
wd = r'/share/disk1/Data/Users/luohb/spider/Cell_BLAST/result/'
url='https://cblast.gao-lab.org/datasets_meta'
# 网站请求获取 Json 数据
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}
try:
res = requests.post(url=url, headers=headers)
print(res.status_code)
# print(res.text)
except Exception:
print('request fail...please check!')
# Json文件解析
i = 0
json_list = json.loads(res.text)
for item in json_list:
time.sleep(2) #避免请求异常导致爬取过快
try:
dataset_name = str(item['dataset_name'].replace(' ', '_'))
organism = str(item['organism'].replace(' ', '_'))
organ = str(item['organ'].replace(' ', '_'))
platform = str(item['platform'])
cell_number = str(item['cell_number'])
visualization = list(item['visualization'].split(','))
i += 1
except KeyError:
print('item {} has key error, please check!'.format(i))
#create & change directory
dir_name = '-'.join([dataset_name, organism, organ, platform, cell_number])
path = os.path.join(wd, dir_name)
os.mkdir(path)
os.chdir(path)
print(os.getcwd())
#download h5file
h5_url = 'https://cblast.gao-lab.org/{name}/{name}.h5'.format(name=dataset_name)
# print(h5_url)
os.system('wget {}'.format(h5_url))
#download SVG file
for viz in visualization:
viz = viz.strip()
svg_path = 'https://cblast.gao-lab.org/{name}/{svg_type}'.format(name=dataset_name, svg_type=viz)
print(svg_path)
os.system('wget {}'.format(svg_path))