写在前面
- 声明一下代码不是我写的,“我不生产代码,我只是代码的搬运工”
- 本文目的是稍微记录和分享,如何使用srapy来爬取OMIM数据库,学习Python scrapy和bs4
爬取流程
scrapy project的构建,
scrapy startproject omimScrapy
cd omimScrapy
scrapy genspider omim omim.org
01 | items.py
配置
import scrapy
class OmimscrapyItem(scrapy.Item):
# define the fields for your item here like:
geneSymbol = scrapy.Field()
mimNumber = scrapy.Field()
location = scrapy.Field()
phenotype = scrapy.Field()
phenotypeMimNumber = scrapy.Field()
nheritance = scrapy.Field()
mappingKey = scrapy.Field()
descriptionFold = scrapy.Field()
diagnosisFold = scrapy.Field()
inheritanceFold = scrapy.Field()
populationGeneticsFold = scrapy.Field()
02 | spider/omim.py
配置
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from omimScrapy.items import OmimscrapyItem
class OmimSpider(scrapy.Spider):
name = 'omim'
allowed_domains = ['omim.org']
#start_urls = ['http://omim.org/']
'''
解析omim mim2gene.txt的文件
'''
def readMim2Gene(self, filename):
filelist = []
with open(filename,"r") as f:
for line in f.readlines():
tempList = []
strs = line.split()
mimNumber = strs[0]
mimEntryType = strs[1]
geneSymbol = "."
if(len(strs)>=4):
geneSymbol = strs[3]
if(mimEntryType in ["gene","gene/phenotype"]):
tempList.append(mimNumber)
tempList.append(mimEntryType)
tempList.append(geneSymbol)
filelist.append(tempList)
return filelist
def start_requests(self):
filelist = self.readMim2Gene("mim2gene.txt")
for row in filelist:
item = OmimscrapyItem()
item['mimNumber'] = row[0]
item['geneSymbol'] = row[2]
url = "https://www.omim.org/entry/"+row[0]
yield scrapy.Request(url,method='GET', callback=self.saveHtml, meta={'item':item})
def saveHtml(self, response):
item = response.meta['item']
html = response.body.decode("utf-8")
with open("scrapy-data/entry/"+item['mimNumber']+".html",'w+') as f:
f.write(html)
f.flush()
03 | settings.py
配置
- OMIM robots.txt 设置了爬虫策略,只允许微软必应bingbot 和谷歌googlebot 爬虫获取指定路径内容;
BOT_NAME = 'bingbot'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'bingbot (+https://www.bing.com/bingbot.htm)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
DOWNLOAD_DELAY = 4
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
04 | scrapy运行
scrapy crawl omim
05 | html解析
'''
解析Phenotype-Gene Relationships表格
'''
def parseHtmlTable(html):
soup = BeautifulSoup(html,"html.parser")
table = soup.table
location,phenotype,mimNumber,nheritance,mappingKey,descriptionFold,diagnosisFold,inheritanceFold,populationGeneticsFold="","","","","","","","",""
if not table:
result = "ERROR"
else:
result = "SUCCESS"
trs = table.find_all('tr')
for tr in trs:
tds = tr.find_all('td')
if len(tds)==0:
continue
elif len(tds)==4:
phenotype = phenotype + "|" + (tds[0].get_text().strip() if tds[0].get_text().strip()!='' else '.' )
mimNumber = mimNumber + "|" + (tds[1].get_text().strip() if tds[1].get_text().strip()!='' else '.')
nheritance = nheritance + "|" + (tds[2].get_text().strip() if tds[2].get_text().strip()!='' else '.')
mappingKey = mappingKey + "|" + (tds[3].get_text().strip() if tds[3].get_text().strip()!='' else '.')
elif len(tds)==5:
location = tds[0].get_text().strip() if tds[0].get_text().strip()!='' else '.'
phenotype = tds[1].get_text().strip() if tds[1].get_text().strip()!='' else '.'
mimNumber = tds[2].get_text().strip() if tds[2].get_text().strip()!='' else '.'
nheritance = tds[3].get_text().strip() if tds[3].get_text().strip()!='' else '.'
mappingKey = tds[4].get_text().strip() if tds[4].get_text().strip()!='' else '.'
else:
result = "ERROR"
descriptionFoldList = soup.select("#descriptionFold")
descriptionFold = "." if len(descriptionFoldList)==0 else descriptionFoldList[0].get_text().strip()
diagnosisFoldList = soup.select("#diagnosisFold")
diagnosisFold = "." if len(diagnosisFoldList)==0 else diagnosisFoldList[0].get_text().strip()
inheritanceFoldList = soup.select("#inheritanceFold")
inheritanceFold = "." if len(inheritanceFoldList)==0 else inheritanceFoldList[0].get_text().strip()
populationGeneticsFoldList = soup.select("#populationGeneticsFold")
populationGeneticsFold = "." if len(populationGeneticsFoldList)==0 else populationGeneticsFoldList[0].get_text().strip()
参考资料
[1] https://www.fee.im/2020/03/crawling-data-from-omim/
[2] https://pzweuj.github.io/2021/11/08/omim-crawler.html