1.counting DNA nucleotides
#style1
seq=''
ntcount=[]
with open('rosalind1.txt') as f:
for line in f:
line = line.rstrip() #去末尾空格
seq += line.upper()
for nt in ['A','C','G','T']:
ntcount.append(seq.count(nt))
print('\t'.join(map(str,ntcount)))
#style2
with open('rosalind1.txt') as f:
for a in f:
b = list(str(a)) #b是一个个核苷酸的列表
c={}
for d in b:
c[d] = b.count(d)
print(c)
#style3
from collections import Counter
with open('rosalind_dna.txt') as f:
list=f.read().rstrip()
print(Counter(list))
2.将DNA转录成RNA
#style1 使用replace函数
with open('rosalind_rna.txt','r') as f:
list=f.read()
print(list.replace('T','U'))
#style2 使用re.sub正则表达式
import re
with open('rosalind_rna.txt', 'r') as f:
list = f.read()
print(re.sub('T','U',list))
3.获取反向互补链
#rosalind 3
#style1
def reverse_complement(seq):
ntComplement={'A':'T','C':'G','G':'C','T':'A'}
revSeqList = list(reversed(seq))
revComSeqList = [ntComplement[k] for k in revSeqList]
revComSeq = ''.join(revComSeqList)
return revComSeq
seq=''
with open('rosalind_revc.txt') as f:
for line in f:
line = line.rstrip()
seq += line.rstrip()
print(reverse_complement(seq))
#style2 使用翻译表
seq=''
with open('rosalind_revc.txt') as f:
for line in f:
line = line.rstrip()
seq += line.rstrip()
transtable = str.maketrans('ATCG','TAGC')
print(seq.translate(transtable)[::-1])
#style3 使用biopython
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
seq=''
with open('rosalind_revc.txt') as f:
for line in f:
line = line.rstrip()
seq += line.rstrip()
my_seq = Seq(seq,IUPAC.unambiguous_dna)
print(my_seq)
print(my_seq.complement()) #互补链
print(my_seq.reverse_complement()) #反向互补链
4.兔子和递归关系
def fibonacciRabbits(n, k):
F = [1, 1]
generation = 2
while generation <= n:
F.append(F[generation - 1] + F[generation - 2] * k)
generation += 1
# 函数返回列表最后一项
return (F[n - 1])
#调用函数并打印
print (fibonacciRabbits(5, 3))
5.计算GC含量
#style1
from Bio import SeqIO
Target_Seq = SeqIO.parse('rosalind_gc.txt','fasta')
GC_content = 0
ID = ''
for target in Target_Seq:
if GC_content < (float(target.seq.count('C')+target.seq.count('G'))/len(target.seq))*100:
GC_content = (float(target.seq.count('C')+target.seq.count('G'))/len(target.seq))*100
ID = target.description
print(ID,'\n',GC_content)
#style2
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqUtils import GC
my_seq = Seq("GGGATCGTAC",IUPAC.unambiguous_dna)
for index,letter in enumerate(my_seq):
print(index,letter)
print(len(my_seq))
print(my_seq.count('A'))
print(GC(my_seq))
6.计算突变个数
d = open('rosalind_hamm.txt','r')
s = d.readlines()
a = s[0].strip()
b = s[1].strip()
hum = 0
for i in range(len(a)):
if a[i] != b[I]:
hum+=1
print(hum)
7.孟德尔第一定律
from scipy.misc import comb
def fun(k,m,n):
y = k+m+n
nn = comb(n,2)/comb(y,2)
kk = comb(m,2)/comb(y,2)
mm = comb(n,1)/comb(y,2)
p = round(1-(nn+kk*1/4+mm*1/2),5)
return p
#读取数字
d = open('rosalind_iprb.txt','r')
s = d.read()
f = s.split()
k,m,n = int(f[0]),int(f[1]),int(f[2])
print(fun(k,m,n))
8.将RNA翻译成蛋白质
# -*- coding: utf-8 -*-
'''rosaline 将RNA翻译成蛋白质'''
#style1 使用biopython
from Bio import Seq
from Bio.Alphabet import generic_dna,generic_rna
from Bio import SeqIO
from Bio.Data import CodonTable
#载入mRNA序列
RNA_string=open('rosalind_prot.txt','r').read().strip()
my_seq=Seq.Seq(RNA_string,generic_rna)
#载入标准遗传密码表
standard_table=CodonTable.unambiguous_rna_by_name['Standard']
#将mRNA翻译成蛋白质
protein=my_seq.translate(table='Standard')
print(protein)
#style2 自己写出密码表
import re
def mRNA_protein(RNA_string):
#将mRNA翻译成蛋白质
start_code = 'AUG'
end_code = ['UAA', 'UAG', 'UGA']
protein_table = {'UUU': 'F', 'CUU': 'L', 'AUU': 'I', 'GUU': 'V', \
'UUC': 'F', 'CUC': 'L', 'AUC': 'I', 'GUC': 'V', \
'UUA': 'L', 'CUA': 'L', 'AUA': 'I', 'GUA': 'V', \
'UUG': 'L', 'CUG': 'L', 'AUG': 'M', 'GUG': 'V', \
'UCU': 'S', 'CCU': 'P', 'ACU': 'T', 'GCU': 'A', \
'UCC': 'S', 'CCC': 'P', 'ACC': 'T', 'GCC': 'A', \
'UCA': 'S', 'CCA': 'P', 'ACA': 'T', 'GCA': 'A', \
'UCG': 'S', 'CCG': 'P', 'ACG': 'T', 'GCG': 'A', \
'UAU': 'Y', 'CAU': 'H', 'AAU': 'N', 'GAU': 'D', \
'UAC': 'Y', 'CAC': 'H', 'AAC': 'N', 'GAC': 'D', \
'UAA': 'Stop', 'CAA': 'Q', 'AAA': 'K', 'GAA': 'E', \
'UAG': 'Stop', 'CAG': 'Q', 'AAG': 'K', 'GAG': 'E', \
'UGU': 'C', 'CGU': 'R', 'AGU': 'S', 'GGU': 'G', \
'UGC': 'C', 'CGC': 'R', 'AGC': 'S', 'GGC': 'G', \
'UGA': 'Stop', 'CGA': 'R', 'AGA': 'R', 'GGA': 'G', \
'UGG': 'W', 'CGG': 'R', 'AGG': 'R', 'GGG': 'G'
}
# 找到起始密码子的位置
start_sit = re.search(start_code, RNA_string)
protein = ''
# 按阅读框匹配蛋白质
for sit in range(start_sit.end(), len(RNA_string), 3):
protein = protein + protein_table[RNA_string[sit:sit + 3]]
print(protein)
if __name__ == '__main__':
RNA_string = open('rosalind_prot.txt','r').read().strip()
mRNA_protein(RNA_string)
#style3 使用biopython
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
d=open('rosalind_prot.txt','r').read().strip()
messenger_rna=Seq(d,IUPAC.unambiguous_rna)
print(messenger_rna.translate().strip('*'))
9.找到子串位置
# -*- coding: utf-8 -*-
#rosalind9:finding a motif in dna
#style1
d=open('rosalind_subs.txt','r')
s=d.readlines() #返回列表
a=s[0].strip()
b=s[1].strip()
print(s)
num1,num2=len(a),len(b)
n=num1-num2
I=0
site=[]
for i in range(n):
c=a[i:i+num2]
if b==c:
site.append(i+1)#python的第1位是0,第2位是1,所以要加1
print(" ".join(str(j) for j in site))#输出成sampleoutput里的格式,使用join需要改为字符串格式
#style2
import re
# 如果overlapped 为False, 匹配到位置为2,10
matches = re.finditer('ATAT', 'GATATATGCATATACTT')
# 返回所有匹配项,
for match in matches: #迭代器可循环
print(match.start())
- re.match,re.search,re.findall,re.finditer区别
re.match
从首字母开始匹配,没有,就返回none
re.search
返回对象
re.findall
返回数组
re.finditer
返回迭代器
#style3
seq = 'GATATATGCATATACTT'
pattern = 'ATAT'
def find_motif(seq, pattern):
position = []
for i in range(len(seq) - len(pattern)):
if seq[i:i + len(pattern)] == pattern:
position.append(str(i + 1))
print('\t'.join(position))
find_motif(seq, pattern)
10.寻找最可能的共同祖先
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#rosalind10: consensus and profile
#style1
from collections import Counter
from collections import OrderedDict #有序字典
# 写出到文件
fh = open('cons.txt', 'wt')
rosalind = OrderedDict()
seqLength = 0
with open('rosalind_cons.txt') as f:
for line in f:
line = line.rstrip()
if line.startswith('>'):
rosalindName = line
rosalind[rosalindName] = ''
# 跳出这个循环,进入下一个循环
continue
# 如果不是'>'开头,执行这一句
rosalind[rosalindName] += line
# len(ATCCAGCT)
seqLength = len(rosalind[rosalindName])
A, C, G, T = [], [], [], []
consensusSeq = ''
for i in range(seqLength):
seq = ''
# rosalind.keys = Rosalind_1...Rosalind_7
for k in rosalind.keys():
# 把 Rosalind_1...Rosalind_7 相同顺序的碱基加起来
seq += rosalind[k][i]
A.append(seq.count('A'))
C.append(seq.count('C'))
G.append(seq.count('G'))
T.append(seq.count('T'))
# 为seq计数
counts = Counter(seq)
# 从多到少返回,是个list啊,只返回第一个
consensusSeq += counts.most_common()[0][0]
print(counts.most_common())
fh.write(consensusSeq + '\n')
fh.write('\n'.join(['A:\t' + '\t'.join(map(str, A)), 'C:\t' + '\t'.join(map(str, C)),
'G:\t' + '\t'.join(map(str, G)), 'T:\t' + '\t'.join(map(str, T))]))
# .join(map(str,A)) 把 A=[5, 1, 0, 0, 5, 5, 0, 0] 格式转化成 A:5 1 0 0 5 5 0 0
fh.close()
#style2(未成功)
import numpy as np
import os
from collections import Counter
fhand = open('rosalind_cons.txt')
t = []
for line in fhand:
if line.startswith('>'):
continue
else:
line = line.rstrip()
line_list = list(line) # 变成一个list
t.append(line_list) # 再把list 放入list
a = np.array(t) # 创建一个二维数组
print(a)
L1, L2, L3, L4 = [], [], [], []
comsquence = ''
for i in range(a.shape[1]): # shape[0],是7 行数,shape[1]是8 项目数
l = [x[i] for x in a] # 调出二维数组的每一列
L1.append(l.count('A'))
L2.append(l.count('C'))
L3.append(l.count('T'))
L4.append(l.count('G'))
comsquence = comsquence + Counter(l).most_common()[0][0]
print(comsquence)
print('A:', L1, '\n', 'C:', L2, '\n', 'T:', L3, '\n', 'G:', L4)
#style3
def update_cnt(list, char, str):
res = [a + b for a,b in zip(list, [1 if x == char else 0 for x in str])]
return(res)
def print_cnt(list, label):
print(label,end = ": ")
for x in list:
print(x, end=" ")
print()
f = open("rosalind_cons.txt", 'r')
fas = f.readlines()
seq = ''
j = 0
for i in range(len(fas)):
if fas[i][0] == '>' and j == 0:
j += 1
next
elif fas[i][0] == '>':
if j == 1:
A = [0 for x in seq]
C = A
G = A
T = A
A = update_cnt(A, 'A', seq)
C = update_cnt(C, 'C', seq)
G = update_cnt(G, 'G', seq)
T = update_cnt(T, 'T', seq)
j += 1
seq = ''
else:
seq += fas[i].strip()
## last seq
A = update_cnt(A, 'A', seq)
C = update_cnt(C, 'C', seq)
G = update_cnt(G, 'G', seq)
T = update_cnt(T, 'T', seq)
consensus = ''
for i in range(len(A)):
m = max(A[i], C[i], G[i], T[I])
if A[i] == m:
consensus += 'A'
elif C[i] == m:
consensus += 'C'
elif G[i] == m:
consensus += 'G'
elif T[i] == m:
consensus += 'T'
print(consensus)
print_cnt(A, 'A')
print_cnt(C, 'C')
print_cnt(G, 'G')
print_cnt(T, 'T')
#style4
import numpy as np #引入numpy
d=open('rosalind_cons.txt','r')
from collections import Counter #问题1里求A/T/C/G出现过多少次时用到的函数
list2=[]
a=1 #起标记作用,看是不是第一次遇到'>'
dline=''
for line in d:
if line.startswith('>'):
if a==0: #如果不是第一次遇到'>'
list2.append(list(dline)) #list是为了创建二维数组
dline=''
continue
else:
a=0 #如果第一次遇到'>'
dline+=line.rstrip() #去掉每一行右边的换行
#最后一行
list2.append(list(dline))
bi=np.array(list2)#创建二维数组
LA,LT,LC,LG=[],[],[],[]
consensus =''#共同序列
for i in range(bi.shape[1]):#数组第1行有多少个
l=bi[:,i] #调出数组每一列
listl=list(l)
LA.append(listl.count('A'))#数一个加一个,LA是数值列表,每数一列就加一个在末尾
LC.append(listl.count('C'))
LT.append(listl.count('T'))
LG.append(listl.count('G'))
most=Counter(listl).most_common() #算出最多的那个
print(most)
consensus=consensus+most[0][0]#第一列的(排序)第一的相加
print (consensus)
print("A:"," ".join(str(j) for j in LA))
print("C:"," ".join(str(j) for j in LC))
print("G:"," ".join(str(j) for j in LG))
print("T:"," ".join(str(j) for j in LT))
- 矩阵运算: numpy spicy pandas
- counters包