转自http://blog.csdn.net/u014801157/article/details/24372465
感觉挺有用的,注意原作版权哈,咱们学习学习~
Biostrings最后一节,介绍模式匹配和序列比对的相关函数和操作。
下面我们使用拟南芥基因转录起始点上游1kb的序列进行分析。序列文件可以从TAIR网站(http://www.arabidopsis.org)下载。先用readDNAStringSet函数从FASTA文件中读取序列并查看头2个序列的信息:
library(Biostrings)
upstream.1k <- readDNAStringSet(file.choose(), "fasta")
head(upstream.1k, 2)
## A DNAStringSet instance of length 2
## width seq names
## [1] 1000 ACAAGCATGCTTAGCATACGT...ATCTTAATGTGGATAGTGCT AT1G08520 | chr1:...
## [2] 1000 CGTAACAGTAACAACTATATT...CTCCTGGAGAAGAGAAGACT AT1G08530 | chr1:...
# 获取第29条序列
DNA.str <- upstream.1k[[29]]
一、模式匹配(pattern match)
1、单模式匹配
matchPattern():1个查询模式1条序列
countPattern():1个查询模式1条序列,仅计数
vmatchPattern():1个查询模式n条序列
vcountPattern():1个查询模式n条序列,仅计数
pattern, 匹配模式
subject, 参考序列
max.mismatch=0, 最大错配数
min.mismatch=0, 最小错配数
with.indels=FALSE/TRUE, 是否允许Indels(插入/缺失)。如果设置为TRUE,min.mismatch必需为0,而max.mismatch解释为编辑位点与匹配区域间的距离
fixed=TRUE/FALSE, 是否允许按照IUPAC代码进行简并碱基的匹配
algorithm="auto", 可设"auto", "naive-exact", "naive-inexact", "boyer-moore", "shift-or" or "indels"等值。但不能随便设置,得看前面几个参数的设置情况。
#
# 1模式查询1条序列:非模糊匹配,不能使用IUPAC简并代码,所以结果找不到(views:
# NONE)
matchPattern("ACGTGKC", DNA.str)
## Views on a 1000-letter DNAString subject
## subject: AGAGAAGGCTCCAAACACGTGTCTCAGCCATA...AATAAATCTTTGCTCATTATATGTCCTCAGC
## views: NONE
# 模糊匹配,使用IUPAC简并代码,K可匹配G/T
matchPattern("ACGTGKC", DNA.str, fixed = FALSE)
## Views on a 1000-letter DNAString subject
## subject: AGAGAAGGCTCCAAACACGTGTCTCAGCCATA...AATAAATCTTTGCTCATTATATGTCCTCAGC
## views:
## start end width
## [1] 17 23 7 [ACGTGTC]
## [2] 65 71 7 [ACGTGTC]
# 仅计数
countPattern("ACGTGKC", DNA.str, fixed = FALSE)
## [1] 2
# 1模式查询n条序列,仅计数:
vc <- vcountPattern("ACGTGKC", upstream.1k, fixed = FALSE)
# 看看最多匹配数,太多了
max(vc)
## [1] 970
# 用subset函数提取具有最多匹配数的序列子集
subs <- subset(upstream.1k, vc == max(vc))
# 原来匹配的都是NNN...,怪不得那么多
matchPattern("ACGTGKC", subs[[1]], fixed = FALSE)
## Views on a 1000-letter DNAString subject
## subject: NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNNNNNNGAATTCGTCGACCAGGACGGCGGA
## views:
## start end width
## [1] 1 7 7 [NNNNNNN]
## [2] 2 8 7 [NNNNNNN]
## [3] 3 9 7 [NNNNNNN]
## [4] 4 10 7 [NNNNNNN]
## [5] 5 11 7 [NNNNNNN]
## [6] 6 12 7 [NNNNNNN]
## [7] 7 13 7 [NNNNNNN]
## [8] 8 14 7 [NNNNNNN]
## [9] 9 15 7 [NNNNNNN]
## ... ... ... ... ...
## [962] 962 968 7 [NNNNNNN]
## [963] 963 969 7 [NNNNNNN]
## [964] 964 970 7 [NNNNNNN]
## [965] 965 971 7 [NNNNNNN]
## [966] 966 972 7 [NNNNNNN]
## [967] 967 973 7 [NNNNNNN]
## [968] 968 974 7 [NNNNNNN]
## [969] 969 975 7 [NNNNNNN]
## [970] 970 976 7 [NNNNNNN]
vc <- vcountPattern("ACGTGKC", upstream.1k, fixed = "subject")
subs <- subset(upstream.1k, vc == max(vc))
# 查看结果,这才是我们想要的:
matchPattern("ACGTGKC", subs[[1]], fixed = "subject")
## Views on a 1000-letter DNAString subject
## subject: TTACCCGAGTTGGGTAATCGACTCGGTTGGTT...CCTTTTCATTTTCTTCTCCCTCTCCTGGGTT
## views:
## start end width
## [1] 171 177 7 [ACGTGGC]
## [2] 350 356 7 [ACGTGTC]
## [3] 421 427 7 [ACGTGTC]
## [4] 449 455 7 [ACGTGGC]
matchPattern("ACGTGKC", subs[[1]], fixed = c(subject = TRUE, pattern = FALSE))
## Views on a 1000-letter DNAString subject
## subject: TTACCCGAGTTGGGTAATCGACTCGGTTGGTT...CCTTTTCATTTTCTTCTCCCTCTCCTGGGTT
## views:
## start end width
## [1] 171 177 7 [ACGTGGC]
## [2] 350 356 7 [ACGTGTC]
## [3] 421 427 7 [ACGTGTC]
## [4] 449 455 7 [ACGTGGC]
names(subset(upstream.1k, vc > 2))
## [1] "AT1G24580 | chr1:8709180-8710179 FORWARD LENGTH=1000"
## [2] "AT1G09210 | chr1:2976751-2977750 REVERSE LENGTH=1000"
## [3] "AT1G19490 | chr1:6754000-6754999 REVERSE LENGTH=1000"
## [4] "AT1G07870 | chr1:2432058-2433057 REVERSE LENGTH=1000"
## [5] "AT1G51140 | chr1:18945754-18946753 REVERSE LENGTH=1000"
## [6] "AT1G20180 | chr1:6995210-6996209 FORWARD LENGTH=1000"
## [7] "AT1G77450 | chr1:29098954-29099953 FORWARD LENGTH=1000"
## [8] "AT1G72510 | chr1:27302366-27303365 FORWARD LENGTH=1000"
## [9] "AT1G60190 | chr1:22197403-22198402 FORWARD LENGTH=1000"
## [10] "AT1G79040 | chr1:29735016-29736015 FORWARD LENGTH=1000"
## [11] "AT1G55520 | chr1:20728092-20729091 REVERSE LENGTH=1000"
## [12] "AT2G04350 | chr2:1514805-1515804 FORWARD LENGTH=1000"
## [13] "AT2G22240 | chr2:9454095-9455094 REVERSE LENGTH=1000"
## [14] "AT3G63350 | chr3:23398468-23399467 FORWARD LENGTH=1000"
## [15] "AT3G33154 | chr3:13964599-13965598 FORWARD LENGTH=1000"
## [16] "AT3G42065 | chr3:14254659-14255658 FORWARD LENGTH=1000"
## [17] "AT3G63070 | chr3:23301365-23302364 FORWARD LENGTH=1000"
## [18] "AT3G49440 | chr3:18336318-18337317 REVERSE LENGTH=1000"
## [19] "AT3G10985 | chr3:3441528-3442527 FORWARD LENGTH=1000"
## [20] "AT3G15280 | chr3:5144694-5145693 REVERSE LENGTH=1000"
## [21] "AT3G03680 | chr3:906500-907499 FORWARD LENGTH=1000"
## [22] "AT3G02140 | chr3:386539-387538 REVERSE LENGTH=1000"
## [23] "AT3G53040 | chr3:19666488-19667487 REVERSE LENGTH=1000"
## [24] "AT4G21940 | chr4:11639802-11640801 FORWARD LENGTH=1000"
## [25] "AT4G25570 | chr4:13055624-13056623 REVERSE LENGTH=1000"
## [26] "AT4G27410 | chr4:13709150-13710149 REVERSE LENGTH=1000"
## [27] "AT5G50360 | chr5:20506339-20507338 REVERSE LENGTH=1000"
## [28] "AT5G10490 | chr5:3304762-3305761 REVERSE LENGTH=1000"
## [29] "AT5G15950 | chr5:5204869-5205868 FORWARD LENGTH=1000"
## [30] "AT5G28667 | chr5:10689062-10690061 FORWARD LENGTH=1000"
## [31] "AT5G06980 | chr5:2166468-2167467 FORWARD LENGTH=1000"
## [32] "AT5G15948 | chr5:5204871-5205870 FORWARD LENGTH=1000"
as(maskMotif(DNA.str, "ACGTGKC", fixed = "subject"), "Views")
## Views on a 1000-letter DNAString subject
## subject: AGAGAAGGCTCCAAACACGTGTCTCAGCCATA...AATAAATCTTTGCTCATTATATGTCCTCAGC
## views:
## start end width
## [1] 1 16 16 [AGAGAAGGCTCCAAAC]
## [2] 24 64 41 [TCAGCCATATTATATCATTCGTCCGAGAGAAGGCTCCACAC]
## [3] 72 1000 929 [TCAGCCATATGTCTATCCGACGTAA...CTTTGCTCATTATATGTCCTCAGC]
gaps(matchPattern("ACGTGKC", DNA.str, fixed = "subject"))
## Views on a 1000-letter DNAString subject
## subject: AGAGAAGGCTCCAAACACGTGTCTCAGCCATA...AATAAATCTTTGCTCATTATATGTCCTCAGC
## views:
## start end width
## [1] 1 16 16 [AGAGAAGGCTCCAAAC]
## [2] 24 64 41 [TCAGCCATATTATATCATTCGTCCGAGAGAAGGCTCCACAC]
## [3] 72 1000 929 [TCAGCCATATGTCTATCCGACGTAA...CTTTGCTCATTATATGTCCTCAGC]
2、多模式匹配与PDict类
matchPDict():n个查询模式1条序列
countPDict():n个查询模式1条序列,仅计数
vmatchPDict():n个查询模式n条序列
vcountPDict():n个查询模式n条序列,仅计数
getClass("TB_PDict")
## Class "TB_PDict" [package "Biostrings"]
##
## Slots:
##
## Name: threeparts dict0 constant_width dups0
## Class: PDict3Parts DNAStringSet logical Dups
##
## Name: elementType elementMetadata metadata
## Class: character DataTableORNULL list
##
## Extends:
## Class "PDict", directly
## Class "List", by class "PDict", distance 2
## Class "Vector", by class "PDict", distance 3
## Class "Annotated", by class "PDict", distance 4
##
## Known Subclasses: "Expanded_TB_PDict"
PDict(x, max.mismatch = NA, tb.start = NA, tb.end = NA, tb.width = NA, algorithm = "ACtree2",
skip.invalid.patterns = FALSE)
x是为DNAStringSet对象或可以转成DNAStringSet对象的字符串或XStringViews对象。
如果PDict全部使用默认参数,那么x中只能包含确定的碱基,即A/T/G/C;如果要使用模糊匹配,那就得设定tb.start/tb.end/tb.width。
# 这是错误的,因为tb.start和tb.end的设置导致tb含有非DNA碱基
xx <- PDict("ATCNGGC", tb.start = 1, tb.end = 4)
## Error: non base DNA letter found in Trusted Band for pattern 1
# 下面是正确的:
xx <- PDict("ATCNGGC", tb.start = 2, tb.end = 3)
# head部分为A
head(xx)
## A DNAStringSet instance of length 1
## width seq
## [1] 1 A
# tb部分为TC
tb(xx)
## A DNAStringSet instance of length 1
## width seq
## [1] 2 TC
# tail部分为NGGC
tail(xx)
## A DNAStringSet instance of length 1
## width seq
## [1] 4 NGGC
xx <- PDict(c("ATCNGGC", "ANTCGGCG"), tb.start = 1, tb.width = 1)
class(xx)
## [1] "TB_PDict"
## attr(,"package")
## [1] "Biostrings"
# xx的head为NULL
head(xx)
## NULL
tb(xx)
## A DNAStringSet instance of length 2
## width seq
## [1] 1 A
## [2] 1 A
tail(xx)
## A DNAStringSet instance of length 2
## width seq
## [1] 6 TCNGGC
## [2] 7 NTCGGCG
# 产生随机序列的函数
rndSeq <- function(dict, n) {
paste(sample(dict, n, replace = T), collapse = "")
}
#
# matchPattern和matchPDict运行时比较函数:参数len.p为查询模式的长度,n.p为查询模式的数量
comp.runtime <- function(len.p, n.p) {
dna <- DNAString(rndSeq(DNA_BASES, 1e+07))
pat <- mapply(rndSeq, list(DNA_BASES), rep(len.p, n.p))
pd <- PDict(pat)
t.matchPattern <- system.time(for (i in 1:length(pat)) matchPattern(pat[i],
dna))
t.matchPDict <- system.time(matchPDict(pd, dna))
rbind(t.matchPattern, t.matchPDict)[, 1:3]
}
# 1个长度为20bp的motif,matchPattern反而比matchPDict要快
comp.runtime(20, 1)
## user.self sys.self elapsed
## t.matchPattern 0.02 0 0.01
## t.matchPDict 0.09 0 0.10
# 10个长度为20bp的motif
comp.runtime(20, 10)
## user.self sys.self elapsed
## t.matchPattern 0.27 0 0.28
## t.matchPDict 0.09 0 0.09
# 100个长度为20bp的motif
comp.runtime(20, 100)
## user.self sys.self elapsed
## t.matchPattern 2.41 0 2.45
## t.matchPDict 0.07 0 0.09
# 1000个长度为20bp的motif
comp.runtime(20, 1000)
## user.self sys.self elapsed
## t.matchPattern 25.05 0.04 25.39
## t.matchPDict 0.17 0.00 0.17
3、位置权重匹配
motif <- DNAStringSet(c(rep("ACGTGGC", 10), rep("ACGTGTC", 3)))
(pwm <- PWM(motif))
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## A 0.1442 0.0000 0.0000 0.0000 0.0000 0.00000 0.0000
## C 0.0000 0.1442 0.0000 0.0000 0.0000 0.00000 0.1442
## G 0.0000 0.0000 0.1442 0.0000 0.1442 0.13487 0.0000
## T 0.0000 0.0000 0.0000 0.1442 0.0000 0.09315 0.0000
matchPWM(pwm, subject, min.score = "80%", ...)
countPWM(pwm, subject, min.score = "80%", ...)
PWMscoreStartingAt(pwm, subject, starting.at = 1)
(xx <- matchPWM(pwm, DNA.str, min.score = 0.9))
## Views on a 1000-letter DNAString subject
## subject: AGAGAAGGCTCCAAACACGTGTCTCAGCCATA...AATAAATCTTTGCTCATTATATGTCCTCAGC
## views:
## start end width
## [1] 17 23 7 [ACGTGTC]
## [2] 65 71 7 [ACGTGTC]
(xx <- matchPWM(pwm, DNA.str, min.score = 0.7))
## Views on a 1000-letter DNAString subject
## subject: AGAGAAGGCTCCAAACACGTGTCTCAGCCATA...AATAAATCTTTGCTCATTATATGTCCTCAGC
## views:
## start end width
## [1] 17 23 7 [ACGTGTC]
## [2] 65 71 7 [ACGTGTC]
## [3] 105 111 7 [TCGTTGC]
## [4] 135 141 7 [AAGTGAC]
## [5] 266 272 7 [ATGTGGG]
## [6] 328 334 7 [ACGGTGC]
## [7] 372 378 7 [AATTGGC]
## [8] 641 647 7 [ATGTGGG]
## [9] 711 717 7 [TCGTGCC]
countPWM(pwm, DNA.str, min.score = 0.7)
## [1] 9
PWMscoreStartingAt(pwm, DNA.str, starting.at = start(xx))
## [1] 0.9583 0.9583 0.7116 0.7209 0.7116 0.7116 0.7116 0.7116 0.7209
二、序列比对
pwa <- pairwiseAlignment("ATCGCAC", "ATCGAAAC", gapOpening = -3, gapExtension = -1)
pwa
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: [1] ATCG--CAC
## subject: [1] ATCGAA-AC
## score: 2.891
score(pwa)
## [1] 2.891
pattern(pwa)
## [1] ATCG--CAC
subject(pwa)
## [1] ATCGAA-AC
aligned(pwa)
## A BStringSet instance of length 1
## width seq
## [1] 8 ATCG--AC
PairwiseAlignments("-PA--W-HEAE", "HEAGAWGHE-E")
## Global PairwiseAlignments (1 of 1)
## pattern: [1] PA--W-HEAE
## subject: [2] EAGAWGHE-E
## score: -6
PairwiseAlignments("-PA--W-HEAE", "HEAGAWGHE-E", gapOpening = -3, gapExtension = -1)
## Global PairwiseAlignments (1 of 1)
## pattern: [1] PA--W-HEAE
## subject: [2] EAGAWGHE-E
## score: -18
PairwiseAlignments("PAWHEAE", "HEAGAWGHEE")
## Error: 'pattern' and 'subject' must have the same number of characters
DNAMultipleAlignment(x = character(), start = NA, end = NA, width = NA, use.names = TRUE,
rowmask = NULL, colmask = NULL)
RNAMultipleAlignment(x = character(), start = NA, end = NA, width = NA, use.names = TRUE,
rowmask = NULL, colmask = NULL)
AAMultipleAlignment(x = character(), start = NA, end = NA, width = NA, use.names = TRUE,
rowmask = NULL, colmask = NULL)