余大神的力作之一:Guangchuang Yu (https://yulab-smu.top)。绘制出的进化树还是很专业美观的!
——电子书: Data Integration, Manipulation and Visualization of Phylogenetic Trees
——书包括4个部分:1)数据输入、输出和操作。2)树数据的可视化和注释。3)ggtree延展。4)其它。
ggtree包的功能很多。这里摘抄几个例子,详见原作网站。
1.加载包
library(treedataverse)
## Attaching packages treedataverse 0.0.1
## ape 5.5 treeio 1.18.0
## dplyr 1.0.7 ggtree 3.2.0
## ggplot2 3.3.5 ggtreeExtra 1.4.0
## tidytree 0.3.6
2.Aligning graph to the tree based on the tree structure
library(ggtree)
library(TDbook)
## load `tree_nwk`, `df_info`, `df_alleles`, and `df_bar_data` from 'TDbook'
tree <- tree_nwk
snps <- df_alleles
snps_strainCols <- snps[1,]
snps<-snps[-1,] # drop strain names
colnames(snps) <- snps_strainCols
gapChar <- "?"
snp <- t(snps)
lsnp <- apply(snp, 1, function(x) {
x != snp[1,] & x != gapChar & snp[1,] != gapChar
})
lsnp <- as.data.frame(lsnp)
lsnp$pos <- as.numeric(rownames(lsnp))
lsnp <- tidyr::gather(lsnp, name, value, -pos)
snp_data <- lsnp[lsnp$value, c("name", "pos")]
## visualize the tree
p <- ggtree(tree)
## attach the sampling information data set
## and add symbols colored by location
p <- p %<+% df_info + geom_tippoint(aes(color=location))
## visualize SNP and Trait data using dot and bar charts,
## and align them based on tree structure
p + geom_facet(panel = "SNP", data = snp_data, geom = geom_point,
mapping=aes(x = pos, color = location), shape = '|') +
geom_facet(panel = "Trait", data = df_bar_data, geom = geom_col,
aes(x = dummy_bar_value, color = location,
fill = location), orientation = 'y', width = .6) +
theme_tree2(legend.position=c(.05, .85))
3.Visualize a tree with associated matrix
beast_file <- system.file("examples/MCC_FluA_H3.tree", package="ggtree")
beast_tree <- read.beast(beast_file)
genotype_file <- system.file("examples/Genotype.txt", package="ggtree")
genotype <- read.table(genotype_file, sep="\t", stringsAsFactor=F)
colnames(genotype) <- sub("\\.$", "", colnames(genotype))
p <- ggtree(beast_tree, mrsd="2013-01-01") +
geom_treescale(x=2008, y=1, offset=2) +
geom_tiplab(size=2)
gheatmap(p, genotype, offset=5, width=0.5, font.size=3,
colnames_angle=-45, hjust=0) +
scale_fill_manual(breaks=c("HuH3N2", "pdm", "trig"),
values=c("steelblue", "firebrick", "darkgreen"), name="genotype")
p <- ggtree(beast_tree, mrsd="2013-01-01") +
geom_tiplab(size=2, align=TRUE, linesize=.5) +
theme_tree2()
gheatmap(p, genotype, offset=8, width=0.6,
colnames=FALSE, legend_title="genotype") +
scale_x_ggtree() +
scale_y_continuous(expand=c(0, 0.3))
4.Visualize a tree with multiple associated matrices
nwk <- system.file("extdata", "sample.nwk", package="treeio")
tree <- read.tree(nwk)
circ <- ggtree(tree, layout = "circular")
df <- data.frame(first=c("a", "b", "a", "c", "d", "d", "a",
"b", "e", "e", "f", "c", "f"),
second= c("z", "z", "z", "z", "y", "y",
"y", "y", "x", "x", "x", "a", "a"))
rownames(df) <- tree$tip.label
df2 <- as.data.frame(matrix(rnorm(39), ncol=3))
rownames(df2) <- tree$tip.label
colnames(df2) <- LETTERS[1:3]
p1 <- gheatmap(circ, df, offset=.8, width=.2,
colnames_angle=95, colnames_offset_y = .25) +
scale_fill_viridis_d(option="D", name="discrete\nvalue")
library(ggnewscale)
p2 <- p1 + new_scale_fill()
gheatmap(p2, df2, offset=15, width=.3,
colnames_angle=90, colnames_offset_y = .25) +
scale_fill_viridis_c(option="A", name="continuous\nvalue")
5.Visualize a tree with multiple sequence alignment
library(TDbook)
# load `tree_seq_nwk` and `AA_sequence` from 'TDbook'
p <- ggtree(tree_seq_nwk) + geom_tiplab(size=3)
msaplot(p, AA_sequence, offset=3, width=2)
p <- ggtree(tree_seq_nwk, layout='circular') +
geom_tiplab(offset=4, align=TRUE) + xlim(NA, 12)
msaplot(p, AA_sequence, window=c(120, 200))
6. Composite plots
library(ggplot2)
library(ggtree)
set.seed(2019-10-31)
tr <- rtree(10)
d1 <- data.frame(
# only some labels match
label = c(tr$tip.label[sample(5, 5)], "A"),
value = sample(1:6, 6))
d2 <- data.frame(
label = rep(tr$tip.label, 5),
category = rep(LETTERS[1:5], each=10),
value = rnorm(50, 0, 3))
g <- ggtree(tr) + geom_tiplab(align=TRUE)
p1 <- ggplot(d1, aes(label, value)) + geom_col(aes(fill=label)) +
geom_text(aes(label=label, y= value+.1)) +
coord_flip() + theme_tree2() + theme(legend.position='none')
p2 <- ggplot(d2, aes(x=category, y=label)) +
geom_tile(aes(fill=value)) + scale_fill_viridis_c() +
theme_minimal() + xlab(NULL) + ylab(NULL)
7.The phylo4 and phylo4d objects
library(phylobase)
data(geospiza_raw)
g1 <- as(geospiza_raw$tree, "phylo4")
g2 <- phylo4d(g1, geospiza_raw$data, missing.data="warn")
d1 <- data.frame(x = seq(1.1, 2, length.out = 5),
lab = names(geospiza_raw$data))
p1 <- ggtree(g2) + geom_tippoint(aes(size = wingL), x = d1$x[1], shape = 1) +
geom_tippoint(aes(size = tarsusL), x = d1$x[2], shape = 1) +
geom_tippoint(aes(size = culmenL), x = d1$x[3], shape = 1) +
geom_tippoint(aes(size = beakD), x = d1$x[4], shape = 1) +
geom_tippoint(aes(size = gonysW), x = d1$x[5], shape = 1) +
scale_size_continuous(range = c(3,12), name="") +
geom_text(aes(x = x, y = 0, label = lab), data = d1, angle = 45) +
geom_tiplab(offset = 1.3) + xlim(0, 3) +
theme(legend.position = c(.1, .75))
## users can use `as.treedata(g2)` to convert `g2` to a `treedata` object
## and use `get_tree_data()` function to extract the associated data
p2 <- gheatmap(ggtree(g1), data=geospiza_raw$data, colnames_angle=45) +
geom_tiplab(offset=1) + hexpand(.2) + theme(legend.position = c(.1, .75))
aplot::plot_list(p1, p2, ncol=2, tag_levels='A')
8.ggtree for other tree-like structures
library(treeio)
library(ggplot2)
library(ggtree)
data("GNI2014", package="treemap")
n <- GNI2014[, c(3,1)]
n[,1] <- as.character(n[,1])
n[,1] <- gsub("\\s\\(.*\\)", "", n[,1])
w <- cbind("World", as.character(unique(n[,1])))
colnames(w) <- colnames(n)
edgelist <- rbind(n, w)
y <- as.phylo(edgelist)
ggtree(y, layout='circular') %<+% GNI2014 +
aes(color=continent) + geom_tippoint(aes(size=population), alpha=.6) +
geom_tiplab(aes(label=country), offset=.1) +
theme(plot.margin=margin(60,60,60,60))
9.Visualizing pairwise nucleotide sequence distance with a phylogenetic tree
library(TDbook)
library(tibble)
library(tidyr)
library(Biostrings)
library(treeio)
library(ggplot2)
library(ggtree)
# loaded from TDbook package
tree <- tree_HPV58
clade <- c(A3 = 92, A1 = 94, A2 = 108, B1 = 156,
B2 = 159, C = 163, D1 = 173, D2 = 176)
tree <- groupClade(tree, clade)
cols <- c(A1 = "#EC762F", A2 = "#CA6629", A3 = "#894418", B1 = "#0923FA",
B2 = "#020D87", C = "#000000", D1 = "#9ACD32",D2 = "#08630A")
## visualize the tree with tip labels and tree scale
p <- ggtree(tree, aes(color = group), ladderize = FALSE) %>%
rotate(rootnode(tree)) +
geom_tiplab(aes(label = paste0("italic('", label, "')")),
parse = TRUE, size = 2.5) +
geom_treescale(x = 0, y = 1, width = 0.002) +
scale_color_manual(values = c(cols, "black"),
na.value = "black", name = "Lineage",
breaks = c("A1", "A2", "A3", "B1", "B2", "C", "D1", "D2")) +
guides(color = guide_legend(override.aes = list(size = 5, shape = 15))) +
theme_tree2(legend.position = c(.1, .88))
## Optional
## add labels for monophyletic (A, C and D) and paraphyletic (B) groups
dat <- tibble(node = c(94, 108, 131, 92, 156, 159, 163, 173, 176,172),
name = c("A1", "A2", "A3", "A", "B1",
"B2", "C", "D1", "D2", "D"),
offset = c(0.003, 0.003, 0.003, 0.00315, 0.003,
0.003, 0.0031, 0.003, 0.003, 0.00315),
offset.text = c(-.001, -.001, -.001, 0.0002, -.001,
-.001, 0.0002, -.001, -.001, 0.0002),
barsize = c(1.2, 1.2, 1.2, 2, 1.2, 1.2, 3.2, 1.2, 1.2, 2),
extend = list(c(0, 0.5), 0.5, c(0.5, 0), 0, c(0, 0.5),
c(0.5, 0), 0, c(0, 0.5), c(0.5, 0), 0)
) %>%
dplyr::group_split(barsize)
p <- p +
geom_cladelab(
data = dat[[1]],
mapping = aes(
node = node,
label = name,
color = group,
offset = offset,
offset.text = offset.text,
extend = extend
),
barsize = 1.2,
fontface = 3,
align = TRUE
) +
geom_cladelab(
data = dat[[2]],
mapping = aes(
node = node,
label = name,
offset = offset,
offset.text =offset.text,
extend = extend
),
barcolor = "darkgrey",
textcolor = "darkgrey",
barsize = 2,
fontsize = 5,
fontface = 3,
align = TRUE
) +
geom_cladelab(
data = dat[[3]],
mapping = aes(
node = node,
label = name,
offset = offset,
offset.text = offset.text,
extend = extend
),
barcolor = "darkgrey",
textcolor = "darkgrey",
barsize = 3.2,
fontsize = 5,
fontface = 3,
align = TRUE
) +
geom_strip(65, 71, "italic(B)", color = "darkgrey",
offset = 0.00315, align = TRUE, offset.text = 0.0002,
barsize = 2, fontsize = 5, parse = TRUE)
## Optional
## display support values
p <- p + geom_nodelab(aes(subset = (node == 92), label = "*"),
color = "black", nudge_x = -.001, nudge_y = 1) +
geom_nodelab(aes(subset = (node == 155), label = "*"),
color = "black", nudge_x = -.0003, nudge_y = -1) +
geom_nodelab(aes(subset = (node == 158), label = "95/92/1.00"),
color = "black", nudge_x = -0.0001,
nudge_y = -1, hjust = 1) +
geom_nodelab(aes(subset = (node == 162), label = "98/97/1.00"),
color = "black", nudge_x = -0.0001,
nudge_y = -1, hjust = 1) +
geom_nodelab(aes(subset = (node == 172), label = "*"),
color = "black", nudge_x = -.0003, nudge_y = -1)
## extract accession numbers from tip labels
tl <- tree$tip.label
acc <- sub("\\w+\\|", "", tl)
names(tl) <- acc
## read sequences from GenBank directly into R
## and convert the object to DNAStringSet
tipseq <- ape::read.GenBank(acc) %>% as.character %>%
lapply(., paste0, collapse = "") %>% unlist %>%
DNAStringSet
## align the sequences using muscle
tipseq_aln <- muscle::muscle(tipseq)
tipseq_aln <- DNAStringSet(tipseq_aln)
## calculate pairwise hamming distances among sequences
tipseq_dist <- stringDist(tipseq_aln, method = "hamming")
## calculate the percentage of differences
tipseq_d <- as.matrix(tipseq_dist) / width(tipseq_aln[1]) * 100
## convert the matrix to a tidy data frame for facet_plot
dd <- as_tibble(tipseq_d)
dd$seq1 <- rownames(tipseq_d)
td <- gather(dd,seq2, dist, -seq1)
td$seq1 <- tl[td$seq1]
td$seq2 <- tl[td$seq2]
g <- p$data$group
names(g) <- p$data$label
td$clade <- g[td$seq2]
## visualize the sequence differences using dot plot and line plot
## and align the sequence difference plot to the tree using facet_plot
p2 <- p + geom_facet(panel = "Sequence Distance",
data = td, geom = geom_point, alpha = .6,
mapping = aes(x = dist, color = clade, shape = clade)) +
geom_facet(panel = "Sequence Distance",
data = td, geom = geom_path, alpha = .6,
mapping=aes(x = dist, group = seq2, color = clade)) +
scale_shape_manual(values = 1:8, guide = FALSE)
print(p2)
10.Highlighting different groups.
library(TDbook)
mytree <- tree_treenwk_30.4.19
# Define nodes for coloring later on
tiplab <- mytree$tip.label
cls <- tiplab[grep("^ch", tiplab)]
labeltree <- groupOTU(mytree, cls)
p <- ggtree(labeltree, aes(color=group, linetype=group), layout="circular") +
scale_color_manual(values = c("#efad29", "#63bbd4")) +
geom_nodepoint(color="black", size=0.1) +
geom_tiplab(size=2, color="black")
p2 <- flip(p, 136, 110) %>%
flip(141, 145) %>%
rotate(141) %>%
rotate(142) %>%
rotate(160) %>%
rotate(164) %>%
rotate(131)
### Group V and II coloring
dat <- data.frame(
node = c(110, 88, 156,136),
fill = c("#229f8a", "#229f8a", "#229f8a", "#f9311f")
)
p3 <- p2 +
geom_hilight(
data = dat,
mapping = aes(
node = node,
fill = I(fill)
),
alpha = 0.2,
extendto = 1.4
)
### Putting on a label on the avian specific expansion
p4 <- p3 +
geom_cladelab(
node = 113,
label = "Avian-specific expansion",
align = TRUE,
angle = -35,
offset.text = 0.05,
hjust = "center",
fontsize = 2,
offset = .2,
barsize = .2
)
### Adding the bootstrap values with subset used to remove all bootstraps < 50
p5 <- p4 +
geom_nodelab(
mapping = aes(
x = branch,
label = label,
subset = !is.na(as.numeric(label)) & as.numeric(label) > 50
),
size = 2,
color = "black",
nudge_y = 0.6
)
### Putting labels on the subgroups
p6 <- p5 +
geom_cladelab(
data = data.frame(
node = c(114, 121),
name = c("Subgroup A", "Subgroup B")
),
mapping = aes(
node = node,
label = name
),
align = TRUE,
offset = .05,
offset.text = .03,
hjust = "center",
barsize = .2,
fontsize = 2,
angle = "auto",
horizontal = FALSE
) +
theme(
legend.position = "none",
plot.margin = grid::unit(c(-15, -15, -15, -15), "mm")
)
print(p6)
11.Phylogenetic tree with genome locus structure
library(dplyr)
library(ggplot2)
library(gggenes)
library(ggtree)
get_genes <- function(data, genome) {
filter(data, molecule == genome) %>% pull(gene)
}
g <- unique(example_genes[,1])
n <- length(g)
d <- matrix(nrow = n, ncol = n)
rownames(d) <- colnames(d) <- g
genes <- lapply(g, get_genes, data = example_genes)
for (i in 1:n) {
for (j in 1:i) {
jaccard_sim <- length(intersect(genes[[i]], genes[[j]])) /
length(union(genes[[i]], genes[[j]]))
d[j, i] <- d[i, j] <- 1 - jaccard_sim
}
}
tree <- ape::bionj(d)
p <- ggtree(tree, branch.length='none') +
geom_tiplab() + xlim_tree(5.5) +
geom_facet(mapping = aes(xmin = start, xmax = end, fill = gene),
data = example_genes, geom = geom_motif, panel = 'Alignment',
on = 'genE', label = 'gene', align = 'left') +
scale_fill_brewer(palette = "Set3") +
scale_x_continuous(expand=c(0,0)) +
theme(strip.text=element_blank(),
panel.spacing=unit(0, 'cm'))
facet_widths(p, widths=c(1,2))