12个组,每组3个重复,一共36个样品的主成分分析图
1、导入包和数据处理
library(tidyr)
library(dplyr)
library(ggplot2)
data_o <- read.table("./PCA.txt",
header = T)
> data_o
S1 S2 S3 S4 S5 S6 S7
M1 7.395760e+04 7.078706e+04 8.417177e+04 137621.1106 147372.6292 164831.9100 7.309625e+04
M2 3.943851e+03 5.747331e+03 5.227088e+03 6523.3657 6013.1556 12186.0886 5.114227e+03
M3 6.510884e+03 5.606016e+03 6.351800e+03 12000.8553 13808.4395 14835.7556 6.354940e+03
M4 1.384136e+05 1.317561e+05 1.553619e+05 266360.3396 289288.2152 339461.9371 1.301302e+05
M5 3.801214e+03 4.022403e+03 5.158690e+03 6174.7307 6762.0510 8130.7640 4.632161e+03
M6 1.201156e+03 1.123829e+03 1.782138e+03 2042.1169 1610.9445 2399.4726 9.799112e+02
M7 5.773008e+04 3.799364e+04 6.130102e+04 30516.9603 63046.2774 24196.7880 4.144398e+04
#翻转数据
data <- t(data_o)
> data
M1 M2 M3 M4 M5 M6 M7 M8 M9 M10
S1 73957.60 3943.851 6510.884 138413.57 3801.214 1201.1558 57730.08 416.6729 1316.5815 118533.6
S2 70787.06 5747.331 5606.016 131756.14 4022.403 1123.8288 37993.64 1213.8793 1540.8043 134324.5
S3 84171.77 5227.088 6351.800 155361.89 5158.690 1782.1378 61301.02 534.9507 1244.3340 124778.7
S4 137621.11 6523.366 12000.855 266360.34 6174.731 2042.1169 30516.96 2634.5004 2461.8294 122279.8
S5 147372.63 6013.156 13808.440 289288.22 6762.051 1610.9445 63046.28 401.3044 2603.8165 119305.2
S6 164831.91 12186.089 14835.756 339461.94 8130.764 2399.4726 24196.79 473.9180 3278.3469 100409.4
#读取数据分组情况
group <- read.table("./group.txt",
header = T)
> group
Group samplenames
1 G1 S1
2 G1 S2
3 G1 S3
4 G2 S4
5 G2 S5
6 G2 S6
7 G3 S7
8 G3 S8
9 G3 S9
10 G4 S10
11 G4 S11
12 G4 S12
###上述数据格式只列出一部分
2、主成分分析
#主成分计算
pca_data <- prcomp(data, scale = TRUE)
#查看合适主成分个数
screeplot(pca_data, type = "lines")
summary(pca_data)
#查看行名,确认是否为36个样品的名称
rownames(pca_data$x)
> rownames(pca_data$x)
[1] "S1" "S2" "S3" "S4" "S5" "S6" "S7" "S8" "S9" "S10" "S11" "S12" "S13" "S14" "S15" "S16" "S17"
[18] "S18" "S19" "S20" "S21" "S22" "S23" "S24" "S25" "S26" "S27" "S28" "S29" "S30" "S31" "S32" "S33" "S34"
[35] "S35" "S36"
#提取PC1的百分比
x_per <- round(summary(pca_data)$importance[2, 1]*100, 1)
#提取PC2的百分比
y_per <- round(summary(pca_data)$importance[2, 2]*100, 1)
#按照样品名称添加组并且合并
df_sample <- data.frame(samplenames=rownames(pca_data$x), pca_data$x) %>%
left_join(group, by = "samplenames")
#数据尾部添加变量Group,只截取部分数据展示
> df_sample
PC31 PC32 PC33 PC34 PC35 PC36 Group
1 0.306133837 0.161442452 0.03766617 0.30708158 -0.186722792 -1.422473e-15 G1
2 0.237143749 -0.599197140 -0.52253461 -0.36597316 0.205268460 -1.526557e-15 G1
3 -0.073255941 -0.069312851 0.02810155 -0.15750184 0.194224495 -1.741662e-15 G1
4 0.593003778 -0.003056265 -0.31652774 -0.35467364 -0.153817055 -1.013079e-15 G2
5 -0.368112005 0.085168850 0.47080323 0.50937052 0.114099679 -1.249001e-15 G2
6 -0.160988819 0.140851886 -0.28354428 -0.05074411 -0.068941571 -1.082467e-15 G2
7 0.323091269 0.126995166 0.44969245 -0.65800755 -0.352720233 -6.522560e-16 G3
8 -0.801287671 -0.206218914 -0.16401072 0.19778263 0.516041135 -1.085070e-15 G3
9 -0.082727209 0.303859473 -0.04526318 0.51639150 -0.266017284 -1.762479e-15 G3
10 0.023928552 0.526276376 -0.37558251 -0.52455436 -0.378126147 -7.771561e-16 G4
11 0.324106409 -0.183442282 0.36741010 -0.20372081 0.423367465 -1.332268e-15 G4
12 -0.339002487 -0.271889208 0.36575640 0.66261644 0.001102365 -1.235123e-15 G4
3、ggplot绘图
#设置适合科研的背景色
theme_set(theme_bw())
#绘图
plot_1 <- ggplot(df_sample, aes(x = PC1, y = PC2)) +
geom_point(aes(color=Group),size=5) +
xlab(paste("PC1","(", x_per,"%)",sep=" ")) +
ylab(paste("PC2","(", y_per,"%)",sep=" ")) +
#t图例按照G1,G2,G3...排列
scale_color_discrete(limits = c(paste("G", seq(1:12),sep = "")))