需求:
我想统计第六列受sv影响的基因位置,第十一列sv的类型,
#!/bin/bash
# 检查命令行参数
if [ $# -ne 1 ]; then
echo "Usage: $0 filename"
exit 1
fi
filename=$1
# 使用 awk 处理文件
awk -F'\t' 'BEGIN {
# 初始化计数器数组
} {
# 分割第6列和第11列的单词,统计出现次数
split($6, words6, " ");
for (word in words6) {
wordCounts6[words6[word]]++;
}
split($11, words11, " ");
for (word in words11) {
wordCounts11[words11[word]]++;
}
}
END {
# 输出第6列单词计数
for (word in wordCounts6) {
print "Column 6: Word \"" word "\" appears " wordCounts6[word] " times";
}
# 输出第11列单词计数
for (word in wordCounts11) {
print "Column 11: Word \"" word "\" appears " wordCounts11[word] " times";
}
}' "$filename"
流程
1.通过grep提取sv
cat sort|while read id; do
# Loop through each text file (replace 'file1.txt' with your actual filenames)
cat txt|while read txt; do
# Use grep to find lines containing the ID in the current text file
grep "$id" "$txt" >> "res/${txt%.txt}.tsv"
done
done
2.标准化输出
#!/bin/bash
###wo1.sh
# 检查命令行参数
if [ $# -ne 1 ]; then
echo "Usage: $0 filename"
exit 1
fi
filename=$1
# 使用 awk 处理文件
awk -F'\t' 'BEGIN {
# 初始化计数器数组
OFS = "\t"; # 设置输出字段分隔符为制表符
print "Column\tWord\tCount";
} {
# 分割第6列和第11列的单词,统计出现次数
split($6, words6, " ");
for (word in words6) {
wordCounts6[words6[word]]++;
}
split($11, words11, " ");
for (word in words11) {
wordCounts11[words11[word]]++;
}
}
END {
# 输出第6列单词计数
for (word in wordCounts6) {
print "6", word, wordCounts6[word];
}
# 输出第11列单词计数
for (word in wordCounts11) {
print "11", word, wordCounts11[word];
}
}' "$filename"
cat ../txt|while read txt
do
sh wo1.sh ${txt%.txt}.tsv >res/${txt%.txt}.tsv
done
3.合并结果
#!/bin/bash
# 创建或清空最终的合并文件
output_file="merged_files"
> "$output_file"
# 假设第一个文件有表头
header_file=$(ls *.tsv | head -n 1)
header=$(awk -v file="$header_file" 'BEGIN{OFS="\t"} NR==1 {print file,$0}' "$header_file")
# 将第一个文件的表头写入合并文件
echo "$header" > "$output_file"
# 遍历当前目录下的所有TSV文件
for file in *.tsv; do
# 检查文件是否存在
if [ -f "$file" ]; then
# 跳过第一个文件的表头
if [ "$file" != "$header_file" ]; then
awk -v file="$file" 'BEGIN{OFS="\t"} NR>1 {print file,$0}' "$file" >> "$output_file"
else
# 从第一个文件中跳过表头
awk -v file="$file" 'BEGIN{OFS="\t"} NR>1 {print file,$0}' "$file" >> "$output_file"
fi
fi
done
echo "Files have been merged into $output_file"
R统计绘图
setwd("D:\\科研助理工作\\data\\7-24")
sv_type <- read.table("sv_type.tsv", header = TRUE, sep = "\t", stringsAsFactors = FALSE)
head(sv_type)
group <- read.table("grou.tsv", header = TRUE, sep = "\t", stringsAsFactors = FALSE)
# 查看前几行数据
head(group)
# 两个数据框都有一个名为"sample"的共同列
merged_data <- merge(sv_type, group, by = "sample")
# 显示合并后的数据框的前几行
head(merged_data)
write.table(merged_data, file = "merged_data.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
if (!require(dplyr)) {
install.packages("dplyr")
}
library(dplyr)
result <- merged_data %>%
group_by(k5, Word) %>%
summarize(
avg_count = round(mean(Count), 0), # 保留整数
sd_count = round(sd(Count), 0) # 保留整数
)
print(result)
write.csv(result, file = "results.csv", row.names = FALSE)
library(ggplot2)
p=ggplot(merged_data, aes(x = k5, y = Count, fill = Word)) + # 使用group来填充颜色
geom_boxplot() +
scale_fill_manual(values = c("#E69F00", "#56B4E9","#009E73", "#F0E442", "#0072B2","#D55E00", "#CC79A7", "#000000","#999999", "#DDCC77",
"#FF6666", "#993333", "#6699CC", "#999933", "#336699")) + # 自定义颜色
labs(title = "Boxplot of Count by K5 and Group",
x = "K5",
y = "Count",
fill = "Group") +
theme_minimal()+geom_boxplot()
p
##添加显著性
library(ggsignif)
compaired <- list(c("Xian", "Geng"))
p1=p+theme_bw() +
theme(panel.grid=element_blank())+
geom_signif(comparisons = compaired,
step_increase = 0.3,
map_signif_level = T, #修改参数map_signif_level=TRUE
test = wilcox.test)
p1
ggsave(filename = "xi_geng_xianz.png", plot = p1, width = 12,height = 6, dpi = 400)