进行各项指标之间的相关性分析

1.下载数据脚本

# 定义样本列表
SAMPLES = ["SRR1214199", "SRR1214198",
           "SRR1144866", "SRR1144863", "SRR1144860", "SRR1144852",
           "SRR1144850", "SRR1144846"]

# 定义下载规则
rule all:
    input:
        expand("{sample}/{sample}_1.fastq.gz", sample=SAMPLES),
        expand("{sample}/{sample}_2.fastq.gz", sample=SAMPLES)

# 使用prefetch下载数据
rule download_SRR:
    output:
        "{sample}/{sample}.sra"
    shell:
        """
        prefetch -X 100G {wildcards.sample}
        """

# 将sra转换为fastq文件
rule translate_fastq:
    input:
        "{sample}/{sample}.sra"
    output:
        "{sample}/{sample}_1.fastq.gz",
        "{sample}/{sample}_2.fastq.gz"
    shell:
        """
        fastq-dump {input} --split-3 --gzip -O {wildcards.sample}/
        """

2.进行ROH的计算

plink --vcf SCT.recode.vcf --make-bed --out SCT

plink --bfile SCT --recode --out SCT

plink --bfile SCT --homozyg --homozyg-density 20 --homozyg-gap 100 --homozyg-kb 500 --homozyg-snp 50 --homozyg-window-het 3 --homozyg-window-snp 50 --homozyg-window-threshold 0.05 --homozyg-window-missing 5 --out SCT

awk '{print $2,$5}' SCT.hom.indiv > SCT.txt

echo "AT.txt" > name.id

import pandas as pd
import os

def read_filenames_from_id_file(id_file_path):
    with open(id_file_path, 'r') as file:
        filenames = file.read().splitlines()
    return filenames

def process_files(filenames, base_path=''):
    result_df = pd.DataFrame()
    for filename in filenames:
        file_path = os.path.join(base_path, filename)
        # 读取数据，此处假设数据是以空格分隔的
        df = pd.read_csv(file_path, sep="\s+", header=None, usecols=[1])

        # 尝试将数据转换为浮点数，以确保数学运算可以进行
        df[1] = pd.to_numeric(df[1], errors='coerce')

        # 进行除法运算
        df[1] = df[1] / 2258664.721

        column_name = os.path.basename(file_path).split('.')[0]
        result_df[column_name] = df[1]

    result_df.to_csv('result.csv', index=False)

# 示例路径，根据实际情况调整
filenames = read_filenames_from_id_file('name.id')
process_files(filenames)

2.计算het

plink --bfile SCT1_SNP_ind_maf0.01_LD0.9 --export vcf --out India_tiger_maf0.01_LD0.9 --allow-extra-chr
vcftools --vcf SCT.recode.vcf --het --out SCT

sed '1d' SCT.het | awk '{print $1,"\t",$2,"\t",$4,"\t",($4-$2)/2258664721}' > SCT_het.txt

3.计算load

perl /home/yjc/Software/annovar/convert2annovar.pl --format vcf4old SCT.recode.vcf --outfile SCT_mit

perl /home/yjc/Software/annovar/annotate_variation.pl SCT_mit /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/ref/test/ --aamatrixfile /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/ref/test/grantham.matrix --outfile sct -buildver SCT

grep "AAMatrix" sct.exonic_variant_function > AAMatrix.txt

## 脚本（其中sample.id是AAMatrix）

import re

# 读取样本ID文件
sample_id_file = 'sample.id'

# 读取样本ID列表
with open(sample_id_file, 'r') as f:
    sample_ids = [line.strip() for line in f]

# 定义处理单个文件的函数
def process_file(sample_id):
    input_file = f'{sample_id}.txt'
    output_file = f'{sample_id}_filtered.txt'

    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            # 使用正则表达式提取AAMatrix的值
            match = re.search(r'AAMatrix=(\d+)', line)
            if match:
                # 将值转换为整数
                aamatrix_value = int(match.group(1))
                # 判断AAMatrix值是否大于150
                if aamatrix_value > 150:
                    outfile.write(line)
    print(f'Processed {input_file}, results written to {output_file}')

# 批量处理所有样本文件
for sample_id in sample_ids:
    process_file(sample_id)

awk '{print $5,$6}' AAMatrix_filtered.txt > position.txt

bcftools query -l SCT.recode.vcf

vi sample.id

### 拆分个体脚本
#!/bin/bash

# 输入文件
input_vcf="SCT.recode.vcf"
sample_list="sample.id"

# 读取样本列表
while IFS= read -r sample
do
    # 输出文件名
    output_vcf="${sample}.vcf"

    # 使用 bcftools 拆分 VCF 文件
    bcftools view -s "$sample" -Oz -o "${output_vcf}.gz" $input_vcf

    # 解压缩生成的 VCF.GZ 文件
    gunzip "${output_vcf}.gz"

    echo "Generated ${output_vcf}"
done < "$sample_list"

### snakemake批处理脚本
import os

# 读取样本 ID 列表
with open('sample.id') as f:
    samples = [line.strip() for line in f]

# 定义规则 all，列出所有需要生成的输出文件
rule all:
    input:
        expand("{sample}_position.recode.vcf", sample=samples)

# 定义规则，处理每个 VCF 文件
rule process_vcf:
    input:
        vcf = "{sample}.vcf",
        positions = "position.txt"
    output:
        recoded_vcf = "{sample}_position.recode.vcf"
    shell:
        "vcftools --vcf {input.vcf} --positions {input.positions} --recode --out {wildcards.sample}_position"


ls *recode.vcf | tr "\t" "\n" | while read id ; do awk '{print $1,$2,$10}' $id | sed "/#/d" > $id.txt ; done

ls *_position.recode.vcf.txt | tr "\t" "\n" | while read id ; do grep "1/1" $id |wc -l ;done

ls *_position.recode.vcf.txt | tr "\t" "\n" | while read id ; do grep "0/1" $id |wc -l ;done

LOF分析

snpEff ref_1 SCT.recode.vcf > SCT.ann

###筛选功能有害突变位点
import re

# 读取样本ID文件
sample_id_file = 'sample.id'

# 读取样本ID列表
with open(sample_id_file, 'r') as f:
    sample_ids = [line.strip() for line in f]

# 定义要匹配的关键字
keywords = ["stop_gained", "splice_acceptor_variant", "splice_donor_variant"]

# 定义处理单个文件的函数
def process_file(sample_id):
    input_file = f'{sample_id}.ann'
    output_file = f'{sample_id}_filtered.ann'

    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            # 检查行中是否包含任意一个关键字
            if any(keyword in line for keyword in keywords):
                outfile.write(line)
    print(f'Processed {input_file}, results written to {output_file}')

# 批量处理所有样本文件
for sample_id in sample_ids:
    process_file(sample_id)

awk '{print $1,$2}' SCT_filtered.ann > position1.txt

### snakemake脚本
import os

# 读取样本 ID 列表
with open('sample.id') as f:
    samples = [line.strip() for line in f]

# 定义规则 all，列出所有需要生成的输出文件
rule all:
    input:
        expand("{sample}_position.recode.vcf", sample=samples)

# 定义规则，处理每个 VCF 文件
rule process_vcf:
    input:
        vcf = "{sample}.vcf",
        positions = "position1.txt"
    output:
        recoded_vcf = "{sample}_position.recode.vcf"
    shell:
        "vcftools --vcf {input.vcf} --positions {input.positions} --recode --out {wildcards.sample}_position"

ls *recode.vcf | tr "\t" "\n" | while read id ; do awk '{print $1,$2,$10}' $id | sed "/#/d" > $id.txt ; done

ls *_position.recode.vcf.txt | tr "\t" "\n" | while read id ; do grep "1/1" $id |wc -l ;done

ls *_position.recode.vcf.txt | tr "\t" "\n" | while read id ; do grep "0/1" $id |wc -l ;done

个体之间平均杂合度计算脚本

import pandas as pd
import itertools

# 读取数据文件
df = pd.read_csv('SCT_het.txt', delim_whitespace=True, header=None, names=['ID', 'Col2', 'Col3', 'Het'])

# 创建一个字典以存储个体的杂合度
het_dict = df.set_index('ID')['Het'].to_dict()

# 获取所有个体ID的列表
individuals = list(het_dict.keys())

# 初始化一个列表来存储所有两两之间杂合度平均值
avg_het_values = []

# 计算所有两两之间杂合度的平均值，并四舍五入保留小数点后6位
for pair in itertools.combinations(individuals, 2):
    id1, id2 = pair
    avg_het = round((het_dict[id1] + het_dict[id2]) / 2, 6)
    avg_het_values.append((id1, id2, avg_het))

# 将结果转换为DataFrame并保存到文件
avg_het_df = pd.DataFrame(avg_het_values, columns=['ID1', 'ID2', 'Avg_Het'])
avg_het_df.to_csv('avg_het_values.txt', sep='\t', index=False)

print("计算完成，结果保存在 avg_het_values.txt 文件中。")

个体之间亲缘系数计算

plink --bfile SCT --genome --min 0 --out kinship

awk '{print $2,$4,$10}' kinship.genome > kinship.txt

个体之间相同位点共同杂合度不同情况统计[1-（2*两个体相同杂合位点数量）/ (个体1杂合位点数+个体2杂合位点数）]

import pandas as pd

# 读取VCF文件
vcf_file = 'SCT.vcf'

# 读取VCF文件内容，跳过头部的注释行
data = pd.read_csv(vcf_file, comment='#', sep='\t')

# 提取样本ID
sample_ids = data.columns[9:]

# 计算每个个体的杂合位点数量
heterozygous_counts = {}
for sample_id in sample_ids:
    heterozygous_counts[sample_id] = ((data[sample_id] == '0/1') | (data[sample_id] == '1/0')).sum()

# 初始化结果列表
results = []

# 计算任意两个个体的杂合位点相同数量并计算Avg_Het
for i in range(len(sample_ids)):
    for j in range(i + 1, len(sample_ids)):
        id1 = sample_ids[i]
        id2 = sample_ids[j]
        
        # 计算相同杂合位点数量
        same_het_count = ((data[id1] == '0/1') & (data[id2] == '0/1')).sum()
        
        # 计算Avg_Het
        avg_het = 1 - (2 * same_het_count) / (heterozygous_counts[id1] + heterozygous_counts[id2])
        
        # 记录结果
        results.append([id1, id2, avg_het])

# 转换为DataFrame
results_df = pd.DataFrame(results, columns=['ID1', 'ID2', 'Avg_Het'])

# 输出结果到CSV文件
results_df.to_csv('avg_het_results.csv', index=False)

# 输出结果到屏幕上
print(results_df)

最后编辑于：2024.08.19 08:56:10

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 199,271评论 5赞 466
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 83,725评论 2赞 376
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 146,252评论 0赞 328
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 53,634评论 1赞 270
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 62,549评论 5赞 359
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 47,985评论 1赞 275
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 37,471评论 3赞 390
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 36,128评论 0赞 254
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 40,257评论 1赞 294
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 35,233评论 2赞 317
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 37,235评论 1赞 328
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 32,940评论 3赞 316
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 38,528评论 3赞 302
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 29,623评论 0赞 19
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 30,858评论 1赞 255
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 42,245评论 2赞 344
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 41,790评论 2赞 339

进行各项指标之间的相关性分析

1.下载数据脚本

2.进行ROH的计算

2.计算het

3.计算load

LOF分析

个体之间平均杂合度计算脚本

个体之间亲缘系数计算

个体之间相同位点共同杂合度不同情况统计[1-（2*两个体相同杂合位点数量）/ (个体1杂合位点数+个体2杂合位点数）]

推荐阅读更多精彩内容