plink格式数据依据染色体拆分数据、依据染色体合并数据

1、准备测试数据

[root@linuxprobe test]# ls
test.map  test.ped
[root@linuxprobe test]# wc -l *  ## 46827个位点,60个样本
   46827 test.map
      60 test.ped
   46887 total
[root@linuxprobe test]# cut -f 1 test.map | uniq -c  ## 一共 26条染色体
   5244 1
   4970 2
   4465 3
   2436 4
   2109 5
   2318 6
   2005 7
   1851 8
   1901 9
   1649 10
   1077 11
   1528 12
   1527 13
   1045 14
   1506 15
   1395 16
   1291 17
   1272 18
   1124 19
    998 20
    768 21
    984 22
   1005 23
    662 24
    892 25
    805 26

2、依据染色体拆分数据 

for i in `seq 26`;do plink --file test --chr $i --sheep --recode tab --out chr$i;done  ## 利用for循环,结合plink软件提取每条染色体数据,(plink提前加入环境变量或者给绝对路径)
……
……
[root@linuxprobe test]# ls
chr10.log    chr12.ped    chr15.nosex  chr18.map    chr20.log    chr22.ped    chr25.nosex  chr3.map    chr6.log    chr8.ped
chr10.map    chr13.log    chr15.ped    chr18.nosex  chr20.map    chr23.log    chr25.ped    chr3.nosex  chr6.map    chr9.log
chr10.nosex  chr13.map    chr16.log    chr18.ped    chr20.nosex  chr23.map    chr26.log    chr3.ped    chr6.nosex  chr9.map
chr10.ped    chr13.nosex  chr16.map    chr19.log    chr20.ped    chr23.nosex  chr26.map    chr4.log    chr6.ped    chr9.nosex
chr11.log    chr13.ped    chr16.nosex  chr19.map    chr21.log    chr23.ped    chr26.nosex  chr4.map    chr7.log    chr9.ped
chr11.map    chr14.log    chr16.ped    chr19.nosex  chr21.map    chr24.log    chr26.ped    chr4.nosex  chr7.map    test.map
chr11.nosex  chr14.map    chr17.log    chr19.ped    chr21.nosex  chr24.map    chr2.log     chr4.ped    chr7.nosex  test.ped
chr11.ped    chr14.nosex  chr17.map    chr1.log     chr21.ped    chr24.nosex  chr2.map     chr5.log    chr7.ped
chr12.log    chr14.ped    chr17.nosex  chr1.map     chr22.log    chr24.ped    chr2.nosex   chr5.map    chr8.log
chr12.map    chr15.log    chr17.ped    chr1.nosex   chr22.map    chr25.log    chr2.ped     chr5.nosex  chr8.map
chr12.nosex  chr15.map    chr18.log    chr1.ped     chr22.nosex  chr25.map    chr3.log     chr5.ped    chr8.nosex

 [root@linuxprobe test]# wc -l *.map
 1649 chr10.map
 1077 chr11.map
 1528 chr12.map
 1527 chr13.map
 1045 chr14.map
 1506 chr15.map
 1395 chr16.map
 1291 chr17.map
 1272 chr18.map
 1124 chr19.map
 5244 chr1.map
 998 chr20.map
 768 chr21.map
 984 chr22.map
 1005 chr23.map
 662 chr24.map
 892 chr25.map
 805 chr26.map
 4970 chr2.map
 4465 chr3.map
 2436 chr4.map
 2109 chr5.map
 2318 chr6.map
 2005 chr7.map
 1851 chr8.map
 1901 chr9.map
 46827 test.map
 93654 total

3、依据染色体合并数据  

[root@linuxprobe test]# cp chr1.map merge.map  ##复制染色体1为起始的合并结果
[root@linuxprobe test]# cp chr1.ped merge.ped  ##同上
for i in `seq 2 26`;do plink --file merge --merge chr$i.ped chr$i.map --sheep --recode tab --out merge;done  ## 利用for循环,结合plink进行合并 (这种合并仅适合样本完全一致时合并)

  [root@linuxprobe test]# wc -l merge.ped merge.map  ##统计合并结果
  60 merge.ped
  46827 merge.map
  46887 total
  [root@linuxprobe test]# md5sum test.ped test.map merge.ped merge.map ## 检查合并后是否一致 
  04a12361169ff0fdc77349f280f11b6b test.ped
  8d4537314a4c7beaeb0da726a8c5fcba test.map
  89b9ee552b2ae3b5e9fcb5282cbbb5c9 merge.ped
  8d4537314a4c7beaeb0da726a8c5fcba merge.map

## ped文件不一致是由于在部分位点基因频率为0.5时,主次等位基因顺序颠倒引起的,不影响结果

4、验证ped文件md5sum不一致的情况   

[root@linuxprobe test]# mkdir test  ## 单独建立文件夹,将需要验证结果复制至该文件夹
[root@linuxprobe test]# cp test.map test.ped merge.map merge.ped test
[root@linuxprobe test]# cd test/
[root@linuxprobe test]# ls
merge.map  merge.ped  test.map  test.ped
 plink --file merge --freq --sheep --out merge;rm *.nosex   ## 利用plink统计基因频率
 plink --file test --freq --sheep --out test ;rm *.nosex   ## 利用plink统计基因频率
 [root@linuxprobe test]# ls
merge.frq  merge.log  merge.map  merge.ped  test.frq  test.log  test.map  test.ped
[root@linuxprobe test]# head merge.frq  ## 查看生成文件格式
 CHR                           SNP   A1   A2          MAF  NCHROBS
   1                      s64199.1    A    G       0.1667      120
   1              OAR19_64675012.1    G    C         0.25      120
   1              OAR19_64715327.1    A    G       0.1583      120
   1              OAR19_64803054.1    A    G         0.25      120
   1                DU281551_498.1    G    A          0.5      120
   1                      s18939.1    0    A            0      120
   1                  OAR1_88143.1    A    G       0.2083      120
   1                      s09912.1    G    C       0.3417      120
   1                      s36301.1    A    G        0.325      120
[root@linuxprobe test]# sed 's/^[\t ]*//' merge.frq | sed 's/[\t ]\+/\t/g' | head  ## 去除行首空格,将多个空格或者制表符转换为一个制表符,查看前十行
CHR     SNP     A1      A2      MAF     NCHROBS
1       s64199.1        A       G       0.1667  120
1       OAR19_64675012.1        G       C       0.25    120
1       OAR19_64715327.1        A       G       0.1583  120
1       OAR19_64803054.1        A       G       0.25    120
1       DU281551_498.1  G       A       0.5     120
1       s18939.1        0       A       0       120
1       OAR1_88143.1    A       G       0.2083  120
1       s09912.1        G       C       0.3417  120
1       s36301.1        A       G       0.325   120
[root@linuxprobe test]# sed 's/^[\t ]*//' merge.frq | sed 's/[\t ]\+/\t/g' | cut -f 3-4 --complement | md5sum   ## 剔除3、4列后查看md5sum
643867ea0404ada96aa61702bc61f678  -
[root@linuxprobe test]# sed 's/^[\t ]*//' test.frq | sed 's/[\t ]\+/\t/g' | cut -f 3-4 --complement | md5sum    ##剔除3、4列后查看买的sum
643867ea0404ada96aa61702bc61f678  -
## 以上说明每个稳点的基因频率完全相同

5、利用diff检查

[root@linuxprobe test]# diff merge.frq test.frq | head   ## 差异的部分基因频率全部为0.5,主次等位基因颠倒,这种颠倒不影响结果
6c6
<    1                DU281551_498.1    G    A          0.5      120
---
>    1                DU281551_498.1    A    G          0.5      120
218c218
<    1                      s64317.1    G    A          0.5      120
---
>    1                      s64317.1    A    G          0.5      120
251c251
<    1                      s45057.1    G    A          0.5      120
[root@linuxprobe test]# diff merge.frq test.frq | tail
---
>   24              OAR24_21591643.1    A    G          0.5      120
46100c46100
<   26               OAR26_5333971.1    A    G          0.5      120
---
>   26               OAR26_5333971.1    G    A          0.5      120
46697c46697
<   26                      s52004.1    G    A          0.5      120
---
>   26                      s52004.1    A    G          0.5      120

   

原文地址:https://www.cnblogs.com/liujiaxin2018/p/13770366.html