plink格式数据依据染色体拆分数据、依据染色体合并数据
1、准备测试数据
[root@linuxprobe test]# ls
test.map test.ped
[root@linuxprobe test]# wc -l * ## 46827个位点,60个样本
46827 test.map
60 test.ped
46887 total
[root@linuxprobe test]# cut -f 1 test.map | uniq -c ## 一共 26条染色体
5244 1
4970 2
4465 3
2436 4
2109 5
2318 6
2005 7
1851 8
1901 9
1649 10
1077 11
1528 12
1527 13
1045 14
1506 15
1395 16
1291 17
1272 18
1124 19
998 20
768 21
984 22
1005 23
662 24
892 25
805 26
2、依据染色体拆分数据
for i in `seq 26`;do plink --file test --chr $i --sheep --recode tab --out chr$i;done ## 利用for循环,结合plink软件提取每条染色体数据,(plink提前加入环境变量或者给绝对路径)
……
……
[root@linuxprobe test]# ls
chr10.log chr12.ped chr15.nosex chr18.map chr20.log chr22.ped chr25.nosex chr3.map chr6.log chr8.ped
chr10.map chr13.log chr15.ped chr18.nosex chr20.map chr23.log chr25.ped chr3.nosex chr6.map chr9.log
chr10.nosex chr13.map chr16.log chr18.ped chr20.nosex chr23.map chr26.log chr3.ped chr6.nosex chr9.map
chr10.ped chr13.nosex chr16.map chr19.log chr20.ped chr23.nosex chr26.map chr4.log chr6.ped chr9.nosex
chr11.log chr13.ped chr16.nosex chr19.map chr21.log chr23.ped chr26.nosex chr4.map chr7.log chr9.ped
chr11.map chr14.log chr16.ped chr19.nosex chr21.map chr24.log chr26.ped chr4.nosex chr7.map test.map
chr11.nosex chr14.map chr17.log chr19.ped chr21.nosex chr24.map chr2.log chr4.ped chr7.nosex test.ped
chr11.ped chr14.nosex chr17.map chr1.log chr21.ped chr24.nosex chr2.map chr5.log chr7.ped
chr12.log chr14.ped chr17.nosex chr1.map chr22.log chr24.ped chr2.nosex chr5.map chr8.log
chr12.map chr15.log chr17.ped chr1.nosex chr22.map chr25.log chr2.ped chr5.nosex chr8.map
chr12.nosex chr15.map chr18.log chr1.ped chr22.nosex chr25.map chr3.log chr5.ped chr8.nosex
[root@linuxprobe test]# wc -l *.map
1649 chr10.map
1077 chr11.map
1528 chr12.map
1527 chr13.map
1045 chr14.map
1506 chr15.map
1395 chr16.map
1291 chr17.map
1272 chr18.map
1124 chr19.map
5244 chr1.map
998 chr20.map
768 chr21.map
984 chr22.map
1005 chr23.map
662 chr24.map
892 chr25.map
805 chr26.map
4970 chr2.map
4465 chr3.map
2436 chr4.map
2109 chr5.map
2318 chr6.map
2005 chr7.map
1851 chr8.map
1901 chr9.map
46827 test.map
93654 total
3、依据染色体合并数据
[root@linuxprobe test]# cp chr1.map merge.map ##复制染色体1为起始的合并结果
[root@linuxprobe test]# cp chr1.ped merge.ped ##同上
for i in `seq 2 26`;do plink --file merge --merge chr$i.ped chr$i.map --sheep --recode tab --out merge;done ## 利用for循环,结合plink进行合并 (这种合并仅适合样本完全一致时合并)
[root@linuxprobe test]# wc -l merge.ped merge.map ##统计合并结果
60 merge.ped
46827 merge.map
46887 total
[root@linuxprobe test]# md5sum test.ped test.map merge.ped merge.map ## 检查合并后是否一致
04a12361169ff0fdc77349f280f11b6b test.ped
8d4537314a4c7beaeb0da726a8c5fcba test.map
89b9ee552b2ae3b5e9fcb5282cbbb5c9 merge.ped
8d4537314a4c7beaeb0da726a8c5fcba merge.map
## ped文件不一致是由于在部分位点基因频率为0.5时,主次等位基因顺序颠倒引起的,不影响结果
4、验证ped文件md5sum不一致的情况
[root@linuxprobe test]# mkdir test ## 单独建立文件夹,将需要验证结果复制至该文件夹
[root@linuxprobe test]# cp test.map test.ped merge.map merge.ped test
[root@linuxprobe test]# cd test/
[root@linuxprobe test]# ls
merge.map merge.ped test.map test.ped
plink --file merge --freq --sheep --out merge;rm *.nosex ## 利用plink统计基因频率
plink --file test --freq --sheep --out test ;rm *.nosex ## 利用plink统计基因频率
[root@linuxprobe test]# ls
merge.frq merge.log merge.map merge.ped test.frq test.log test.map test.ped
[root@linuxprobe test]# head merge.frq ## 查看生成文件格式
CHR SNP A1 A2 MAF NCHROBS
1 s64199.1 A G 0.1667 120
1 OAR19_64675012.1 G C 0.25 120
1 OAR19_64715327.1 A G 0.1583 120
1 OAR19_64803054.1 A G 0.25 120
1 DU281551_498.1 G A 0.5 120
1 s18939.1 0 A 0 120
1 OAR1_88143.1 A G 0.2083 120
1 s09912.1 G C 0.3417 120
1 s36301.1 A G 0.325 120
[root@linuxprobe test]# sed 's/^[\t ]*//' merge.frq | sed 's/[\t ]\+/\t/g' | head ## 去除行首空格,将多个空格或者制表符转换为一个制表符,查看前十行
CHR SNP A1 A2 MAF NCHROBS
1 s64199.1 A G 0.1667 120
1 OAR19_64675012.1 G C 0.25 120
1 OAR19_64715327.1 A G 0.1583 120
1 OAR19_64803054.1 A G 0.25 120
1 DU281551_498.1 G A 0.5 120
1 s18939.1 0 A 0 120
1 OAR1_88143.1 A G 0.2083 120
1 s09912.1 G C 0.3417 120
1 s36301.1 A G 0.325 120
[root@linuxprobe test]# sed 's/^[\t ]*//' merge.frq | sed 's/[\t ]\+/\t/g' | cut -f 3-4 --complement | md5sum ## 剔除3、4列后查看md5sum
643867ea0404ada96aa61702bc61f678 -
[root@linuxprobe test]# sed 's/^[\t ]*//' test.frq | sed 's/[\t ]\+/\t/g' | cut -f 3-4 --complement | md5sum ##剔除3、4列后查看买的sum
643867ea0404ada96aa61702bc61f678 -
## 以上说明每个稳点的基因频率完全相同
5、利用diff检查
[root@linuxprobe test]# diff merge.frq test.frq | head ## 差异的部分基因频率全部为0.5,主次等位基因颠倒,这种颠倒不影响结果
6c6
< 1 DU281551_498.1 G A 0.5 120
---
> 1 DU281551_498.1 A G 0.5 120
218c218
< 1 s64317.1 G A 0.5 120
---
> 1 s64317.1 A G 0.5 120
251c251
< 1 s45057.1 G A 0.5 120
[root@linuxprobe test]# diff merge.frq test.frq | tail
---
> 24 OAR24_21591643.1 A G 0.5 120
46100c46100
< 26 OAR26_5333971.1 A G 0.5 120
---
> 26 OAR26_5333971.1 G A 0.5 120
46697c46697
< 26 s52004.1 G A 0.5 120
---
> 26 s52004.1 A G 0.5 120