linux系统shell实现统计 plink文件基因频率

 

1、

复制代码
[root@centos79 test]# cat test.sh
#!/bin/bash

#step1 check consistence of columns
temp1=`head -n 1 $1 | awk '{print NF}'`
for i in $(seq `sed -n "$=" $1`)
do
temp2=$(sed -n "$i"p $1 | awk '{print NF}')
if [ $temp2 -ne $temp1 ]
then
echo "$i row option exception!"
echo -e "row1: $temp1 columns \nrow$i: $temp2 columns"
exit
fi
done

echo "step1 done!"

#step2 check nucleotide A T G C or 0
awk '{for(i = 7; i <= NF; i++) {print $i}}' $1 | sed 's/[\t ]/\n/' | sort -u  > tempped
for i in `cat tempped`
do
if [ $i != "G" ] && [ $i != "C" ] && [ $i != "A" ] && [ $i != "T" ] && [ $i != "0" ]
then
echo "abnormal nucleotide: $i"
exit
fi
done
rm -f tempped

echo "step2 done!"

#step3 check match of map and ped

map=$(sed -n "$=" $2)
ped=$(head -n 1 $1 | awk '{print (NF - 6)/2}')
if [ $map -ne $ped ]
then
echo "map and ped do not match!"
exit
fi

echo "step3 done!"

#step4 statistic
for i in $(seq `head -n 1 $1 | awk '{print (NF - 6)/2}'`); do awk -v a=$i '{print $(a*2 + 6),$(a*2+5) }' $1 | sed 's/ /\n/' | sort | uniq -c | sed 's/^[\t ]\+//g' | sort -n | xargs | awk '{if(NF == 2 && $2 != 0) {print "NA", "0", $2, "1"} else if(NF == 2 && $2 == 0) {print "0", "0", "0", "1"} else if(NF == 4 && $2 != 0) {print $2, $1/($1 + $3), $4, $3/($1 + $3)} else if(NF == 4 && $2 == 0) {print "NA", "0", $4, "1"} else if(NF == 6) {print $4, $3/($3 + $5), $6, $5/($3 + $5)}}' | sed 's/ /\t/g' >> tempresult.txt; done

echo "step 4 done!"

#step5 add coordinate
cut -f 1-2,4 $2 | paste -  tempresult.txt | sed "1i chr\tsnp\tpos\tmaf\tfreq\tmaj\tfreq" > freqresult.txt
rm tempresult.txt

echo "finished!"
复制代码

用法:

复制代码
[root@centos79 test]# ls   ## 测试数据
outcome.map  outcome.ped  test.sh
[root@centos79 test]# cat outcome.map
1       snp1    0       55910
1       snp2    0       85204
1       snp3    0       122948
1       snp4    0       203750
1       snp5    0       312707
1       snp6    0       356863
1       snp7    0       400518
1       snp8    0       487423
[root@centos79 test]# cat outcome.ped
DOR     1       0       0       0       -9      G G     0 0     0 0     0 0     A A     T T     G G     G C
DOR     2       0       0       0       -9      G G     G G     0 0     0 0     A A     T T     A G     C C
DOR     3       0       0       0       -9      G G     C C     0 0     G G     G G     T T     A G     G C
DOR     4       0       0       0       -9      G G     C C     0 0     G G     G G     A A     G G     G G
DOR     5       0       0       0       -9      G G     C C     0 0     G G     G G     A A     A G     G C
DOR     6       0       0       0       -9      G G     C C     0 0     G G     G G     A A     A A     C C
[root@centos79 test]# bash test.sh outcome.ped outcome.map
step1 done!
step2 done!
step3 done!
step 4 done!
finished!
[root@centos79 test]# ls   ##生成的结果
freqresult.txt  outcome.map  outcome.ped  test.sh
[root@centos79 test]# cat freqresult.txt
chr     snp     pos     maf     freq    maj     freq
1       snp1    55910   NA      0       G       1
1       snp2    85204   G       0.2     C       0.8
1       snp3    122948  0       0       0       1
1       snp4    203750  NA      0       G       1
1       snp5    312707  A       0.333333        G       0.666667
1       snp6    356863  A       0.5     T       0.5
1       snp7    400518  A       0.416667        G       0.583333
1       snp8    487423  G       0.416667        C       0.583333
复制代码

 

2、plink软件验证, 结果一致

复制代码
[root@centos79 test]# plink --file outcome --freq --out verify > /dev/null; rm *.log *.nosex
[root@centos79 test]# ls
freqresult.txt  outcome.map  outcome.ped  test.sh  verify.frq
[root@centos79 test]# cat freqresult.txt
chr     snp     pos     maf     freq    maj     freq
1       snp1    55910   NA      0       G       1
1       snp2    85204   G       0.2     C       0.8
1       snp3    122948  0       0       0       1
1       snp4    203750  NA      0       G       1
1       snp5    312707  A       0.333333        G       0.666667
1       snp6    356863  A       0.5     T       0.5
1       snp7    400518  A       0.416667        G       0.583333
1       snp8    487423  G       0.416667        C       0.583333
[root@centos79 test]# cat verify.frq
 CHR  SNP   A1   A2          MAF  NCHROBS
   1 snp1    0    G            0       12
   1 snp2    G    C          0.2       10
   1 snp3    0    0           NA        0
   1 snp4    0    G            0        8
   1 snp5    A    G       0.3333       12
   1 snp6    A    T          0.5       12
   1 snp7    A    G       0.4167       12
   1 snp8    G    C       0.4167       12
复制代码

 

posted @   小鲨鱼2018  阅读(97)  评论(0编辑  收藏  举报
编辑推荐:
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律
历史上的今天:
2020-10-30 linux系统中firewalld防火墙管理工具firewall-cmd(CLI命令行)
2020-10-30 linux系统中iptables防火墙管理工具
点击右上角即可分享
微信分享提示