Linux 中 shell 脚本实现根据gff 计算每一个基因的最长转录本

 

001、生成基因名称的列表

awk -F "\t" '$3 == "gene" && $NF ~ /gene=/ {print $NF}' chr1.gff | sed 's/\(.*\)\(gene=[^;]\+\)\(.*\)/\2/' | sort | uniq > gene.list

 

002、计算最长转录本

a、脚本文件

#!/bin/bash

num=0
for i in $(cat gene.list)
do
        let num=$num+1
        mRNA=$(grep "$i;" chr1.gff | awk '$3 ~ /^mRNA$/' | wc -l)
        exon=$(grep "$i;" chr1.gff | awk '$3 ~ /^exon$/' | wc -l)

        if [ $exon -eq 0 ]
        then
                echo "$i has no exon"
                exit
        fi

        if [ $mRNA -gt 1 ]
        then
                grep "$i;" chr1.gff | awk '$3 ~ /^mRNA$|^exon$/' | cut -f 3-5 | awk 'BEGIN{a=0}{if($1 ~ /^mRNA$/){a++}; if($1 ~ /^exon$/) {print "exon"a, $2, $3}}' | awk '{ay[$1] += ($3 - $2 + 1)} END {for(j in ay) print j, ay[j]}' | awk -v sam=$i '{if(NR == 1) {max = $2} else if($2 > max) {max = $2}} END {print sam, max}' >> result.txt
        else
                grep "$i;" chr1.gff | awk -v sam=$i '$3 ~ /^exon$/ {sum += ($5 - $4 + 1)} END {print sam, sum}' >> result.txt
        fi
        echo $num done

done

 

b、执行程序

[root@PC1 test]# bash record.sh

 

c、结果文件:

[root@PC1 test]# head result.txt
gene=2-CP2 1080
gene=AAP1 1684
gene=abz1 1196
gene=ACI112 1787
gene=ACI36 1445
gene=ACS2 1847
gene=AdBiL 2089
gene=ADC 2800
gene=ADH 1152
gene=Adi3 2494

 

posted @ 2023-05-16 22:27  小鲨鱼2018  阅读(145)  评论(0编辑  收藏  举报