linux shell中将fasta文件按照每行指定碱基数输出

1、测试数据

复制代码
root@PC1:/home/test# ls
record.txt  test.fa
root@PC1:/home/test# cat test.fa
>OR4F29_ENSG00000284733_ENST00000426406_20_955_995
AGCCCAGTTGGCTGGACCAATGGAT
GGAGAGAATCACTCAGTGGTATCTGAG
TTTTTGTTTCTGGGACTC
>OR4F16_ENSG00000284662_ENST00000332831_20_955_995
AGCCCAGTTGGCTGGACCAATGGATGGAG
AGAATCACTCAGTGGTATCTGAGTTTTTGTTTCTGGGACTCAC
>OR4F29_ENSG00000284733_ENST00000426406_20_955_995
AGCCCAGTTGGCTGGA
CCAATGGATGGAGAGAATCACTCAGTGGTATCTGAGTTTTT
GTTTCTGGGACTCACT
>OR4F16_ENSG00000284662_ENST00000332831_20_955_995
AGCCCAGTTGGCTGGACCAATGGATGGAGAGA
ATCACTCAGTGGTATCTGAGTTTTTGTTTCTGG
>OR4F29_ENSG00000284733_ENST00000426406_20_955_995
AGCCCAGTTGGCTGGA
CCAATGGATGGAGAGAATCACTCAGTGGTATCTGAGTTTTT
GTTTCTGGGACTCACT
复制代码

 

2、脚本

复制代码
root@PC1:/home/test# ls
record.txt  test.fa
root@PC1:/home/test# cat record.txt
#step1
grep -n "^>" test.fa | cut -d ":" -f 1 | paste -d " " -s | awk '{for(i = 1; i < NF; i++) printf("%d %d ", $i+1,$(i+1)-1); printf("\n")}' | awk '{for(i = 1; i <= NF; i++) if(i % 2 == 0) {print $i} else {printf("%s ", $i)}}' > topindex.txt

#step2
sed -n "/^>/=" test.fa | awk 'END{print $0 + 1}' | paste - -d " " <(sed -n "$=" test.fa ) > endindex.txt

#step3   此处6位指定每行多少个碱基,可以设定为其他数值
cat topindex.txt endindex.txt | while read {i,j}; do awk -v a=$i -v b=$j "NR == a, NR == b" test.fa | awk '{printf("%s", $0)} END {print}' | awk -v c=$i -F "" '{printf("tag%d",c); for(i = 1; i <= NF; i++) if(i % 6 == 0) {print $i} else {printf("%s ", $i)}; printf("\n"); idx = idx + 1} ' >> tempresult; done

#step4
grep "^>" test.fa | paste - -d " " <(grep "^tag" tempresult ) | while read {i,j}; do sed -i "/$j/i $i"  tempresult ; done

#step5
sed 's/^tag[0-9]*//g' tempresult > result.fa

rm endindex.txt topindex.txt tempresult
复制代码

 

3、测试

复制代码
root@PC1:/home/test# ls
record.txt  test.fa
root@PC1:/home/test# bash record.txt
root@PC1:/home/test# ls
record.txt  result.fa  test.fa
root@PC1:/home/test# cat result.fa   ## 查看结果
>OR4F29_ENSG00000284733_ENST00000426406_20_955_995
A G C C C A
G T T G G C
T G G A C C
A A T G G A
T G G A G A
G A A T C A
C T C A G T
G G T A T C
T G A G T T
T T T G T T
T C T G G G
A C T C T T
T T T G T T
T C T G G G
A C T C
>OR4F16_ENSG00000284662_ENST00000332831_20_955_995
A G C C C A
G T T G G C
T G G A C C
A A T G G A
T G G A G A
G A A T C A
C T C A G T
G G T A T C
T G A G T T
T T T G T T
T C T G G G
A C T C A C
A G A A T C
A C T C A G
T G G T A T
C T G A G T
T T T T G T
T T C T G G
G A C T C A
C
复制代码

 

4、每行20个碱基

复制代码
root@PC1:/home/test# ls
record.txt  test.fa
root@PC1:/home/test# cat record.txt
#step1
grep -n "^>" test.fa | cut -d ":" -f 1 | paste -d " " -s | awk '{for(i = 1; i < NF; i++) printf("%d %d ", $i+1,$(i+1)-1); printf("\n")}' | awk '{for(i = 1; i <= NF; i++) if(i % 2 == 0) {print $i} else {printf("%s ", $i)}}' > topindex.txt

#step2
sed -n "/^>/=" test.fa | awk 'END{print $0 + 1}' | paste - -d " " <(sed -n "$=" test.fa ) > endindex.txt

#step3  此处改为20
cat topindex.txt endindex.txt | while read {i,j}; do awk -v a=$i -v b=$j "NR == a, NR == b" test.fa | awk '{printf("%s", $0)} END {print}' | awk -v c=$i -F "" '{printf("tag%d",c); for(i = 1; i <= NF; i++) if(i % 20 == 0) {print $i} else {printf("%s ", $i)}; printf("\n"); idx = idx + 1} ' >> tempresult; done

#step4
grep "^>" test.fa | paste - -d " " <(grep "^tag" tempresult ) | while read {i,j}; do sed -i "/$j/i $i"  tempresult ; done

#step5
sed 's/^tag[0-9]*//g' tempresult > result.fa

rm endindex.txt topindex.txt tempresult
root@PC1:/home/test# bash record.txt
root@PC1:/home/test# ls
record.txt  result.fa  test.fa
root@PC1:/home/test# cat result.fa   ## 查看结果
>OR4F29_ENSG00000284733_ENST00000426406_20_955_995
A G C C C A G T T G G C T G G A C C A A
T G G A T G G A G A G A A T C A C T C A
G T G G T A T C T G A G T T T T T G T T
T C T G G G A C T C T T T T T G T T T C
T G G G A C T C
>OR4F16_ENSG00000284662_ENST00000332831_20_955_995
A G C C C A G T T G G C T G G A C C A A
T G G A T G G A G A G A A T C A C T C A
G T G G T A T C T G A G T T T T T G T T
T C T G G G A C T C A C A G A A T C A C
T C A G T G G T A T C T G A G T T T T T
G T T T C T G G G A C T C A C
>OR4F29_ENSG00000284733_ENST00000426406_20_955_995
A G C C C A G T T G G C T G G A C C A A
T G G A T G G A G A G A A T C A C T C A
G T G G T A T C T G A G T T T T T G T T
T C T G G G A C T C A C T G T T T C T G
G G A C T C A C T
>OR4F16_ENSG00000284662_ENST00000332831_20_955_995
A G C C C A G T T G G C T G G A C C A A
T G G A T G G A G A G A A T C A C T C A
G T G G T A T C T G A G T T T T T G T T
T C T G G A T C A C T C A G T G G T A T
C T G A G T T T T T G T T T C T G G
>OR4F29_ENSG00000284733_ENST00000426406_20_955_995
A G C C C A G T T G G C T G G A C C A A
T G G A T G G A G A G A A T C A C T C A
G T G G T A T C T G A G T T T T T G T T
T C T G G G A C T C A C T G T T T C T G
G G A C T C A C T
复制代码

 

posted @   小鲨鱼2018  阅读(228)  评论(3编辑  收藏  举报
相关博文:
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律
历史上的今天:
2021-02-03 c语言中求数组元素的最大值和最小值
2021-02-03 c语言中对象式宏(全局变量)
2021-02-03 c语言中实现数组的倒序排列
2021-02-03 linux系统 centos7.9 中安装 Rsudio-server
点击右上角即可分享
微信分享提示