python中将fasta文件按照每行指定碱基数输出

1、测试数据test.fa (一共两条染色体)

>OR4F5_ENSG00000186092_ENST00000641515_61_1038_2618
CCCAGATCTCTTCAGTTTTTATGCCTCATTCTGTGAAAATTGCTGTAGTCTCTTCCAGTTATGAAGAAGGTAACTGCAGAGGCTATTTCCTGGAATGAATCAACGAGTGAAACGAATAACTCTATGGTGACTGAATTCATTTTTCTGGGTCTCTCTGATTCTCAGGAACTCCAGACCTTCCTATTTATGTTGTTTTTTGTATTCTATGGAGGAATCGTGTTTGGAAACCTTCTTATTGTCATAACAGTGGTATCTGACTCCCACCTTCACTCTCCCATGTACTTCCTGCTAGCCAACCTCTCACTCATTGATCTGTCTCTGTCTTCAGTCACAGCCCCCAAGATGATTACTGACTTTTTCAGCCAGCGCAAAGTCATCTCTTTCAAGGGCTGCCTTGTTCAGATATTTCTCCTTCACTTCTTTGGTGGGAGTGAGATGGTGATCCTCATAGCCATGGGCTTTGACAGATATATAGCAATATGCAAGCCCCTACACTACACTACAATTATGTGTGGCAACGCATGTGTCGGCATTATGGCTGTCACATGGGGAATTGGCTTTCTCCATTCGGTGAGCCAGTTGGCGTTTGCCGTGCACTTACTCTTCTGTGGTCCCAATGAGGTCGATAGTTTTTATTGTGACCTTCCTAGGGTAATCAAACTTGCCTGTACAGATACCTACAGGCTAGATATTATGGTCATTGCTAACAGTGGTGTGCTCACTGTGTGTTCTTTTGTTCTTCTAATCATCTCATACACTATCATCCTAATGACCATCCAGCATCGCCCTTTAGATAAGTCGTCCAAAGCTCTGTCCACTTTGACTGCTCACATTACAGTAGTTCTTTTGTTCTTTGGACCATGTGTCTTTATTTATGCCTGGCCATTCCCCATCAAGTCATTAGATAAATTCCTTGCTGTATTTTATTCTGTGATCACCCCTCTCTTGAACCCAATTATATACACACTGAGGAACAAAGACATGAAGACGGCAATAAGACAGCTGAGAAAATGGGATGCACATTCTAGTGTAAAGTTTTAGATCTTATATAACTGTGAGATTAATCTCAGATAATGACACAAAATATAGTGAAGTTGGTAAGTTATTTAGTAAAGCTCATGAAAATTGTGCCCTCCATTCCCATATAATTTAGTAATTGTCTAGGAACTTCCACATACATTGCCTCAATTTATCTTTCAACAACTTGTGTGTTATATTTTGGAATACAGATACAAAGTTATTATGCTTTCAAAATATTCTTTTGCTAATTCTTAGAACAAAGAAAGGCATAAATATATTAGTATTTGTGTACACCTGTTCCTTCCTGTGTGACCCTAAGTTTAGTAGAAGAAAGGAGAGAAAATATAGCCTAGCTTATAAATTTAAAAAAAAATTTATTTGGTCCATTTTGTGAAAAACATAAAAAAAGAACTGTCACATCTTAATTTAAAAAATATATGCTTAGTGGTAAGGAGATATATGTCAACTTTTAAGAGGTTGAAAAACAAACGCCTCCCATTATAAGTTTATACTTCACCTCCCACCACTATAACAACCCAGAATCCATGAGGGCATTATCAGGAGTGAGTGGAAGAGTAAGTTTGCCAATGTGAAATGTGCCTTCTAGGTCCTAGACGTCTGTGGTATAACTGCTCATAAGCAGTAGAAAGAATTTAGAGGGATCCAGGCTCTCATCACGTTGGCACAAAGTATATTACTTGGATCCATCTATGTCATTTTCCATGGTTAATGTTTAAAAGCACAGGCTTTAAAGTAAAAAACAAAGAGCTGGATTCAACTCTACTGACTCTTATTAATCATGATTTTGGGCACATTACGTAGCTTTCATGAGCTTTAGTTTCTACATTTATAAACAGGAGATTATACCTATTATGCATGGTTATTATGAAGGAAAATGACAAAATAGATATAAATCAAATAGCCCACTTCGAGACATATTAAGCATGAATAAACATTAGATACTATTAAAATCCTATATATTAACAAAGCCAAAAGTTTCAAACTTTACTTTTTCCCAACATTCTTGTGAAATATGACACATCCCAATCTTAACAGATGCTCATTTGGGATACTGTACTTGTGAGTGGAAGTGTGTATATTTGTGTGCAAGTGTGTACTCATATACTTCCACCTTACCACCCTAGAAAGGCATGATGAAAATTTAAGATAGAAGGAAAATATAAATTGAAAAAAAAAAACCTTAACAAATGATTCTGACAAATATCTTCTCTTTCCAGGGAGAATCACTGAGCCAGAATAAAATTGAACACTAAATATTCTAAGAAAAAAGGAATCTAGTTTGTCAAAATGTGACTTGAATTAATAGATAAGGAGAGTCAGATGATAAGAGGGTCAAAATTATGTTTATCTTAGGAAAAGTAGAATAGAAAATTTATAAGCAGATTAAAAACACATAATAAAAGTAGTAAATAATAATGACAGTATCTCAAATCAGTGCAGGGGGGAAAGGCCTACTAATGTGATGGTGGGATAATTGGATAGCAATATGGGAAAAGATATATTTAATTTATTTGCTACACCAAATGCCAGGACAATCTCTAAGTGAATTCAAGACATAACTCTTTTTTCAAAAAAAC
>OR4F29_ENSG00000284733_ENST00000426406_20_955_995
AGCCCAGTTGGCTGGACCAATGGATGGAGAGAATCACTCAGTGGTATCTGAGTTTTTGTTTCTGGGACTCACTCATTCATGGGAGATCCAGCTCCTCCTCCTAGTGTTTTCCTCTGTGCTCTATGTGGCAAGCATTACTGGAAACATCCTCATTGTGTTTTCTGTGACCACTGACCCTCACTTACACTCCCCCATGTACTTTCTACTGGCCAGTCTCTCCTTCATTGACTTAGGAGCCTGCTCTGTCACTTCTCCCAAGATGATTTATGACCTGTTCAGAAAGCGCAAAGTCATCTCCTTTGGAGGCTGCATCGCTCAAATCTTCTTCATCCACGTCGTTGGTGGTGTGGAGATGGTGCTGCTCATAGCCATGGCCTTTGACAGATATGTGGCCCTATGTAAGCCCCTCCACTATCTGACCATTATGAGCCCAAGAATGTGCCTTTCATTTCTGGCTGTTGCCTGGACCCTTGGTGTCAGTCACTCCCTGTTCCAACTGGCATTTCTTGTTAATTTAGCCTTCTGTGGCCCTAATGTGTTGGACAGCTTCTACTGTGACCTTCCTCGGCTTCTCAGACTAGCCTGTACCGACACCTACAGATTGCAGTTCATGGTCACTGTTAACAGTGGGTTTATCTGTGTGGGTACTTTCTTCATACTTCTAATCTCCTACGTCTTCATCCTGTTTACTGTTTGGAAACATTCCTCAGGTGGTTCATCCAAGGCCCTTTCCACTCTTTCAGCTCACAGCACAGTGGTCCTTTTGTTCTTTGGTCCACCCATGTTTGTGTATACACGGCCACACCCTAATTCACAGATGGACAAGTTTCTGGCTATTTTTGATGCAGTTCTCACTCCTTTTCTGAATCCAGTTGTCTATACATTCAGGAATAAGGAGATGAAGGCAGCAATAAAGAGAGTATGCAAACAGCTAGTGATTTACAAGAGGATCTCATAAATGATATAATAAGCCCTTCTCATTAAACATGATATGG

 

2、python脚本

root@PC1:/home/test# cat test.py
# read fasta and save in dict
input_fa = {}            ## 此处定义一个空的字典

# loop for read
with open('test.fa','r') as rawfa:     ## 读入测试文件,并命名为rawfa
    for line in rawfa:                 ## 对rawfa逐行进行循环
        line = line.strip('\n')        ## 删除末尾的换行符
        if line.startswith('>'):       ## 如果匹配开始是> 的行
            key = line                 ## 将该行定义为字典的键
            input_fa[key] = ''         ## 同时将该键对应的值定义为空
        else:
            input_fa[key] += line.replace('\n','')     ## 如果没有匹配>, 将这一行赋值给对应键的值, 同时删除末尾的换行符



# output save in another file
output_fa = open('new_fasta.fa','w')     ## 以写的形式打开一个文件,用于储存结果

# choose length you want to separate
my_length = 20                         ## 此处定义每行的字符数

# separate sequences
for key,val in input_fa.items():       ## 对字典的键和值进行循环
    output_fa.write(key + '\n')        ## 将键写入文件,同时增加换行符
    while len(val) > my_length:        ## 利用while循环对值的长度进行判断,当长于设定的长度时,一直循环
        output_fa.write(val[0:my_length] + '\n')  ## 每次写入指定长度个碱基
        val = val[my_length:len(val)]       ## 此处是while循环的关键,每执行一次循环,val向右移动my_length个长度的碱基,也就是val长度每次减少my_length
    output_fa.write(val + '\n')        ## 写入最后长度不足my_length的碱基

# file close
output_fa.close()      ## 关闭文件

 

3、执行脚本

root@PC1:/home/test# ls    ## 测试数据和脚本
test.fa  test.py
root@PC1:/home/test# python3 test.py   ## 执行脚本
root@PC1:/home/test# ls
new_fasta.fa  test.fa  test.py        ## 结果文件
root@PC1:/home/test# head new_fasta.fa   ## 查看前10行
>OR4F29_ENSG00000284733_ENST00000426406_20_955_995
AGCCCAGTTGGCTGGACCAA
TGGATGGAGAGAATCACTCA
GTGGTATCTGAGTTTTTGTT
TCTGGGACTCACTCATTCAT
GGGAGATCCAGCTCCTCCTC
CTAGTGTTTTCCTCTGTGCT
CTATGTGGCAAGCATTACTG
GAAACATCCTCATTGTGTTT
TCTGTGACCACTGACCCTCA
root@PC1:/home/test# tail new_fasta.fa   ## 查看后10行,
CACACCCTAATTCACAGATG
GACAAGTTTCTGGCTATTTT
TGATGCAGTTCTCACTCCTT
TTCTGAATCCAGTTGTCTAT
ACATTCAGGAATAAGGAGAT
GAAGGCAGCAATAAAGAGAG
TATGCAAACAGCTAGTGATT
TACAAGAGGATCTCATAAAT
GATATAATAAGCCCTTCTCA
TTAAACATGATATGG

 

来源:https://mp.weixin.qq.com/s?__biz=MzkyMTI1MTYxNA==&mid=2247494994&idx=1&sn=e1db20f09eac3ce6296ce3236970dbfc&chksm=c184d723f6f35e35406d92953e3ba9c72faf40192a8f8ea8507ed85cd52eb76744aae597cc16&mpshare=1&scene=23&srcid=0202lOxgPkVBELbemkiqnRTf&sharer_sharetime=1643763712632&sharer_shareid=4ed060cc4cd1efce40e3ab6dd8d8c7d4#rd

 

posted @ 2022-02-02 22:57  小鲨鱼2018  阅读(697)  评论(0编辑  收藏  举报