python 中统计fasta文件GC含量、总长度、总的GC含量

 

001、

root@PC1:/home/test# ls
test.fasta  test.py
root@PC1:/home/test# cat test.fasta             ## 测试文件
>scaffold_1
CCCGGGTAAAACGGGTCTTCAAGAAAACGCTCCTCCGTTAATGCCGGCCGATTCAAATAA
CGCTGATTCTGATTCAGGATATACAATCTGACATGATGAACAGGTTTTCCAATTGGAATC
CGTT
>scaffold_2
CACGCCGCCAGCGTTCGTCCTGAGCCAGGATCAAACTCTCCGATAAATGGATCACAGGTT
AAGTTCACCGCATCCTGCGGCGACACCTGTGTGGCCTGCGTCGTGCAGGCCCTAGTTTGA
>scaffold_3
TTGATCCAGTGGCTCCGGTTACTCCAGTTGATCCTGTTGCGCCTGTTGCTCCAGTTTCTC
CGGTTGGTCCGGTTGATCCGGTTGCACCTGTTACTCCAGTGGCTCCGGTTACTCCCGTCG
CACCAGTTTCTCCTGTCGCACCAGTTGATCCTGTTGCGCCTGTTGGTCCTGTATCTCCAG
>scaffold_4
CCTGAGCCAGGATCAAACTCTCCGATA
root@PC1:/home/test# cat test.py                ## 脚本
#!/usr/bin/python

import re
in_file = open("test.fasta", "r")
out_file = open("result.txt", "w")

dict1 = dict()
len_all = 0
len_all_gc = 0

for i in in_file:
    i = i.strip()
    if i[0] == ">":
        id_tem = i
        dict1[id_tem] = [0,0]
    else:
        len_line = len(i)
        dict1[id_tem][0] += len_line
        len_gc = len(re.findall('[GCgc]', i))
        dict1[id_tem][1] += len_gc
        len_all += len_line
        len_all_gc += len_gc

print("id", "length","len_all_gc","percentage", file = out_file, sep = "\t")
print("all", len_all, len_all_gc, len_all_gc / len_all, file = out_file, sep = "\t")


for i,j in dict1.items():
    print(i, j[0], j[1], j[1] / j[0], file = out_file, sep = "\t")


root@PC1:/home/test# python test.py        ## 执行脚本
root@PC1:/home/test# ls
result.txt  test.fasta  test.py
root@PC1:/home/test# cat result.txt        ## 结果文件
id      length  len_all_gc      percentage
all     451     241     0.5343680709534369
>scaffold_1     124     55      0.4435483870967742
>scaffold_2     120     70      0.5833333333333334
>scaffold_3     180     102     0.5666666666666667
>scaffold_4     27      14      0.5185185185185185

 

参考:https://mp.weixin.qq.com/s?__biz=MzIxNzc1Mzk3NQ==&mid=2247491476&idx=1&sn=c580bf5e497442599df8ede1d382ff23&chksm=97f5af8ca082269aa436c4fb9c40abd622c4bba68e749e0492bc83c7127959d0871344eb5960&scene=178&cur_album_id=2403674812188688386#rd

 

posted @ 2022-08-07 22:30  小鲨鱼2018  阅读(393)  评论(0编辑  收藏  举报