Linux awk给fasta中重复的染色体名做重复标记

001、 awk实现

[root@pc1 test1]# ls
a.txt
[root@pc1 test1]# cat a.txt       ## 测试文件
>jcf7180003470556
2 7
>jcf7180003470556
3 8
>jcf7180003470552
4 9 6
>jcf7180003470546
5 3
>jcf7180003470558
6 2
>jcf7180003470556
7 1
>jcf7180003470550
8 5
>jcf7180003470558
10 4 3

[root@pc1 test1]# awk '{if($0 ~ />/) {ay[$0]++}; if($0 ~ />/ && ay[$0] > 1) {$0 = $0"_"ay[$0]}; print $0}' a.txt
>jcf7180003470556             ## 给重复的染色体名做标记
2 7
>jcf7180003470556_2
3 8
>jcf7180003470552
4 9 6
>jcf7180003470546
5 3
>jcf7180003470558
6 2
>jcf7180003470556_3
7 1
>jcf7180003470550
8 5
>jcf7180003470558_2
10 4 3

002、python实现

a、

[root@pc1 test1]# cat a.fa       ## 测试数据
>jcf7180003470556
2 7
>jcf7180003470556
3 8
>jcf7180003470552
4 9 6
>jcf7180003470546
5 3
>jcf7180003470558
6 2
>jcf7180003470556
7 1
>jcf7180003470550
8 5
>jcf7180003470558
10 4 3
[root@pc1 test1]# cat test.py      ## python程序
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

in_file = open("a.fa", "r")
dict1 = dict() ## dict1 = {}

for i in in_file:
        i = i.strip()
        if i[0] == ">" and i not in dict1:
                dict1[i] = 1
        elif i[0] == ">" and i in dict1:
                dict1[i] += 1
        if i[0] == ">" and dict1[i] > 1:
                i = i + "_" + str(dict1[i])
        print(i)
in_file.close()
[root@pc1 test1]# python test.py      ## 执行程序
>jcf7180003470556
2 7
>jcf7180003470556_2
3 8
>jcf7180003470552
4 9 6
>jcf7180003470546
5 3
>jcf7180003470558
6 2
>jcf7180003470556_3
7 1
>jcf7180003470550
8 5
>jcf7180003470558_2
10 4 3

b、引入collenctions 中的defaultdict

[root@pc1 test1]# ls
a.fa  test.py
[root@pc1 test1]# cat a.fa      ## 测试数据
>jcf7180003470556
2 7
>jcf7180003470556
3 8
>jcf7180003470552
4 9 6
>jcf7180003470546
5 3
>jcf7180003470558
6 2
>jcf7180003470556
7 1
>jcf7180003470550
8 5
>jcf7180003470558
10 4 3
[root@pc1 test1]# cat test.py     ## 程序
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from collections import defaultdict
dict1 = defaultdict(int)
in_file = open("a.fa", "r")

for i in in_file:
        i = i.strip()
        if i.startswith(">"):
                dict1[i] += 1
        if i.startswith(">") and dict1[i] > 1:
                i = i + "_" + str(dict1[i])
        print(i)
in_file.close()
[root@pc1 test1]# python3 test.py    ## 执行程序
>jcf7180003470556
2 7
>jcf7180003470556_2
3 8
>jcf7180003470552
4 9 6
>jcf7180003470546
5 3
>jcf7180003470558
6 2
>jcf7180003470556_3
7 1
>jcf7180003470550
8 5
>jcf7180003470558_2
10 4 3