Extract sequences from FASTA file based on a pair of list
import os
import sys
from Bio import SeqIO
USAGE = "\nusage: python extract_seq.py pair_list protein.fas genome.fas\n"
if len(sys.argv) < 3:
print USAGE
sys.exit()
# pairs dict
accids = open(sys.argv[1], 'r') # 读取 pair of list 文件
# 以 pair of list 文件的第一列为 keys,第二列为 values,创建字典
ACC_DICT = {}
for line in accids:
line = line.rstrip()
pdID, ptID = line.split(' ')
ACC_DICT[pdID] = ptID
# id vs seq dict
pd_infa = SeqIO.parse(open(sys.argv[2]), 'fasta')
pt_infa = SeqIO.parse(open(sys.argv[3]), 'fasta')
# 创建以 FASTA 文件的 ID 为 keys, sequence 为 values 的字典
SEQ_DICT = {}
for rec in pd_infa:
SEQ_DICT[rec.id] = str(rec.seq)
for rec in pt_infa:
SEQ_DICT[rec.id] = str(rec.seq)
# extracting, pair sequences per directory
for pd, pt in ACC_DICT.items():
os.mkdir(pd)
pd_outfa = open(pd + "/" + pd + ".fa", 'w')
pt_outfa = open(pd + "/" + pt + ".fa", 'w')
pd_outfa.write(">" + pd + "\n" + SEQ_DICT[pd] + "\n")
pt_outfa.write(">" + pt + "\n" + SEQ_DICT[pt] + "\n")
pd_outfa.close()
pt_outfa.close()
accids.close()
pd_infa.close()
pt_infa.close()
input list pairs
head pd_pt.list # 这个文件为 tblastn 输出的最优结果
TRINITY_DN100009_c0_g1_i1|m.181585 Ptaeda_C6522853
TRINITY_DN100016_c0_g1_i1|m.181572 Ptaeda_tscaffold95
TRINITY_DN100056_c0_g1_i1|m.181582 Ptaeda_scaffold110831
TRINITY_DN10006_c1_g1_i1|m.4279 Ptaeda_tscaffold4204
TRINITY_DN100074_c0_g1_i1|m.181586 Ptaeda_tscaffold2205
TRINITY_DN100077_c0_g1_i1|m.181566 Ptaeda_scaffold152599.2
TRINITY_DN100096_c0_g1_i1|m.181570 Ptaeda_tscaffold4539
TRINITY_DN100098_c0_g1_i1|m.181563 Ptaeda_scaffold888278.2
TRINITY_DN100108_c0_g1_i1|m.181648 Ptaeda_C31542994
TRINITY_DN100114_c0_g1_i1|m.181649 Ptaeda_tscaffold6504
input fasta
pd_proteins.fa
Ptaeda.fas
run
python extract_seq.py pd_pt.list pd_proteins.fa Ptaeda.fas
ls TRINITY_DN100077_c0_g1_i1\|m.181566/
Ptaeda_scaffold152599.2.fa TRINITY_DN100077_c0_g1_i1|m.181566.fa
该脚本运行结果作为 Genewise
的输入文件