Extract sequences from FASTA file based on a pair of list

import os
import sys
from Bio import SeqIO

USAGE = "\nusage: python extract_seq.py pair_list protein.fas genome.fas\n"

if len(sys.argv) < 3:
    print USAGE
    sys.exit()

# pairs dict
accids = open(sys.argv[1], 'r') # 读取 pair of list 文件

# 以 pair of list 文件的第一列为 keys,第二列为 values,创建字典
ACC_DICT = {}
for line in accids:
    line = line.rstrip()
    pdID, ptID = line.split(' ')
    ACC_DICT[pdID] = ptID

# id vs seq dict
pd_infa = SeqIO.parse(open(sys.argv[2]), 'fasta')
pt_infa = SeqIO.parse(open(sys.argv[3]), 'fasta')

# 创建以 FASTA 文件的 ID 为 keys, sequence 为 values 的字典
SEQ_DICT = {}
for rec in pd_infa:
    SEQ_DICT[rec.id] = str(rec.seq)

for rec in pt_infa:
    SEQ_DICT[rec.id] = str(rec.seq)

# extracting, pair sequences per directory
for pd, pt in ACC_DICT.items():
    os.mkdir(pd)
    pd_outfa = open(pd + "/" + pd + ".fa", 'w')
    pt_outfa = open(pd + "/" + pt + ".fa", 'w')
    pd_outfa.write(">" + pd + "\n" + SEQ_DICT[pd] + "\n")
    pt_outfa.write(">" + pt + "\n" + SEQ_DICT[pt] + "\n")
    pd_outfa.close()
    pt_outfa.close()

accids.close()
pd_infa.close()
pt_infa.close()

input list pairs

head pd_pt.list # 这个文件为 tblastn 输出的最优结果
TRINITY_DN100009_c0_g1_i1|m.181585 Ptaeda_C6522853
TRINITY_DN100016_c0_g1_i1|m.181572 Ptaeda_tscaffold95
TRINITY_DN100056_c0_g1_i1|m.181582 Ptaeda_scaffold110831
TRINITY_DN10006_c1_g1_i1|m.4279 Ptaeda_tscaffold4204
TRINITY_DN100074_c0_g1_i1|m.181586 Ptaeda_tscaffold2205
TRINITY_DN100077_c0_g1_i1|m.181566 Ptaeda_scaffold152599.2
TRINITY_DN100096_c0_g1_i1|m.181570 Ptaeda_tscaffold4539
TRINITY_DN100098_c0_g1_i1|m.181563 Ptaeda_scaffold888278.2
TRINITY_DN100108_c0_g1_i1|m.181648 Ptaeda_C31542994
TRINITY_DN100114_c0_g1_i1|m.181649 Ptaeda_tscaffold6504

input fasta

pd_proteins.fa
Ptaeda.fas

run

python extract_seq.py pd_pt.list pd_proteins.fa Ptaeda.fas

ls TRINITY_DN100077_c0_g1_i1\|m.181566/
Ptaeda_scaffold152599.2.fa  TRINITY_DN100077_c0_g1_i1|m.181566.fa

该脚本运行结果作为 Genewise 的输入文件

posted @ 2017-02-05 14:09  liuhui_pine  阅读(261)  评论(0编辑  收藏  举报