#!/usr/bin/python import re import sys import gzip change={'A':'T','T':'A','C':'G','G':'C','N':'N'} CODE = { 'GCA' : 'A', 'GCC' : 'A', 'GCG' : 'A', 'GCT' : 'A', 'TGC' : 'C', 'TGT' : 'C', # Cysteine 'GAC' : 'D', 'GAT' : 'D', # Aspartic Acid 'GAA' : 'E', 'GAG' : 'E', # Glutamic Acid 'TTC' : 'F', 'TTT' : 'F', # Phenylalanine 'GGA' : 'G', 'GGC' : 'G', 'GGG' : 'G', 'GGT' : 'G', # Glycine 'CAC' : 'H', 'CAT' : 'H', # Histidine 'ATA' : 'I', 'ATC' : 'I', 'ATT' : 'I', # Isoleucine 'AAA' : 'K', 'AAG' : 'K', # Lysine 'CTA' : 'L', 'CTC' : 'L', 'CTG' : 'L', 'CTT' : 'L', 'TTA' : 'L', 'TTG' : 'L', # Leucine 'ATG' : 'M', # Methionine 'AAC' : 'N', 'AAT' : 'N', # Asparagine 'CCA' : 'P', 'CCC' : 'P', 'CCG' : 'P', 'CCT' : 'P', # Proline 'CAA' : 'Q', 'CAG' : 'Q', # Glutamine 'CGA' : 'R', 'CGC' : 'R', 'CGG' : 'R', 'CGT' : 'R', 'AGA' : 'R', 'AGG' : 'R', # Arginine 'TCA' : 'S', 'TCC' : 'S', 'TCG' : 'S', 'TCT' : 'S', 'AGC' : 'S', 'AGT' : 'S', # Serine 'ACA' : 'T', 'ACC' : 'T', 'ACG' : 'T', 'ACT' : 'T', # Threonine 'GTA' : 'V', 'GTC' : 'V', 'GTG' : 'V', 'GTT' : 'V', # Valine 'TGG' : 'W', # Tryptophan 'TAC' : 'Y', 'TAT' : 'Y', # Tyrosine 'TAA' : '*', 'TAG' : '*', 'TGA' : '*' # Stop } def readfa(l): col={} arr =[] sca ='' li= gzip.open(l,'rb') for line in li: if '>' in line: arr =[] sca = line.split()[0].lstrip('>') col[sca]=arr else: without = re.sub(r'\n',"",line) arr.append(without) return col def readgff(l): col ={} arr =[] li= gzip.open(l,'rb') for line in li: sp = line.split( ) if sp[2] == 'mRNA': gene = re.match(r'ID=(.*?);',sp[8]).group(1) arr=[] col[gene]=[arr,sp[0],sp[6]] # start=sp[3] elif sp[2] == 'CDS': gene = re.match(r'Parent=(.*?);',sp[8]).group(1) col[gene][0].append([int(sp[3])-1,int(sp[4])]) return col #main### out= gzip.open(sys.argv[3],'wb') gff=readgff(sys.argv[2]) c=gff s='' fa =readfa(sys.argv[1]) for k1,v1 in gff.items(): if v1[1] in fa.keys(): lon=s.join(fa[v1[1]]) short='' for i in v1[0]: short +=lon[i[0]:i[1]] if v1[2] == '-': short=''.join(change[i] for i in short)[::-1] j=0 AA='' while j <= (len(short)-3): sp = short[j:3+j] if 'N' in sp: j=j+3 continue else: AA += CODE[sp] j=j+3 print >>out,">",k1,"\n",AA else: print "no fa"