六框翻译

 

六框翻译-生信人必练的200个数据处理任务-生信技能树  http://www.biotrainee.com/thread-1444-1-1.html(出处: 生信技能树)
密码子是按3个碱基翻译的,
所以从第一位开始翻译会得到一个氨基酸序列,
从第二位翻译会得到一个不同的氨基酸序列
从第三位开始又会得到一个不同的序列。
从第四位开始就会和第一个开始翻译的序列一样(因为这两个都是从序列里面的起始密码子开始翻译的),
所以相对于单链,会有3种翻译的方式,
同样的情况在互补链上也会有3种,所以就一共有六种翻译方式。读出六条序列,即6框翻译。
关于真核生物翻译的起始起始:
首先是核糖体40s小亚基以及一些真核翻译起始因子结合mRNA的5'帽结构形成复合体,之后就会向下滑动,找AUG,翻译起始还要受到AUG附近序列控制,比如Kozak序列:CCa/gCCAUGG,如果找到了AUG,而且附近的序列又比较适合起始,那么翻译就开始了。
my(%codon) = ( 
      
    'TCA' => 'S',    # Serine 
    'TCC' => 'S',    # Serine 
    'TCG' => 'S',    # Serine 
    'TCT' => 'S',    # Serine        
    'TTC' => 'F',    # Phenylalanine 
    'TTT' => 'F',    # Phenylalanine 
    'TTA' => 'L',    # Leucine 
    'TTG' => 'L',    # Leucine 
    'TAC' => 'Y',    # Tyrosine
    'TAT' => 'Y',    # Tyrosine 
    'TAA' => '*',    # Stop 
    'TAG' => '*',    # Stop 
    'TGC' => 'C',    # Cysteine 
    'TGT' => 'C',    # Cysteine 
    'TGA' => '*',    # Stop 
    'TGG' => 'W',    # Tryptophan 
    'CTA' => 'L',    # Leucine 
    'CTC' => 'L',    # Leucine 
    'CTG' => 'L',    # Leucine 
    'CTT' => 'L',    # Leucine 
    'CCA' => 'P',    # Proline 
    'CCC' => 'P',    # Proline 
    'CCG' => 'P',    # Proline 
    'CCT' => 'P',    # Proline 
    'CAC' => 'H',    # Histidine 
    'CAT' => 'H',    # Histidine 
    'CAA' => 'Q',    # Glutamine 
    'CAG' => 'Q',    # Glutamine 
    'CGA' => 'R',    # Arginine 
    'CGC' => 'R',    # Arginine 
    'CGG' => 'R',    # Arginine 
    'CGT' => 'R',    # Arginine 
    'ATA' => 'I',    # Isoleucine 
    'ATC' => 'I',    # Isoleucine 
    'ATT' => 'I',    # Isoleucine 
    'ATG' => 'M',    # Methionine 
    'ACA' => 'T',    # Threonine 
    'ACC' => 'T',    # Threonine 
    'ACG' => 'T',    # Threonine 
    'ACT' => 'T',    # Threonine 
    'AAC' => 'N',    # Asparagine 
    'AAT' => 'N',    # Asparagine 
    'AAA' => 'K',    # Lysine 
    'AAG' => 'K',    # Lysine 
    'AGC' => 'S',    # Serine 
    'AGT' => 'S',    # Serine 
    'AGA' => 'R',    # Arginine 
    'AGG' => 'R',    # Arginine 
    'GTA' => 'V',    # Valine 
    'GTC' => 'V',    # Valine 
    'GTG' => 'V',    # Valine 
    'GTT' => 'V',    # Valine 
    'GCA' => 'A',    # Alanine 
    'GCC' => 'A',    # Alanine 
    'GCG' => 'A',    # Alanine 
    'GCT' => 'A',    # Alanine
    'GAC' => 'D',    # Aspartic Acid 
    'GAT' => 'D',    # Aspartic Acid 
    'GAA' => 'E',    # Glutamic Acid 
    'GAG' => 'E',    # Glutamic Acid 
    'GGA' => 'G',    # Glycine 
    'GGC' => 'G',    # Glycine 
    'GGG' => 'G',    # Glycine 
    'GGT' => 'G',    # Glycine 
    ); 
$dna=shift @ARGV;
$protein="";
for(my $i=0; $i < (length($dna) - 2) ; $i += 3)
        {$protein.=$codon{substr($dna,$i,3)} }
print "+1\t$protein\n"; 
$protein="";
for(my $i=1; $i < (length($dna) - 2) ; $i += 3)
        {$protein.=$codon{substr($dna,$i,3)} }
print "+2\t$protein\n"; 
$protein="";
for(my $i=2; $i < (length($dna) - 2) ; $i += 3)
        {$protein.=$codon{substr($dna,$i,3)} }
print "+3\t$protein\n"; 
 
$dna=reverse($dna);
$dna=~tr/ACGTacgt/TGCAtgca/;
$protein="";
for(my $i=0; $i < (length($dna) - 2) ; $i += 3)
        {$protein.=$codon{substr($dna,$i,3)} }
print "-1\t$protein\n"; 
$protein="";
for(my $i=1; $i < (length($dna) - 2) ; $i += 3)
        {$protein.=$codon{substr($dna,$i,3)} }
print "-2\t$protein\n"; 
$protein="";
for(my $i=2; $i < (length($dna) - 2) ; $i += 3)
        {$protein.=$codon{substr($dna,$i,3)} }
print "-3\t$protein\n"; 

 

注:还可以参照下面的代码

 

  1 use strict;  
  2 use warnings;  
  3   
  4   
  5 my $dna      ='';  
  6 my $protein  ='';  
  7 my @file_data=( );  
  8 my @filedata;
  9 my $revcom='';
 10  
 11  
 12 #打开文件 
 13 @filedata  = get_file_data();
 14 #得到序列
 15 $dna       = extract_sequence_from_fasta_data(@filedata);  
 16  
 17 #六框阅读翻译
 18  
 19 print "\n---------------------Reading Frame 1-----------------\n";
 20 $protein=translate_frame($dna,1);
 21 print_sequence($protein,70);
 22  
 23 print "\n---------------------Reading Frame 2-----------------\n";
 24 $protein=translate_frame($dna,2);
 25 print_sequence($protein,70);
 26  
 27 print "\n---------------------Reading Frame 3-----------------\n";
 28 $protein=translate_frame($dna,3);
 29 print_sequence($protein,70);
 30  
 31 print "\n---------------------Reading Frame 4-----------------\n";
 32 $protein=translate_frame($dna,4);
 33 print_sequence($protein,70);
 34  
 35 print "\n---------------------Reading Frame 5-----------------\n";
 36 $protein=translate_frame($dna,5);
 37 print_sequence($protein,70);
 38  
 39 print "\n---------------------Reading Frame 6-----------------\n";
 40 $protein=translate_frame($dna,6);
 41 print_sequence($protein,70);
 42  
 43 sub get_file_data
 44 {  
 45     # A subroutine to get data from a file given its filename
 46     #读取文件的子序列
 47     my $dna_filename;
 48     my @filedata;
 49     print "please input the Path just like this f:\\\\perl\\\\data.txt\n";   
 50     chomp($dna_filename=<STDIN>); 
 51     open(DNAFILENAME,$dna_filename)||die("can not open the file!");    
 52     @filedata     = <DNAFILENAME>;  
 53     close DNAFILENAME;  
 54     return @filedata;#子函数的返回值一定要记住写
 55 }
 56  
 57 sub extract_sequence_from_fasta_data  
 58 {  
 59     #*******************************************************************  
 60     # A subroutine to extract FASTA sequence data from an array  
 61     # 得到其中的序列  
 62     # fasta格式介绍:  
 63     # 包括三个部分  
 64     # 1.第一行中以>开头的注释行,后面是名称和序列的来源  
 65     # 2.标准单字母符号的序列  
 66     # 3.*表示结尾  
 67     #*******************************************************************  
 68   
 69     my (@fasta_file_data) =@_;  
 70     my $sequence =' ';  
 71     foreach my $line (@fasta_file_data)  
 72     {  
 73         #这里忽略空白行  
 74         if ($line=~/^\s*$/)  
 75         {  
 76             next;  
 77         }  
 78         #忽略注释行  
 79         elsif($line=~/^\s*#/)  
 80         {  
 81             next;  
 82         }  
 83         #忽略fasta的第一行  
 84         elsif($line=~/^>/)  
 85         {  
 86             next;  
 87         }  
 88         else  
 89         {  
 90             $sequence .=$line;  
 91         }  
 92     }  
 93     $sequence=~s/\s//g;  
 94     return $sequence;  
 95 }  
 96   
 97 sub print_sequence  
 98 {  
 99     # A subroutine to format and print sequence data  
100     my ($sequence, $length) = @_;  
101     for (my $pos =0; $pos<length($sequence);$pos+=$length)  
102     {  
103         print substr($sequence,$pos,$length),"\n";  
104     }  
105 }  
106   
107      
108   
109 sub codon2aa     
110 {     
111   
112     #第三种方法    
113     #也就是运用哈希    
114     #我们将所有的密码子作为hash的key,然后将代表的氨基酸作为hash的value    
115     #然后进行匹配    
116     # codon2aa     
117     # A subroutine to translate a DNA 3-character codon to an amino acid     
118     # Version 3, using hash lookup     
119     my($codon) = @_;     
120      
121     $codon = uc $codon;#uc=uppercase;lc=lowercase    
122                    #也就是大小写转换,uc表示将所有的小写 转换为大写    
123                #lc将所有的大写转换为小写    
124       
125     my(%genetic_code) = (     
126          
127     'TCA' => 'S',    # Serine     
128     'TCC' => 'S',    # Serine     
129     'TCG' => 'S',    # Serine     
130     'TCT' => 'S',    # Serine     
131     'TTC' => 'F',    # Phenylalanine     
132     'TTT' => 'F',    # Phenylalanine     
133     'TTA' => 'L',    # Leucine     
134     'TTG' => 'L',    # Leucine     
135     'TAC' => 'Y',    # Tyrosine      
136     'TAT' => 'Y',    # Tyrosine     
137     'TAA' => '_',    # Stop     
138     'TAG' => '_',    # Stop     
139     'TGC' => 'C',    # Cysteine     
140     'TGT' => 'C',    # Cysteine     
141     'TGA' => '_',    # Stop     
142     'TGG' => 'W',    # Tryptophan     
143     'CTA' => 'L',    # Leucine     
144     'CTC' => 'L',    # Leucine     
145     'CTG' => 'L',    # Leucine     
146     'CTT' => 'L',    # Leucine     
147     'CCA' => 'P',    # Proline     
148     'CCC' => 'P',    # Proline     
149     'CCG' => 'P',    # Proline     
150     'CCT' => 'P',    # Proline     
151     'CAC' => 'H',    # Histidine     
152     'CAT' => 'H',    # Histidine     
153     'CAA' => 'Q',    # Glutamine     
154     'CAG' => 'Q',    # Glutamine     
155     'CGA' => 'R',    # Arginine     
156     'CGC' => 'R',    # Arginine     
157     'CGG' => 'R',    # Arginine     
158     'CGT' => 'R',    # Arginine     
159     'ATA' => 'I',    # Isoleucine     
160     'ATC' => 'I',    # Isoleucine     
161     'ATT' => 'I',    # Isoleucine     
162     'ATG' => 'M',    # Methionine     
163     'ACA' => 'T',    # Threonine     
164     'ACC' => 'T',    # Threonine     
165     'ACG' => 'T',    # Threonine     
166     'ACT' => 'T',    # Threonine     
167     'AAC' => 'N',    # Asparagine     
168     'AAT' => 'N',    # Asparagine     
169     'AAA' => 'K',    # Lysine     
170     'AAG' => 'K',    # Lysine     
171     'AGC' => 'S',    # Serine     
172     'AGT' => 'S',    # Serine     
173     'AGA' => 'R',    # Arginine     
174     'AGG' => 'R',    # Arginine     
175     'GTA' => 'V',    # Valine     
176     'GTC' => 'V',    # Valine     
177     'GTG' => 'V',    # Valine     
178     'GTT' => 'V',    # Valine     
179     'GCA' => 'A',    # Alanine     
180     'GCC' => 'A',    # Alanine     
181     'GCG' => 'A',    # Alanine     
182     'GCT' => 'A',    # Alanine         
183     'GAC' => 'D',    # Aspartic Acid     
184     'GAT' => 'D',    # Aspartic Acid     
185     'GAA' => 'E',    # Glutamic Acid     
186     'GAG' => 'E',    # Glutamic Acid     
187     'GGA' => 'G',    # Glycine     
188     'GGC' => 'G',    # Glycine     
189     'GGG' => 'G',    # Glycine     
190     'GGT' => 'G',    # Glycine     
191     );     
192      
193     if(exists $genetic_code{$codon})     
194     {     
195         return $genetic_code{$codon};     
196     }    
197     else    
198     {     
199      
200             print STDERR "Bad codon \"$codon\"!!\n";     
201             exit;     
202     }     
203 }     
204   
205 sub dna2peptide  
206 {  
207     my ($dna)=@_;  
208     my $protein ='';  
209     for (my $i=0; $i<(length($dna)-2);$i+=3)  
210     {  
211         $protein .=codon2aa(substr($dna,$i,3));  
212     }  
213     return $protein;#这个词错误找了一晚上,没有返回值,所以结果总是没有内容,以后要引以为戒,子程序一定要有返回值  
214 }  
215  
216 sub translate_frame
217 {
218     my ($seq,$start,$end)=@_;
219     my $protein;
220     
221     unless($end)
222     {
223         $end=length($seq);
224     }
225     return dna2peptide(substr($seq,$start-1,$end-$start+1));
226 }

 

posted @ 2018-08-14 17:49  忆昔烟雨情  阅读(2117)  评论(0编辑  收藏  举报