[笔记]reads_classifier

reads_classifier.pl
  1 #!/usr/bin/perl 
  2 #===============================================================================
  3 #
  4 #         FILE: reads_classifier.pl
  5 #
  6 #        USAGE: ./reads_classifier.pl  
  7 #
  8 #  DESCRIPTION: 
  9 #
 10 #      OPTIONS: ---
 11 # REQUIREMENTS: ---
 12 #         BUGS: ---
 13 #        NOTES: ---
 14 #       AUTHOR: YAN Yun (Puriney), yanyun@whu.edu.cn
 15 #      COMPANY: 
 16 #      VERSION: 1.0
 17 #      CREATED: 2012年07月17日 15时32分48秒
 18 #     REVISION: ---
 19 #===============================================================================
 20 
 21 use strict;
 22 use warnings;
 23 
 24 my ($inbed, $index, $out) = @ARGV; 
 25 die <DATA> unless (@ARGV == 3);
 26 open IN, "$inbed" || die $!;
 27 open OUT , ">$out" || die $!;
 28 while (<IN>){
 29 
 30     chomp;
 31     print OUT "$_";
 32     my ($reads_ref, $reads_s, $reads_e, $reads_str) = (split /\t/,$_)[0,1,2,5];
 33     
 34     my $class = 0; 
 35     $class = &reads_type_classifer($reads_ref, $reads_s,$reads_e, $reads_str, $index);
 36     print OUT "\t$class\n";
 37 }
 38 
 39 
 40 sub reads_type_classifer{
 41     my ($reads_ref, $reads_s,$reads_e, $reads_str, $index) = @_;
 42     #print "ini:$reads_s\t$reads_e\n";
 43     open INDEX, $index;
 44     #index format (1-based cordination):
 45     #accesion_num, CDS/tRNA, strand, rRNA/Gene, Joint/Single_positioin, position_start1, postion_end1, position_start2(if J), position_end2(if J)
 46     
 47     #position hash, f=forward, r=reverse
 48     my (%trnaf,%cdsf,%rrnaf,%trnar,%cdsr,%rrnar);
 49     my ($ref, $cort, $str, $rorg, $js);
 50         
 51     #my $classtag = 0; 
 52     #my $class = "unknow1";
 53     my $go = "Y";
 54     while (<INDEX>){
 55         
 56     chomp;
 57         my @col = split (/\t/,$_);
 58         
 59         #$cort: CDS or tRNA
 60         #$rorg: ribosomal RNA or Gene
 61         #$js : joint or single
 62         ($ref, $cort, $str, $rorg, $js) = (split /\t/,$_)[0,1,2,3,4];
 63         
 64         if ($reads_ref =~ /$ref/){
 65             
 66         my ($s, $e) = (0,0);
 67     
 68             #get forward information
 69             if ($str ==1){
 70             #get (%trnaf,%cdsf,%rrnaf,)
 71                 if ($js eq "J"){
 72                     if ($cort eq "tRNA"){ #tRNA
 73                         #parse joint position
 74                         for ( my $list = 5; $list <= (@col - 1 - 1); $list=$list + 2 ) {
 75                             ($s, $e)=($col[$list], $col[$list+1]);
 76                             $trnaf{$s} = $e;
 77                         }
 78                         next;
 79                     }
 80                     elsif ( $cort eq "CDS"){ #CDS
 81                         #parse join position
 82                         for ( my $list = 5; $list <= (@col - 1 - 1); $list=$list + 2 ) {
 83                             ($s, $e)=($col[$list], $col[$list+1]);
 84                             $cdsf{$s} = $e;
 85                         }
 86                         if ($rorg ne "G"){ #ribosomal RNA
 87                             #parse joint position
 88                             for ( my $list = 5; $list <= (@col - 1 - 1); $list=$list + 2 ) {
 89                             ($s, $e)=($col[$list], $col[$list+1]);
 90                             $rrnaf{$s} = $e;
 91                             }
 92                         }
 93                     }
 94                 }
 95                 elsif ($js eq "S"){
 96                     ($s, $e) = (split /\t/,$_)[5,6];
 97                     
 98                     #CORE CODES
 99                     #CLASSIFIER
100                     if ($cort eq "tRNA"){ #tRNA
101                         $trnaf{$s} = $e;
102                         next;
103                     }
104                     elsif ( $cort eq "CDS"){ #CDS
105                         $cdsf{$s} = $e;
106                             if ($rorg ne "G"){ #ribosomal RNA
107                             $rrnaf{$s} = $e;
108                             next;
109                             }
110                     }
111     
112                 }
113             }
114     
115             #get complementary strand information
116             elsif ($str == -1){
117             #get %trnar,%cdsr,%rrnar
118                 if ($js eq "J"){
119                     if ($cort eq "tRNA"){ #tRNA
120                         #parse joint position
121                         for ( my $list = 5; $list <= (@col - 1 - 1); $list=$list + 2 ) {
122                             ($s, $e)=($col[$list], $col[$list+1]);
123                             $trnar{$s} = $e;
124                         }
125                         next;
126                     }
127                     elsif ( $cort eq "CDS"){ #CDS
128                         #parse join position
129                         for ( my $list = 5; $list <= (@col - 1 - 1); $list=$list + 2 ) {
130                             ($s, $e)=($col[$list], $col[$list+1]);
131                             $cdsr{$s} = $e;
132                         }
133                         if ($rorg ne "G"){ #ribosomal RNA
134                             #parse joint position
135                             for ( my $list = 5; $list <= (@col - 1 - 1); $list=$list + 2 ) {
136                             ($s, $e)=($col[$list], $col[$list+1]);
137                             $rrnar{$s} = $e;
138                             }
139                         }
140                     }
141                 }
142                 elsif ($js eq "S"){
143                     ($s, $e) = (split /\t/,$_)[5,6];
144                     
145                     #CORE CODES
146                     #CLASSIFIER
147                     if ($cort eq "tRNA"){ #tRNA
148                         $trnar{$s} = $e;
149                         next;
150                     }
151                     elsif ( $cort eq "CDS"){ #CDS
152                         $cdsr{$s} = $e;
153                             if ($rorg ne "G"){ #ribosomal RNA
154                             $rrnar{$s} = $e;
155                             next;
156                             }
157                     }
158                 }
159     
160             }
161             
162             }
163         else {
164             $go = "N";
165             print "no kidding me\n";
166             last;
167         }
168     }
169     
170     
171     my $class = "initial";
172     print "$go\n" if ($class eq "initial" and $go eq "N");
173     my $classtag = 0;
174     if ($go ne "N"){
175         # now we have (%trnaf,%cdsf,%rrnaf,%trnar,%cdsr,%rrnar);
176         # 
177         # decide read's class, tRNA, rRNA, CDS, inter genenic region
178         #
179         #my $classtag = 0; 
180         %trnaf = &sorthash(%trnaf);
181         %trnar = &sorthash(%trnar);
182         %cdsf = &sorthash(%cdsf);
183         %cdsr = &sorthash(%cdsr);
184         %trnaf = &sorthash(%trnaf);
185         %trnar = &sorthash(%trnar);
186         if ($reads_str eq "+"){
187             
188             #if tRNA start
189             if ( &withinorout($reads_s, $reads_e, %trnaf) != 0){
190             #tRNA            
191             $classtag = &withinorout($reads_s, $reads_e, %trnaf);
192             #print "tRNA:$reads_s\t$reads_e\n";
193                 if ($classtag == 1){ 
194                     $class = "tRNA";
195                 }
196                 elsif ($classtag == -1) {
197                     if (&withinorout($reads_s, $reads_e,%cdsf) == 1){
198                         $class = "CDS";
199                     }
200                     else {
201                         #$class = "unknow2";
202                         $class = "IGR+1";
203                     }
204                 }
205                 else {
206                     $class = "unknown";
207                 }
208             }
209             #if trna end
210             
211             elsif ( &withinorout($reads_s, $reads_e, %cdsf) != 0){
212             #elsif cds start
213             #CDS
214             $classtag = &withinorout($reads_s, $reads_e, %cdsf);
215             
216                 if ($classtag ==1){
217                     $class = "CDS+";
218                     
219                     #rRNA
220                     if (&withinorout($reads_s, $reads_e,%rrnaf) == 1){
221                         $class = (split /\s+/, $rorg)[0];
222                         $class = "rRNA-".$class;   
223                     }
224                     elsif (&withinorout($reads_s, $reads_e,%rrnaf) == -1){
225                         $class = "unknow3";
226                     }
227     
228                 }
229                 elsif ($classtag == -1 ){
230                     $class = "IGR+2";
231                 }
232                 else {
233                     $class = "unknow+4";
234                 }
235             }
236             #elsif cds end
237             
238             else {
239                 $class = "unkown+5";
240             }
241         }
242     
243         elsif ($reads_str eq "-"){
244             
245             #if tRNA start
246             if (&withinorout($reads_s, $reads_e, %trnar) != 0){
247             #tRNA
248             $classtag = &withinorout($reads_s, $reads_e, %trnar);
249                 if ($classtag == 1){ 
250                     $class = "tRNA";
251                 }
252                 elsif ($classtag == -1) {
253                     if (&withinorout($reads_s, $reads_e,%cdsr) == 1){
254                         $class = "CDS";
255                     }
256                     else {
257                         #$class = "unknow2";
258                         $class = "IGR-1";
259                     }
260                 }
261                 else {
262                     $class = "unknown";
263                 }
264             }
265             #if tRNA end
266             
267             
268             #if cds start
269             elsif (&withinorout($reads_s, $reads_e, %cdsr) != 0){
270                 #print "cds:$reads_s\t$reads_e\n";
271                 #print &withinorout($reads_s, $reads_e, %cdsr) ,"\n";
272 =head
273                 foreach my $key (sort {$a <=> $b} keys %cdsr){
274                     print "$key\n";
275                 }
276 =cut
277             #CDS
278             $classtag = &withinorout($reads_s, $reads_e, %cdsr);
279                 if ($classtag == 1){
280                     $class = "CDS-";
281                     
282                     #rRNA
283                     if (&withinorout($reads_s, $reads_e,%rrnar) == 1){
284                         $class = (split /\s+/, $rorg)[0];
285                         $class = "rRNA-".$class;   
286                     }
287                     elsif (&withinorout($reads_s, $reads_e,%rrnar) == -1){
288                         $class = "unknow3";
289                     }
290     
291                 }
292                 elsif ($classtag == -1 ){
293                     $class = "IGR-2";
294                 }
295                 else {
296                     $class = "unknow4";
297                 }
298             }
299             #if cds end
300         
301         
302             else {
303                 $class = "unkown-5";
304             }
305         }
306     }
307     
308     elsif ($go eq "N"){
309         print "no kidding me\n";
310         die ;
311     }
312     else {
313         $class = "unkown6"
314     }
315     
316 $class;
317 }
318 
319 
320 sub withinorout{
321     my $tag = 0; 
322     my ($s, $e, %hash) = @_;
323     #s = reads start, e = reads end
324     # bed format reads, 0-based cordination
325     $s = $s + 1;
326     $e = $e + 1;
327     
328     #rs = reference start, re = reference end
329     # reference genebank, 1-based cordination
330     my ($rs,$re);
331     my $num = (keys %hash);
332     
333     for (my $step = 1;$step<=$num;$step++){
334         
335         ($rs, $re) = each %hash;
336 =head
337         if ($step = $num){
338                 $tag = -1 if ($re < $s);
339                 last;
340         }
341 =cut
342         #last unless ($rs && $re);
343 #        elsif ($step <$num){
344             if ($rs<=$s and $e<=$re){
345                 #print "$rs\t$s\t$e\t$re\n";
346                     #within
347                 
348                     $tag = 1;    
349                     last;
350                 
351             }
352             else {
353                 
354                 #without
355                 
356                 $tag = -1;
357                 my $tmp_e = $re;
358                 ;
359             }
360 #        }
361 #        else {
362 #            next;
363 #        }
364 
365     }
366     $tag;
367 }
368 
369 sub sorthash{
370     my %hash = @_;
371     my %sorthash;
372     foreach my $key (sort {$a<=>$b} keys %hash){
373         $sorthash{$key} = $hash{$key};
374     }    
375     %sorthash;
376 }
377 
378 
379 __DATA__
380 $0 <in.bed> <in.homemade.index> <out.plus.bed>

 

小题大做什么的

posted @ 2012-07-19 19:21  Puriney  阅读(199)  评论(0编辑  收藏  举报