shell和perl对区间交集处理(坐标区间有重叠,则合并之并取其最小的起始坐标和最大的终止坐标)
-----------------------------------------------------------
处理数据:
refseq1 8 13 + NM_152486 SAMD11
refseq1 6 11 + NM_152486 SAMD11
refseq1 14 16 + NM_152486 SAMD11
refseq1 6 12 - NM_021170 HES4
refseq1 8 10 - NM_021170 HES4
refseq2 9 11 - NM_001002919 FAM150B
refseq2 3 8 - NM_001002919 FAM150B
--------------------------------------------------------
期望得到的结果是:
refseq1 6 13 + NM_152486_E1 SAMD11
refseq1 14 16 + NM_152486_E2 SAMD11
refseq1 6 12 - NM_021170_E1 HES4
refseq2 9 11 - NM_001002919_E1 FAM150B
refseq2 3 8 - NM_001002919_E2 FAM150B
------------------------------------------------------
1 ---------------------------------------------------------------------------------------- 2 #!/usr/bin/perl 3 my %R; 4 while () { 5 my @s = split; 6 push @{ $R{ $s[4] }{R} }, [ $s[1], $s[2] ]; 7 $R{ $s[4] }{ $s[1] } = [ @s[ 0, 3, 4, 5 ] ]; 8 } 9 10 while ( my ( $k, $v ) = each %R ) { 11 my @tmp = sort { $a->[0] <=> $b->[0] } @{ $v->{R} }; 12 my ( $E, @R ) = ( 1, shift @tmp ); 13 for my $t (@tmp) { 14 $t->[0] > $R[-1][1] && push @R, $t and next; 15 $t->[1] > $R[-1][1] && ( $R[-1][1] = $t->[1] ) 16 } 17 for my $R (@R) { 18 my ( $A, $B, $C, $D ) = @{ $v->{ $R->[0] } }; 19 print join( "\t", $A, @$R, $B, "${C}_E" . $E++, $D ) . $/; 20 } 21 } 22 -------------------------------------------------------------------------------
1 ------------------------------------------------------------------------------- 2 use strict; 3 use warnings; 4 5 my @name_array ; 6 my $hash; 7 while(){ 8 #get serial from each line 9 my ($seq,$min,$max, $sign, $serial, $name) = split(/\s+/); 10 11 #in Python ,use in for convince 12 unless (grep $_->[2] eq $serial ,@name_array){ 13 push @name_array, [$seq,$sign,$serial,$name]; 14 } ; 15 16 ($min,$max) = ($max,$min) if $min > $max ; 17 18 if($hash->{$serial}){ 19 sort_matrix($min,$max,$hash->{$serial}); 20 } 21 else{ 22 push @{$hash->{$serial}},[$min,$max]; 23 } 24 25 } 26 ###print sort result 27 for my $item( @name_array ){ 28 my $serial = $item->[2]; 29 for (0..$#{$hash->{$serial}}){ 30 print $item->[0]," ",$hash->{$serial}->[$_][0]," ",$hash->{$serial}->[$_][1]," ",$item->[1]," ",$serial,"_E",($_+1)," ",$item->[3],"\n"; 31 } 32 } 33 34 sub sort_matrix{ 35 my ($cur_min,$cur_max,$matrix) = @_; 36 # record index 37 my @splice_index; 38 for (0..$#$matrix){ 39 my $item_min = $matrix->[$_]->[0]; 40 my $item_max = $matrix->[$_]->[1]; 41 next if $cur_max < $item_min or $cur_min > $item_max; 42 return if $cur_min >= $item_min and $cur_max <= $item_max; 43 $cur_min = $cur_min < $item_min ? $cur_min : $item_min; 44 $cur_max = $cur_max < $item_max ? $item_max : $cur_max; 45 push @splice_index, $_; 46 } 47 push @$matrix,[$cur_min,$cur_max]; 48 #print "@splice_index\n"; 49 50 for (0..$#splice_index){ 51 splice(@$matrix,$splice_index[$_]-$_,1);##This is trick 52 } 53 #print $cur_min,":",$cur_max,"\n"; 54 }
1 首先sort -k5 -k2 file ,使$2(坐标起点)升序排列,由于$2<$3(起点<终点)这个是恒成立的。所以按照我的这个思路第N+1行和第N行比较,然后合并。 2 # awk -v OFS="\t" 'NR==1{tag=$5;start=$2;end=$3;a=$6;b=$4;c=$1}NR>1{ 3 4 if($5==tag){ 5 if($2end) 6 7 end=$3 8 9 else if($3 10 11 else{ 12 13 print ">"tag"_"++count,a,c,b,start,end; 14 15 start=$2;end=$3;tag=$5;a=$6;b=$4;c=$1 16 17 } 18 19 20 } 21 22 23 else { 24 print ">"tag"_"++count,a,c,b,start,end; 25 26 tag=$5;start=$2;end=$3;a=$6;b=$4;c=$1 27 count=0 28 29 } 30 31 }END{print ">"tag"_"++count,a,c,b,start,end}' <<<"refseq2 279363 280352 - NM_001002919 FAM150B 32 refseq2 282911 283375 - NM_001002919 FAM150B 33 refseq2 285923 286403 - NM_001002919 FAM150B 34 refseq2 286090 286543 - NM_001002919 FAM150B 35 refseq2 287383 288092 - NM_001002919 FAM150B 36 refseq2 287813 288508 - NM_001002919 FAM150B 37 refseq1 934142 935012 - NM_021170 HES4 38 refseq1 934706 935193 - NM_021170 HES4 39 refseq1 934872 935367 - NM_021170 HES4 40 refseq1 935046 935752 - NM_021170 HES4 41 refseq1 860921 861380 + NM_152486 SAMD11 42 refseq1 861102 861593 + NM_152486 SAMD11 43 refseq1 865335 865916 + NM_152486 SAMD11 44 refseq1 866219 866669 + NM_152486 SAMD11 45 refseq1 870952 871476 + NM_152486 SAMD11 46 refseq1 874220 874709 + NM_152486 SAMD11 47 refseq1 874455 875040 + NM_152486 SAMD11 48 refseq1 879088 880161 + NM_152486 SAMD11" 49 -------------------------------------------------------------------------- 50 # cat file1 51 A 10 30 52 A 20 40 53 A 50 70 54 A1 38 168 55 56 # awk '{k=sprintf("%s_0d",$1,$2);for(n=0;n++<3;)a[k,n]=$n;c[k]}END{$1="";t=asorti(c,s);for(n=0;n++=a[N,2]){a[N,2]=$2;if($3>a[N,3])a[N,3]=$3}else if($1)print $1,$2,$3;for(m=0;m++<3;)$m=a[N,m]}}' file1 57 A1 38 168 58 A 10 40 59 A 50 70