怎么检测自己fastq的Phred类型 | phred33 phred64
http://wiki.bits.vib.be/index.php/Identify_the_Phred_scale_of_quality_scores_used_in_fastQ
1 2 3 4 5 | # S - Sanger Phred+33, raw reads typically (0, 40) # X - Solexa Solexa+64, raw reads typically (-5, 40) # I - Illumina 1.3+ Phred+64, raw reads typically (0, 40) # J - Illumina 1.5+ Phred+64, raw reads typically (3, 40) with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold) # L - Illumina 1.8+ Phred+33, raw reads typically (0, 41) |
用别人的工具会比价靠谱,自己写容易出错,或考虑不周:
BBMap as a little tool for this:
1 | $ testformat.sh in =N0174.fq.gz |
可以这么封装到流程里:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | fq1=SRR8501263_1.fastq.gz fq2=SRR8501263_2.fastq.gz /home/lizhixin/softwares/bbmap/testformat .sh in =$fq1 | cut -f 1 > fastq. format # phred="" for line in $( cat fastq. format ); do phred=$line; done if [ $phred== "sanger" ] then echo "sanger" else echo "not sanger" fi |
直接用awk命令:
这个对SRA转的fastq无法判断
1 | zcat A14_1.fastq.gz | head -100 | awk '{if(NR%4==0) printf("%s",$0);}' | od -A n -t u1 | awk 'BEGIN{min=100;max=0;}{for(i=1;i<=NF;i++) {if($i>max) max=$i; if($i<min) min=$i;}}END{if(max<=74 && min<59) print "Phred+33"; else if(max>73 && min>=64) print "Phred+64"; else if(min>=59 && min<64 && max>73) print "Solexa+64"; else print "Unknown score encoding!";}' |
一个Perl脚本
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | #!/usr/bin/perl -w # http://wiki.bits.vib.be/index.php/Identify_the_Phred_scale_of_quality_scores_used_in_fastQ use strict; use File::Basename; use List::MoreUtils qw( minmax ) ; # fastq_detect.pl fastq.file sample-size # detect fastQ format from quality scores in fastQ input file # Version 3 # # Stephane Plaisance - VIB-BITS - July-04-2012 # Joachim Jacob - Aug-02-2012 - joachim.jacob@gmail.com # - changed the maximum value of Sanger to 73 # - changed reading the file with a file handle # (was a file handle !! supporting several archive formats. SP) # - changed the diagnosing algoritm # Stephane Plaisance - VIB-BITS - April-08-2013 # - merged both versions and corrected flaw in min/max # thanks to Sergey Mitrfanov for perl reformatting ##################################################################### # diagnose # SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS..................................................... # ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...................... # ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...................... # .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ...................... # LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL.................................................... # !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ # | | | | | | # 33 59 64 73 104 126 # S 0........................26...31.......40 # X -5....0........9.............................40 # I 0........9.............................40 # J 3.....9.............................40 # L 0.2......................26...31........41 # # S - Sanger Phred+33, raw reads typically (0, 40) # X - Solexa Solexa+64, raw reads typically (-5, 40) # I - Illumina 1.3+ Phred+64, raw reads typically (0, 40) # J - Illumina 1.5+ Phred+64, raw reads typically (3, 40) with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold) # L - Illumina 1.8+ Phred+33, raw reads typically (0, 41) ##################################################################### my $script = basename($0); @ARGV gt 0 or die "usage: $script <fastq file> <opt:sample-size (100)>\n" ; my ( $inputfile , $limit ) = @ARGV ; if (! defined $limit ) { $limit = 100}; # check first 100 records my $cnt =0; my ( $min , $max ); # global min and max values # print STDERR "\n## Analysing ".$limit." records from $inputfile ... \n"; my $z = ReadFile ( $inputfile ) || die "Error: cannot read from variant file $inputfile: $!\n" ; ## parse while ( my $id = < $z >) { $id =~ m/^@/ || die "expected @ not found in line 1!\n" ; my $seq = < $z >; my $sep = < $z >; $sep =~ m/^\+/ || die "expected + not found in line 3!\n" ; my $qual = < $z >; chomp ( $qual ); $cnt ++; $cnt >= $limit && last ; # char to ascii my @chars = split ( "" , $qual ); my @nums = sort { $a <=> $b } ( map { unpack ( "C*" , $_ )} @chars ); if ( $cnt ==1) { ( $min , $max ) = minmax @nums ; } else { my ( $lmin , $lmax ) = minmax @nums ; # local values for this read $lmin < $min ? $min = $lmin : $min = $min ; $lmax > $max ? $max = $lmax : $max = $max ; } } undef $z ; ## diagnose my %diag =( 'Sanger' => '.' , 'Solexa' => '.' , 'Illumina 1.3+' => '.' , 'Illumina 1.5+' => '.' , 'Illumina 1.8+' => '.' , ); my %comment =( 'Sanger' => 'Phred+33, Q[33; 73], (0, 40)' , 'Solexa' => 'Solexa+64, Q[59; 104], (-5, 40)' , 'Illumina 1.3+' => 'Phred+64, Q[64; 104], (0, 40)' , 'Illumina 1.5+' => 'Phred+64, Q[66; 104], (3, 40), with 0=N/A, 1=N/A, 2=Read Segment Quality Control Indicator' , 'Illumina 1.8+' => 'Phred+33, Q[33; 74], (0, 41)' , ); if ( $min <33 || $max >104) { die "Quality values corrupt. found [$min; $max] where [33; 104] was expected\n" ; } if ( $min >=33 && $max <=73) { $diag { 'Sanger' }= 'x' ;} if ( $min >=59 && $max <=104) { $diag { 'Solexa' }= 'x' ;} if ( $min >=64 && $max <=104) { $diag { 'Illumina 1.3+' }= 'x' ;} if ( $min >=66 && $max <=104) { $diag { 'Illumina 1.5+' }= 'x' ;} if ( $min >=33 && $max <=74) { $diag { 'Illumina 1.8+' }= 'x' ;} ## report # print STDERR "# sampled raw quality values are in the range of [".$min."; ".$max."]\n"; # print STDERR "# format(s) marked below with 'x' agree with this range\n"; foreach my $format ( sort keys %diag ) { #print STDERR sprintf(" %-13s : %2s [%-30s] \n", $format, $diag{$format}, $comment{$format}); if ( $diag { $format } eq "x" ) { print "$format\n" } } ############## #### Subs #### # reads from uncompressed, gzipped and bgzip fastQ files sub ReadFile { my $infile = shift ; my $FH ; if ( $infile =~ /.bz2$/) { open ( $FH , "bzcat $infile |" ) or die ( "$!: can't open file $infile" ); } elsif ( $infile =~ /.gz$/) { open ( $FH , "zcat $infile |" ) or die ( "$!: can't open file $infile" ); } elsif ( $infile =~ /.fq|.fastq|.txt$/) { open ( $FH , "cat $infile |" ) or die ( "$!: can't open file $infile" ); } else { die ( "$!: do not recognise file type $infile" ); } return $FH ; } |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· winform 绘制太阳,地球,月球 运作规律
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)
2017-03-27 Pymonitor使用方法和常见问题