一些脚本
1. 统计fasta文件序列条数(Shell)
grep -c ">" x.fasta
2. fastq转换成fasta格式(Shell)
gzip -dc BGIseq500.fq.gz | awk 'NR%4==1{printf ">%s\n",substr($0,2)} NR%4==2{print}' > BGIseq500.fa
3. 统计read数目和read长度(Shell)
readNum=`gzip -dc BGIseq500_1.fq.gz | wc -l | awk '{print $1/4}'` echo read number:$readNum readLen=`less newBGIseq500_1.fq.gz | head -2 | awk -F"\n" '{if($1 !~ /^@/) { print length($1)}}'` echo read length:$readLen totalBase=`echo "$readNum $readLen" | awk '{ print $1*$2*2} '` echo total base number:$totalBase
4. coverage乘数概率问题(R)
totalBp <- 1599999*100*2 print(totalBp) deepth <- totalBp/6000000 print(deepth) ppois(40, deepth, lower=FALSE) qpois(0.99, deepth, lower=FALSE)
5. SOAPdenovo组装
config文件
# config文件 #maximal read length max_rd_len=100 [LIB] #average insert size avg_ins=450 #if sequence needs to be reversed reverse_seq=0 #in which part(s) the reads are used asm_flags=3 #use only first 100 bps of each read rd_len_cutoff=100 #in which order the reads are used while scaffolding rank=1 # cutoff of pair number for a reliable connection (at least 3 for short insert size) pair_num_cutoff=3 #minimum aligned length to contigs for a reliable read location (at least 32 for short insert size) map_len=32 #a pair of fastq file, read 1 file should always be followed by read 2 file q1=newBGIseq500_1.fq.gz q2=newBGIseq500_2.fq.gz
SOAPdenovo命令(Shell)
SOAPdenovo-63mer all -s my.cfg -K 35 -D 1 -o new 1>new.log 2>new.err
6. 统计fasta文件中每一条序列的长度(Perl)
use strict; my $num = 0; my @lengthList = (); my $len = undef; open(OUT, ">scanffoldLength.list") or die ("can't open file!\n"); while(<>) { chomp; if (/^>/) { if ($len == undef) { $len = 0; next; } else { $lengthList[$num] = $len; $num += 1; $len = 0; } } else { $len += length($_); } } $lengthList[$num] = $len; $num += 1; print "num: $num\n"; print "num2: ", scalar @lengthList, "\n"; foreach (@lengthList) { print OUT "$_\n"; } close(OUT);
7. 按数字从大到小排序(Shell)
sort -r -g scanffoldLength.list > scanffoldLength.sort.list
8. 统计scanffold序列累积长度,计算N50,并画图(R)
lengthList <- read.table("scanffoldLength.sort.list") lengthList <- lengthList[,1] head(lengthList) length(lengthList) accLen <- numeric(length(lengthList)) totLen <- lengthList[1]; accLen[1] <- lengthList[1]; for (i in (2:length(lengthList))) { accLen[i] <- (accLen[i-1] + lengthList[i]) totLen <- totLen + lengthList[i]; } head(accLen) length(accLen) totLen N50 <- 0 index <- 0 accLen[length(lengthList)] for (i in (1:length(lengthList))) { if (accLen[i] > totLen/2) { N50 <- lengthList[i] index <- i break } } N50 pdf("length.pdf") plot(x=(1:length(lengthList)), y=accLen, type="l", xlab="l", ylab="accLen") print(index) #lines(c(index,index), c(0,accLen[index]), col="blue") #lines(c(0,index), c(accLen[index],accLen[index]), col="blue") text <- paste("(", as.character(index), "," , as.character(N50), ")", sep = "") points(index, accLen[index], col="red") text(x=index+200, y=accLen[index], labels=text, col="red") dev.off()
9. vcftools的一些使用(Shell)
# get Qual ./vcftools --gzvcf chr17.vcf.gz --site-quality --out Qual # get interval ./vcftools --gzvcf chr17.vcf.gz --chr chr17 --from-bp 7661779 --to-bp 7687550 --remove-indels --out TP53 --recode
10. 画数据分布Hist图(R)
data <- read.table("Qual.lqual", header=TRUE) pdf("Qual.pdf") hist(data[,3], breaks=100 , main="Qual Hist", freq=F) dev.off()
11. 统计vcf文件中样本的genotype(Perl)
use strict; my @samples = (); my %statTable = (); while (<>) { chomp; next if(/^##/); if(/^#CHROM/) { my @labels = split; for my $index (9..@labels-1) { push(@samples, $labels[$index]); } #print "@samples\n"; for (0..@samples-1) { my %subTab = (); $statTable{$samples[$_]}{"0/0"} = 0; $statTable{$samples[$_]}{"0/1"} = 0; $statTable{$samples[$_]}{"1/1"} = 0; } #print %statTable; next; } my @vcfData = split; for my $index (9..@vcfData-1) { my @values = split /:/, $vcfData[$index]; if($values[0] eq "0/0") { $statTable{$samples[$index-9]}{"0/0"} += 1; } elsif($values[0] eq "0/1") { $statTable{$samples[$index-9]}{"0/1"} += 1; } elsif($values[0] eq "1/1") { $statTable{$samples[$index-9]}{"1/1"} += 1; } else { die("error~"); } } #print "\n"; } foreach my $sampleName (keys %statTable) { print "$sampleName:\n"; foreach my $gt (keys $statTable{$sampleName}) { print "$gt\t$statTable{$sampleName}{$gt}\n"; } }