肾病质控下机数据的脚本
library("tidyverse") library(ggplot2) library(ggpubr) data <- read_tsv("./rawDataStat20220215141517.txt") #fq qc dataPlot<-data %>% select("TotalBases") %>% mutate(x = 1:length(data$sampleId)) dataPlot$RawReads <- as.numeric(dataPlot$TotalBases )/1000000000 rawdata<-ggplot(dataPlot,aes(x=`RawReads`)) + geom_histogram(bins = 30,fill="steelblue")+ geom_vline(xintercept = 134, color="blue",linetype="dashed",size=1)+ geom_vline(xintercept = 90, color="red",linetype="dashed",size=1)+ # scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+ scale_y_continuous(expand = c(0, 0),limits = c(0, 2200),breaks=seq(0, 2000, 500))+ scale_x_continuous(breaks = c(0,90,130,170,210,250), labels = c(0,90,130,170,210,250),limits = c(30,260)) + labs(x="Raw data(G)",y = "Number of Samples",title="Raw data")+ theme_bw()+ theme( panel.grid = element_blank(), axis.text=(element_text(size=11)), axis.title = (element_text(size=11)), plot.title = element_text(hjust = 0.5) )+ annotate('text', x=134, y=2006, label='Mean of raw data:134G',size=4) rawdata ggsave("rawread.hist.png", width = 6, height = 5, plot = rawdata) #non SOAPnuke filed, get the raw filed data$`CG%`=(data$`%GC_1`+data$`%GC_2`)/2 data$`Q20%`=(data$`%Q20_1`+data$`%Q20_2`)/2 data$`Q30%`=(data$`%Q30_1`+data$`%Q30_2`)/2 #q20 dataPlot<-data %>% select("Q20%") %>% mutate(x = 1:length(data$sampleId))%>%filter(`Q20%` >=90) #dataPlot$Q20_clean = as.numeric(dataPlot$Q20_clean)*100 q20<-ggplot(dataPlot,aes(x=`Q20%`)) + geom_histogram(bins=30,fill="steelblue")+ geom_vline(xintercept = 96.98, color="blue",linetype="dashed",size=1)+ geom_vline(xintercept = 90, color="red",linetype="dashed",size=1)+ # scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+ scale_y_continuous(expand = c(0, 0),limits = c(0,2300 ),breaks=seq(0, 2000, 500))+ #scale_x_continuous(limits = c(90, 100),breaks=seq(90, 100, 2))+ #scale_x_continuous(breaks = c(0,100,120,200,300,400,500), labels = c(0,100,120,200,300,400,500)) + labs(x="Q20%",y = "Number of Samples",title="Q20")+ theme_bw()+ theme( panel.grid = element_blank(), axis.text=(element_text(size=11)), axis.title = (element_text(size=11)), plot.title = element_text(hjust = 0.5) )+ annotate('text', x=97.28, y=2200, label='Mean of Q20:96.98%',size=4) q20 ggsave("q20.hist.png", width = 6, height = 5, plot = q20) #q30 dataPlot<-data %>% select("Q30%") %>% mutate(x = 1:length(data$sampleId))%>% filter(`Q30%`>=80) #dataPlot$Q30_clean = as.numeric(dataPlot$Q30_clean)*100 q30<-ggplot(dataPlot,aes(x=`Q30%`)) + geom_histogram(bins=30,fill="steelblue")+ geom_vline(xintercept = 90.7, color="blue",linetype="dashed",size=1)+ geom_vline(xintercept = 80, color="red",linetype="dashed",size=1)+ # scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+ scale_y_continuous(expand = c(0, 0),limits = c(0,1700 ),breaks=seq(0, 5000, 500))+ #scale_x_continuous(limits = c(80, 100),breaks=seq(80, 100, 2))+ #scale_x_continuous(breaks = c(0,100,120,200,300,400,500), labels = c(0,100,120,200,300,400,500)) + labs(x="Q30%",y = "Number of Samples",title="Q30")+ theme_bw()+ theme( panel.grid = element_blank(), axis.text=(element_text(size=11)), axis.title = (element_text(size=11)), plot.title = element_text(hjust = 0.5) )+ annotate('text', x=90.7, y=1600, label='Mean of Q30:90.7%',size=4) q30 ggsave("q30.hist.png", width = 6, height = 5, plot = q30) #CG% dataPlot<-data %>% select("CG%") %>% mutate(x = rep("GC%",length(data$sampleId))) %>%filter( `CG%`> 39 & `CG%` <44) #dataPlot$GC_clean <- as.numeric(dataPlot$GC_clean)*100 gc<-ggplot(dataPlot,aes(x=`CG%`)) + geom_histogram(bins=30 ,fill="steelblue")+ geom_vline(xintercept = 41.18, color="blue",linetype="dashed",size=1)+ geom_vline(xintercept = 39, color="red",linetype="dashed",size=1)+ geom_vline(xintercept = 44, color="red",linetype="dashed",size=1)+ # scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+ scale_y_continuous(expand = c(0, 0),limits = c(0,2100), breaks=seq(0, 2000, 500))+ #scale_x_continuous(limits = c(38, 45),breaks=seq(38, 45, 1))+ scale_x_continuous(breaks = c(39,41,43,44,45,47), labels = c(39,41,43,44,45,47)) + labs(x="GC%",y = "Number of Samples",title="GC")+ theme_bw()+ theme( panel.grid = element_blank(), axis.text=(element_text(size=11)), axis.title = (element_text(size=11)), plot.title = element_text(hjust = 0.5) )+ annotate('text', x=41.18, y=2050, label='Mean of GC%:41.18%',size=4) gc ggsave("gc.hist.png", width = 6, height = 5, plot = gc) p<-ggarrange(rawdata,q20,q30,gc,align = "v") ggsave("qc.total.hist.png", width = 8, height = 6, plot = p)
本文来自博客园,作者:BioinformaticsMaster,转载请注明原文链接:https://www.cnblogs.com/koujiaodahan/p/16143632.html
posted on 2022-04-14 11:03 BioinformaticsMaster 阅读(25) 评论(0) 编辑 收藏 举报