肾病质控下机数据的脚本

library("tidyverse")
library(ggplot2)
library(ggpubr)

data <- read_tsv("./rawDataStat20220215141517.txt")
#fq qc

dataPlot<-data %>%
  select("TotalBases") %>%
  mutate(x = 1:length(data$sampleId))

dataPlot$RawReads <- as.numeric(dataPlot$TotalBases )/1000000000

rawdata<-ggplot(dataPlot,aes(x=`RawReads`)) + 
  geom_histogram(bins = 30,fill="steelblue")+
  geom_vline(xintercept = 134, color="blue",linetype="dashed",size=1)+
  geom_vline(xintercept = 90, color="red",linetype="dashed",size=1)+
  # scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+
  scale_y_continuous(expand = c(0, 0),limits = c(0, 2200),breaks=seq(0, 2000, 500))+
  scale_x_continuous(breaks = c(0,90,130,170,210,250), labels = c(0,90,130,170,210,250),limits = c(30,260)) +
  labs(x="Raw data(G)",y = "Number of Samples",title="Raw data")+
  theme_bw()+
  theme(
    panel.grid = element_blank(),
    axis.text=(element_text(size=11)),
    axis.title = (element_text(size=11)),
    plot.title = element_text(hjust = 0.5)
  )+
  annotate('text', x=134, y=2006, label='Mean of raw data:134G',size=4)
rawdata

ggsave("rawread.hist.png", width = 6, height = 5, plot = rawdata)

#non SOAPnuke filed, get the raw filed
data$`CG%`=(data$`%GC_1`+data$`%GC_2`)/2
data$`Q20%`=(data$`%Q20_1`+data$`%Q20_2`)/2
data$`Q30%`=(data$`%Q30_1`+data$`%Q30_2`)/2


#q20

dataPlot<-data %>%
  select("Q20%") %>%
  mutate(x = 1:length(data$sampleId))%>%filter(`Q20%` >=90)

#dataPlot$Q20_clean = as.numeric(dataPlot$Q20_clean)*100

q20<-ggplot(dataPlot,aes(x=`Q20%`)) + 
  geom_histogram(bins=30,fill="steelblue")+
  geom_vline(xintercept = 96.98, color="blue",linetype="dashed",size=1)+
  geom_vline(xintercept = 90, color="red",linetype="dashed",size=1)+
  # scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+
  scale_y_continuous(expand = c(0, 0),limits = c(0,2300 ),breaks=seq(0, 2000, 500))+
  #scale_x_continuous(limits = c(90, 100),breaks=seq(90, 100, 2))+
  #scale_x_continuous(breaks = c(0,100,120,200,300,400,500), labels = c(0,100,120,200,300,400,500)) +
  labs(x="Q20%",y = "Number of Samples",title="Q20")+
  theme_bw()+
  theme(
    panel.grid = element_blank(),
    axis.text=(element_text(size=11)),
    axis.title = (element_text(size=11)),
    plot.title = element_text(hjust = 0.5)
  )+
  annotate('text', x=97.28, y=2200, label='Mean of Q20:96.98%',size=4) 
q20
ggsave("q20.hist.png", width = 6, height = 5, plot = q20)

#q30
dataPlot<-data %>%
  select("Q30%") %>%
  mutate(x = 1:length(data$sampleId))%>% filter(`Q30%`>=80)

#dataPlot$Q30_clean = as.numeric(dataPlot$Q30_clean)*100

q30<-ggplot(dataPlot,aes(x=`Q30%`)) + 
  geom_histogram(bins=30,fill="steelblue")+
  geom_vline(xintercept = 90.7, color="blue",linetype="dashed",size=1)+
  geom_vline(xintercept = 80, color="red",linetype="dashed",size=1)+
  # scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+
  scale_y_continuous(expand = c(0, 0),limits = c(0,1700 ),breaks=seq(0, 5000, 500))+
  #scale_x_continuous(limits = c(80, 100),breaks=seq(80, 100, 2))+
  #scale_x_continuous(breaks = c(0,100,120,200,300,400,500), labels = c(0,100,120,200,300,400,500)) +
  labs(x="Q30%",y = "Number of Samples",title="Q30")+
  theme_bw()+
  theme(
    panel.grid = element_blank(),
    axis.text=(element_text(size=11)),
    axis.title = (element_text(size=11)),
    plot.title = element_text(hjust = 0.5)
  )+
  annotate('text', x=90.7, y=1600, label='Mean of Q30:90.7%',size=4) 
q30
ggsave("q30.hist.png", width = 6, height = 5, plot = q30)

#CG%
dataPlot<-data %>%
  select("CG%") %>%
  mutate(x = rep("GC%",length(data$sampleId))) %>%filter( `CG%`> 39 &  `CG%` <44)

#dataPlot$GC_clean <- as.numeric(dataPlot$GC_clean)*100

gc<-ggplot(dataPlot,aes(x=`CG%`)) + 
  geom_histogram(bins=30
                 ,fill="steelblue")+
  geom_vline(xintercept = 41.18, color="blue",linetype="dashed",size=1)+
  geom_vline(xintercept = 39, color="red",linetype="dashed",size=1)+
  geom_vline(xintercept = 44, color="red",linetype="dashed",size=1)+
  # scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+
  scale_y_continuous(expand = c(0, 0),limits = c(0,2100), breaks=seq(0, 2000, 500))+
  #scale_x_continuous(limits = c(38, 45),breaks=seq(38, 45, 1))+
  scale_x_continuous(breaks = c(39,41,43,44,45,47), labels = c(39,41,43,44,45,47)) +
  labs(x="GC%",y = "Number of Samples",title="GC")+
  theme_bw()+
  theme(
    panel.grid = element_blank(),
    axis.text=(element_text(size=11)),
    axis.title = (element_text(size=11)),
    plot.title = element_text(hjust = 0.5)
  )+
  annotate('text', x=41.18, y=2050, label='Mean of GC%:41.18%',size=4) 
gc

ggsave("gc.hist.png", width = 6, height = 5, plot = gc)
p<-ggarrange(rawdata,q20,q30,gc,align = "v")

ggsave("qc.total.hist.png", width = 8, height = 6, plot = p)

  

posted on 2022-04-14 11:03  BioinformaticsMaster  阅读(22)  评论(0编辑  收藏  举报

导航