肾病质控下机数据的脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
library("tidyverse")
library(ggplot2)
library(ggpubr)
 
data <- read_tsv("./rawDataStat20220215141517.txt")
#fq qc
 
dataPlot<-data %>%
  select("TotalBases") %>%
  mutate(x = 1:length(data$sampleId))
 
dataPlot$RawReads <- as.numeric(dataPlot$TotalBases )/1000000000
 
rawdata<-ggplot(dataPlot,aes(x=`RawReads`)) +
  geom_histogram(bins = 30,fill="steelblue")+
  geom_vline(xintercept = 134, color="blue",linetype="dashed",size=1)+
  geom_vline(xintercept = 90, color="red",linetype="dashed",size=1)+
  # scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+
  scale_y_continuous(expand = c(0, 0),limits = c(0, 2200),breaks=seq(0, 2000, 500))+
  scale_x_continuous(breaks = c(0,90,130,170,210,250), labels = c(0,90,130,170,210,250),limits = c(30,260)) +
  labs(x="Raw data(G)",y = "Number of Samples",title="Raw data")+
  theme_bw()+
  theme(
    panel.grid = element_blank(),
    axis.text=(element_text(size=11)),
    axis.title = (element_text(size=11)),
    plot.title = element_text(hjust = 0.5)
  )+
  annotate('text', x=134, y=2006, label='Mean of raw data:134G',size=4)
rawdata
 
ggsave("rawread.hist.png", width = 6, height = 5, plot = rawdata)
 
#non SOAPnuke filed, get the raw filed
data$`CG%`=(data$`%GC_1`+data$`%GC_2`)/2
data$`Q20%`=(data$`%Q20_1`+data$`%Q20_2`)/2
data$`Q30%`=(data$`%Q30_1`+data$`%Q30_2`)/2
 
 
#q20
 
dataPlot<-data %>%
  select("Q20%") %>%
  mutate(x = 1:length(data$sampleId))%>%filter(`Q20%` >=90)
 
#dataPlot$Q20_clean = as.numeric(dataPlot$Q20_clean)*100
 
q20<-ggplot(dataPlot,aes(x=`Q20%`)) +
  geom_histogram(bins=30,fill="steelblue")+
  geom_vline(xintercept = 96.98, color="blue",linetype="dashed",size=1)+
  geom_vline(xintercept = 90, color="red",linetype="dashed",size=1)+
  # scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+
  scale_y_continuous(expand = c(0, 0),limits = c(0,2300 ),breaks=seq(0, 2000, 500))+
  #scale_x_continuous(limits = c(90, 100),breaks=seq(90, 100, 2))+
  #scale_x_continuous(breaks = c(0,100,120,200,300,400,500), labels = c(0,100,120,200,300,400,500)) +
  labs(x="Q20%",y = "Number of Samples",title="Q20")+
  theme_bw()+
  theme(
    panel.grid = element_blank(),
    axis.text=(element_text(size=11)),
    axis.title = (element_text(size=11)),
    plot.title = element_text(hjust = 0.5)
  )+
  annotate('text', x=97.28, y=2200, label='Mean of Q20:96.98%',size=4)
q20
ggsave("q20.hist.png", width = 6, height = 5, plot = q20)
 
#q30
dataPlot<-data %>%
  select("Q30%") %>%
  mutate(x = 1:length(data$sampleId))%>% filter(`Q30%`>=80)
 
#dataPlot$Q30_clean = as.numeric(dataPlot$Q30_clean)*100
 
q30<-ggplot(dataPlot,aes(x=`Q30%`)) +
  geom_histogram(bins=30,fill="steelblue")+
  geom_vline(xintercept = 90.7, color="blue",linetype="dashed",size=1)+
  geom_vline(xintercept = 80, color="red",linetype="dashed",size=1)+
  # scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+
  scale_y_continuous(expand = c(0, 0),limits = c(0,1700 ),breaks=seq(0, 5000, 500))+
  #scale_x_continuous(limits = c(80, 100),breaks=seq(80, 100, 2))+
  #scale_x_continuous(breaks = c(0,100,120,200,300,400,500), labels = c(0,100,120,200,300,400,500)) +
  labs(x="Q30%",y = "Number of Samples",title="Q30")+
  theme_bw()+
  theme(
    panel.grid = element_blank(),
    axis.text=(element_text(size=11)),
    axis.title = (element_text(size=11)),
    plot.title = element_text(hjust = 0.5)
  )+
  annotate('text', x=90.7, y=1600, label='Mean of Q30:90.7%',size=4)
q30
ggsave("q30.hist.png", width = 6, height = 5, plot = q30)
 
#CG%
dataPlot<-data %>%
  select("CG%") %>%
  mutate(x = rep("GC%",length(data$sampleId))) %>%filter( `CG%`> 39 &  `CG%` <44)
 
#dataPlot$GC_clean <- as.numeric(dataPlot$GC_clean)*100
 
gc<-ggplot(dataPlot,aes(x=`CG%`)) +
  geom_histogram(bins=30
                 ,fill="steelblue")+
  geom_vline(xintercept = 41.18, color="blue",linetype="dashed",size=1)+
  geom_vline(xintercept = 39, color="red",linetype="dashed",size=1)+
  geom_vline(xintercept = 44, color="red",linetype="dashed",size=1)+
  # scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+
  scale_y_continuous(expand = c(0, 0),limits = c(0,2100), breaks=seq(0, 2000, 500))+
  #scale_x_continuous(limits = c(38, 45),breaks=seq(38, 45, 1))+
  scale_x_continuous(breaks = c(39,41,43,44,45,47), labels = c(39,41,43,44,45,47)) +
  labs(x="GC%",y = "Number of Samples",title="GC")+
  theme_bw()+
  theme(
    panel.grid = element_blank(),
    axis.text=(element_text(size=11)),
    axis.title = (element_text(size=11)),
    plot.title = element_text(hjust = 0.5)
  )+
  annotate('text', x=41.18, y=2050, label='Mean of GC%:41.18%',size=4)
gc
 
ggsave("gc.hist.png", width = 6, height = 5, plot = gc)
p<-ggarrange(rawdata,q20,q30,gc,align = "v")
 
ggsave("qc.total.hist.png", width = 8, height = 6, plot = p)

  

posted on   BioinformaticsMaster  阅读(26)  评论(0编辑  收藏  举报

相关博文:
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律
历史上的今天:
2018-04-14 os.popen(cmd) .read() 获取执行后的结果且带有换行符\n

导航

< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5
点击右上角即可分享
微信分享提示