肾病质控下机数据的脚本
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | library ( "tidyverse" ) library (ggplot2) library (ggpubr) data <- read_tsv ( "./rawDataStat20220215141517.txt" ) #fq qc dataPlot<-data %>% select ( "TotalBases" ) %>% mutate (x = 1: length (data$sampleId)) dataPlot$RawReads <- as.numeric (dataPlot$TotalBases )/1000000000 rawdata<- ggplot (dataPlot, aes (x=`RawReads`)) + geom_histogram (bins = 30,fill= "steelblue" )+ geom_vline (xintercept = 134, color= "blue" ,linetype= "dashed" ,size=1)+ geom_vline (xintercept = 90, color= "red" ,linetype= "dashed" ,size=1)+ # scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+ scale_y_continuous (expand = c (0, 0),limits = c (0, 2200),breaks= seq (0, 2000, 500))+ scale_x_continuous (breaks = c (0,90,130,170,210,250), labels = c (0,90,130,170,210,250),limits = c (30,260)) + labs (x= "Raw data(G)" ,y = "Number of Samples" ,title= "Raw data" )+ theme_bw ()+ theme ( panel.grid = element_blank (), axis.text=( element_text (size=11)), axis.title = ( element_text (size=11)), plot.title = element_text (hjust = 0.5) )+ annotate ( 'text' , x=134, y=2006, label= 'Mean of raw data:134G' ,size=4) rawdata ggsave ( "rawread.hist.png" , width = 6, height = 5, plot = rawdata) #non SOAPnuke filed, get the raw filed data$`CG%`=(data$`%GC_1`+data$`%GC_2`)/2 data$`Q20%`=(data$`%Q20_1`+data$`%Q20_2`)/2 data$`Q30%`=(data$`%Q30_1`+data$`%Q30_2`)/2 #q20 dataPlot<-data %>% select ( "Q20%" ) %>% mutate (x = 1: length (data$sampleId))%>% filter (`Q20%` >=90) #dataPlot$Q20_clean = as.numeric(dataPlot$Q20_clean)*100 q20<- ggplot (dataPlot, aes (x=`Q20%`)) + geom_histogram (bins=30,fill= "steelblue" )+ geom_vline (xintercept = 96.98, color= "blue" ,linetype= "dashed" ,size=1)+ geom_vline (xintercept = 90, color= "red" ,linetype= "dashed" ,size=1)+ # scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+ scale_y_continuous (expand = c (0, 0),limits = c (0,2300 ),breaks= seq (0, 2000, 500))+ #scale_x_continuous(limits = c(90, 100),breaks=seq(90, 100, 2))+ #scale_x_continuous(breaks = c(0,100,120,200,300,400,500), labels = c(0,100,120,200,300,400,500)) + labs (x= "Q20%" ,y = "Number of Samples" ,title= "Q20" )+ theme_bw ()+ theme ( panel.grid = element_blank (), axis.text=( element_text (size=11)), axis.title = ( element_text (size=11)), plot.title = element_text (hjust = 0.5) )+ annotate ( 'text' , x=97.28, y=2200, label= 'Mean of Q20:96.98%' ,size=4) q20 ggsave ( "q20.hist.png" , width = 6, height = 5, plot = q20) #q30 dataPlot<-data %>% select ( "Q30%" ) %>% mutate (x = 1: length (data$sampleId))%>% filter (`Q30%`>=80) #dataPlot$Q30_clean = as.numeric(dataPlot$Q30_clean)*100 q30<- ggplot (dataPlot, aes (x=`Q30%`)) + geom_histogram (bins=30,fill= "steelblue" )+ geom_vline (xintercept = 90.7, color= "blue" ,linetype= "dashed" ,size=1)+ geom_vline (xintercept = 80, color= "red" ,linetype= "dashed" ,size=1)+ # scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+ scale_y_continuous (expand = c (0, 0),limits = c (0,1700 ),breaks= seq (0, 5000, 500))+ #scale_x_continuous(limits = c(80, 100),breaks=seq(80, 100, 2))+ #scale_x_continuous(breaks = c(0,100,120,200,300,400,500), labels = c(0,100,120,200,300,400,500)) + labs (x= "Q30%" ,y = "Number of Samples" ,title= "Q30" )+ theme_bw ()+ theme ( panel.grid = element_blank (), axis.text=( element_text (size=11)), axis.title = ( element_text (size=11)), plot.title = element_text (hjust = 0.5) )+ annotate ( 'text' , x=90.7, y=1600, label= 'Mean of Q30:90.7%' ,size=4) q30 ggsave ( "q30.hist.png" , width = 6, height = 5, plot = q30) #CG% dataPlot<-data %>% select ( "CG%" ) %>% mutate (x = rep ( "GC%" , length (data$sampleId))) %>% filter ( `CG%`> 39 & `CG%` <44) #dataPlot$GC_clean <- as.numeric(dataPlot$GC_clean)*100 gc<- ggplot (dataPlot, aes (x=`CG%`)) + geom_histogram (bins=30 ,fill= "steelblue" )+ geom_vline (xintercept = 41.18, color= "blue" ,linetype= "dashed" ,size=1)+ geom_vline (xintercept = 39, color= "red" ,linetype= "dashed" ,size=1)+ geom_vline (xintercept = 44, color= "red" ,linetype= "dashed" ,size=1)+ # scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+ scale_y_continuous (expand = c (0, 0),limits = c (0,2100), breaks= seq (0, 2000, 500))+ #scale_x_continuous(limits = c(38, 45),breaks=seq(38, 45, 1))+ scale_x_continuous (breaks = c (39,41,43,44,45,47), labels = c (39,41,43,44,45,47)) + labs (x= "GC%" ,y = "Number of Samples" ,title= "GC" )+ theme_bw ()+ theme ( panel.grid = element_blank (), axis.text=( element_text (size=11)), axis.title = ( element_text (size=11)), plot.title = element_text (hjust = 0.5) )+ annotate ( 'text' , x=41.18, y=2050, label= 'Mean of GC%:41.18%' ,size=4) gc ggsave ( "gc.hist.png" , width = 6, height = 5, plot = gc) p<- ggarrange (rawdata,q20,q30,gc,align = "v" ) ggsave ( "qc.total.hist.png" , width = 8, height = 6, plot = p) |
本文来自博客园,作者:BioinformaticsMaster,转载请注明原文链接:https://www.cnblogs.com/koujiaodahan/p/16143632.html
posted on 2022-04-14 11:03 BioinformaticsMaster 阅读(26) 评论(0) 编辑 收藏 举报
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律
2018-04-14 os.popen(cmd) .read() 获取执行后的结果且带有换行符\n