准备基本的包
conda install -c bioconda snakemake samtools hisat2 trim-galore subread -y
准备数据
wget https://ftp.ensembl.org/pub/release-110/fasta/sus_scrofa/dna/Sus_scrofa.Sscrofa11.1.dna.toplevel.fa.gz
wget https://ftp.ensembl.org/pub/release-110/gtf/sus_scrofa/Sus_scrofa.Sscrofa11.1.110.gtf.gz
# 解压
gunzip Sus_scrofa.Sscrofa11.1.dna.toplevel.fa.gz && mv Sus_scrofa.Sscrofa11.1.dna.toplevel.fa pig.fa
gunzip Sus_scrofa.Sscrofa11.1.110.gtf.gz && mv Sus_scrofa.Sscrofa11.1.110.gtf pig.gtf
# 以及相关的样本数据
$ ls
# B1-nao_1.fastq.gz B2-1nao_2.fastq.gz B4-1nao_1.fastq.gz B5-nao_2.fastq.gz B8-nao_1.fastq.gz
# B1-nao_2.fastq.gz B3-1nao_1.fastq.gz B4-1nao_2.fastq.gz B7-nao_1.fastq.gz B8-nao_2.fastq.gz
# B2-1nao_1.fastq.gz B3-1nao_2.fastq.gz B5-nao_1.fastq.gz B7-nao_2.fastq.gz md5.txt
# MD5校验
md5sum -c md5.txt
构建索引
hisat2-build -p 8 pig.fa pig
bash脚本运行
# 获取样本名字
ls 0_rawdata/*.fastq.gz > list.txt
awk -F '/' '{print $2}' list.txt | awk -F '_' '{print $1}' | sort | uniq > list1.txt && mv list1.txt list.txt
# 生成索引
hisat2-build -p 8 pig.fa pig
for i in `cat list.txt`
do
nohup trim_galore -j 8 -o 1_trim/ --paired 0_rawdata/${i}_1.fastq.gz 0_rawdata/${i}_2.fastq.gz &
done
wait
for i in `cat list.txt`
do
nohup hisat2 -p 8 -x pig -1 1_trim/${i}_1_val_1.fq.gz -2 1_trim/${i}_2_val_2.fq.gz | samtools view -bS - > 2_alignment/${i}.bam &
done
wait
for i in `cat list.txt`
do
nohup featureCounts -T 8 -t exon -g gene_id -a ./genome/pig.gtf -o 3_featureCounts/${i}.txt 2_alignment/${i}.bam &
done
攥写Snakefile
config.yaml
gtf: "./genome/pig.gtf"
index: "./genome/pig"
threads: 20
Snakefile
shell.executable("/bin/bash")
configfile: "./config.yaml"
(SAMPLES,READS,) = glob_wildcards("0_rawdata/{sample}_{read}.fastq.gz")
READS=["1","2"]
rule all: # all要包含所有没被其他rule调用的output
input:
expand("3_featureCounts/{sample}.txt", sample=SAMPLES),
expand("4_coverage/{sample}.bw", sample=SAMPLES),
"pig.ht2.index.complete"
rule print_samples:
run:
print("SAMPLES:", SAMPLES)
rule trim_galore:
input:
"0_rawdata/{sample}_1.fastq.gz",
"0_rawdata/{sample}_2.fastq.gz"
output:
"1_trim/{sample}_1_val_1.fq.gz",
"1_trim/{sample}_2_val_2.fq.gz"
shell:
"trim_galore --cores {config[threads]} --output_dir 1_trim/ --paired {input}"
rule hisat2build: # rule的名字不能有-,否则会报错Expected name or colon after rule keyword.
input:
fa="genome/pig.fa"
output:
touch("pig.ht2.index.complete")
shell:
"hisat2-build -p {config[threads]} {input.fa} genome/pig && touch {output}"
rule hisat2:
input:
trimmed1="1_trim/{sample}_1_val_1.fq.gz",
trimmed2="1_trim/{sample}_2_val_2.fq.gz",
index="pig.ht2.index.complete"
output:
"2_alignment/{sample}.bam"
shell:
"hisat2 -p {config[threads]} -x genome/pig -1 {input.trimmed1} -2 {input.trimmed2} | samtools view -bS - > {output}"
rule featureCounts:
input:
"2_alignment/{sample}.bam"
output:
"3_featureCounts/{sample}.txt"
shell:
"""
featureCounts -T {config[threads]} -t exon -g gene_id -a {config[gtf]} -o {output[0]} -p {input[0]}
"""
rule bamsortindex:
input:
"2_alignment/{sample}.bam"
output:
sort="2_alignment/{sample}.sorted.bam",
index="2_alignment/{sample}.sorted.bam.bai"
shell:
"""
samtools sort -@ {config[threads]} -o {output.sort} {input} && samtools index {output.sort}
"""
rule bamCoverage:
input:
bam="2_alignment/{sample}.sorted.bam",
index="2_alignment/{sample}.sorted.bam.bai"
output:
"4_coverage/{sample}.bw"
shell:
"bamCoverage -b {input.bam} -o {output} --numberOfProcessors {config[threads]}"
攥写运行snakefile的SLURM递交脚本
#!/bin/bash
#SBATCH --job-name=sbatch_rna_flow
#SBATCH --cpus-per-task=40
#SBATCH -o job.out
#SBATCH --nodelist=comput3
snakemake -j 40
snakemake --cluster "qsub -V -cwd -q v01" -j 10
--cluster 集群运行指令
qusb -V -cwd -q 表示输出当前环境变量(-V),在当前目录下运行(-cwd), 投递到指定的队列(-q), 如果不指定则使用任何可用队列
--local-cores N: 在每个集群中最多并行N核
--cluster-config/-u FILE: 集群配置文件
RNAseq表达定量
library(DESeq2)
counts <- read.table("counts.txt", header=TRUE, row.names=1)
countsTable <- as.matrix(counts[,(6:ncol(counts))])
# 每个样本的总reads数
totalReads <- t(as.matrix(colSums(countsTable)))
length <- as.matrix(counts[,5])
# 每个样本基因的fpkm等于每个样本基因的reads数除以每个样本的总reads数,再除以对应基因的长度
## 样本总reads数乘基因的长度获得矩阵
totalReadsLength <- length %*% totalReads
## 每个样本基因的fpkm等于countsTable矩阵除以totalReadsLength矩阵对应位置的值
fpkm <- 10^9 * countsTable / totalReadsLength
# 保存,并保留行名和列名
write.table(fpkm, "fpkm.txt", sep="\t", quote=FALSE, col.names=NA, row.names=TRUE)
更改fpkm的列名,使gene_id转化为symbol
library(clusterProfiler) # 不包含猪的ensembl id
library(org.Ss.eg.db)
# 下载biomaRt
# if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
# BiocManager::install("biomaRt")
library(biomaRt)
# 显示能连接的数据库
listMarts()
#选择使用哪个数据库
Mart <- useMart('ENSEMBL_MART_ENSEMBL')
#列出并选择猪的基因组注释
listDatasets(Mart)
# 188 Pig genes (Sscrofa11.1)
Mart <- useDataset('sscrofa_gene_ensembl', mart = Mart)
#查看可以输入的gene ID的类型
listFilters(Mart)
#查看可以输出的gene ID的类型
listAttributes(Mart)
fpkm <- read.table("fpkm.txt", header=TRUE, row.names=1)
gene_id <- rownames(fpkm)
# > head(gene_id)
# [1] "ENSSSCG00000028996"
# [2] "ENSSSCG00000005267"
# [3] "ENSSSCG00000005268"
# [4] "ENSSSCG00000005269"
# [5] "ENSSSCG00000031382"
# [6] "ENSSSCG00000005271"
# 根据输入输出类型使用getBM()函数进行gene ID转换
geneconvert <- getBM(attributes=c('ensembl_gene_id', 'external_gene_name'),
filters='ensembl_gene_id',
values=gene_id,
mart=Mart)
# 保存
write.table(geneconvert, "geneconvert.txt", sep="\t", quote=FALSE, col.names=NA, row.names=TRUE)
# getBM(attributes, filters = "", values = "", mart, curl = NULL, checkFilters = TRUE, verbose = FALSE, uniqueRows = TRUE, bmHeader = FALSE,quote = "\", useCache = TRUE)
# attributes定义gene ID输出类型(支持多类型输出),filters定义gene ID输入类型,values指定所输入的gene ID,mart为所用的数据库。但需要注意的是在输出时,需要带上输入的gene ID类型,否则会无法分清输出结果
# 将fpkm的行名替换为symbol,缺失的symbol用gene_id代替,重复的rowname添加后缀
# 假设 geneconvert 数据框包含两列:ensembl_gene_id 和 external_gene_name
# 匹配 fpkm 矩阵的行名和 geneconvert 数据框中的 ensembl_gene_id,并获取对应的 external_gene_name
matching_names <- geneconvert$external_gene_name[match(rownames(fpkm), geneconvert$ensembl_gene_id)]
# 创建一个空的向量用于存储新的行名
new_row_names <- character(length = length(matching_names))
# 遍历匹配到的行名
for (i in 1:length(matching_names)) {
# 如果存在匹配的 external_gene_name,则使用它作为新的行名
if (!is.na(matching_names[i])) {
new_row_names[i] <- matching_names[i]
} else {
# 如果缺失 external_gene_name,则使用对应的 gene_id 作为新的行名
new_row_names[i] <- geneconvert$ensembl_gene_id[match(rownames(fpkm)[i], geneconvert$ensembl_gene_id)]
}
}
# 处理重复的行名
unique_names <- make.unique(new_row_names, sep = "_")
# 将新的行名设置为 fpkm 矩阵的行名
rownames(fpkm) <- unique_namesd
# 保存
write.table(fpkm, "fpkm_symbol.txt", sep="\t", quote=FALSE, col.names=NA, row.names=TRUE)
攥写wgbs的Snakefile
config.yaml
threads: 8
index: "./genome"
Snakefile
shell.executable("/bin/bash")
configfile: "./config.yaml"
(SAMPLES,READS,) = glob_wildcards("0_rawdata/{sample}_{read}.fastq.gz")
rule all:
input:
expand("5_bismark_methylation_extractor/{sample}_1_val_1_bismark_bt2_pe.deduplicated.bismark.cov.gz", sample=SAMPLES),
"1_fastqc/multiqc_report.zip",
"2_fastqc/multiqc_report.zip"
rule fastqc1:
input:
"0_rawdata/{sample}_1.fastq.gz",
"0_rawdata/{sample}_2.fastq.gz"
output:
"1_fastqc/{sample}_1_fastqc.zip",
"1_fastqc/{sample}_1_fastqc.html",
"1_fastqc/{sample}_2_fastqc.zip",
"1_fastqc/{sample}_2_fastqc.html"
shell:
"fastqc -o 1_fastqc/ -f fastq -t {config[threads]} --noextract {input[0]} {input[1]}"
rule multiqc1:
input:
expand("1_fastqc/{sample}_1_fastqc.zip", sample=SAMPLES),
expand("1_fastqc/{sample}_2_fastqc.zip", sample=SAMPLES)
output:
"1_fastqc/multiqc_report.zip"
shell:
"multiqc 1_fastqc/ -o 1_fastqc/"
rule trim_galore:
input:
"0_rawdata/{sample}_1.fastq.gz",
"0_rawdata/{sample}_2.fastq.gz"
output:
"2_trim/{sample}_1_val_1.fq.gz",
"2_trim/{sample}_2_val_2.fq.gz"
shell:
"trim_galore -j {config[threads]} -o 2_trim/ --paired {input[0]} {input[1]}"
rule fastqc2:
input:
"2_trim/{sample}_1_val_1.fq.gz",
"2_trim/{sample}_2_val_2.fq.gz"
output:
"2_fastqc/{sample}_1_fastqc.zip",
"2_fastqc/{sample}_1_fastqc.html",
"2_fastqc/{sample}_2_fastqc.zip",
"2_fastqc/{sample}_2_fastqc.html"
shell:
"fastqc -o 2_fastqc/ -f fastq -t {config[threads]} --noextract {input[0]} {input[1]}"
rule multiqc2:
input:
expand("2_fastqc/{sample}_1_fastqc.zip", sample=SAMPLES),
expand("2_fastqc/{sample}_2_fastqc.zip", sample=SAMPLES)
output:
"2_fastqc/multiqc_report.zip"
shell:
"multiqc 2_fastqc/ -o 2_fastqc/"
# rule bismark_genome_preparation:
# input:
# "genome"
# output:
# touch("genome/Bisulfite_Genome/CT_conversion/genome_mfa.CT_conversion.fa")
# shell:
# "bismark_genome_preparation --bowtie2 --parallel {config[threads]} {input}"
rule bismark:
input:
fq1="2_trim/{sample}_1_val_1.fq.gz",
fq2="2_trim/{sample}_2_val_2.fq.gz",
index="genome/Bisulfite_Genome/CT_conversion/genome_mfa.CT_conversion.fa"
output:
"3_alignment/{sample}_1_val_1_bismark_bt2_pe.bam"
shell:
"bismark --bowtie2 --genome genome -1 {input.fq1} -2 {input.fq2} -o 3_alignment/"
rule deduplicate:
input:
"3_alignment/{sample}_1_val_1_bismark_bt2_pe.bam"
output:
"4_deduplicate/{sample}_1_val_1_bismark_bt2_pe.deduplicated.bam"
shell:
"deduplicate_bismark -p --bam {input} --output_dir 4_deduplicate/"
rule bismark_methylation_extractor:
input:
"4_deduplicate/{sample}_1_val_1_bismark_bt2_pe.deduplicated.bam"
output:
"5_bismark_methylation_extractor/{sample}_1_val_1_bismark_bt2_pe.deduplicated.bismark.cov.gz"
shell:
"bismark_methylation_extractor -p --bedGraph --multicore {config[threads]} --output 5_bismark_methylation_extractor/ {input}"
撰写Chip-seq的Snakefile
config.yaml
threads: 8
index: "./genome"
Snakefile
shell.executable("/bin/bash")
configfile: "./config.yaml"
prefix = "SRR13289"
start = 578
end = 593
SAMPLES = []
for id in range(start, end + 1):
file_name = f"{prefix}{id}"
SAMPLES.append(file_name)
rule all:
input:
"1_fastqc/multiqc_report.html",
"2_fastqc/multiqc_report.html",
expand("3_bamqc/{sample}.sorted.bam.stat", sample=SAMPLES),
expand("5_dedup/{sample}.sorted.dedup.bam.bai", sample=SAMPLES),
expand("6_bigwig/{sample}.bw", sample=SAMPLES),
"genome/pig.complete"
rule fastqc1:
input:
"0_rawdata/{sample}_1.fastq.gz",
"0_rawdata/{sample}_2.fastq.gz"
output:
"1_fastqc/{sample}_1_fastqc.zip",
"1_fastqc/{sample}_2_fastqc.zip"
shell:
"fastqc -o 1_fastqc/ -f fastq -t {config[threads]} --noextract {input[0]} {input[1]}"
rule multiqc1:
input:
expand("1_fastqc/{sample}_1_fastqc.zip", sample=SAMPLES),
expand("1_fastqc/{sample}_2_fastqc.zip", sample=SAMPLES)
output:
"1_fastqc/multiqc_report.html"
shell:
"multiqc 1_fastqc/ -o 1_fastqc/"
rule trim_galore:
input:
"0_rawdata/{sample}_1.fastq.gz",
"0_rawdata/{sample}_2.fastq.gz"
output:
"2_trim/{sample}_1_val_1.fq.gz",
"2_trim/{sample}_2_val_2.fq.gz"
shell:
"trim_galore -j {config[threads]} -o 2_trim/ --paired {input[0]} {input[1]}"
rule fastqc2:
input:
"2_trim/{sample}_1_val_1.fq.gz",
"2_trim/{sample}_2_val_2.fq.gz"
output:
"2_fastqc/{sample}_1_val_1_fastqc.zip",
"2_fastqc/{sample}_2_val_2_fastqc.zip"
shell:
"fastqc -o 2_fastqc/ -f fastq -t {config[threads]} --noextract {input[0]} {input[1]}"
rule multiqc2:
input:
expand("2_fastqc/{sample}_1_val_1_fastqc.zip", sample=SAMPLES),
expand("2_fastqc/{sample}_2_val_2_fastqc.zip", sample=SAMPLES)
output:
"2_fastqc/multiqc_report.html"
shell:
"multiqc 2_fastqc/ -o 2_fastqc/"
rule bowtie2build:
input:
fa="genome/pig.fa"
output:
touch("genome/pig.complete")
shell:
"bowtie2-build {input.fa} genome/pig && touch {output}"
rule bowtie2:
input:
"2_trim/{sample}_1_val_1.fq.gz",
"2_trim/{sample}_2_val_2.fq.gz"
output:
"3_sort/{sample}.sam"
shell:
"bowtie2 -p 8 -x ./genome/pig -1 {input[0]} -2 {input[1]} -S {output}"
rule sam2bam:
input:
"3_sort/{sample}.sam"
output:
"3_sort/{sample}.bam"
shell:
"samtools view -@ {config[threads]} -bS {input} > {output}"
rule samtoolssort1:
input:
"3_sort/{sample}.bam"
output:
"3_sort/{sample}.sorted.bam"
shell:
"samtools sort -@ {config[threads]} -n -o {output} {input}"
rule bamqc:
input:
"3_sort/{sample}.sorted.bam"
output:
"3_bamqc/{sample}.sorted.bam.stat"
shell:
"samtools flagstat {input} > {output}"
rule fixmate:
input:
"3_sort/{sample}.sorted.bam"
output:
"4_fixmate/{sample}.sorted.fixmate.bam"
shell:
"samtools fixmate -m -@ {config[threads]} {input} {output}"
rule samtoolssort2:
input:
"4_fixmate/{sample}.sorted.fixmate.bam"
output:
"4_sort/{sample}.sorted.bam"
shell:
"samtools sort -@ {config[threads]} -o {output} {input}"
rule deduplicate:
input:
"4_sort/{sample}.sorted.bam"
output:
"5_dedup/{sample}.sorted.dedup.bam"
shell:
"samtools markdup -@ {config[threads]} {input} {output}"
rule bamindex:
input:
"5_dedup/{sample}.sorted.dedup.bam"
output:
"5_dedup/{sample}.sorted.dedup.bam.bai"
shell:
"samtools index {input}"
rule bam2bigwig:
input:
"5_dedup/{sample}.sorted.dedup.bam",
"5_dedup/{sample}.sorted.dedup.bam.bai"
output:
"6_bigwig/{sample}.bw"
shell:
"bamCoverage -b {input[0]} -o {output} -bs 10 --normalizeUsing BPM"
攥写cuttag的Snakefile
config.yaml
threads: 8
index: "./genome"
Snakefile
shell.executable("/bin/bash")
configfile: "./config.yaml"
(SAMPLES,READS,) = glob_wildcards("0_rawdata/{sample}_{read}.fq.gz")
READS=["1","2"]
rule all:
input:
"1_fastqc/multiqc_report.zip",
"genome/pig.complete",
expand("6_bed/{sample}.clean.bed", sample=SAMPLES),
expand("7_bigwig/{sample}.bw", sample=SAMPLES)
rule fastqc:
input:
"0_rawdata/{sample}_1.fq.gz",
"0_rawdata/{sample}_2.fq.gz"
output:
"1_fastqc/{sample}_1_fastqc.zip",
"1_fastqc/{sample}_2_fastqc.zip"
shell:
"fastqc -o 1_fastqc/ -f fastq -t {config[threads]} --noextract {input[0]} {input[1]}"
rule multiqc:
input:
expand("1_fastqc/{sample}_1_fastqc.zip", sample=SAMPLES),
expand("1_fastqc/{sample}_2_fastqc.zip", sample=SAMPLES)
output:
"1_fastqc/multiqc_report.zip"
shell:
"multiqc 1_fastqc/ -o 1_fastqc/"
rule bowtie2build:
input:
fa="genome/pig.fa"
output:
touch("genome/pig.complete")
shell:
"bowtie2-build {input.fa} genome/pig && touch {output}"
rule bowtie2:
input:
"0_rawdata/{sample}_1.fq.gz",
"0_rawdata/{sample}_2.fq.gz"
output:
"2_bowtie2/{sample}.sam"
shell:
"""
touch {output.log}
bowtie2 --end-to-end --very-sensitive --no-mixed --no-discordant --phred33 -I 10 -X 700 -p 8 -x genome/pig -1 {input[0]} -2 {input[1]} -S {output}
"""
rule sam2bam:
input:
"2_bowtie2/{sample}.sam"
output:
"3_bam/{sample}.bam"
shell:
"samtools view -@ {config[threads]} -F 0x04 -bS {input} > {output}"
rule bam2bed:
input:
"3_bam/{sample}.bam"
output:
"4_bed/{sample}.bedpe"
shell:
"bedtools bamtobed -i {input} -bedpe > {output}"
rule cleanbed:
input:
"4_bed/{sample}.bedpe"
output:
"5_bed/{sample}.clean.bedpe"
shell:
"awk '$1==$4 && $6-$2 < 1000 {print $0}' {input} > {output}"
rule bedpe2bed:
input:
"5_bed/{sample}.clean.bedpe"
output:
"6_bed/{sample}.clean.bed"
shell:
"cut -f 1,2,6 {input} | sort -k1,1 -k2,2n -k3,3n > {output}"
rule bamsort:
input:
"3_bam/{sample}.bam"
output:
"3_bam/{sample}.sorted.bam"
shell:
"samtools sort -@ {config[threads]} -o {output} {input}"
rule bamindex:
input:
"3_bam/{sample}.sorted.bam"
output:
"3_bam/{sample}.sorted.bam.bai"
shell:
"samtools index {input}"
rule bam2bigwig:
input:
"3_bam/{sample}.sorted.bam",
"3_bam/{sample}.sorted.bam.bai"
output:
"7_bigwig/{sample}.bw"
shell:
"bamCoverage -b {input[0]} -o {output}"