cuttag分析流程（部分存疑）

参考博客

https://mp.weixin.qq.com/s/uwO9G_71h8kU3lTWsW_zPw
https://www.jianshu.com/p/1a23656a0713
https://zhuanlan.zhihu.com/p/520071927

数据质控

fastqc -o 0_fastqc/ -f fastq -t 10 --noextract ./0_rawdata/*_1.fq
fastqc -o 0_fastqc/ -f fastq -t 10 --noextract ./0_rawdata/*_2.fq

bowtie2建立索引

wget ftp://ftp.ensembl.org/pub/release-102/fasta/sus_scrofa/dna/Sus_scrofa.Sscrofa11.1.dna.toplevel.fa.gz
gunzip Sus_scrofa.Sscrofa11.1.dna.toplevel.fa.gz && mv Sus_scrofa.Sscrofa11.1.dna.toplevel.fa pig.fa

bowtie2-build genome/pig.fa genome/pig

slurm提交

#!/bin/bash
#SBATCH --job-name=sbatch_cuttag
#SBATCH --cpus-per-task=40
#SBATCH -o 1_bowtie2.out
#SBATCH --nodelist=comput3

bowtie2比对

比对到猪基因组

cat sample.txt | while read id
do
echo "bowtie2 --end-to-end --very-sensitive --no-mixed --no-discordant --phred33 -I 10 -X 700 -p 8 -x genome/pig -1 ./0_rawdata/${id}_1.fq -2 ./0_rawdata/${id}_2.fq -S ./1_bowtie2/${id}.sam &"
done > bowtie2.sh

SpikeIn处理

不确定在猪的cuttag中是否需要进行spikeIn，如果需要，可以参考以下代码

比对到大肠杆菌基因组

# 下载大肠杆菌基因组
wget ftp://ftp.ensemblgenomes.org/pub/bacteria/release-48/fasta/bacteria_0_collection/escherichia_coli_str_k_12_substr_mg1655/dna/Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.fa.gz
gunzip Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.fa.gz && mv Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.fa ecoli.fa

# 建立索引
bowtie2-build genome/e.coli/ecoli.fa genome/e.coli/ecoli

# 比对
cat sample.txt | while read id
do
echo "bowtie2 --end-to-end --very-sensitive --no-mixed --no-discordant --phred33 -I 10 -X 700 -p 8 -x genome/e.coli/ecoli -1 ./0_rawdata/${id}_1.fq -2 ./0_rawdata/${id}_2.fq -S ./1_bowtie2/ecoli/${id}.sam & > ${id}_bowtie2_spikeIn.txt"
done > bowtie2_ecoli.sh

获取测序深度值

while read id
do
{
    seqDepthDouble=`samtools view -F 0x04 ./1_bowtie2/ecoli/${id}.sam | wc -l`
    seqDepth=$((seqDepthDouble/2))
    echo $seqDepth > ./1_bowtie2/ecoli/${id}_bowtie2_spikeIn.seqDepth
} &
done < sample.txt
wait

文件格式的转换，获得bed文件

while read i
do
{
## 转换为 BAM 文件格式，并且仅保留成功映射到参考序列上的读取
samtools view -bS -F 0x04 ./1_bowtie2/pig/${i}.sam > ./1_mapped/${i}.bam

## 将 BAM 文件转换为 bedpe 文件格式
bedtools bamtobed -i ./1_mapped/${i}.bam -bedpe > ./2_bed/${i}.bedpe
## 将 BAM 文件转换为 bed 文件格式
bedtools bamtobed -i ./1_mapped/${i}.bam > ./2_bed/${i}.bed

## 保留那些在同一条染色体且片段长度小于 1000bp 的双端 reads
awk '$1==$4 && $6-$2 < 1000 {print $0}' ./2_bed/${i}.bed > ./2_bed/${i}.clean.bed

## 仅提取片段相关的列 
cut -f 1,2,6 ./2_bed/${i}.clean.bed | sort -k1,1 -k2,2n -k3,3n  > ./3_fragments/${i}.fragments.bed
} &
done < sample.txt
wait

评估重复性

## 评估重复性
## 为了研究重复之间和不同条件下的可重复性，基因组被分成 500 bp bin，每个 bin reads 计数的 log2 转换值的皮尔逊相关性在重复数据集之间计算。多个重复和 IgG 对照数据集显示在层次化聚类关联矩阵中。
while read i
do
{
binLen=500
awk -v w=$binLen '{print $1, int(($2 + $3)/(2*w))*w + w/2}' ./3_fragments/${i}.fragments.bed \
sort -k1,1V -k2,2n \
uniq -c \
awk -v OFS="\t" '{print $2, $3, $1}' \
sort -k1,1V -k2,2n  > ./4_bin/${i}.fragmentsCount.bin$binLen.bed
} &
done < sample.txt
wait

加入测序深度对bed文件进行标准化处理

需要有大肠杆菌的测序深度值，如没有不进行标准化处理

# 标准化
samtools faidx ./genome/pig/pig.fa ##后面的bedtools -g的内容通过samtools的faidx来进行构建获得索引文件

# 这里的seqDepth是前面的数据比对处理当中获得测序深度值，可以调取txt文件，来进行后续的循环处理
while read i
do
{
# 读取测序深度值
seqDepth=`cat ./1_bowtie2/ecoli/${i}_bowtie2_spikeIn.seqDepth`
# 根据测序深度值进行标准化处理
if [[ "$seqDepth" -gt "1" ]]; then
scale_factor=`echo "10000 / $seqDepth" | bc -l`
echo "Scaling factor for $histName is: $scale_factor!"
bedtools genomecov -bg -scale $scale_factor -i ./3_fragments/${i}.fragments.bed -g ./genome/pig/pig.fa.fai > ./5_normalized/${i}.fragments.normalized.bedgraph
fi  
} &
done < sample.txt
wait

call peak

# 用SEACR进行peak calling
seacr="seacr.sh"

cat sample.txt | while read i
do
bash $seacr ./5_normalized/${i}.fragments.normalized.bedgraph 0.01 non stringent ./6_peak/${i}_seacr_top0.01.peaks.bed
done

snakemake

shell.executable("/bin/bash")
configfile: "./config.yaml"

SAMPLES=['RD29C36_N501_701', 'RD29C36-N501-702', 'RD29C36-N501-702', 'RD29C36-N501-703', 'RD29C36-N501-703', 'RD29C36-N501-704', 'RD29C36-N501-704', 'RD29C36-N502-701', 'RD29C36-N502-701', 'RD29-N502-702', 'RD29-N502-702', 'RD29-N502-703', 'RD29-N502-703', 'RD29-N502-704', 'RD29-N502-704', 'M2-oocyte-501-704', 'M2-oocyte-501-704', 'M2-oocyte-501-706', 'M2-oocyte-501-706', 'no-oocyte-501-705', 'no-oocyte-501-705', 'RD29C36_N501_701']

rule all:
    input:
        "1_fastqc/multiqc_report.html",
        "genome/pig.complete",
        expand("6_bed/{sample}.clean.bed", sample=SAMPLES),
        expand("7_bigwig/{sample}.bw", sample=SAMPLES)

rule fastqc1:
    input:
        "0_rawdata/{sample}_1.fq.gz",
        "0_rawdata/{sample}_2.fq.gz"
    output:
        "1_fastqc/{sample}_1_fastqc.html",
        "1_fastqc/{sample}_2_fastqc.html"
    shell:
        "fastqc -o 1_fastqc/ -f fastq -t {config[threads]} --noextract {input[0]} {input[1]}"

rule multiqc1:
    input:
        expand("1_fastqc/{sample}_1_fastqc.html", sample=SAMPLES),
        expand("1_fastqc/{sample}_2_fastqc.html", sample=SAMPLES)
    output:
        "1_fastqc/multiqc_report.html"
    shell:
        "multiqc 1_fastqc/ -o 1_fastqc/"

rule bowtie2build:
    input:
        fa="genome/pig.fa"
    output:
        touch("genome/pig.complete")
    shell:
        "bowtie2-build {input.fa} genome/pig && touch {output}"

rule bowtie2:
    input:
        "0_rawdata/{sample}_1.fq.gz",
        "0_rawdata/{sample}_2.fq.gz"
    output:
        "2_bowtie2/{sample}.sam"
    shell:
        """
        bowtie2 --end-to-end --very-sensitive --no-mixed --no-discordant --phred33 -I 10 -X 700 -p 8 -x genome/pig -1 {input[0]} -2 {input[1]} -S {output}
        """

rule sam2bam:
    input:
        "2_bowtie2/{sample}.sam"
    output:
        "3_bam/{sample}.bam"
    shell:
        "samtools view -@ {config[threads]} -F 0x04 -bS {input} > {output}"

rule bam2bed:
    input:
        "3_bam/{sample}.bam"
    output:
        "4_bed/{sample}.bedpe"
    shell:
        "bedtools bamtobed -i {input} -bedpe > {output}"

rule cleanbed:
    input:
        "4_bed/{sample}.bedpe"
    output:
        "5_bed/{sample}.clean.bedpe"
    shell:
        "awk '$1==$4 && $6-$2 < 1000 {{print $0}}' {input} > {output}"

rule bedpe2bed:
    input:
        "5_bed/{sample}.clean.bedpe"
    output:
        "6_bed/{sample}.clean.bed"
    shell:
        "cut -f 1,2,6 {input} | sort -k1,1 -k2,2n -k3,3n  > {output}"

rule bamsort:
    input:
        "3_bam/{sample}.bam"
    output:
        "3_bam/{sample}.sorted.bam"
    shell:
        "samtools sort -@ {config[threads]} -o {output} {input}"

rule bamindex:
    input:
        "3_bam/{sample}.sorted.bam"
    output:
        "3_bam/{sample}.sorted.bam.bai"
    shell:
        "samtools index {input}"

rule bam2bigwig:
    input:
        "3_bam/{sample}.sorted.bam",
        "3_bam/{sample}.sorted.bam.bai"
    output:
        "7_bigwig/{sample}.bw"
    shell:
        "bamCoverage -b {input[0]} -o {output}"

posted @ 2024-04-15 23:09 CASTWJ 阅读(448) 评论(0) 收藏举报

刷新页面返回顶部

cauwj

cuttag分析流程（部分存疑）

参考博客

数据质控

bowtie2建立索引

slurm提交

bowtie2比对

比对到猪基因组

SpikeIn处理

比对到大肠杆菌基因组

获取测序深度值

文件格式的转换，获得bed文件

评估重复性

加入测序深度对bed文件进行标准化处理

call peak

snakemake

公告