#!/usr/bin/env bash # # Pipeline to reproduce the data analysis for the paper: # # Genome-Wide Mapping of Binding Sites Reveals Multiple Biological Functions of the Transcription Factor Cst6p in Saccharomyces cerevisiae # # http://mbio.asm.org/content/7/3/e00559-16.long#F1 # # # This script requires Entrez-Direct, bwa, bamUtils and GEM installed. # # Bail on errors. set -ueo pipefail # All shortcuts to programs and data are defined here. GEM=~/src/gem/gem.jar DIST=~/src/gem/Read_Distribution_ChIP-exo.txt # Reference genome. REF=refs/saccer3.fa # Glucose samples. GLU1=bam/trimmed-SRR3033154.bam GLU2=bam/trimmed-SRR3033155.bam # Ethanol samples. ETH1=bam/trimmed-SRR3033156.bam ETH2=bam/trimmed-SRR3033157.bam # Remote urls. URL=http://hgdownload.cse.ucsc.edu/goldenPath/sacCer3/bigZips/chromFa.tar.gz # Directories to organize data in. mkdir -p refs mkdir -p data mkdir -p bam # Download the chromosomes and build a genome. # The sequences are by chromosome. curl $URL | tar zxv curl http://hgdownload.cse.ucsc.edu/goldenPath/sacCer3/bigZips/sacCer3.chrom.sizes > refs/sacCer3.chrom.sizes # Move the files mv *.fa refs # Create genome. cat refs/chr*.fa > $REF # Index the reference. bwa index $REF samtools faidx $REF # Download the sequencing data in FASTQ format. # Get run info from SRA. esearch -db sra -query PRJNA306490 | efetch -format runinfo > runinfo.csv # Isolate just the run ids for the run. cat runinfo.csv | cut -f 1 -d , | grep SRR > runids.txt # Download the fastq files. cat runids.txt | parallel --eta --verbose "fastq-dump -O data --split-files -F {}" # Run the alignments in paired end mode. #cat runids.txt | parallel --eta --verbose "bwa mem -t 8 $REF data/{}_1.fastq data/{}_2.fastq | samtools sort -@ 8 > bam/{}.bam" # The data is aligned in single end mode and only file 1 is used. cat runids.txt | parallel --eta --verbose "bwa mem -t 8 $REF data/{}_1.fastq | samtools sort -@ 8 > bam/{}.bam" # Index the results in BAM format. cat runids.txt | parallel --eta --verbose "samtools index bam/{}.bam" # Trim the BAM file to keep left/right borders. cat runids.txt | parallel --eta --verbose "bam trimBam bam/{}.bam bam/temp-{}.bam -R 70 --clip" # Sort trimmed BAM file. cat runids.txt | parallel --eta --verbose "samtools sort -@ 8 bam/temp-{}.bam > bam/trimmed-{}.bam" # Get rid of temporary BAM files. rm -f bam/temp* # Reindex trimmed bam files. cat runids.txt | parallel --eta --verbose "samtools index bam/trimmed-{}.bam" # Create the coverage files for all BAM files. ls bam/*.bam | parallel --eta --verbose "bedtools genomecov -ibam {} -g $REF.fai -bg | sort -k1,1 -k2,2n > {.}.bedgraph" # Generate all bigwig coverages from bedgraphs. ls bam/*.bedgraph | parallel --eta --verbose "bedGraphToBigWig {} $REF.fai {.}.bw" # Run GEM on glucose samples. java -Xmx3G -jar $GEM --d $DIST --g refs/sacCer3.chrom.sizes --genome refs --exptCond1 $GLU1 --exptCond1 $GLU2 --f SAM --out glucose --outBED --k 5 --smooth 3 --mrc 20 --k_win 30 # Run GEM on ethanol samples. java -Xmx3G -jar $GEM --d $DIST --g refs/sacCer3.chrom.sizes --genome refs --exptCond1 $ETH1 --exptCond1 $ETH2 --f SAM --out ethanol --outBED --k 5 --smooth 3 --mrc 20 --k_win 30