#
# While this script does run as a single command a typical RNA-Seq analysis
# takes two steps.
#
# 1. Create the count matrix.
# 2. Run statistical methods on the count matrix.
#
set -uex

# The location of the reference genome.
GENOME_URL=http://data.biostarhandbook.com/books/rnaseq/data/golden.genome.tar.gz

# The location of the sequencing reads.
READS_URL=http://data.biostarhandbook.com/books/rnaseq/data/golden.reads.tar.gz

# Download and unpack both data
wget -nc $GENOME_URL
wget -nc $READS_URL

# Unpack the gzipped data
# Creates the reads and refs folder.
tar xzvf golden.genome.tar.gz
tar xzvf golden.reads.tar.gz

# Run sequence statistics on the sequencing reads.
seqkit stats reads/*.fq

#  Run sequence statistics on the genome and transcripts.
seqkit stats refs/*.fa

# Create the sample names, three BORED three EXCITED labels.
parallel echo {1}_{2} ::: BORED EXCITED ::: 1 2 3 > ids

# The name of indexed reference genome.
IDX=refs/genome.fa

# Index the reference genome. Needs to be done only once and can be reused.
hisat2-build $IDX $IDX

# Index the reference genome with samtools. Needs to be done only once and can be reused.
samtools faidx $IDX

# Create the folder that will store the alignment files.
mkdir -p bam

# Align the FASTQ files to the reference genome and create BAM files.
cat ids | parallel "hisat2 -x $IDX -1 reads/{}_R1.fq -2 reads/{}_R2.fq | samtools sort > bam/{}.bam"

# Index each BAM file.
cat ids | parallel  "samtools index bam/{}.bam"

# Run the featureCounts program to summarize the number of reads that overlap with features.
featureCounts -p -a refs/features.gff -o counts.txt bam/BORED_?.bam bam/EXCITED_?.bam

#
# Above we created the count matrix. That is step one of the RNA-Seq process.
#

#
# This section requires the presence of the statistics package.
#
# Typically we run it separately to form different experimental designs.

# Store the analysis scripts separately.
mkdir -p code

# Get the data analysis scripts.
wget -P code -qnc  http://data.biostarhandbook.com/books/rnaseq/code/deseq2.r
wget -P code -qnc  http://data.biostarhandbook.com/books/rnaseq/code/edger.r
wget -P code -qnc  http://data.biostarhandbook.com/books/rnaseq/code/heatmap.r

# Generate the results with method 2.
cat counts.txt | Rscript code/deseq2.r 3x3 > deseq2-results.csv
cat deseq2-results.csv | Rscript code/heatmap.r > deseq2-heatmap.pdf

# Generate the results with method 3.
cat counts.txt | Rscript code/edger.r 3x3 > edger-results.csv
cat  edger-results.csv | Rscript code/heatmap.r > edge-heatmap.pdf