# # While this script does run as a single command a typical RNA-Seq analysis # takes two steps. # # 1. Create the count matrix. # 2. Run statistical methods on the count matrix. # set -uex # The location of the reference genome. GENOME_URL=http://data.biostarhandbook.com/books/rnaseq/data/golden.genome.tar.gz # The location of the sequencing reads. READS_URL=http://data.biostarhandbook.com/books/rnaseq/data/golden.reads.tar.gz # Download and unpack both data wget -nc $GENOME_URL wget -nc $READS_URL # Unpack the gzipped data # Creates the reads and refs folder. tar xzvf golden.genome.tar.gz tar xzvf golden.reads.tar.gz # Run sequence statistics on the sequencing reads. seqkit stats reads/*.fq # Run sequence statistics on the genome and transcripts. seqkit stats refs/*.fa # Create the sample names, three BORED three EXCITED labels. parallel echo {1}_{2} ::: BORED EXCITED ::: 1 2 3 > ids # The name of indexed reference genome. IDX=refs/genome.fa # Index the reference genome. Needs to be done only once and can be reused. hisat2-build $IDX $IDX # Index the reference genome with samtools. Needs to be done only once and can be reused. samtools faidx $IDX # Create the folder that will store the alignment files. mkdir -p bam # Align the FASTQ files to the reference genome and create BAM files. cat ids | parallel "hisat2 -x $IDX -1 reads/{}_R1.fq -2 reads/{}_R2.fq | samtools sort > bam/{}.bam" # Index each BAM file. cat ids | parallel "samtools index bam/{}.bam" # Run the featureCounts program to summarize the number of reads that overlap with features. featureCounts -p -a refs/features.gff -o counts.txt bam/BORED_?.bam bam/EXCITED_?.bam # # Above we created the count matrix. That is step one of the RNA-Seq process. # # # This section requires the presence of the statistics package. # # Typically we run it separately to form different experimental designs. # Store the analysis scripts separately. mkdir -p code # Get the data analysis scripts. wget -P code -qnc http://data.biostarhandbook.com/books/rnaseq/code/deseq2.r wget -P code -qnc http://data.biostarhandbook.com/books/rnaseq/code/edger.r wget -P code -qnc http://data.biostarhandbook.com/books/rnaseq/code/heatmap.r # Generate the results with method 2. cat counts.txt | Rscript code/deseq2.r 3x3 > deseq2-results.csv cat deseq2-results.csv | Rscript code/heatmap.r > deseq2-heatmap.pdf # Generate the results with method 3. cat counts.txt | Rscript code/edger.r 3x3 > edger-results.csv cat edger-results.csv | Rscript code/heatmap.r > edge-heatmap.pdf