# Stop on any error.
set -uex

# Accession number for the reference genome
ACC=AF086833

# SRA run number.
SRR=SRR1972739

# How many reads to extract from the dataset.
N=10000

# The reference genome in three different formats: GenBank, FASTA and GFF
GB=refs/$ACC.gb
FA=refs/$ACC.fa
GFF=refs/$ACC.gff

# Make the reference directory.
mkdir -p refs

# Get a genbank file.
efetch -db nucleotide -format gb -id $ACC > $GB

# Convert the GenBank file into GFF3.
cat $GB | seqret -filter -feature -osformat gff3 > $GFF

# Convert the GenBank file into FASTA.
cat $GB | seqret -filter -feature -osformat fasta > $FA

# Create an index of the FASTA file
samtools faidx $FA

# Obtain the dataset.
fastq-dump -X $N --split-files $SRR

# Index reference with bwa
bwa index $FA

# Index the reference with samtools
samtools faidx $FA

# Shortcuts to read names
R1=${SRR}_1.fastq
R2=${SRR}_2.fastq

# Align with bwa mem.
bwa mem $FA $R1 $R2 | samtools sort > $SRR.bwa.bam

# Index the BAM file generated with bwa.
samtools index $SRR.bwa.bam

# Index reference with bowtie2.
bowtie2-build $FA $FA

# Align the same data with bowtie2.
bowtie2 -x $FA -1 $R1 -2 $R2 | samtools sort > $SRR.bowtie.bam

# Index the BAM file produced with bowtie2.
samtools index $SRR.bowtie.bam