#!/usr/bin/env bash

#
# This script assumes that you have downloaded the zika data already.
# and you have the data and indices ready.
#

# Stop on any error.
set -ueo pipefail

# These will store the various SRR ids.
RUNINFO=info/zika.csv
SINGLE=info/singles.csv
PAIRED=info/paired.csv

# Store bam files here.
mkdir -p bam

# How many CPU cores on the system.
CPUS=4

# The hisat index name.
IDX=refs/grch38/genome

# Keep track of what the aligners printed out.
RUNLOG=runlog.txt

echo "Run started by `whoami` on `date`" > $RUNLOG
# Align the paired end samples.
for SAMPLE in $(cat $PAIRED)
do
    R1=reads/${SAMPLE}_1.fastq
    R2=reads/${SAMPLE}_2.fastq
    BAM=bam/${SAMPLE}.bam
    SUMMARY=bam/${SAMPLE}_summary.txt

    echo "*** Running Hisat2 on paired end sample: $SAMPLE"
    hisat2 -p $CPUS -x $IDX -1 $R1 -2 $R2 2> $RUNLOG | samtools sort > $BAM  2>> $RUNLOG
    samtools index $BAM
done

# Align the single end samples.
for SAMPLE in $(cat $SINGLE)
do
    R1=reads/${SAMPLE}.fastq
    BAM=bam/${SAMPLE}.bam
    SUMMARY=bam/${SAMPLE}_summary.txt

    echo "*** Running Hisat2 on single end sample: $SAMPLE"
    hisat2 -p $CPUS -x $IDX -U $R1 2> $RUNLOG | samtools sort > $BAM 2>> $RUNLOG
    samtools index $BAM
done