# Bail on any error.
set -uexo pipefail

# The accession number for the genome.
ACC=AF086833

# The ebola sequencing run to process.
SRR=SRR1972739

# The reference genome in two formats.
REF=db/$ACC.fa
GBK=db/$ACC.gb
GFF=db/$ACC.gff

# The alignment.
BAM=$ACC.bam

# The resulting VCF file.
VCF=$ACC.vcf

# Make the directory for the reference.
mkdir -p db

# Fetch the sequence from NCBI.
efetch -db nuccore -format gb -id $ACC > $GBK

# Convert genbank to fasta.
cat $GBK | readseq -p -format=FASTA > $REF

# Convert genbank to GFF.
cat $GBK | readseq -p -format=GFF> $GFF

# Generate the bwa index.
bwa index $REF

# Obtain the sequencing data.
fastq-dump -X 2000 --split-files $SRR

# Align to reference.
bwa mem $REF SRR1972739_1.fastq SRR1972739_2.fastq | samtools sort > $BAM

# Index the BAM file.
samtools index $BAM

# Call variants from the alignment.
freebayes -f $REF $BAM > $VCF