# Accession number for reference genome.
ACC ?= NC_045512

# Accession number for the SRA run.
SRR ?= SRR12053850

# Number of compute cores.
CPU ?= 8

# Number of test reads used when running testdata.
LIMIT ?=100000

# GenBank reference file.
GBK=${ACC}.gbk

# Fasta reference file.
REF=refs/${ACC}.fa

# GFF reference file.
GFF=refs/${ACC}.gff

# Temporary directory used only with fasterq-dump.
# See also: https://www.moritzs.de/linux/thank_you_sra/
TMP=tmp

# All names will have this common element.
ROOT=${SRR}-${ACC}

# Alignment file.
BAM=bam/${ROOT}.bam

# Variant file.
VCF=vcf/${ROOT}.vcf.gz

# The annotated VCF file.
ANN=vcf/${ROOT}.annotated.vcf

# The SNPeff database.
SNPEFF = data/${ACC}/snpEffectPredictor.bin

# Read names.
R1=reads/${SRR}_1.fastq
R2=reads/${SRR}_2.fastq

# The BWA index.
IDX=${REF}.amb

# Phony targets instruct make to use these as rules only.
.PHONY: run

# Check that executables exist.
EXE = bio bwa samtools bcftools snpEff
K := $(foreach exec,$(EXE),\
        $(if $(shell which $(exec)),some string,$(error "Program: $(exec) not found.")))

# Print usage.
all:
	@echo "USAGE:"
	@echo
	@echo "  make test vcf"
	@echo "  make clean"
	@echo "  make vcf"
	@echo
	@echo "Additional parameters:"
	@echo
	@echo    ACC=${ACC}, SRR=${SRR}, CPU=${CPU}  LIMIT=${LIMIT}

# Removes all files created by operations in this file.
clean:
	rm -rf reads ${TMP}

# Removes all files created by operations in this file.
realclean:
	rm -rf refs reads ${TMP} vcf bam data tmp snpEff.config

# Rule to make the reference genome.
${REF}:
	mkdir -p refs
	bio fetch ${ACC} -format fasta > ${REF}

# Rule to make the GFF file.
${GFF}:
	bio fetch ${ACC} -format gff > ${GFF}

# The index depends on the reference genome.
${IDX}: ${REF}
	bwa index ${REF}
	echo ${IDX}

# Download a subset of the reads.
${R1} ${R2}:
	mkdir -p reads
	fasterq-dump -e ${CPU} -t ${TMP} -f -x -p  -O reads --split-files ${SRR}

# Download a subset of the reads.
test:
	mkdir -p reads
	fastq-dump -F -X ${LIMIT} --split-files -O reads ${SRR}

# BAM file depends on the BWA index and reads.
${BAM}: ${IDX} ${R1} ${R2}
	mkdir -p bam
	# Note how we filter alignment for mapped reads only.
	bwa mem -t ${CPU} ${REF} ${R1} ${R2} | samtools view -b -F 4 | samtools sort -@ ${CPU} > ${BAM}
	samtools index ${BAM}
	samtools flagstat ${BAM}

# The VCF file depends on the BAM file.
${VCF}: ${BAM}
	mkdir -p vcf
	bcftools mpileup -O v -f ${REF} ${BAM}  | bcftools call --ploidy 1 -mv -O z -o ${VCF}
	bcftools index ${VCF}

# Building a custom snpEff database.
${SNPEFF}:
	# Snpeff needs the files in specific folders.
	mkdir -p data/${ACC}

	# Download the GenBank file, has to be called genes.gbk.
	bio fetch ${ACC} > data/${ACC}/genes.gbk

	# Set the snpEff config file.
	echo "${ACC}.genome : ${ACC}" > snpEff.config

	# Build the snpEff database.
	snpEff build -v ${ACC}

# Runs the snpEff tool.
${ANN}: ${SNPEFF} ${VCF}
	snpEff -v ${ACC} ${VCF} > ${ANN}
	mv snpEff_*.*  vcf/

# Download all reads for the SRR run.
vcf: ${ANN}