# Accession number for reference genome. ACC ?= NC_045512 # Accession number for the SRA run. SRR ?= SRR12053850 # Number of compute cores. CPU ?= 8 # Number of test reads used when running testdata. LIMIT ?=100000 # GenBank reference file. GBK=${ACC}.gbk # Fasta reference file. REF=refs/${ACC}.fa # GFF reference file. GFF=refs/${ACC}.gff # Temporary directory used only with fasterq-dump. # See also: https://www.moritzs.de/linux/thank_you_sra/ TMP=tmp # All names will have this common element. ROOT=${SRR}-${ACC} # Alignment file. BAM=bam/${ROOT}.bam # Variant file. VCF=vcf/${ROOT}.vcf.gz # The annotated VCF file. ANN=vcf/${ROOT}.annotated.vcf # The SNPeff database. SNPEFF = data/${ACC}/snpEffectPredictor.bin # Read names. R1=reads/${SRR}_1.fastq R2=reads/${SRR}_2.fastq # The BWA index. IDX=${REF}.amb # Phony targets instruct make to use these as rules only. .PHONY: run # Check that executables exist. EXE = bio bwa samtools bcftools snpEff K := $(foreach exec,$(EXE),\ $(if $(shell which $(exec)),some string,$(error "Program: $(exec) not found."))) # Print usage. all: @echo "USAGE:" @echo @echo " make test vcf" @echo " make clean" @echo " make vcf" @echo @echo "Additional parameters:" @echo @echo ACC=${ACC}, SRR=${SRR}, CPU=${CPU} LIMIT=${LIMIT} # Removes all files created by operations in this file. clean: rm -rf reads ${TMP} # Removes all files created by operations in this file. realclean: rm -rf refs reads ${TMP} vcf bam data tmp snpEff.config # Rule to make the reference genome. ${REF}: mkdir -p refs bio fetch ${ACC} -format fasta > ${REF} # Rule to make the GFF file. ${GFF}: bio fetch ${ACC} -format gff > ${GFF} # The index depends on the reference genome. ${IDX}: ${REF} bwa index ${REF} echo ${IDX} # Download a subset of the reads. ${R1} ${R2}: mkdir -p reads fasterq-dump -e ${CPU} -t ${TMP} -f -x -p -O reads --split-files ${SRR} # Download a subset of the reads. test: mkdir -p reads fastq-dump -F -X ${LIMIT} --split-files -O reads ${SRR} # BAM file depends on the BWA index and reads. ${BAM}: ${IDX} ${R1} ${R2} mkdir -p bam # Note how we filter alignment for mapped reads only. bwa mem -t ${CPU} ${REF} ${R1} ${R2} | samtools view -b -F 4 | samtools sort -@ ${CPU} > ${BAM} samtools index ${BAM} samtools flagstat ${BAM} # The VCF file depends on the BAM file. ${VCF}: ${BAM} mkdir -p vcf bcftools mpileup -O v -f ${REF} ${BAM} | bcftools call --ploidy 1 -mv -O z -o ${VCF} bcftools index ${VCF} # Building a custom snpEff database. ${SNPEFF}: # Snpeff needs the files in specific folders. mkdir -p data/${ACC} # Download the GenBank file, has to be called genes.gbk. bio fetch ${ACC} > data/${ACC}/genes.gbk # Set the snpEff config file. echo "${ACC}.genome : ${ACC}" > snpEff.config # Build the snpEff database. snpEff build -v ${ACC} # Runs the snpEff tool. ${ANN}: ${SNPEFF} ${VCF} snpEff -v ${ACC} ${VCF} > ${ANN} mv snpEff_*.* vcf/ # Download all reads for the SRR run. vcf: ${ANN}