# Bail on any error. set -uexo pipefail # The accession number for the genome. ACC=AF086833 # The ebola sequencing run to process. SRR=SRR1972739 # The reference genome in two formats. REF=db/$ACC.fa GBK=db/$ACC.gb GFF=db/$ACC.gff # The alignment. BAM=$ACC.bam # The resulting VCF file. VCF=$ACC.vcf # Make the directory for the reference. mkdir -p db # Fetch the sequence from NCBI. efetch -db nuccore -format gb -id $ACC > $GBK # Convert genbank to fasta. cat $GBK | readseq -p -format=FASTA > $REF # Convert genbank to GFF. cat $GBK | readseq -p -format=GFF> $GFF # Generate the bwa index. bwa index $REF # Obtain the sequencing data. fastq-dump -X 2000 --split-files $SRR # Align to reference. bwa mem $REF SRR1972739_1.fastq SRR1972739_2.fastq | samtools sort > $BAM # Index the BAM file. samtools index $BAM # Call variants from the alignment. freebayes -f $REF $BAM > $VCF