#!/usr/bin/env bash # # This script downloads SRR numbers specified in an input file # and aligns and calls varianst to reference. # # Prepare a file that contains SRR numbers. # # Run the code as: # # bash find-ebola-varianst.sh srrfilename.txt # set -ue # Accession number for the reference that we wish to align against. ACC="AF086833" # Reads the content of a file spefified by the first parameter # into the SAMPLES array. SRR_IDS=($(awk -F= '{print $1}' $1)) echo "*** Samples: ${SRR_IDS[@]}" # Make the genome directory mkdir -p refs # This will be the reference. REF=refs/${ACC}.fa # How many reads per sample. # For a realistic scenario raise this # (or remove the parameter from where it is used) READNUM=10000 # The runlog.txt file will keep track of all messages printed during the run. RUNLOG=runlog.txt # Helps keep track of when an analysis was run. echo "*** Logging to: $RUNLOG" echo "*** Run by `whoami` on `date`" > $RUNLOG # Obtain the reference file if it is not present. if [ ! -f $REF.fai ]; then echo "*** Obtain data for $ACC and save it as $REF". efetch -db=nuccore -format=fasta -id=$ACC | seqret -filter -sid $ACC > $REF # Index reference with different tools needed later. echo "*** Index $REF with several tools." bwa index $REF 2>> $RUNLOG samtools faidx $REF else echo "*** Found reference in $REF" fi # Place the SRR files into a subdirectory mkdir -p sra for SRR in ${SRR_IDS[@]} do echo "*** Downloading data for: $SRR" fastq-dump -O sra -X $READNUM --split-files $SRR 1>> $RUNLOG # These are the read names from fastq-dump. R1=sra/${SRR}_1.fastq R2=sra/${SRR}_2.fastq TAG="@RG\tID:$SRR\tSM:$SRR\tLB:$SRR" # Run the aligner and tag by SRR number. echo "*** Aligning reads: $SRR.bam" bwa mem -R $TAG $REF $R1 $R2 2>>$RUNLOG | samtools sort > $SRR.bam 2>> $RUNLOG samtools index $SRR.bam # Call variants via bcftools. echo "*** Calling variants: $SRR.vcf" samtools mpileup -uvf $REF $SRR.bam 2>> $RUNLOG | bcftools call --ploidy 1 -vm -Ov > $SRR.vcf done # Concatentate all SRR bam files into a single string. BAMS="" for SRR in ${SRR_IDS[@]} do BAMS+="${SRR}.bam " done echo "*** All BAM files: $BAMS" echo "*** Calling variants from all runs: samples.vcf" samtools mpileup -uvf $REF $BAMS 2>> $RUNLOG | bcftools call --ploidy 1 -vm -Ov > samples.vcf