#!/usr/bin/env bash # # This script downloads SRR numbers specified in an input file # and aligns and calls variants to reference. # # Prepare a file that contains SRR numbers. # # Run the code as: # # bash find-variants.sh ACCESSION_NUMBER SRRFILE.txt # # Stop this script on any error. set -uexo pipefail # How many reads per sample. # For a realistic scenario raise this # (or remove the parameter from where it is used) READNUM=100000 # Accesion number for the reference that we wish to align against is the first parameter. ACC=$1 # Reads the content of a file specfified by the second parameter into the SAMPLES array. SAMPLES=($(cut -f 1 $2)) echo "*** Samples: ${SAMPLES[@]}" # Make the genome directory mkdir -p refs # This will be the reference. REF=refs/${ACC}.fa # The runlog.txt file will keep track of all messages printed during the run. RUNLOG=runlog.txt # Helps keep track of when an analysis was run. echo "*** Logging to: $RUNLOG" echo "*** Run by `whoami` on `date`" > $RUNLOG # Obtain the reference file if it is not present. if [ ! -f $REF.fai ]; then echo "*** Obtain data for $ACC and save it as $REF". efetch -db nuccore -format fasta -id $ACC | seqret -filter -sid $ACC > $REF # Index reference with different tools needed later. echo "*** Index $REF with several tools." bwa index $REF 2>> $RUNLOG samtools faidx $REF else echo "*** Found reference in $REF" fi # Place the SRR files into a subdirectory mkdir -p reads # Loop over all for SRR in ${SAMPLES[@]} do echo "*** Downloading data for: $SRR" fastq-dump -O reads -X $READNUM --split-files $SRR 1>> $RUNLOG # These are the read names from fastq-dump. R1=reads/${SRR}_1.fastq R2=reads/${SRR}_2.fastq TAG="@RG\tID:$SRR\tSM:$SRR\tLB:$SRR" # Run the aligner and tag by SRR number. echo "*** Aligning reads: $SRR.bam" bwa mem -R $TAG $REF $R1 $R2 2>>$RUNLOG | samtools sort > $SRR.bam 2>> $RUNLOG samtools index $SRR.bam # Call variants via bcftools. echo "*** Calling variants: $SRR.vcf" bcftools mpileup -f $REF $SRR.bam 2>> $RUNLOG | bcftools call --ploidy 1 -vm -Ov > $SRR.vcf done # Concatentate all SRR bam file names into a single string. BAMS="" for SRR in ${SAMPLES[@]} do BAMS+="${SRR}.bam " done echo "*** All BAM files: $BAMS" echo "*** Calling variants from all runs: combined.vcf" bcftools mpileup -f $REF $BAMS 2>> $RUNLOG | bcftools call --ploidy 1 -vm -Ov > combined.vcf