# # This script generates simulated data from # a genome file that the user mutates. # # It then runs three different variant callers # to allow the user the compare their output. # # As written the script will run samtools, freebayes and GATK # and requires samtools, bwa and picard to be installed. # See the book on how to install them all. # # Users are expected to modify the line # that mutates the genome at known locations and observe the # effects that this has on the variants that are called. # set -uex # Create a directory to store # the reference and all indices. mkdir -p refs # The accession number. ACC=AF086833 # This will be the reference file. # obtained from the accession number. REF=refs/REFERENCE.fa # This is the genome file that gets mutated from the reference. GENOME=GENOME.fa # Modify the genome from the command line like so # # # cat $REF | biosed -filter -target AGGGTGGACAACAGAAGAACA -replace AGGGTGGACAACAGAAGAA > $GENOME # # # This is how I found a region of known location that I could mutate # cat $REF | seqret -filter -sbegin 1980 -send 2000 # The read alignment file. BAM=align.bam # Genome wide alignment file. GLOBAL=global.bam # How to tag the alignments. TAG="@RG\tID:variants\tSM:${ACC}\tLB:simulation" # Error rate for the simulation. ERR=0.02 # This file will keep track of messages printed during the run. echo "*** Run by `whoami` on `date`" > runlog.txt # Obtain the reference file if it does not already exist. if [ ! -f $REF.fai ]; then echo "*** Obtain data for $ACC and save it as $REF". efetch -db nuccore -format fasta -id $ACC | seqret -filter -sid $ACC > $REF # Create the GENOME so that it exists. cat $REF > $GENOME # Index reference with different tools needed later. echo "*** Index $REF with several tools." bwa index $REF 2>> runlog.txt samtools faidx $REF 2>> runlog.txt picard CreateSequenceDictionary REFERENCE=$REF OUTPUT=refs/REFERENCE.dict 2> /dev/null else echo "*** Found reference genome: $REF" fi # # Simulate data with dwgsim. NO OTHER mutations are allowed. # We have allow errors otherwise variant callers may not work properly. # echo "*** Generating simulated reads from: $GENOME" dwgsim -e $ERR -E $ERR -r 0 $GENOME data 2>> runlog.txt # # This is how dwgsim will name its data. # R1=data.bwa.read1.fastq.gz R2=data.bwa.read2.fastq.gz # # Align the reads and generate athe BAM file. # echo "*** Aligning the reads into: $BAM" bwa mem -R $TAG $REF $R1 $R2 2>> runlog.txt | samtools sort > $BAM # # Also create a semiglobal alignment for the mutated genome. # echo "*** Genome alignment into: global.bam" bwa mem $REF $GENOME 2>> runlog.txt | samtools sort > $GLOBAL # # Index the BAM files. # samtools index $BAM samtools index $GLOBAL # # Remove previous vcf in case a failure masks that. # rm -f samtools.vcf freebayes.vcf gatk.vcf # # Call variants with samtools. # echo "*** Calling variants with samtools." bcftools mpileup -f $REF $BAM 2>> runlog.txt | bcftools call -vm -Ov > samtools.vcf 2>> runlog.txt # # Call variants with freebayes. # echo "*** Calling variants with freebayes." freebayes -f $REF $BAM > freebayes.vcf 2>> runlog.txt # Call variants with GATK haplotype caller. echo "*** Calling variants with GATK." gatk -T HaplotypeCaller -R $REF -I $BAM -o gatk.vcf 2>> runlog.txt 1>> runlog.txt # Print a short summary at the end. echo "*** Run completed successfully." echo "*** VCF files: samtools.vcf freebayes.vcf gatk.vcf" # cat samtools.vcf freebayes.vcf gatk.vcf | grep AF08 | grep -v '#' # ATGAGGAAGAA # TATGA---AGA # ATGAGGAAGAA # AT---GAAGAA