#!/usr/bin/env bash # # This script simulates a sequencing run from an accession number. # # You cam mutate the genome genome by hand - or as it is in the script. # Then run this tool generate alignments and visualize these alignments in IGV. # set -ue # Accession number for the reference. ACC="AF086833" # How many reads to generate. N=10000 # How long should each read be. L=100 # What is the DNA fragment size. D=500 # Error rates. E=0.02 # How to tag the alignments. TAG="@RG\tID:variants\tSM:${ACC}\tLB:simulation" # Make the genome directory mkdir -p refs # This will be the reference. REF=refs/REFERENCE.fa # This is will be real genome under study GENOME=GENOME.fa # Results will go here. BAM=align.bam # This file will keep track of messages printed during the run. echo "*** Run by `whoami` on `date`" > runlog.txt # Obtain the reference file if it does not already exists. if [ ! -f $REF ]; then echo "*** Obtain data for $ACC and save it as $REF". efetch -db=nuccore -format=fasta -id=$ACC | seqret -filter -sid $ACC > $REF # Index reference with different tools needed later. echo "*** Index $REF with several tools." bwa index $REF 2>> runlog.txt samtools faidx $REF echo "*** Mutating $REF to $GENOME" # # Mutate the $GENOME file by hand or by biosed like so # # The base at position 2000 is mutated from an A to C # cat $REF | biosed -filter -target AGGGTGGACAACAGAAGAACA -replace AGGGTGGACAACAGAAGAACT > $GENOME else echo "*** Found reference in $REF" fi # # Generate a simulated genome. echo "*** Generating simulated reads from: $GENOME" dwgsim -c 0 -N $N -e $E -d $D -1 $L -2 $L -r 0 -R 0 $GENOME data > /dev/null 2> /dev/null # Read pair names that are the result of the simulation R1=data.bwa.read1.fastq R2=data.bwa.read2.fastq # # This is how dwgsim will name its data. # R1=data.bwa.read1.fastq R2=data.bwa.read2.fastq # # Align the reads and generate athe BAM file. # echo "*** Aligning the reads into: $BAM" bwa mem -R $TAG $REF $R1 $R2 2>> runlog.txt | samtools sort > $BAM samtools index $BAM