# Stop on any error. set -uex # Accession number for the reference genome ACC=AF086833 # SRA run number. SRR=SRR1972739 # How many reads to extract from the dataset. N=10000 # The reference genome in three different formats: GenBank, FASTA and GFF GB=refs/$ACC.gb FA=refs/$ACC.fa GFF=refs/$ACC.gff # Make the reference directory. mkdir -p refs # Get a genbank file. efetch -db nucleotide -format gb -id $ACC > $GB # Convert the GenBank file into GFF3. cat $GB | seqret -filter -feature -osformat gff3 > $GFF # Convert the GenBank file into FASTA. cat $GB | seqret -filter -feature -osformat fasta > $FA # Create an index of the FASTA file samtools faidx $FA # Obtain the dataset. fastq-dump -X $N --split-files $SRR # Index reference with bwa bwa index $FA # Index the reference with samtools samtools faidx $FA # Shortcuts to read names R1=${SRR}_1.fastq R2=${SRR}_2.fastq # Align with bwa mem. bwa mem $FA $R1 $R2 | samtools sort > $SRR.bwa.bam # Index the BAM file generated with bwa. samtools index $SRR.bwa.bam # Index reference with bowtie2. bowtie2-build $FA $FA # Align the same data with bowtie2. bowtie2 -x $FA -1 $R1 -2 $R2 | samtools sort > $SRR.bowtie.bam # Index the BAM file produced with bowtie2. samtools index $SRR.bowtie.bam