#!/usr/bin/env bash # Stop on any error. set -ueo pipefail # This is where the unpacked reads will be stored. mkdir -p reads # Make a directory for indices and reference sequences. mkdir -p refs # Make a directory for runinfo. mkdir -p info # Set the project name. PROJECT=PRJNA313294 # These will store the various SRR ids. RUNINFO=info/zika.csv SINGLE=info/single.csv PAIRED=info/paired.csv # Limit the read numbers to this. READNUM=1000000 echo "*** Searching ENTREZ for project $PROJECT." esearch -db sra -query $PROJECT | efetch -format runinfo > $RUNINFO echo "*** Separating single end and paired end runs." cat $RUNINFO | grep SRR.*SINGLE | cut -f 1 -d , > $SINGLE cat $RUNINFO | grep SRR.*PAIRED | cut -f 1 -d , > $PAIRED # This will run only if you have installed # wonderdump that is necessary only for Bash on Windows if hash wonderdump 2>/dev/null; then cat $SINGLE $PAIRED | xargs -n 1 echo wonderdump | bash fi echo "*** Unpacking single end reads." cat $SINGLE | xargs -n 1 fastq-dump -X $READNUM -O reads echo "*** Unpacking paired end reads." cat $PAIRED | xargs -n 1 fastq-dump -X $READNUM --split-files -O reads # This will be a file that comes with a prebuilt index. IDX_FILE=refs/grch38/genome.1.ht2 # Check to see if we need to download the prebuilt index. if [ ! -f $IDX_FILE ]; then # The URL to the hisat index. IDX_URL=ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/grch38.tar.gz echo "*** Downloading the INDEX from: $IDX_URL" curl $IDX_URL| tar zxv # Move index directory to refs mv grch38 ./refs else echo "*** Found HISAT index: $IDX_FILE" fi # This is the local GTF file. GTF=refs/GRCh38.gtf # Check to see wether we need to download the Gene Transfer file if [ ! -f $GTF ]; then # The URL to the GTF file. GTF_URL=ftp://ftp.ensembl.org/pub/release-86/gtf/homo_sapiens/Homo_sapiens.GRCh38.86.gtf.gz echo "*** Downloading the GTF from: $GTF" curl $GTF_URL | gunzip -c > $GTF else echo "*** Found GTF file: $GTF" fi