# # This project operates on realistic data # and will download the entire human genome indices. # # Stop on any error. set -uex # This is where the unpacked reads will be stored. mkdir -p reads # Make a directory for indices and reference sequences. # Since we will be using entire full databases we make sure # to only download them once and reuse the data if already exists. mkdir -p ~/refs # Set the project name. PROJECT=PRJNA313294 # Full run information. RUNINFO=runinfo.all.csv # Run info for single end reads. SINGLE=runinfo.single.csv # Run information for paired end reads. PAIRED=runinfo.paired.csv # Limit the read numbers if necesary. READNUM=1000000 # Downloading the run information. esearch -db sra -query $PROJECT | efetch -format runinfo > $RUNINFO # Separate the single end files. cat $RUNINFO | grep SRR.*SINGLE | cut -f 1 -d , > $SINGLE # Separate the paired end files. cat $RUNINFO | grep SRR.*PAIRED | cut -f 1 -d , > $PAIRED # Download and unpack single end reads. cat $SINGLE | parallel fastq-dump -X $READNUM -O reads >> log.txt # Download and unpack paired end reads cat $PAIRED | parallel fastq-dump -X $READNUM --split-files -O reads >> log.txt