#
# This project operates on realistic data
# and will download the entire human genome indices.
#

# Stop on any error.
set -uex

# This is where the unpacked reads will be stored.
mkdir -p reads

# Make a directory for indices and reference sequences.
# Since we will be using entire full databases we make sure
# to only download them once and reuse the data if already exists.
mkdir -p ~/refs

# Set the project name.
PROJECT=PRJNA313294

# Full run information.
RUNINFO=runinfo.all.csv

# Run info for single end reads.
SINGLE=runinfo.single.csv

# Run information for paired end reads.
PAIRED=runinfo.paired.csv

# Limit the read numbers if necesary.
READNUM=1000000

# Downloading the run information.
esearch -db sra -query $PROJECT | efetch -format runinfo > $RUNINFO

# Separate the single end files.
cat $RUNINFO | grep  SRR.*SINGLE | cut -f 1 -d , > $SINGLE

# Separate the paired end files.
cat $RUNINFO | grep  SRR.*PAIRED | cut -f 1 -d , > $PAIRED

# Download and unpack single end reads.
cat $SINGLE  | parallel fastq-dump -X $READNUM -O reads >> log.txt

# Download and unpack paired end reads
cat $PAIRED  | parallel fastq-dump -X $READNUM --split-files -O reads >> log.txt