#
# An example script to prepare reference genome data.
#

# Stop on any error.
set -uex

# The directory that stores the global references.
# It is meant to be used for multiple projects.
DIR=~/refs

# Make the DIR if it does exist.
mkdir -p $DIR

# The URL for the prebuild Hisat2 indices.
URL1=ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/grch38.tar.gz

# Download the prebuild Hisat2 indices.
(cd $DIR && wget -nc $URL1)

# Unpack without overwriting if not necessary.
# Finds the name from the url as explained in:
# https://unix.stackexchange.com/questions/325490/how-to-get-last-part-of-http-link-in-bash
(cd $DIR && tar xzvf ${URL1##*/})

# The ENSEMBLY annotation file.
URL2=ftp://ftp.ensembl.org/pub/release-96/gtf/homo_sapiens/Homo_sapiens.GRCh38.96.chr.gtf.gz

# Download and unpack the GFF file.
(cd $DIR && wget -nc $URL2)

# Unpack the GFF file. Finds the file name from the URL.
(cd $DIR && gunzip -k -f ${URL2##*/})

# The Ensembl CDNA (transcript) file.
URL3=ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz

# Download and unpack the CDNA file.
(cd $DIR && wget -nc $URL3)

# Unpack the CDNA file.
(cd $DIR && gunzip -k -f ${URL3##*/})