#!/usr/bin/env bash

#
# This recipe obtains published data on the Coronavirus
#
# Need more help? See: https://www.biostarhandbook.com
#

# Stringent error checking.
set -uex

# Ensure the BiostarHandbook code is installed
pip install pyyaml
pip install --upgrade BiostarHandbook

# Make the taxonkit data directory
mkdir -p ~/.taxonkit

# Download the taxonomy data.
(cd ~/.taxonkit && wget -nc ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz)

# Unpack taxonomy data.
(cd ~/.taxonkit && tar zxvf taxdump.tar.gz)

# Store the blast database here.
mkdir -p db

# Less stringent error checking: update_blastdb.pl is not well behaved
set +e

# Download the latest blast database.
(cd db && update_blastdb.pl --decompress Betacoronavirus)

# Download the taxonomy BLAST database.
(cd db && update_blastdb.pl --decompress taxdb)

# Turn the strict error checking back on.
set -e

# Create a report of what is in the Blast database file.
# Accession number, taxonomy id, sequence title.
blastdbcmd -db db/Betacoronavirus -entry all -outfmt "%a %T %t" > blastinfo.txt

# Strict error checking
set -uex

# Get the YAML specification of the data.
curl -s https://www.ncbi.nlm.nih.gov/core/assets/genbank/files/ncov-sequences.yaml > metadata.yaml

# Create a tabular GenBank file.
cat metadata.yaml | bio metadata -a > metadata.txt

# Select the accession numbers (with no version number) for the complete genomes
cat blastinfo.txt | grep 2697049 | grep "complete genome" | cut -f 1 -d ' ' | cut -f 1 -d '.'  > nCov.ids

# Create a reference directory.
mkdir -p refs

# Store all SARS-COV2 genomes in the reference directory.
cat nCov.ids | parallel -j 1  blastdbcmd -db db/Betacoronavirus -entry {} > refs/nCov-genomes.fa

# RefSeq sequence genome for SARS-Cov2
ACC=NC_045512

# Get the representative sequences for nCov, SARS and batSARS
parallel -j 1 "efetch -db nuccore -format gb -id {} > refs/{}.gb" ::: NC_045512 KT444582 MG772933

# Format all GenBank files as FASTA
find refs -name '*.gb' | parallel "cat {} | seqret -filter -osformat fasta > {.}.fa"

# Format all GenBank files as GFF keeping only coding sequences.
find refs  -name '*.gb' | parallel "cat {} | seqret -filter -feature -osformat gff3 | grep '[[:space:]]CDS' > {.}.gff"

# Create the taxomy directory
mkdir -p taxa

# Create the lineages.
cat blastinfo.txt | cut -f 2 -d ' ' | taxonkit lineage | sort -u | sort -k 2,2 > taxa/lineages.txt

# Select the accession numbers for bat SARS viruses
bio filter -i blastinfo.txt -t taxa/lineages.txt -k "bat sars" | grep "complete genome" > taxa/batSARS.txt

# Extract the sequences for bat SARS viruses.
cat taxa/batSARS.txt | cut -f 1 -d ' ' |  parallel -j 1 blastdbcmd -db db/Betacoronavirus -entry {} > refs/batSARS-genomes.fa

# Select the accession numbers for human SARS virus genomes
bio filter -i blastinfo.txt -t taxa/lineages.txt -k "SARS" -r "bat" | grep "complete genome" > taxa/SARS.txt

# Extract accession numbers human SARS viruses.
cat taxa/SARS.txt | cut -f 1 -d ' ' |  parallel -j 1 blastdbcmd -db db/Betacoronavirus -entry {} > refs/SARS-genomes.fa

# Build blast databases for each nCov, batSARS and SARS viruses.
makeblastdb -in refs/nCov-genomes.fa -dbtype nucl -out db/nCov -parse_seqids
makeblastdb -in refs/SARS-genomes.fa -dbtype nucl -out db/SARS -parse_seqids
makeblastdb -in refs/batSARS-genomes.fa -dbtype nucl -out db/batSARS -parse_seqids

# Generate a samtools faidx for each fasta file.
find refs -name '*.fa' | parallel "samtools faidx {}"