#!/usr/bin/env bash # # This recipe obtains published data on the Coronavirus # # Need more help? See: https://www.biostarhandbook.com # # Stringent error checking. set -uex # Ensure the BiostarHandbook code is installed pip install pyyaml pip install --upgrade BiostarHandbook # Make the taxonkit data directory mkdir -p ~/.taxonkit # Download the taxonomy data. (cd ~/.taxonkit && wget -nc ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz) # Unpack taxonomy data. (cd ~/.taxonkit && tar zxvf taxdump.tar.gz) # Store the blast database here. mkdir -p db # Less stringent error checking: update_blastdb.pl is not well behaved set +e # Download the latest blast database. (cd db && update_blastdb.pl --decompress Betacoronavirus) # Download the taxonomy BLAST database. (cd db && update_blastdb.pl --decompress taxdb) # Turn the strict error checking back on. set -e # Create a report of what is in the Blast database file. # Accession number, taxonomy id, sequence title. blastdbcmd -db db/Betacoronavirus -entry all -outfmt "%a %T %t" > blastinfo.txt # Strict error checking set -uex # Get the YAML specification of the data. curl -s https://www.ncbi.nlm.nih.gov/core/assets/genbank/files/ncov-sequences.yaml > metadata.yaml # Create a tabular GenBank file. cat metadata.yaml | bio metadata -a > metadata.txt # Select the accession numbers (with no version number) for the complete genomes cat blastinfo.txt | grep 2697049 | grep "complete genome" | cut -f 1 -d ' ' | cut -f 1 -d '.' > nCov.ids # Create a reference directory. mkdir -p refs # Store all SARS-COV2 genomes in the reference directory. cat nCov.ids | parallel -j 1 blastdbcmd -db db/Betacoronavirus -entry {} > refs/nCov-genomes.fa # RefSeq sequence genome for SARS-Cov2 ACC=NC_045512 # Get the representative sequences for nCov, SARS and batSARS parallel -j 1 "efetch -db nuccore -format gb -id {} > refs/{}.gb" ::: NC_045512 KT444582 MG772933 # Format all GenBank files as FASTA find refs -name '*.gb' | parallel "cat {} | seqret -filter -osformat fasta > {.}.fa" # Format all GenBank files as GFF keeping only coding sequences. find refs -name '*.gb' | parallel "cat {} | seqret -filter -feature -osformat gff3 | grep '[[:space:]]CDS' > {.}.gff" # Create the taxomy directory mkdir -p taxa # Create the lineages. cat blastinfo.txt | cut -f 2 -d ' ' | taxonkit lineage | sort -u | sort -k 2,2 > taxa/lineages.txt # Select the accession numbers for bat SARS viruses bio filter -i blastinfo.txt -t taxa/lineages.txt -k "bat sars" | grep "complete genome" > taxa/batSARS.txt # Extract the sequences for bat SARS viruses. cat taxa/batSARS.txt | cut -f 1 -d ' ' | parallel -j 1 blastdbcmd -db db/Betacoronavirus -entry {} > refs/batSARS-genomes.fa # Select the accession numbers for human SARS virus genomes bio filter -i blastinfo.txt -t taxa/lineages.txt -k "SARS" -r "bat" | grep "complete genome" > taxa/SARS.txt # Extract accession numbers human SARS viruses. cat taxa/SARS.txt | cut -f 1 -d ' ' | parallel -j 1 blastdbcmd -db db/Betacoronavirus -entry {} > refs/SARS-genomes.fa # Build blast databases for each nCov, batSARS and SARS viruses. makeblastdb -in refs/nCov-genomes.fa -dbtype nucl -out db/nCov -parse_seqids makeblastdb -in refs/SARS-genomes.fa -dbtype nucl -out db/SARS -parse_seqids makeblastdb -in refs/batSARS-genomes.fa -dbtype nucl -out db/batSARS -parse_seqids # Generate a samtools faidx for each fasta file. find refs -name '*.fa' | parallel "samtools faidx {}"