#!/usr/bin/env bash
#
# This recipe clusters nCov genomes.
#
# Need more help? See: https://www.biostarhandbook.com
#

# Strict error checking.
set -uex

# How many sequences
seqkit stats  refs/nCov-genomes.fa

# Cluster the sequences at 70%
cd-hit -c 0.7 -i refs/nCov-genomes.fa -o out

# What is the minimum, maxium and median similarity
cat out.clstr | grep at | awk ' { print $5 } ' | tr -d '%' | datamash min 1 max 1 median 1