# Error handling. set -ue function stats() { # Summarizes value in the clustering file. grep at | awk ' { print $5 } ' | tr -d '%' | datamash min 1 max 1 median 1 } # Sample size. N=300 # Random seed. SEED=$RANDOM # Select a random sample. seqtk sample -s $SEED refs/sars2.fa $N > subset.fa # Cluster the sequences. cd-hit -i subset.fa -o out >> log # Print the seed (so we know it in case we want to repeat selection). printf "$SEED\t" # Print the average similarities. cat out.clstr | stats