#
# Usage:
#
# bash virus-discovery.sh 10000
#

# Stop on error
set -uex

# How many reads to process.
# Increase this number as needed.
N=$1

# Create a directory
mkdir -p reads

# Obtain the data. Use the first 100K sequences for now.
fastq-dump -X $N SRR10971381 --split-files  --origfmt --outdir reads

# Run FastQC on the original data.
#fastqc reads/SRR10971381*.fastq

# Shortcuts to read names
F1=reads/SRR10971381_1.fastq
F2=reads/SRR10971381_2.fastq

# Trim the reads by quality.
trimmomatic PE $F1 $F2 -baseout reads/read.fq SLIDINGWINDOW:4:30

# Rerun the quality control.
# fastqc reads/*.fq

# Shortcuts to read names
R1=reads/read_1P.fq
R2=reads/read_2P.fq

# This is needed if you rerun the code.
# Delete the results of previous assembly (!)
rm -rf out

# Run the megahit assembler.
megahit -1 $R1 -2 $R2 -o out

# Find the statistics
seqkit stats out/final.contigs.fa