# # Usage: # # bash virus-discovery.sh 10000 # # Stop on error set -uex # How many reads to process. # Increase this number as needed. N=$1 # Create a directory mkdir -p reads # Obtain the data. Use the first 100K sequences for now. fastq-dump -X $N SRR10971381 --split-files --origfmt --outdir reads # Run FastQC on the original data. #fastqc reads/SRR10971381*.fastq # Shortcuts to read names F1=reads/SRR10971381_1.fastq F2=reads/SRR10971381_2.fastq # Trim the reads by quality. trimmomatic PE $F1 $F2 -baseout reads/read.fq SLIDINGWINDOW:4:30 # Rerun the quality control. # fastqc reads/*.fq # Shortcuts to read names R1=reads/read_1P.fq R2=reads/read_2P.fq # This is needed if you rerun the code. # Delete the results of previous assembly (!) rm -rf out # Run the megahit assembler. megahit -1 $R1 -2 $R2 -o out # Find the statistics seqkit stats out/final.contigs.fa