set -ue

# set directories and shortcuts

mkdir -p refs reads bam diff
BIOPROJECT=PRJNA257197
OUT=diff
RUN=SRR1972969
ACC=AF086833
REF=refs/${ACC}.fa
GENOME=refs/${ACC}.txt
GENBANK=refs/${ACC}.gb
GFF=refs/${ACC}.gff

echo "getting reference ad annotations"
# get reference and annotations
efetch -db nuccore -id $ACC -format fasta  > $REF
efetch -db nuccore -id $ACC -format gb  > $GENBANK
readseq -informat=2 -format=24 -output=$GFF $GENBANK

echo "creating genome file"
# create genome file
bioawk -c fastx '{ print $name, length($seq) }' $REF > $GENOME

echo "creating genome index"
# index reference
bwa index $REF

echo "getting genomes"
# get genomes
esearch -db nucleotide -query $BIOPROJECT | efetch -format acc | head -8 >ids.txt
for id in $(cat ids.txt)
do
   efetch -db nuccore -id $id -format fasta  > $id.fa
done

echo "getting reads"
#get reads to align
fastq-dump --split-files -O reads $RUN

echo "making genome alignments"
# genome alignment
for id in $(cat ids.txt)
do
   echo -e "aligning  $id"
   lastz $REF $id.fa --output=$OUT/${id}.differences --format=differences --ambiguous=iupac
   cat $OUT/${id}.differences | awk 'BEGIN{FS="\t";}{print $1,$2+1,$3,$12}' >$OUT/${id}_diff.txt
done

echo "mapping reads"
# read mapping
R1=reads/${RUN}_1.fastq
R2=reads/${RUN}_2.fastq
bwa mem $REF $R1 $R2 |samtools sort >bam/reads.bam
samtools index bam/reads.bam

echo "generating coverage"
# generate coverage
bedtools genomecov -d -ibam bam/reads.bam -g $GENOME >bam/cov.txt
cat bam/cov.txt |  awk 'BEGIN{OFS=" "}{print $1,$2-1,$2,$3}' >bam/read_cov.txt

echo "getting gene coordinates"
# get gene coordinates from annotation
cat $GFF  | grep -v "#" |awk 'BEGIN{OFS=" "}$3=="gene" {print $1".2",$4,$5}' >genes.txt

echo "creating karyotype"
# generate karyotype file
bioawk -c fastx '{ print $name, length($seq) }' $REF | awk 'BEGIN{OFS=" ";}{print "chr - ",$1,$1,0,$2,"green"}'  >karyotype.txt