set -ue # set directories and shortcuts mkdir -p refs reads bam diff BIOPROJECT=PRJNA257197 OUT=diff RUN=SRR1972969 ACC=AF086833 REF=refs/${ACC}.fa GENOME=refs/${ACC}.txt GENBANK=refs/${ACC}.gb GFF=refs/${ACC}.gff echo "getting reference ad annotations" # get reference and annotations efetch -db nuccore -id $ACC -format fasta > $REF efetch -db nuccore -id $ACC -format gb > $GENBANK readseq -informat=2 -format=24 -output=$GFF $GENBANK echo "creating genome file" # create genome file bioawk -c fastx '{ print $name, length($seq) }' $REF > $GENOME echo "creating genome index" # index reference bwa index $REF echo "getting genomes" # get genomes esearch -db nucleotide -query $BIOPROJECT | efetch -format acc | head -8 >ids.txt for id in $(cat ids.txt) do efetch -db nuccore -id $id -format fasta > $id.fa done echo "getting reads" #get reads to align fastq-dump --split-files -O reads $RUN echo "making genome alignments" # genome alignment for id in $(cat ids.txt) do echo -e "aligning $id" lastz $REF $id.fa --output=$OUT/${id}.differences --format=differences --ambiguous=iupac cat $OUT/${id}.differences | awk 'BEGIN{FS="\t";}{print $1,$2+1,$3,$12}' >$OUT/${id}_diff.txt done echo "mapping reads" # read mapping R1=reads/${RUN}_1.fastq R2=reads/${RUN}_2.fastq bwa mem $REF $R1 $R2 |samtools sort >bam/reads.bam samtools index bam/reads.bam echo "generating coverage" # generate coverage bedtools genomecov -d -ibam bam/reads.bam -g $GENOME >bam/cov.txt cat bam/cov.txt | awk 'BEGIN{OFS=" "}{print $1,$2-1,$2,$3}' >bam/read_cov.txt echo "getting gene coordinates" # get gene coordinates from annotation cat $GFF | grep -v "#" |awk 'BEGIN{OFS=" "}$3=="gene" {print $1".2",$4,$5}' >genes.txt echo "creating karyotype" # generate karyotype file bioawk -c fastx '{ print $name, length($seq) }' $REF | awk 'BEGIN{OFS=" ";}{print "chr - ",$1,$1,0,$2,"green"}' >karyotype.txt