# # A simple tool to generate reads with perfect coverage out a fasta stream. # # It creates paired end output from fragments of the same with half the reads # coming from forward and half from reverse strands. # # It loads the whole genome into memory so might not work all that # well for large genomes. # # Usage: # # cat genome.fa | python perfect_coverage.py # # Will create the files called R1.fq and R2.fq # import sys, random # How many reads per genomic coordinate. COV = 1 # Read length LEN = 50 # Fragment size FRAG = 500 REVCOMP = dict(A='T', T='A', C='G', G='C') def rc(x): return REVCOMP[x] def revcomp(seq): out = ''.join(reversed(list(map(rc, seq)))) return out # Concatenate all lines of a FASTA file into # a list of strings where each element represents # a single full sequence. genome, chrom = [], [] for line in sys.stdin: if line[0] == '>': if chrom: genome.append(''.join(chrom)) chrom = [] else: chrom.append(line.strip()) # Append last chromosome. genome.append(''.join(chrom)) # Generate reads with the expected coverage. read1, read2 = open('R1.fq', 'wt'), open('R2.fq', 'wt') # Need unique names within a file. pattern = "@P{}-{}\n{}\n+\n" + 'I' * LEN + '\n' # Paired end reads for chrom in genome: for index in range(len(chrom) - FRAG): # This is the first fragment. frag = chrom[index: index + FRAG] fwd_start, fwd_end = frag[:LEN], revcomp(frag[-LEN:]) # Its reverse complement. rev_start, rev_end = revcomp(fwd_start), revcomp(fwd_end) # Generate the reads. # Turn indices into one based coordinates to read them more easily. for count in range(COV): read1.write(pattern.format(index + 1, count, fwd_start)) read2.write(pattern.format(index + 1, count, fwd_end))