#!/usr/bin/env python

HELP = """
This program combines kallisto and salmon abundance files scattered across directories.

The output of the program is similar to the count file produced by featureCounts

Usage:
    
    cat IDS | python combine-abundance.py RESULTS 
    
Where:

 - RESULTS is a directory
 - IDS a row oriented file with one identifier per line
 
For each id the IDS file the program will attempt to load the file named as 

   RESULTS/id/abundances.tsv

Once all files are loaded these are combined into a result that has six columns to start each row, 
then the estimated abundance from each file labeled with a sample name.

"""

import csv
import os
import sys


def parse(workdir, sample):
    """
    Returns a generator over a file
    """

    # The path to the abundnace file.
    path = os.path.join(workdir, sample, "abundance.tsv")

    # Try the salmon abundance file.
    if not os.path.isfile(path):
        path = os.path.join(workdir, sample, "quant.sf")

    # Stop on missing files.
    if not os.path.isfile(path):
        print("ERROR: unable to read either kallisto or salmon file: %s" % path)
        sys.exit()

    # Open the stream.
    stream = csv.DictReader(open(path), delimiter="\t")

    # Generate the data
    for row in stream:
        yield row


def process(workdir, stream=sys.stdin):
    """
    Processes and combines tsv files.
    """

    # The samples in the work directory.
    samples = [line.strip() for line in stream]

    collect = {}
    for sample in samples:
        rows = list(parse(workdir, sample))

        for row in rows:

            try:
                target_id, length, eff_length = row['target_id'], row['length'], row['eff_length']
            except KeyError as exc:
                target_id, length, eff_length = row['Name'], row['Length'], row['EffectiveLength']

            # Fill in the first six columns.
            if target_id not in collect:
                collect[target_id] = [target_id, length, eff_length, "0", "0", "0"]

            try:
                value = row['est_counts']
            except KeyError as exc:
                value = row['NumReads']

            value = str(round(float(value), 1))
            collect[target_id].append(value)

    # Print the collected data
    header = [ "target_id", "length", "eff_length", "X1", "X2", "X3" ] + samples
    header = "\t".join(header)
    print (header)

    # Print the header

    for key, values in collect.items():
        line = "\t".join(values)
        print(line)

if __name__ == '__main__':

    if len(sys.argv) != 2:
        print(HELP)
        sys.exit()

    # Get the work directory
    workdir = sys.argv[1]

    if not os.path.isdir(workdir):
        print("ERROR: %s is not a directory" % workdir)
        sys.exit(-1)

    process(workdir=workdir, stream=sys.stdin)