"""
File transformer. Takes a tabuolar file (bed or gff) and 
transforms them to a file used as genetrack input.
Optional parameters may be used to shift the 5' ends with 
a specified amount. This is useful if the file corresponds to data 
with fixed fragment widths you can move each fragment to the 
center location. The program may be invoked in multiple ways. 
As a standalone script::

    python tabs2genetrack.py

As a python module::

    python -m genetrack.scripts.tabs2genetrack

Or in other python scripts::

>>>
>>> from genetrack.scripts import tabs2genetrack
>>> tabs2genetrack.transform( inpname, outname, format='bed', shift=0)
>>>

Run the script with no parameters to see the options that it takes.

A genetrack input file format is a tab delimited text file
described in the API documentation of the PositionalData 
class: `genetrack.hdflib.PositionalData`
The transformation is a three step process, *transform*, 
*sort* and *consolidate*. It will create files in the 
genetrack temporary data directory and it will remove the intermediate files 
when the process is complete.

**Observed runtime**: tranformation rate of 2 million lines per minute

**Note1**: The script will invoke the system `sort` command to sort the file 
that is substantially faster under Unix than Windows.

"""
import os, sys, csv, shutil
from itertools import *
from genetrack import logger, conf, util, hdflib

def consolidate( inpname, outname, format):
    """
    Consolidates an input file. 
    Merges multiple indentical indices into one line
    """
    fp = open(outname, 'wt')

    # recover the original basename
    basename = os.path.basename(inpname).replace('.sorted', '')

    # create a few information headers
    fp.write("#\n# created with tabs2genetrack\n")
    fp.write("# source: %s, format %s\n#\n" % (basename, format) )
    fp.write("chrom\tindex\tforward\treverse\tvalue\n")
    
    # the main reader
    reader = csv.reader(open(inpname, 'rb'), delimiter='\t')

    # will duplicate some code to avoid an extra conditional internally
    chrom, index, fwd, rev, val = reader.next()
    lastindex, fwd, rev, val = int(index), int(fwd), int(rev), int(val)
    collect = [ chrom, lastindex, fwd, rev, val ]

    for row in reader:
        chrom, index, fwd, rev, val = row
        index, fwd, rev, val = int(index), int(fwd), int(rev), int(val)
        if index == lastindex:
            collect[2] += fwd
            collect[3] += rev
            collect[4] += val
        else:
            # write out the collected data
            fp.write( '%s\t%s\t%s\t%s\t%s\n' % tuple(collect) )
            collect = [ chrom, index, fwd, rev, val ]
            lastindex = index

    fp.close()

def transform(inpname, outname, format, shift=0, index=False, options=None):
    """
    Transforms reads stored in bedfile to a genetrack input file.
    Requires at least 6 bed columns to access the strand.
    """

    # detect file formats
    if format == "BED":
        CHROM, START, END, STRAND = 0, 1, 2, 5
    elif format == "GFF":
        CHROM, START, END, STRAND = 0, 3, 4, 6
    else:
        raise Exception('Invalid file format' % format)

    # two sanity checks, one day someone will thank me
    if format == 'BED' and inpname.endswith('gff'):
        raise Exception('BED format on a gff file?')
    if format == 'GFF' and inpname.endswith('bed'):
        raise Exception('GFF format on a bed file?')

    # find the basename of the outputname
    basename = os.path.basename(outname)
   
    # two files store intermediate results
    flat = conf.tempdata( '%s.flat' % basename )
    sorted  = conf.tempdata( '%s.sorted' % basename )

    # check for track information on first line, 
    # much faster this way than conditional checking on each line
    fp = file(inpname, 'rU')
    first = fp.readline()
    fp.close()

    # create the reader
    reader = csv.reader(file(inpname, 'rU'), delimiter='\t')

    # skip if trackline exists
    if first.startswith == 'track':
        reader.next()

    # unwind the comments
    list(takewhile(lambda x: x[0].startswith('#'), reader))

    # copious timing info for those who enjoy these
    timer, full = util.Timer(), util.Timer()

    logger.debug("parsing '%s'" % inpname)
    logger.debug("output to '%s'" % outname)

    # create the unsorted output file and apply corrections
    logger.debug("unsorted flat file '%s'" % flat)

    fp = file(flat, 'wt')
    for linec, row in enumerate(reader):
        try:
            chrom, start, end, strand = row[CHROM], row[START], row[END], row[STRAND]
        except Exception, exc:
            first = row[0][0]
            # may be hitting the end of the file with other comments
            if  first == '>':
                break # hit the sequence content of the gff file
            elif first == '#':
                continue # hit upon some comments
            else:
                logger.error(row)
                raise Exception(exc) 

        if strand == '+':
            # on forward strand, 5' is at start
            idx = int(start) + shift
            fwd, rev, val = 1, 0, 1
        elif strand == '-':
            # on reverse strand, 5' is at end
            idx = int(end) - shift
            fwd, rev, val = 0, 1, 1
        else:
            # no strand specified, generate interval centers
            idx = (int(start)+int(end))/2
            fwd, rev, val = 0, 0, 1

        # it is essential be able to sort the index as a string! 
        fp.write('%s\t%012d\t%s\t%s\t%s\n' % (chrom, idx, fwd, rev, val))

    fp.close()
    linet = util.commify(linec)
    logger.debug("parsing %s lines finished in %s" % (linet, timer.report()))

    # if it is producing coverage then it will expand reads into full intervaals

    # now let the sorting commence
    cmd = "sort %s > %s" % (flat, sorted)
    logger.debug("sorting into '%s'" % sorted)
    os.system(cmd)
    logger.debug("sorting finished in %s" % timer.report() )

    logger.debug("consolidating into '%s'" % outname)
    consolidate( sorted, outname, format=format)
    logger.debug("consolidate finished in %s" % timer.report() )
    logger.debug("output saved to '%s'" % outname)
    logger.debug("full conversion finished in %s" % full.report() )

    # attempting to cleanup the remaining files
    for name in (flat, sorted):
        logger.debug("removing temporary file '%s'" % name )
        os.remove(name)

    # also runs the indexing on it
    if index:
        logger.debug("loading the index from '%s'" % outname)
        # create the work directory
        if options.workdir and not os.path.isdir(options.workdir):
            os.mkdir(options.workdir)
        result = hdflib.PositionalData(fname=outname, update=True, workdir=options.workdir)
        logger.debug("indexing finished in %s" % timer.report() )
        result.close()

        logger.debug("moving index to main output '%s'" % outname )

        # remove the intermediate file
        os.remove(outname)

        # move the index as the output file
        shutil.move(result.index, outname)

def option_parser():
    "The option parser may be constructed in other tools invoking this script"
    import optparse

    usage = "usage: %prog -i inputfile -o outputfile -f format"

    parser = optparse.OptionParser(usage=usage)

    # setting the input file name
    parser.add_option(
        '-i', '--input', action="store", 
        dest="inpname", type='str', default=None,
        help="the input file name (required)"
    )

    # setting the output file name
    parser.add_option(
        '-o', '--output', action="store", 
        dest="outname", type='str', default=None,
        help="output file name (required)"
    )

    # file formats
    parser.add_option(
        '-f', '--format', action="store", 
        dest="format", type="str", default='', 
        help="input file format, bed or gff (required)",
    )
    
    # correction shift added in 5' direction for start/end coordinates
    parser.add_option(
        '-s', '--shift', action="store", 
        dest="shift", type="int", default=0, 
        help="shift for the 5' end on each strand (default=0)",
    )

    # verbosity can be 0,1 and 2 (increasing verbosity)
    parser.add_option(
        '-v', '--verbosity', action="store", 
        dest="verbosity", type="int", default=1, 
        help="sets the verbosity (0, 1) (default=1)",
    )

    parser.add_option("-x", "--index",
        action="store_true", dest="index", default=False,
        help="also creates an hdf index for the file")

    parser.add_option(
        '-w', '--workdir', action="store", 
        dest="workdir", type='str', default=None,
        help="work directory (optional)"
    )

    return parser

if __name__ == '__main__':
    
    parser = option_parser()

    options, args = parser.parse_args()

    # uppercase the format
    options.format = options.format.upper()

    if options.format not in ('BED', 'GFF'):
        sys.stdout = sys.stderr
        parser.print_help()
        sys.exit(-1)

    logger.disable(options.verbosity)

    # missing file names
    if not (options.inpname and options.outname and options.format):
        parser.print_help()
        sys.exit(-1)
    else:
        transform(inpname=options.inpname, outname=options.outname,\
            format=options.format, shift=options.shift, index=options.index, options=options)