#!/usr/bin/python3
# -*- python -*-
# PiGx -- Pipelines in Genomics.
#
# Copyright © 2017-2021 Akalin lab.
#
# This file is part of PiGx.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

if 'yes' == 'yes':
    import os, sys
    os.environ["R_LIBS_SITE"] = ""
    os.environ["PYTHONPATH"] = ""
    # Setting PYTHONPATH does not help with dependencies needed for
    # this script.
    for directory in "".split(":"):
        try:
            sys.path.index(directory)
        except ValueError:
            sys.path.insert(1, directory)

license_blurb = """
License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>.

This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
"""

version = """
PiGx RNAseq Pipeline.
Version: 0.1.1

Copyright © 2017-2021 Bora Uyar, Jona Ronen, Ricardo Wurmus.
"""

description = """
PiGx RNAseq Pipeline.

PiGx RNAseq is a data processing pipeline for RNAseq read data.
"""

epilog = "This pipeline was developed by the Akalin group at MDC in Berlin in 2017-2021."


import argparse
from os import path
import os, sys, json, csv, yaml, errno
from snakemake.utils import update_config
import shutil, filecmp
from glob import glob
import subprocess

def formatter(prog):
    return argparse.RawTextHelpFormatter(prog, max_help_position=80)

parser = argparse.ArgumentParser(description=description,
                                 epilog=epilog,
                                 formatter_class=formatter)

parser.add_argument('-v', '--version', action='version',
                    version=version)

parser.add_argument('sample_sheet', nargs='?', default='sample_sheet.csv',
                    help="""\
The sample sheet containing sample data in CSV format.\
""")

group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--init', dest='init', choices=['settings', 'sample-sheet', 'both'], const='both', nargs='?',
                   help='Generate a template SETTINGS file, a SAMPLE-SHEET.  Leave empty for both.')
group.add_argument('-s', '--settings', dest='settings',
                   help='A YAML file for settings that deviate from the defaults.')

parser.add_argument('-c', '--configfile', dest='configfile', default='./config.json',
                    help="""\
The config file used for calling the underlying snakemake process.  By
default the file 'config.json' is dynamically created from the sample
sheet and the settings file.
""")

parser.add_argument('--target', dest='target', action='append',
                    help="""\
Stop when the named target is completed instead of running the whole
pipeline.  The default target is "final-report".  Pass "--target=help"
to describe all available targets.""")

parser.add_argument('-n', '--dry-run', dest='dry_run', action='store_true',
                    help="""\
Only show what work would be performed.  Do not actually run the
pipeline.""")

parser.add_argument('--graph', dest='graph',
                    help="""\
Output a graph in PDF format showing the relations between rules of
this pipeline.  You must specify a graph file name such as
"graph.pdf".""")

parser.add_argument('--force', dest='force', action='store_true',
                    help="""\
Force the execution of rules, even though the outputs are considered
fresh.""")

parser.add_argument('--reason', dest='reason', action='store_true',
                    help="""\
Print the reason why a rule is executed.""")

parser.add_argument('--unlock', dest='unlock', action='store_true',
                    help="""\
Recover after a snakemake crash.""")

parser.add_argument('--verbose', dest='verbose', action='store_true',
                    help="""\
Print supplementary info on job execution.""")

parser.add_argument('--printshellcmds', dest='printshellcmds', action='store_true',
                    help="""\
Print commands being executed by snakemake.""")


args = parser.parse_args()


# Generate config file
def bail(msg):
    """Print the error message to stderr and exit."""
    print(msg, file=sys.stderr)
    exit(1)

# FIXME: only needed for bsseq
def get_filenames(mylist):
    return list(map(lambda x: splitext_fqgz(x)[0], mylist))

# FIXME: only needed for bsseq
def splitext_fqgz(string):
    def fq_suffix(filename):
        return any(filename.endswith(ext) for ext in [".fq", ".fastq", ".fasta"])

    def is_zipped(filename):
        return any(filename.endswith(ext) for ext in [".gz", ".bz2"])

    if is_zipped(string):
        string, zipext = os.path.splitext(string)
    else:
        zipext = ""
    if fq_suffix(string):
        base, ext = os.path.splitext(string)
        return (base, ext + zipext)
    else:
        bail("Input files are not fastq files!")

# FIXME: only needed for bsseq
def parse_samples(lines):
    """
    Parse csv table with information about samples, eg:

    Read1,Read2,SampleID,ReadType,Treatment
    sampleB.pe1.fq.gz,sampleB.pe2.fq.gz,sampleB,WGBS,B,,
    pe1.single.fq.gz,,sampleB1,WGBS,B,,

    It returns a dictionary required for the config file.
    """
    sreader = csv.reader(lines, delimiter=',')
    all_rows = [row for row in sreader if row]

    header = list(map(lambda x: x.strip(), all_rows[0] ))
    rows   = all_rows[1:]
    minimal_header = ['Read1', 'Read2', 'SampleID', 'Protocol', 'Treatment']

    if header[:5] != minimal_header:
        raise Exception("First columns of the input table have to be " +
                        ",".join(minimal_header) + ".")

    sample_ids = [x[2] for x in rows]
    if len(set(sample_ids)) != len(sample_ids):
        raise Exception("Column 'SampleID' has non-unique values.")

    # Create a dictionary with all params, keys are samples ids
    outputdict = {}
    for row in rows:
        if ( len(row) != 5 ):
            bail("Invalid row format in Samplesheet. Each row should have five columns.")
        row = list(map(lambda x: x.strip(), row))
        files = list(filter(None, row[0:2]))
        if not files:
            raise Exception("Each sample has to have an entry in at least one of the columns 'Read1' or 'Read2'.")

        sampleid_dict = {}
        for idx in range(len(header[2:])):
            try:
                sampleid_dict[header[2:][idx]] = row[2:][idx]
            except IndexError:
                raise Exception("Number of columns in row " + idx + " doesn't match number of elements in header.")

        sampleid_dict['files']      = files
        sampleid_dict['fastq_name'] = get_filenames(files)
        outputdict[row[2]] = sampleid_dict
    return { 'SAMPLES': outputdict }

def generate_config(configfile, sample_sheet, settingsfile, dirs):
    """Generate a new configuration file CONFIGFILE using SAMPLE_SHEET and
SETTINGSFILE as inputs.  Use the locations in DIRS to find default
settings.
    """
    # Load defaults
    if os.getenv('PIGX_UNINSTALLED'):
        where = os.getenv('srcdir') if os.getenv('srcdir') else '.'
        defaults = path.join(where, 'etc/settings.yaml')
    else:
        defaults = path.join(dirs['locations']['pkgdatadir'], 'settings.yaml')

    if not path.exists(defaults):
        bail("Could not find default settings.  Did you forget to set PIGX_UNINSTALLED?")

    settings = yaml.safe_load(open(defaults, 'r'))

    # Load user overrides
    if settingsfile:
        if not os.path.isfile(settingsfile):
            bail('ERROR: settings.yaml file not found')
        update_config(settings,yaml.safe_load(open(settingsfile, 'r')))

    settings['execution']['target'] = args.target

    # Record the location of the sample sheet.
    if not os.path.isfile(path.abspath(sample_sheet)):
        bail("ERROR: Failed to find sample sheet provided. No such file: " + sample_sheet)
    settings['locations']['sample-sheet'] = path.abspath(sample_sheet)

    # FIXME: special case for bsseq:
    # Load parameters specific to samples
    if "PiGx_rnaseq".lower() == 'pigx_bsseq':
        with open(sample_sheet, 'r') as f:
            lines = f.read().splitlines()
        sample_params = parse_samples(lines)
        settings.update(sample_params)

    settings['locations'].update(dirs['locations'])

    # Resolve relative paths in the locations section
    root = path.dirname(sample_sheet)
    here = os.getenv('srcdir') if os.getenv('srcdir') else os.getcwd()

    for key in settings['locations']:
        if settings['locations'][key]:
            settings['locations'][key] = path.normpath(path.join(here, root, settings['locations'][key]))

    # FIXME: special case for scrnaseq: resolve genome fasta / gtf files
    if "PiGx_rnaseq".lower() == 'pigx_scrnaseq':
        for annotation in ['primary', 'secondary']:
            if settings['annotation'][annotation]:
                if settings['annotation'][annotation]['gtf']:
                    settings['annotation'][annotation]['gtf'] = path.normpath(path.join(here, root, settings['annotation'][annotation]['gtf']))
                if settings['annotation'][annotation]['genome'] and settings['annotation'][annotation]['genome']['fasta']:
                    settings['annotation'][annotation]['genome']['fasta'] = path.normpath(path.join(here, root, settings['annotation'][annotation]['genome']['fasta']))

    # Write the config file
    with open(configfile, 'w') as outfile:
        dumps = json.dumps(settings,
                           indent=4, sort_keys=True,
                           separators=(",",": "), ensure_ascii=True)
        outfile.write(dumps)

def generate_cluster_configuration():
    rules = config['execution']['rules']

    cluster_conf = {}
    for rule in rules:
        if 'queue' in config['execution']['cluster']:
            # --- User has supplied general queue name for all rules---
            if 'queue' in rules[rule]:
              bail("ERROR: 'queue' multiply defined in settings file.") #AND per rule ->error
            else:
              cluster_conf[rule] = {
              'nthreads': rules[rule]['threads'],
              'MEM':      rules[rule]['memory'],
              'queue':    config['execution']['cluster']['queue'],
              'h_stack':  config['execution']['cluster']['stack']
              }
        elif ( 'queue' in rules[rule] ):
            # --- User has supplied a queue for this specific rule.
            if not 'queue' in rules['__default__']:
                bail("ERROR: submission queue specified per rule with no default.")
            cluster_conf[rule] = {
              'nthreads': rules[rule]['threads'],
              'MEM':      rules[rule]['memory'],
              'queue':    rules[rule]['queue'],
              'h_stack':  config['execution']['cluster']['stack']
              }
        else:
              # --- User has provided no information on queue for this rule -> default.
              cluster_conf[rule] = {
              'nthreads': rules[rule]['threads'],
              'MEM':      rules[rule]['memory'],
              'h_stack':  config['execution']['cluster']['stack']
              }

    cluster_config_file = "cluster_conf.json"
    with open(cluster_config_file, 'w') as outfile:
        dumps = json.dumps(cluster_conf,
                           indent=4, sort_keys=True,
                           separators=(",",": "), ensure_ascii=True)
        outfile.write(dumps)

    return cluster_config_file


# Create symbolic links to the inputs and reference genome

# Create links within the output folder that point directly to the
# reference genome, as well as to each sample input file so that it's
# clear where the source data came from.

# N.B. Any previously existing links will be kept in place, and no
# warning will be issued if this is the case.

def makelink(src, target):
    if not path.isfile(src):
        bail("Refusing to link non-existent file %s" % src)
    elif not path.isdir(path.dirname(target)):
        bail("%s or subdirectory does not exist for linking %s" % config['locations']['output-dir'], target)
    else:
        try:
            os.symlink(src, target)
        except FileExistsError:
            pass

def maybe_copy(source, target_dir):
    target = path.join(target_dir, path.basename(source))
    if not path.exists(target) or not filecmp.cmp(source, target):
        shutil.copy(source, target_dir)

def prepare_links():
    os.makedirs(path.join(config['locations']['output-dir'], 'pigx_work'),
                exist_ok=True)

    # FIXME: special case for bsseq
    if "PiGx_rnaseq".lower() == 'pigx_bsseq':
        os.makedirs(path.join(config['locations']['output-dir'], 'pigx_work/input'),
                    exist_ok=True)

        # copy documentation file to the output work directory.
        if os.getenv('PIGX_UNINSTALLED'):
            where = os.getenv('srcdir') if os.getenv('srcdir') else '.'
            contents = path.join(where, 'etc/CONTENTS.txt')
        else:
            contents = path.join(config['locations']['pkgdatadir'], 'CONTENTS.txt')
        shutil.copyfile(contents, path.join(config['locations']['output-dir'], 'pigx_work/CONTENTS.txt'))

        # Link the reference genome
        try:
            os.symlink(os.path.dirname(config['locations']['genome-fasta']),
                       path.join(config['locations']['output-dir'], 'pigx_work/refGenome'))
        except FileExistsError:
            pass

        # Create file links
        for sample in config['SAMPLES']:
            flist = config['SAMPLES'][sample]['files']
            single_end = len(flist) == 1

            for idx, f in enumerate(flist):
                if not f.endswith(".gz"):
                    # FIXME: Future versions should handle unzipped .fq or .bz2.
                    bail("Input files must be gzipped: %s." % f)

                tag = "" if single_end else '_' + str(idx + 1)
                linkname = config['SAMPLES'][sample]['SampleID'] + tag + ".fq.gz"
                makelink(path.join(config['locations']['input-dir'], f),
                         path.join(config['locations']['output-dir'], "pigx_work/input/", linkname))

    # Ensure that we use the configured Pandoc, pandoc-citeproc
    # ...and the configured Rscript
    bin = path.join(config['locations']['output-dir'], 'pigx_work/bin')
    if path.exists(bin): shutil.rmtree(bin)
    os.makedirs(bin, exist_ok=True)
    os.symlink('/usr/bin/pandoc', path.join(bin, "pandoc"))
    os.symlink('@PANDOC_CITEPROC@', path.join(bin, "pandoc-citeproc"))
    os.symlink('/usr/bin/Rscript', path.join(bin, "Rscript"))
    os.environ['PATH'] = path.abspath(bin) + ":" + os.environ['PATH']
    os.environ['PIGX_PATH'] = path.abspath(bin) + ":" + os.environ['PATH']
    os.environ['R_LIBS_USER'] = "/dev/null"
    os.environ['R_LIBS'] = "/dev/null"

def display_logo():
    if os.getenv('PIGX_UNINSTALLED'):
        where = os.getenv('srcdir') if os.getenv('srcdir') else '.'
        pretty = path.join(where, 'etc/pretty.txt')
    else:
        pretty = path.join(config['locations']['pkgdatadir'], 'pretty.txt')

    if not os.getenv('PIGX_UGLY'):
        # Ensure that this is printed without errors, even if the user's
        # locale is not a UTF-8 locale.
        p = open(pretty, 'r', encoding='utf-8').read()
        sys.stdout.buffer.write(p.encode('utf-8'))



# Determine locations
dirs = {}
if os.getenv('PIGX_UNINSTALLED'):
    here = os.getenv('srcdir') if os.getenv('srcdir') else os.getcwd()
    dirs['locations'] = {
        'prefix'       : here,
        'exec_prefix'  : here,
        'libexecdir'   : here,
        'pkglibexecdir': here,
        'datarootdir'  : here,
        'pkgdatadir'   : here
    }
else:
    # Expand and store autoconf directory variables
    prefix = '/usr'
    exec_prefix = '${prefix}'[1:].format(prefix=prefix)
    libexecdir = '${exec_prefix}/libexec'[1:].format(exec_prefix=exec_prefix)
    pkglibexecdir = '{libexecdir}/pigx_rnaseq'.format(libexecdir=libexecdir)
    datarootdir = '${prefix}/share'[1:].format(prefix=prefix)
    pkgdatadir = '${datarootdir}/pigx_rnaseq'[1:].format(datarootdir=datarootdir)

    dirs['locations'] = {
        'prefix'       : '/usr',
        'exec_prefix'  : exec_prefix,
        'libexecdir'   : libexecdir,
        'pkglibexecdir': pkglibexecdir,
        'datarootdir'  : datarootdir,
        'pkgdatadir'   : pkgdatadir
    }

# Init?
if args.init:
    init_settings = False
    init_sample_sheet = False
    if os.getenv('PIGX_UNINSTALLED'):
        base = os.getcwd() + '/etc/'
    else:
        base = dirs['locations']['pkgdatadir']

    if args.init == 'both':
        init_settings = True
        init_sample_sheet = True

    if args.init == 'settings' or init_settings:
        name = 'settings.yaml'
        if path.exists(name):
            print('Refusing to overwrite existing {}.'.format(name))
        else:
            with open(name, 'w') as outfile:
                with open(path.join(base, name), 'r') as infile:
                    for line in infile:
                        outfile.write('# ' + line)
            print('Generated {} template.'.format(name))

    if args.init == 'sample-sheet' or init_sample_sheet:
        name = 'sample_sheet.csv'
        if path.exists(name):
            print('Refusing to overwrite existing sample_sheet.csv.')
        else:
            shutil.copy(path.join(base, name + '.example'), name)
            print('Generated {} template.'.format(name))
    exit(0)

# Run snakemake!
generate_config(args.configfile,
                args.sample_sheet,
                args.settings,
                dirs)

config = json.load(open(args.configfile, 'r'))

command = [
    "/usr/bin/snakemake",
    "--snakefile={}/snakefile.py".format(config['locations']['pkglibexecdir']),
    "--configfile={}".format(args.configfile),
    "--directory={}".format(config['locations']['output-dir']),
    "--jobs={}".format(config['execution']['jobs']),
    "--rerun-incomplete"
]

if config['execution']['submit-to-cluster']:
    cluster_config_file = generate_cluster_configuration()
    print("Commencing snakemake run submission to cluster", flush=True, file=sys.stderr)

    # Check if a particular queue has been specified: 
    if ( ('queue' in config['execution']['cluster'] )  or ( 'queue' in config['execution']['rules']['__default__'] )):
        queue_selection_string = " -q {cluster.queue} "
	# User has defined queue name(s) --either generally, or rule-specific.
	# in either case, generate_cluster_configuration() (defined above) has 
        # already printed the apropriate queue name to each rule.
        queue_selection_string = " -q {cluster.queue} "
    else :
        # User has supplied no q value -> let SGE pick the the default.
        queue_selection_string = ""
    # --- done checking if ( queue name(s) supplied by user)

    # check if a contact email has been specified:
    if config['execution']['cluster']['contact-email'].lower() == 'none':
        contact_email_string = ""
    else:
        contact_email_string = "-m a -M %s" % config['execution']['cluster']['contact-email']


    # check if machine allows for queue submission at all
    try:
        subprocess.call(["qsub", "-help"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    except OSError as e:
        if e.errno == errno.ENOENT:
            print("Error: Your system does not seem to support cluster submission.\nReason: Could not find qsub executable.", file=sys.stderr)
            exit(1)
        else:
            raise

    qsub = "qsub -v R_LIBS_USER -v R_LIBS -v PIGX_PATH -v GUIX_LOCPATH %s -l h_stack={cluster.h_stack}  -l h_vmem={cluster.MEM} %s -b y -pe smp {cluster.nthreads} -cwd" % (queue_selection_string, contact_email_string)
    if config['execution']['cluster']['log-dir']:
        qsub += " -o %s/ " % config['execution']['cluster']['log-dir']
        qsub += " -e %s/ " % config['execution']['cluster']['log-dir']
        os.makedirs(config['execution']['cluster']['log-dir'], exist_ok=True)

    if config['execution']['cluster']['args']:
        qsub += " " + config['execution']['cluster']['args']
    command += [
        "--cluster-config={}".format(cluster_config_file),
        "--cluster={}".format(qsub),
        "--jobscript={}/qsub-template.sh".format(config['locations']['pkglibexecdir']),
        "--latency-wait={}".format(config['execution']['cluster']['missing-file-timeout'])
    ]
else:
    print("Commencing snakemake run submission locally", flush=True, file=sys.stderr)

if args.graph:
    command.append("--dag")
    with open(args.graph, "w") as outfile:
        p1 = subprocess.Popen(command, stdout=subprocess.PIPE)
        p2 = subprocess.Popen(['dot','-Tpdf'], stdin=p1.stdout, stdout=outfile)
        p2.communicate()
else:
    prepare_links()
    display_logo()
    if args.force:
        command.append("--forceall")
    if args.dry_run:
        command.append("--dryrun")
    if args.reason:
        command.append("--reason")
    if args.unlock:
        command.append("--unlock")
    if args.verbose:
        command.append("--verbose")
    if args.printshellcmds:
        command.append("--printshellcmds")
    if args.target and 'help' in args.target:
        command.append("help")
    subprocess.run(command)
