"""check_arcce_clean - Re-runs failed cleanup tasks."""

import os
import random
import time
from arcnagios.arcutils import arcstat, J_UNDEFINED
from arcnagios.ce.jobutils import JobNagiosPlugin
from arcnagios.nagutils import OK, WARNING, CRITICAL, ServiceReport, ServiceOk
from arcnagios.utils import counted_adjectives, log_process_error, \
        ResultOk, ResultError

class Check_arcce_clean(JobNagiosPlugin):

    def __init__(self):
        JobNagiosPlugin.__init__(self)
        ap = self.argparser.add_argument_group('Job Cleaner Options')
        ap.add_argument('--timeout', dest = 'timeout',
            type = int, default = 120,
            help = 'Overall timeout for probe, but currently does not limit '
                   'scheduled cleanup.')
        ap.add_argument('--max-age', dest = 'max_age',
            type = int, default = 604800,
            help = 'Max age before jobs info is cleaned.')
        ap.add_argument('--arcstat-timeout', dest = 'arcstat_timeout',
            type = int, default = 5, metavar = 'T',
            help = 'Passed to arcstat --timeout.')
        ap.add_argument('-w', dest = 'warning_load',
            type = float, default = 10,
            help = 'Ratio of remaining work to processed work above which \
                    to issue a warning alert.')
        ap.add_argument('-c', dest = 'critical_load',
            type = float, default = 20,
            help = 'Ratio of remaining work to processed work above which \
                    to issue a critical alert.')
        self._t_start = time.time()

    def time_left(self):
        return self.opts.timeout - time.time() + self._t_start

    def prune_jobs(self):
        active_jobids = self.collect_active_jobids()

        t_left = self.time_left()
        if t_left < 1:
            self.log.warning('Timeout before querying probes to prune.')
            return ResultError(RuntimeError('Timeout'))
        jobstats = list(arcstat(log = self.log,
                           timeout = min(t_left, self.opts.arcstat_timeout),
                           show_unavailable = True).items())
        random.shuffle(jobstats)

        pruned_count = 0
        failed_count = 0
        rest_count = 0
        for jobid, jobstat in jobstats:
            t_left = self.time_left()

            if jobstat.state == J_UNDEFINED:
                if jobid in active_jobids:
                    self.log.info('Skipping unavailable but active %s.', jobid)
                else:
                    if t_left < 1:
                        rest_count += 1
                        continue
                    arcclean_result = \
                        self.arcclient.arcclean(jobid, force = True,
                                                timeout = t_left)
                    if arcclean_result.is_ok():
                        self.log.info('Cleaned unavailable job %s.', jobid)
                        pruned_count += 1
                    else:
                        synopsis = 'failed to clean unavailable job %s' % jobid
                        log_process_error(self.log, arcclean_result.error,
                                          synopsis = synopsis,
                                          prefix = 'arcclean')
                        failed_count += 1
            elif jobstat.submitted:
                tm_sub = time.strptime(jobstat.submitted, '%Y-%m-%d %H:%M:%S')
                t_sub = time.mktime(tm_sub)
                if self._t_start - t_sub > self.opts.max_age:
                    if t_left < 1:
                        rest_count += 1
                        continue
                    arcclean_result = \
                        self.arcclient.arcclean(jobid, force = True,
                                                timeout = t_left)
                    if arcclean_result.is_ok():
                        self.log.info('Cleaned %s submitted %s.',
                                      jobid, jobstat.submitted)
                        pruned_count += 1
                    else:
                        synopsis = 'failed to clean %s' % jobid
                        log_process_error(self.log, arcclean_result.error,
                                          synopsis = synopsis,
                                          prefix = 'arcclean')
                        failed_count += 1

        return ResultOk((pruned_count, failed_count, rest_count))

    def _check_load(self, load, msg):
        if load > self.opts.critical_load:
            msg += ', critical load!'
            return (CRITICAL, msg)
        elif load > self.opts.warning_load:
            msg += ', high load!'
            return (WARNING, msg)
        else:
            msg += '.'
            return (OK, msg)

    def check(self):
        if not os.path.exists(self.top_workdir):
            self.log.info('The work directory is %s.', self.top_workdir)
            return ServiceOk('No jobs to clean since the working directory '
                             'has not yet been created.')
        self.require_voms_proxy()

        # Run scheduled work.
        s_ok, s_retry, s_failed, s_postponed = \
                self.cleaner.run(self.time_left() * 2 / 3)
        s_load = s_postponed / float(s_ok + s_failed + 1)
        s_msg = 'Sched: ' + counted_adjectives(
            [(s_ok, 'ok'),
             (s_retry, 'to retry'),
             (s_failed, 'failed'),
             (s_postponed, 'postponed')], if_empty = 'no work')
        s_service_state, s_msg = self._check_load(s_load, s_msg)

        # Prune ARC jobs if there is time.
        j_result = self.prune_jobs()
        if j_result.is_ok():
            j_cleaned, j_failed, j_postponed = j_result.get()
            j_load = j_postponed / float(j_cleaned + j_failed + 1)
            j_msg = 'Jobfile: ' + counted_adjectives(
                [(j_cleaned, 'cleaned'),
                 (j_failed, 'failed'),
                 (j_postponed, 'postponed')], if_empty = 'no work')
            j_service_state, j_msg = self._check_load(j_load, j_msg)
        else:
            j_service_state = CRITICAL
            j_msg = "No time left for checking ARC jobs."

        # Announce result.
        return ServiceReport(max(s_service_state, j_service_state),
                             s_msg + ' ' + j_msg)
