#!/usr/bin/env python
##
#
# Vaisala software source code file
#
# Copyright (c) Vaisala Oyj 2015. All rights reserved.
#
##
"""This program goes through hybrid scans, records how long it typically takes
to do the whole task and then goes through combined hybrids which took either
too long or too short time compared to most hybrids.

The problem with hybrid scans is that there is no way to identify an execution
of a scan and because a hybrid scan consists of multiple files, we may end up
creating a hybrid scan from files which are not part of the same execution of
a task.

Let's consider IRIS hybrid task executions A and A'.

Happy path:
 1. scan-updater-service notices a file (A1) added and adds it to scans.
 2. scan-updater-service runs through its hybrid matching logic and sees scan
    A has the cardinality (how many files it consists of) of 2. It only has
    A1, but not A2 so it cannot create a hybrid scan out of them (in the scan
    database).
 3. scan-updater-service notices a file (A2) added and adds it to scans.
 4. scan-updater-service runs through its hybrid matching logic and sees the
    hybrid parts A1 and A2 are there. They are combined into a hybrid A* and
    are now visible through scan-http-service.

Hybrid parts missing path:
 1. scan-updater-service notices a file (A1) added and adds it to scans.
 2. scan-updater-service runs through its hybrid matching logic and sees scan
    A has the cardinality of 2. It only has A1, but no A2 so it cannot create
    a hybrid scan yet.
 3. scan-updater-service notices a file (A'2) added and adds it to scans.
 4. scan-updater-service runs through its hybrid matching logic and sees the
    hybrid parts A1 and A'2 are there. Because scan-updater-service has no
    metadata linking hybrid scan parts, it creates a hybrid, A*.

The only distinguishing factor for the parts missing anomaly is the time it
takes for the radar to execute the whole scan. I.e. if it takes the radar
normally 10 minutes to execute the scan, but some of the hybrids actually took
20 minutes, there clearly must be something fishy going on. This tool is made
for that.
"""
from __future__ import print_function

import collections
import datetime
import sqlite3
import sys
import time

from scan_service import scan_db


def pr(*args, **kwargs):
    if 'file' not in kwargs:
        kwargs['file'] = sys.stderr
    if 'sep' not in kwargs:
        kwargs['sep'] = '\t'
    return print(*args, **kwargs)


def median(items):
    items = sorted(items)
    pivot_index = (len(items) - 1) // 2

    if len(items) == 0:
        return 0
    elif len(items) == 1:
        return items[0]

    if len(items) % 2:
        return items[pivot_index]
    else:
        return (items[pivot_index] + items[pivot_index + 1]) / 2.0


def mean(items):
    return sum(items) / float(len(items))


def main(args):
    db_conn = sqlite3.connect(args.db_file, detect_types=sqlite3.PARSE_DECLTYPES)
    scan_db.create_db(db_conn)
    db_conn.row_factory = scan_db.namedtuple_factory
    # http://stackoverflow.com/a/27165929
    db_conn.execute("PRAGMA busy_timeout = %i" % 5000)
    cursor = db_conn.cursor()

    times = collections.defaultdict(list)

    rows = cursor.execute("SELECT * FROM hybrids ORDER BY timestamp DESC").fetchall()
    for row in rows:
        scan_rows = cursor.execute("SELECT * FROM scans WHERE hybrid_id = :id ORDER BY timestamp ASC",
                                   {'id': row.id}).fetchall()

        first_timestamp = scan_rows[0].timestamp
        last_timestamp = scan_rows[-1].timestamp
        time_between_first_and_last_sweep_start = (last_timestamp - first_timestamp).total_seconds()


        times[(row.radar_id, row.task_name)].append(time_between_first_and_last_sweep_start)

        # pr(row.radar_id, row.task_name, row.timestamp, len(scan_rows), time_between_first_and_last_sweep_start)
        # for scan_row in scan_rows:
        #     pr("", scan_row.hybrid_minor_task_id, scan_row.timestamp)

    pr("Statistical summary (time (s) between first and last sweep start):")
    pr("")
    pr("", "RADAR", "TASK", "MEAN", "MEDIAN")
    for k in times.iterkeys():
        radar_id, task_name = k
        pr("", radar_id, task_name, mean(times[k]), median(times[k]))

    time.sleep(2)

    suspicious_count = 0
    pr(u"""\nSuspicious hybrid scans
    (time between first and last sweep start more than 10% off {}):""".format(args.statistical_measure))
    for row in rows:
        scan_rows = cursor.execute("SELECT * FROM scans WHERE hybrid_id = :id ORDER BY timestamp ASC",
                                   {'id': row.id}).fetchall()
        first_timestamp = scan_rows[0].timestamp
        last_timestamp = scan_rows[-1].timestamp
        time_between_first_and_last_sweep_start = (last_timestamp - first_timestamp).total_seconds()

        if args.statistical_measure == 'mean':
            measure = mean(times[(row.radar_id, row.task_name)])
        elif args.statistical_measure == 'median':
            measure = median(times[(row.radar_id, row.task_name)])

        distance = abs(time_between_first_and_last_sweep_start - measure)
        if distance > measure / 10.0:
            pr(row.radar_id, row.task_name, row.timestamp, time_between_first_and_last_sweep_start, distance)
            for scan_row in scan_rows:
                pr(" ", scan_row.hybrid_minor_task_id, scan_row.timestamp)
            suspicious_count += 1

    pr("")
    pr(u"Number of suspicious scans: {}".format(suspicious_count))

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--db-file', dest='db_file', required=True)
    parser.add_argument('--statistical-measure', type=str, choices=("mean", "median"),
                        default="median")
    args = parser.parse_args()
    main(args)
