#!/usr/bin/python3
#-*- coding: utf-8 -*-
"""
o2locktop is a top-like tool to monitor OCFS2 DLM lock usage in the cluster,
and can be used to detect hot files/directories, which intensively acquire DLM
locks.
"""

from __future__ import print_function
import sys
import signal
import argparse
import multiprocessing
import os
import time
from tempfile import TemporaryFile

PID = os.getpid()
ATTEMPT = 0
while True:
    try:
        from o2locktoplib import util
        from o2locktoplib import dlm
        from o2locktoplib import printer
        from o2locktoplib import keyboard
        from o2locktoplib import config
        from o2locktoplib.retry import retry
        break
    except ImportError:
        if ATTEMPT:
            print("\no2locktop: error: can't find the o2locktoplib, installed error!\n",
                  file=sys.stderr)
            sys.exit(0)
        else:
            ATTEMPT = 1
        PYTHON_VERSION = "{0}.{1}".format(str(sys.version_info[0]), str(sys.version_info[1]))
        PACKAGE_PATH_64 = "/usr/lib64/python{0}/site-packages".format(PYTHON_VERSION)
        PACKAGE_PATH = "/usr/lib/python{0}/site-packages".format(PYTHON_VERSION)
        LOCAL_PACKAGE_PATH_64 = "/usr/local/lib64/python{0}/site-packages".format(PYTHON_VERSION)
        LOCAL_PACKAGE_PATH = "/usr/local/lib/python{0}/site-packages".format(PYTHON_VERSION)

        for pk_path in PACKAGE_PATH_64, PACKAGE_PATH, LOCAL_PACKAGE_PATH_64, LOCAL_PACKAGE_PATH:
            if pk_path not in sys.path:
                sys.path.append(pk_path)


def parse_args(args=None):
    """ Parse the arguments of o2loctop
    Parameters:
        args(list): If args is None, the function will get the parameters form sys.argv[]
    Returns:
        (dict): The parse result in nameless dictionary
    """

    description = r"""
It is a top-like tool to monitor OCFS2 DLM lock usage in the cluster, and can
be used to detect hot files/directories, which intensively acquire DLM locks.
"""

    notes = r"""
The average/maximal wait time for DLM lock acquisitions likely gives hints to
the administrator when concern about OCFS2 performance, for example,
- if the workload is unbalanced among nodes.
- if a file is too hot, then maybe need check the related applications above.
- if a directory is too hot, then maybe split it to smaller with less number
  of files underneath.

OUTPUT ANNOTATION:
  - The output is refreshed every 5 seconds, and sorted by the sum of 
    DLM EX(exclusive) and PR(protected read) lock average wait time
  - One row, one inode (including the system meta files if with '-d' argument)
  - Columns:
    "TYPE" is DLM lock types,
      'M' -> Meta data lock for the inode
      'W' -> Write lock for the inode
      'O' -> Open lock for the inode

    "INO" is the inode number of the file

    "EX NUM" is the number of EX lock acquisitions
    "EX TIME" is the maximal wait time to get EX lock
    "EX AVG" is the average wait time to get EX lock

    "PR NUM" is the number of PR(read) lock acquisitions
    "PR TIME" is the maximal wait time to get PR lock
    "PR AVG" is the average wait time to get PR lock

SHORTCUTS:
  - Type "d" to display DLM lock statistics for each node
  - Type "Ctrl+C" or "q" to exit o2locktop process

PREREQUISITES:
  o2locktop reads OCFS2_FS_STATS statistics from /sys/kernel/debug/. That says,
  for all cluster nodes, the kernel option must be set(enabled). Check it out:
      grep OCFS2_FS_STATS < /boot/config-\`uname -r\`

  o2locktop uses the passwordless SSH to OCFS2 nodes as root. Set it up if not:
      ssh-keygen; ssh-copy-id root@node1

EXAMPLES:
  - At any machine within or outside of the cluster:

    o2locktop -n node1 -n node2 -n node3 /mnt/shared

    To find the absolute path according to the inode number:
    find <MOUNT_POINT> -inum <INO>
 
"""

    parser = argparse.ArgumentParser(description=description,
                                     prog='o2locktop',
                                     epilog=notes,
                                     formatter_class=argparse.RawDescriptionHelpFormatter
                                    )
#                                     add_help=False)

    parser.add_argument('-n', metavar='NODE_IP',
                        dest='host_list', action='append',
                        help='OCFS2 node IP address for ssh')

    parser.add_argument('-o', metavar='LOG_FILE', dest='log',
                        action='store',
                        help='log path')

    parser.add_argument('-l', metavar='DISPLAY_LENGTH',
                        dest='display_len', type=int,
                        help='number of lock records to display')

    parser.add_argument('-V', '--version', action="store_true",
                        help='the current version of o2locktop')

    parser.add_argument('-d', '--debug', action="store_true",
                        help='show all the inode including the system inode number')

    parser.add_argument('mount_point', metavar='MOUNT_POINT', nargs='?',
                        help='OCFS2 mount point, e.g. /mnt/shared')

    args = parser.parse_args(args=args)

    node_list = []
    if args.version:
        print(config.VERSION)
        sys.exit(0)
    if args.display_len is not None and args.display_len <= 0:
        util.eprint("\no2locktop: error: The length of the line to show must be greater than 0\n")
        sys.exit(0)
    if args.host_list:
        if not args.mount_point:
            util.eprint("\no2locktop: error: ocfs2 mount point is needed\n")
            parser.print_usage()
            sys.exit(0)
        for i in args.host_list:
            node_list.append(i)

        return {"mode":"remote",
                "mount_node" : node_list[0],
                "mount_point" : args.mount_point,
                "node_list" : node_list,
                "log" : args.log,
                "display_len" : args.display_len,
                "debug" : args.debug}
    else:
        if not args.mount_point:
            util.eprint("\no2locktop: error: ocfs2 mount point is needed\n")
            parser.print_usage()
            sys.exit(0)
        return {"mode":"local",
                "mount_point" : args.mount_point,
                "log" : args.log,
                "display_len" : args.display_len,
                "debug" : args.debug}

    parser.print_help()
    sys.exit(0)

def _connection_test_worker(node, mount_point, queue, pid):
    """ The worker of connection_test
    Parameters:
        node(str): The node name that to be tested
        mount_point(str): mount_point in the node
        queue(multiprocessing.Queue): To send message
        pid: PID of the main process
    """
    now = time.time()
    uuid = util.get_dlm_lockspace_mp(node, mount_point)
    if not uuid:
        if (time.time()-now) > 5:
            util.eprint("\no2locktop: error: network connection to {0} failed\n".format(node))
            try:
                os.kill(pid, signal.SIGUSR1)
            except:
                pass
        elif not util.is_passwdless_ssh_set(node):
            util.eprint("\no2locktop: error: o2locktop uses the passwordless SSH to OCFS2 nodes"\
                        " as root. Set it up if not: \nssh-keygen; ssh-copy-id root@{node}\n"
                        .format(node=node))
            try:
                os.kill(pid, signal.SIGUSR1)
            except:
                pass
        else:
            util.eprint("\no2locktop: error: can't find the mount point: {0}, "\
                        "please cheack and retry\n"
                        .format(mount_point))
            try:
                os.kill(pid, signal.SIGUSR1)
            except:
                pass
        return
    queue.put(uuid)


def connection_test(nodes, mount_point):
    """ Test the connection of all the nodes in the remote mode, and chech the uuid of the nodes
    Parameters:
        nodes(list): Node list of the cluster
        mount_point(str): mount_point in the node
    """
    assert(nodes is not None and nodes)
    queue = multiprocessing.Queue()
    process_list = []
    for node in nodes:
        process = multiprocessing.Process(target=_connection_test_worker,
                                          args=(node, mount_point, queue, PID))
        process.daemon = True
        process.start()
        process_list.append(process)
    for process in process_list:
        process.join()
    uuid = queue.get()
    for _ in range(len(process_list) - 1):
        tmp = queue.get()
        if uuid != tmp:
            util.eprint("\no2locktop: error: can't find the shared storage in the cluster, "\
                        "check if the node in the command line has input errors\n")
            sys.exit(0)

def _connection_ocfs2_debug_test_worker(node, pid):
    """ The worker of connection_ocfs2_debug_test
    Parameters:
        node(str): The node name that to be tested
        pid: PID of the main process
    """
    if not util.is_kernel_ocfs2_fs_stats_enabled(node):
        util.eprint("\no2locktop: error: the node({0}) do not support ocfs2 debug, "\
                    "please cheack and retry\n"
                    .format(node))
        os.kill(pid, signal.SIGUSR1)
        sys.exit(0)

PROCESS_LIST = []
def connection_ocfs2_debug_test(nodes):
    """ Check all the nodes that if the ovfs2 debug flag is set
    Parameters:
        nodes(list): The node list that to be tested
    """
    assert(nodes is not None and len)
    # do must wait the children process finished
    for node in nodes:
        process = multiprocessing.Process(target=_connection_ocfs2_debug_test_worker,
                                          args=(node, PID))
        process.daemon = True
        process.start()
        PROCESS_LIST.append(process)

def _remote_cmd_test_worker(node, pid, uuid):
    """ The worker of remote_cmd_test
    Parameters:
        node(str): The node name that to be tested
        pid: PID of the main process
        uuid: The uuid of ocfs2 device
    """
    result = util.cmd_is_exist(config.CMDS, node)
    if not result[0]:
        util.eprint("\no2locktop: error: the node({0}) do not have the command {1}, "\
                    "please install and retry\n"
                    .format(node, result[1]))
        os.kill(pid, signal.SIGUSR1)
        sys.exit(0)
    # to test if the remote node support ocfs2 debug v4, if support, set the v4 filter
    v4_support = util.check_support_debug_v4_and_get_interval(uuid, node)
    if v4_support:
        util.set_debug_v4_interval(uuid, node, config.INTERVAL*2+1)

def remote_cmd_test(nodes, mount_point):
    """ Test if all the required commands is in the node envirment
    Parameters:
        nodes(list): The node list that to be tested
        mount_point(str): mount_point in the node
    """
    assert(nodes is not None and nodes)
    uuid = util.get_dlm_lockspace_mp(nodes[0], mount_point)
    for node in nodes:
        process = multiprocessing.Process(target=_remote_cmd_test_worker, args=(node, PID, uuid))
        process.daemon = True
        process.start()
        PROCESS_LIST.append(process)


def local_test(mount_point):
    """ Test if the mount_point's device is an ocfs2 device
    """
    uuid = util.get_dlm_lockspace_mp(None, mount_point)
    if not uuid:
        util.eprint("\no2locktop: error: can't find the mount point: {0}, "\
                    "please cheack and retry\n"
                    .format(mount_point))
        sys.exit(0)

def local_ocfs2_debug_test():
    """ The  local version of remote_ocfs2_debug_test
    """
    if not util.is_kernel_ocfs2_fs_stats_enabled():
        util.eprint("\no2locktop: error: the node({0}) do not support ocfs2 debug, "\
                    "please cheack and retry\n"
                    .format("localhost"))
        sys.exit(0)

def local_cmd_test(mount_point):
    """ The  local version of remote_cmd_test
    """
    result = util.cmd_is_exist(config.CMDS)
    if not result[0]:
        util.eprint("\no2locktop: error: the local node do not have the command {0}, "\
                    "please install and retry\n"
                    .format(result[1]))
        sys.exit(0)
    # to test if the local node support ocfs2 debug v4, if support, set the v4 filter
    uuid = util.get_dlm_lockspace_mp(None, mount_point)
    v4_support = util.check_support_debug_v4_and_get_interval(uuid, None)
    if v4_support:
        util.set_debug_v4_interval(uuid, None, config.INTERVAL*2+1)

def main():
    """
    The main function of o2locktop
    """
    args = parse_args()

    # can also use dup2
    if not config.DEBUG:
        tmp_stderr = TemporaryFile('w+t')
        sys.stderr = tmp_stderr

    @retry(10)
    def sigcont_handler(signum, frame):
        """
        SIGCONT handler, for the state of Ctrl+z
        """
        keyboard.set_terminal()
        # the pass is useful for test
        pass

    @retry(10)
    def sigusr1_handler(signum, frame):
        """
        SIGUSR1 handler
        """
        keyboard.reset_terminal()
        os._exit(0)

    signal.signal(signal.SIGUSR1, sigusr1_handler)

    log = args["log"]
    display_len = args["display_len"]
    debug = args["debug"]

    if args['mode'] == "remote":
        mount_host, mount_point = args["mount_node"], args["mount_point"]
        nodes = args["node_list"]
        connection_test(nodes, mount_point)
        connection_ocfs2_debug_test(nodes)
        remote_cmd_test(nodes, mount_point)
        lock_space_str = util.get_dlm_lockspace_mp(mount_host, mount_point)
        max_sys_inode_num = util.get_dlm_lockspace_max_sys_inode_number(mount_host, mount_point)
        mount_info = ':'.join([mount_host, mount_point])
    elif args['mode'] == "local":
        mount_point = args["mount_point"]
        local_test(mount_point)
        local_ocfs2_debug_test()
        local_cmd_test(mount_point)
        lock_space_str = util.get_dlm_lockspace_mp(None, mount_point)
        max_sys_inode_num = util.get_dlm_lockspace_max_sys_inode_number(None, mount_point)
        mount_info = mount_point

    if lock_space_str is None:
        #print("Error while getting lockspace")
        sys.exit(0)

    if args["mode"] == "local":
        nodes = None

    for process in PROCESS_LIST:
        process.join()

    printer_queue = multiprocessing.Queue()
    printer_process = multiprocessing.Process(target=printer.worker,
                                              args=(printer_queue, log),
                                              kwargs={"mount_info":mount_info})
    printer_process.daemon = True
    lock_space_process = multiprocessing.Process(target=dlm.worker,
                                                 args=(lock_space_str,
                                                       max_sys_inode_num,
                                                       debug,
                                                       display_len,
                                                       nodes,
                                                       printer_queue))

    lock_space_process.daemon = True
    printer_process.start()
    lock_space_process.start()

    signal.signal(signal.SIGCONT, sigcont_handler)
    keyboard.worker(printer_queue)

    lock_space_process.terminate()
    lock_space_process.join()

    #printer_process will exit on quit message
    #printer_process.terminate()
    printer_process.join()


    sys.exit(0)

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        #print("Bye")
        keyboard.reset_terminal()
        pass
