#!/usr/bin/env python
# Import perf.data into a pandas DataFrame
#
# Copyright (c) 2013-2014, Intel Corporation
# Author: Andi Kleen
#
# This program is free software; you can redistribute it and/or modify it
# under the terms and conditions of the GNU General Public License,
# version 2, as published by the Free Software Foundation.
#
# This program is distributed in the hope it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
# more details.

import pandas as pd
import numpy as np
import perfdata
from collections import defaultdict, Counter
import elf
import mmap

ignored = {'type', 'start', 'end', '__recursion_lock__', 'ext_reserved',
           'header_end', 'end_event', 'offset', 'callchain', 'branch',
           'branch_stack', 'end_id', 'size', 'cpumode', 'caller', 'time',
           # skip attr for now, as it is too complex
           # XXX simple representation
           'attr'}

bool_fields = {'kernel', 'hv', 'guest'}

def resolve_list(j, ip, mm, need_line):
     filename, _, foffset = mm.resolve(j.pid, ip)
     sym, soffset, line = elf.resolve_ip(filename, foffset, ip, need_line)
     return [filename, sym, soffset, line]

def resolve_chain(cc, j, mm, need_line):
    if not cc:
        return []
    res = []
    for ip in cc.caller:
        r = [ip,]
        r += resolve_list(j, ip, mm, need_line)
        res.append(r)
    return res

def resolve_branch(branch, j, mm, need_line):
    res = []
    for br in branch:
        # XXX flags
        r = [br['from'], br['to']]
        r += resolve_list(j, br['from'], mm, need_line)
        r += resolve_list(j, br['to'], mm, need_line)
        res.append(r)
    return res

class Path:
    """Store either a callchain or a branch stack as a list with id."""
    def __init__(self, val, id):
        self.val = val
        self.id = id

class Aux:
    """Store auxilliary data to the main pandas perf array, like call chains
       or branch stacks. The data is deduped and a unique id generated."""

    def __init__(self):
        self.ids = dict()
        self.paths = dict()
        self.next_id = 0

    def alloc_id(self):
        id = self.next_id
        self.next_id += 1
        return id

    def add(self, h, create):
        h = tuple(h)
        if h in self.paths:
            return self.paths[h].id
        id = self.alloc_id()
        path = Path(create(), id)
        self.paths[h] = path
        self.ids[id] = path
        return id

    def getid(self, id):
        return self.ids[id]

    def __getitem__(self, id):
	return self.ids[id]

cpumodes = {
    'UNKNOWN': (0, 0, 0),
    'KERNEL': (1, 0, 0),
    'USER': (0, 0, 0),
    'HYPERVISOR': (0, 1, 0),
    'GUEST_KERNEL': (1, 0, 1),
    'GUEST_USER': (0, 0, 1),
}

def samples_to_df(h, need_line):
    """Convert a parsed perf event list to a pandas table.
       The pandas table contains all events in a easily to process format.
       The pandas table has callchain_aux and branch_aux fields pointing
       to Aux object defining the callchains/branches."""
    ev = perfdata.get_events(h)
    index = []
    data = defaultdict(list)
    callchains = Aux()
    branches = Aux()

    used = Counter()
    mm = mmap.MmapTracker()

    for n in range(0, len(ev)):
        mm.lookahead_mmap(ev, n)

        j = ev[n]
        if j.type != "SAMPLE":
            continue

        mm.update_sample(j)
	def add(k, i):
	    data[k].append(i)
	    used[k] += 1

        filename, mmap_base, foffset = mm.resolve(j.pid, j.ip)
        if filename == "[kernel.kallsyms]_text":
            filename = None
        add('filename', filename)
        sym, soffset, line = elf.resolve_ip(filename, foffset, j.ip, need_line)
        add('symbol', sym)
        add('line', line)
        add('soffset', soffset)
        if 'callchain' in j and j.callchain:
            id = callchains.add(j.callchain.caller,
                    lambda: resolve_chain(j.callchain, j, mm, need_line))
            add('callchain', id)
        if 'branch_stack' in j and j.branch_stack:
            branch = j.branch_stack.branch
            id = branches.add(map(lambda x: (x['from'], x.to), branch),
                    lambda: resolve_branch(branch, j, mm, need_line))
            add('branch', id)
        kernel, guest, hv = cpumodes[j['cpumode']]
        add('kernel', kernel)
        add('guest', guest)
        add('hv', hv)
        for name in j:
            if name not in ignored:
                if j[name]:
                    used[name] += 1
                data[name].append(j[name])
        index.append(int(j["time"]))
    for j in data.keys():
        if used[j] == 0:
            del data[j]
    df = pd.DataFrame(data, index=index, dtype=np.uint64)
    for i in bool_fields:
        df[i] = df[i].astype('bool')
    df.branch_aux = branches
    df.callchain_aux = callchains
    return df

def read_samples(fn, need_line=True):
    with open(fn, "rb") as f:
        h = perfdata.perf_file.parse_stream(f)
        df = samples_to_df(h, need_line)
        return df, h.attrs.perf_file_attr.f_attr, h.features

if __name__ == '__main__':
    import argparse
    import sys

    args = argparse.ArgumentParser()
    args.add_argument('file', nargs='?', help='perf.data file to read', 
                          default='perf.data')
    args.add_argument('--repl', action='store_true',
                             help='start python shell with data')
    args.add_argument('--ipython', action='store_true',
			     help='start ipython shell with data')
    p = args.parse_args()
    df, _, _ = read_samples(p.file)
    if p.repl:
        import code
        print df
        code.interact(banner='perf.data is in df', local=locals())
        sys.exit(0)

    if p.ipython:
        try:
            from IPython.terminal.embed import InteractiveShellEmbed
        except NameError:
            sys.exit("Ipython not installed")
        print df
        ipshell = InteractiveShellEmbed(banner1="perf.data is in df")
        ipshell()
        sys.exit(0)

    print df
    print df['filename'].value_counts()
    print df['symbol'].value_counts()
    print df['line'].value_counts()