Source code for node_metrics

# Copyright (c) 2017, System Engineering Software Society
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * Neither the name of the System Engineering Software Society nor the
#       names of its contributors may be used to endorse or promote products
#       derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.
# IN NO EVENT SHALL SYSTEM ENGINEERING SOFTWARE SOCIETY BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Some of the docstrings for this module have been automatically
extracted from the `scikit-learn <http://scikit-learn.org/>`_ library
and are covered by their respective licenses.
"""

from __future__ import (print_function, division, unicode_literals,
                        absolute_import)
import sklearn
import sklearn.base
import sklearn.metrics
import sklearn.exceptions
import sklearn.model_selection
import sklearn.feature_selection
import scipy.sparse

from sympathy.api import node
from sympathy.api.nodeconfig import Port, Ports, Tag, Tags
from sympathy.api.exceptions import SyNodeError, SyDataError, sywarn

from sylib.machinelearning.model import ModelPort
from sylib.machinelearning.utility import *
from sylib.machinelearning.descriptors import *
from sylib.machinelearning.abstract_nodes import SyML_abstract


[docs]class ROCFromProb(SyML_abstract, node.Node):
    name = 'ROC from Probabilities'
    author = 'Mathias Broxvall'
    copyright = '(C) 2017 System Engineering Software Society'
    version = '0.1'
    icon = 'roc_curve.svg'
    description = (
        'Computes Receiver operating characteristics (ROC) based on '
        'calculated Y-probabilities and from true Y.')
    nodeid = 'org.sysess.sympathy.machinelearning.roc_prob'
    tags = Tags(Tag.MachineLearning.Metrics)

    descriptor = Descriptor()
    descriptor.name = name
    descriptor.set_info([
        {'name': 'pos_label',
         'type': UnionType([
             NoneType(), IntType(), StringType()], default=None)},
        {'name': 'drop_intermediate',
         'type': BoolType(default=True)},
        {'name': 'header as label',
         'desc': 'Use header of Y-prob as the target label',
         'no-kw': True,
         'type': BoolType(default=True)},
    ], doc_class=sklearn.metrics.roc_curve)

    parameters = node.parameters()
    SyML_abstract.generate_parameters(parameters, descriptor)

    inputs = Ports([Port.Table('Y-prob', name='Y-prob'),
                    Port.Table('Y-true', name='Y-true')])
    outputs = Ports([Port.Table('roc', name='roc')])
    __doc__ = SyML_abstract.generate_docstring(
        description, descriptor.info, descriptor.attributes, inputs, outputs)

    def execute(self, node_context):
        Y_prob_tbl = node_context.input['Y-prob']
        Y_true_tbl = node_context.input['Y-true']
        roc_tbl = node_context.output['roc']
        header_as_label = node_context.parameters['header as label'].value

        Y_prob = table_to_array(Y_prob_tbl)
        Y_true = table_to_array(Y_true_tbl)

        kwargs = self.__class__.descriptor.get_parameters(
            node_context.parameters)
        kwargs['y_true'] = Y_true
        kwargs['y_score'] = Y_prob
        if BoolType().from_string(header_as_label):
            label = Y_prob_tbl.column_names()[0]
            try:
                label = int(label)
            except ValueError:
                pass
            kwargs['pos_label'] = label
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(**kwargs)
        roc_tbl.set_column_from_array('false positive rate', fpr)
        roc_tbl.set_column_from_array('true positive rate', tpr)
        roc_tbl.set_column_from_array('threshold', thresholds)
        roc_tbl.set_name('ROC')


[docs]class ConfusionFromPrediction(SyML_abstract, node.Node):
    name = 'Confusion Matrix'
    author = 'Mathias Broxvall'
    copyright = '(C) 2017 System Engineering Software Society'
    version = '0.1'
    icon = 'confusion_matrix.svg'
    description = (
        'Computes the confusion matrix given predictions and true Y-values.')
    nodeid = 'org.sysess.sympathy.machinelearning.confusion'
    tags = Tags(Tag.MachineLearning.Metrics)

    descriptor = Descriptor()
    descriptor.name = name
    descriptor.set_info([
        {'name': 'labels',
         'type': UnionType([
             NoneType(), IntListType(), StringListType()], default="")},
        {'name': 'include heading',
         'desc': 'Adds a columns with used class names',
         'no-kw': True,
         'type': BoolType(default=True)},
    ], doc_class=sklearn.metrics.confusion_matrix)

    parameters = node.parameters()
    SyML_abstract.generate_parameters(parameters, descriptor)

    inputs = Ports([Port.Table('Y-pred', name='Y-pred'),
                    Port.Table('Y-true', name='Y-true')])
    outputs = Ports([Port.Table('confusion-matrix', name='confusion-matrix')])
    __doc__ = SyML_abstract.generate_docstring(
        description, descriptor.info, descriptor.attributes, inputs, outputs)

    def execute(self, node_context):
        Y_pred_tbl = node_context.input['Y-pred']
        Y_true_tbl = node_context.input['Y-true']
        cm_tbl = node_context.output['confusion-matrix']
        heading = BoolType().from_string(
            node_context.parameters['include heading'].value)

        Y_pred = table_to_array(Y_pred_tbl)
        Y_true = table_to_array(Y_true_tbl)

        kwargs = self.__class__.descriptor.get_parameters(
            node_context.parameters)
        kwargs['y_true'] = Y_true
        kwargs['y_pred'] = Y_pred
        if kwargs['labels'] == []:
            kwargs['labels'] = None
        cm = sklearn.metrics.confusion_matrix(**kwargs)
        if kwargs['labels'] is not None:
            cols = kwargs['labels']
        else:
            cols = sorted(list(set(Y_true.ravel())))

        if heading:
            cm_tbl.set_column_from_array('label', np.array(cols))
        for i, col in enumerate(cols):
            cm_tbl.set_column_from_array(str(cols[i]), cm[:, i])
        cm_tbl.set_name(Y_true_tbl.get_name()+" confusion matrix")

[docs]class LearningCurve(node.Node):
    name = 'Learning Curve'
    author = 'Mathias Broxvall'
    copyright = '(C) 2017 System Engineering Software Society'
    version = '0.1'
    icon = 'learning_curve.svg'
    description = (
        'Generates a learning curve by training model multiple times'
        'on incrementally larger subsets of the data and using '
        'cross validation for scoring. '
        'Plot performance of train-mean vs. test-mean for curve.')
    nodeid = 'org.sysess.sympathy.machinelearning.learningcurve'
    tags = Tags(Tag.MachineLearning.Metrics)

    parameters = node.parameters()
    parameters.set_boolean(
        'shuffle', value=True, label='Shuffle',
        description='Randomizes the input dataset before passed to '
        'internal cross validation')
    parameters.set_float(
        'smallest', value=0.1, label='Smallest fraction',
        description='Size of the smallest dataset as fraction of total')
    parameters.set_integer(
        'steps', value=10, label='Steps',
        description='Number of different sizes of training/test data measured')
    parameters.set_integer(
        'cv', value=3, label='Cross validation folds',
        description='Number of fold of cross-validation (minimum 2)')

    inputs = Ports([
        ModelPort('Model','model'),
        Port.Table('X', name='X'),
        Port.Table('Y', name='Y')
    ])
    outputs = Ports([
        Port.Table('results', name='results'),
        Port.Table('statistics', name='statistics')
    ])

    def execute(self, node_context):
        X_tbl = node_context.input['X']
        Y_tbl = node_context.input['Y']
        results = node_context.output['results']
        statistics = node_context.output['statistics']
        shuffle = node_context.parameters['shuffle'].value
        smallest = node_context.parameters['smallest'].value
        steps = node_context.parameters['steps'].value
        cv = node_context.parameters['cv'].value

        smallest = max(0, min(smallest, 1.0))
        cv = max(2, cv)

        X = table_to_array(X_tbl)
        Y = table_to_array(Y_tbl)

        if shuffle:
            perm = np.random.permutation(X.shape[0])
            X=X[perm]
            Y=Y[perm]

        in_model = node_context.input['model']
        in_model.load()
        skl = in_model.get_skl()

        sizes, train_scores, test_scores = (
            sklearn.model_selection.learning_curve(
                skl, X, Y[:,0], cv=cv,
                train_sizes=np.linspace(smallest, 1.0, steps)))

        N = train_scores.shape[1]
        sizes_all = np.repeat(sizes, N)
        results.set_column_from_array("sizes", sizes_all)
        results.set_column_from_array("train", train_scores.ravel())
        results.set_column_from_array("test", test_scores.ravel())

        statistics.set_column_from_array("sizes", sizes)
        statistics.set_column_from_array("train_mean",
                                         np.mean(train_scores, axis=1))
        statistics.set_column_from_array("test_mean",
                                         np.mean(test_scores, axis=1))
        statistics.set_column_from_array("train_median",
                                         np.median(train_scores, axis=1))
        statistics.set_column_from_array("test_median",
                                         np.median(test_scores, axis=1))


[docs]class ConditionalFromCategories(node.Node):
    name = 'Conditional Probabilty from Categories'
    author = 'Mathias Broxvall'
    copyright = '(C) 2017 System Engineering Software Society'
    version = '0.1'
    icon = 'cond_prob_cat.svg'
    description = (
        'Creates groups of all (categorical) features and gives probabilities'
        'for Y. All of X must be categorical and all Y binary')
    nodeid = 'org.sysess.sympathy.machinelearning.cond_prob_cat'
    tags = Tags(Tag.MachineLearning.Metrics)

    parameters = node.parameters()
    inputs = Ports([
        Port.Table('X', name='X'),
        Port.Table('Y', name='Y')
    ])
    outputs = Ports([
        Port.Table('results', name='results'),
    ])

    def execute(self, node_context):
        X_tbl = node_context.input['X']
        Y_tbl = node_context.input['Y']
        results = node_context.output['results']

        X_names = X_tbl.column_names()
        Y_names = Y_tbl.column_names()

        dfX = X_tbl.to_dataframe()

        dfXY = dfX.copy()
        for col_y in Y_names:
            if col_y in X_names:
                sywarn('Column {} exists in both X, Y'.format(col_y))
            else:
                dfXY.loc[:,col_y] = Y_tbl.get_column_to_series(col_y)

        means = dfXY.groupby(X_names).mean()
        xy_indices = np.array(list(means.index))
        y_means = np.array(means)
        if len(xy_indices.shape) == 1:
            xy_indices = xy_indices.reshape(xy_indices.shape+(1,))
        if len(y_means.shape) == 1:
            y_means = y_means.reshape(y_means.shape+(1,))

        for i, col_x in enumerate(X_names):
            results.set_column_from_array(col_x, xy_indices[:,i])
        for i, col_y in enumerate(Y_names):
            results.set_column_from_array(col_y, y_means[:,i])