Source code for node_application

# This file is part of Sympathy for Data.
# Copyright (c) 2017, Combine Control Systems AB
#
# SYMPATHY FOR DATA COMMERCIAL LICENSE
# You should have received a link to the License with Sympathy for Data.
import inspect
import numpy as np
import warnings

import sklearn.exceptions
import sklearn.feature_selection
import scipy.sparse

from sympathy.api import node
from sympathy.api.nodeconfig import Port, Ports, Tag, Tags
from sympathy.api.exceptions import SyDataError, sywarn

from sylib.machinelearning.model import ModelPort
from sylib.machinelearning.abstract_nodes import SyML_abstract
from sylib.machinelearning.descriptors import Descriptor

from sylib.machinelearning.descriptors import FloatType
from sylib.machinelearning.descriptors import NoneType
from sylib.machinelearning.descriptors import StringSelectionType
from sylib.machinelearning.descriptors import UnionType

from sylib.machinelearning.utility import data_to_table
from sylib.machinelearning.utility import table_to_array
from sylib.machinelearning.utility import array_to_table


[docs] class Fit(node.Node): """Trains a machine learning model using the input data. It uses the implementation of the model class .fit method. It adjusts the internal parameters of the model based on the patterns found in the training dataset. It is recommended to first use the :ref:`org.sysess.sympathy.machinelearning.simple_split` node to separate the data before training, for unbiased verification. (Note that some advanced models will not work with this node but must be used with the Fit Dataset node from Advanced Machine Learning extension.) For unsupervised learining the input port Y can be removed. However, some models might not be defined to be trained unsupervised and need to have the Y supplied. """ name = 'Fit' author = 'Mathias Broxvall' icon = 'fit.svg' description = ( 'Trains a model. Use "Ports > Input > Y" for supervised/unsupervised training') nodeid = 'org.sysess.sympathy.machinelearning.fit' tags = Tags(Tag.MachineLearning.Apply) inputs = Ports([ModelPort('Input model', 'in-model'), Port.Table('X', name='X'), Port.Custom('table', 'Y', name='Y0', n=(0, 1, 1)), Port.Custom('table', 'Sample weights', name='sample_weights', n=(0, 1))]) outputs = Ports([ModelPort('Output model', 'out-model')]) def execute(self, node_context): X_tbl = node_context.input['X'] Y_tbls = node_context.input.group('Y0') in_model = node_context.input['in-model'] out_model = node_context.output['out-model'] sample_weights = node_context.input.group('sample_weights') if len(Y_tbls) > 0: Y_tbl = Y_tbls[0] Y = table_to_array(Y_tbls[0], unitary=True) else: Y = None Y_tbl = None if len(sample_weights) > 0: sample_weight = table_to_array(sample_weights[0], unitary=True) else: sample_weight = None out_model.source(in_model) out_model.load() model = out_model.get_skl() X = table_to_array(X_tbl) # Check if we can fit in a progress_update function kwargs = {} if 'progress_fn' in inspect.signature(model.fit).parameters: kwargs['progress_fn'] = lambda i: self.set_progress(i) with warnings.catch_warnings(): warnings.simplefilter( 'ignore', sklearn.exceptions.ConvergenceWarning) if Y is None: if sample_weight is None: model.fit(X, **kwargs) else: model.fit(X, sample_weight=sample_weight, **kwargs) else: if sample_weight is None: model.fit(X, Y, **kwargs) else: model.fit(X, Y, sample_weight=sample_weight, **kwargs) desc = out_model.get_desc() desc.set_x_names(X_tbl.column_names()) if Y_tbl is not None: desc.set_y_names(Y_tbl.column_names()) desc.post_fit(model) out_model.save()
[docs] class FitText(node.Node): name = 'Fit Texts' author = 'Mathias Broxvall' icon = 'fit_text.svg' description = ( 'Fits a model using lists of texts. ' 'Use "Create Input Port > Y" for supervised training') nodeid = 'org.sysess.sympathy.machinelearning.fit_text' tags = Tags(Tag.MachineLearning.Apply) inputs = Ports([ModelPort('Input model', 'in-model'), Port.Custom('[text]', 'X', name='X'), Port.Custom('table', 'Y', name='Y0', n=(0, 1, 1))]) outputs = Ports([ModelPort('Output model', 'out-model')]) def execute(self, node_context): Y_tbls = node_context.input.group('Y0') in_model = node_context.input['in-model'] out_model = node_context.output['out-model'] X = [x.get() for x in node_context.input['X']] if len(Y_tbls) > 0: Y_tbl = Y_tbls[0] else: Y_tbl = None out_model.source(in_model) out_model.load() model = out_model.get_skl() with warnings.catch_warnings(): warnings.simplefilter( 'ignore', sklearn.exceptions.ConvergenceWarning) if Y_tbl is None: model.fit(X) else: Y = table_to_array(Y_tbl, unitary=True) model.fit(X, Y) desc = out_model.get_desc() desc.set_x_names(['corpus']) if Y_tbl is not None: desc.set_y_names(Y_tbl.column_names()) desc.post_fit(model) out_model.save()
[docs] class Predict(node.Node): """Applies a model to data. Uses the models implementation of the .predict method to predict Y given X. If the model was trained, X must be in the same format as the train data. (Note that some advanced models will not work with this node but must be used with the Predict Dataset node.) """ name = 'Predict' author = 'Mathias Broxvall' icon = 'predict.svg' description = 'Uses a model to predict Y given X' nodeid = 'org.sysess.sympathy.machinelearning.predict' tags = Tags(Tag.MachineLearning.Apply) parameters = node.parameters() parameters.set_boolean( 'pass_x', label='Pass through X', value=False, description=( 'Passes through a copy of X in addition to the predicted values')) inputs = Ports([ModelPort('Input model', 'in-model'), Port.Table('X', name='X')]) outputs = Ports([Port.Table('Y', name='Y')]) def execute(self, node_context): X_tbl = node_context.input['X'] Y_tbl = node_context.output['Y'] in_model = node_context.input['in-model'] pass_x = node_context.parameters['pass_x'].value in_model.load() model = in_model.get_skl() desc = in_model.get_desc() X = table_to_array(X_tbl) try: Y = desc.predict(model, X) except TypeError as exc: raise SyDataError( "Model does not implement the 'predict' function" ) from exc return if pass_x: for col in X_tbl.cols(): Y_tbl.set_column_from_array(col.name, col.data) if len(Y.shape) < 2: Y = Y.reshape(Y.shape + (1,)) y_names = in_model.get_desc().y_names if y_names is None: y_names = ["y{0}".format(i) for i in range(Y.shape[1])] for i, name in enumerate(y_names): Y_tbl.set_column_from_array(name, Y[:, i])
[docs] class PredictProbabilities(node.Node): """Applies a model to data. Uses the models implementation of the .predict_proba method to predict Y given X. But instead of returning the value for Y, the estimated probabilites for each class is returned. """ name = 'Predict Probabilities' author = 'Mathias Broxvall' icon = 'probabilities.svg' description = ( 'Uses a model to predict Y given X and returns the estimated' 'probabilities for each class in Y') nodeid = 'org.sysess.sympathy.machinelearning.predict_proba' tags = Tags(Tag.MachineLearning.Apply) inputs = Ports([ModelPort('Input model', 'in-model'), Port.Table('X', name='X')]) outputs = Ports([Port.Table('Y', name='Y')]) parameters = node.parameters() parameters.set_string( 'names method', label='Output names', value='From classes', description='Method used to generate output names', editor=node.editors.combo_editor(options=[ 'From classes', 'By index', 'From model Y names'])) def execute(self, node_context): X_tbl = node_context.input['X'] Y_tbl = node_context.output['Y'] in_model = node_context.input['in-model'] names = node_context.parameters['names method'].value in_model.load() model = in_model.get_skl() X = table_to_array(X_tbl) try: Y = model.predict_proba(X) except TypeError as exc: raise SyDataError( "Model does not implement the 'predict' function" ) from exc if isinstance(Y, list): Y = np.concatenate(Y, axis=1) if len(Y.shape) < 2: Y = Y.reshape(Y.shape+(1,)) y_names = ["Y{0}".format(i) for i in range(Y.shape[1])] if names == 'From classes': try: y_names = [str(classname) for classname in model.classes_] except AttributeError: pass elif names == 'From model Y names': cols = in_model.get_desc().y_names y_names[:len(cols)] = cols for i, name in enumerate(y_names): Y_tbl.set_column_from_array(name, Y[:, i])
[docs] class DecisionFunction(node.Node): """Predict confidence scores for samples with a model. Uses the models implementation of the .decision_function. The decision function is typically used in regression, to get the raw scores or distances from the decision boundary for each sample, rather than the predicted class labels. In multi-class classification, it returns either one score per class or a score matrix, depending on the model and configuration. """ name = 'Decision Function' author = 'Mathias Broxvall' icon = 'decision_function.svg' description = ( 'Applies the decision function (if available) of a trained model ' 'to return a scalar for each class of outputs') nodeid = 'org.sysess.sympathy.machinelearning.decision_function' tags = Tags(Tag.MachineLearning.Apply) inputs = Ports([ModelPort('Input model', 'in-model'), Port.Table('X', name='X')]) outputs = Ports([Port.Table('Y', name='Y')]) def execute(self, node_context): X_tbl = node_context.input['X'] Y_tbl = node_context.output['Y'] in_model = node_context.input['in-model'] in_model.load() model = in_model.get_skl() X = table_to_array(X_tbl) try: Y = model.decision_function(X) except TypeError as exc: raise SyDataError( "Model does not implement 'decision_function'" ) from exc return if len(Y.shape) < 2: Y = Y.reshape(Y.shape + (1,)) y_names = in_model.get_desc().y_names if y_names is not None and len(y_names) < Y.shape[1]: y_names = None if y_names is None: try: y_names = model.classes_ except AttributeError: # Well, we tried - use fallback names pass if y_names is None: y_names = ["y{0}".format(i) for i in range(Y.shape[1])] for i, name in enumerate(y_names): Y_tbl.set_column_from_array(name, Y[:, i])
[docs] class FitTransform(node.Node): """Apply transform model to data. This node uses the models implemenation of the .fit_transform() method, which is usually a combination of .fit() and .transform(). When you call .fit_transform(X), the method first learns from the data (.fit()), and then applies the transformation to X (.transform()), returning the modified version of the data. This node should not be used with test data since it also trains the model. Class lables can be supplied by enabeling the Y port: Ports > Input > Y in the configuration GUI. """ name = 'Fit Transform' author = 'Mathias Broxvall' icon = 'transform.svg' description = ( 'Fits a transform model to the given data and computes ' 'the transformed data. ') nodeid = 'org.sysess.sympathy.machinelearning.fit_transform' tags = Tags(Tag.MachineLearning.Apply) inputs = Ports([ModelPort('Input model', 'in-model'), Port.Table('Input table', name='input'), Port.Custom('table', 'Y', name='Y', n=(0, 1, 0))]) outputs = Ports([ModelPort('Output model', 'out-model'), Port.Table('Output table', name='output')]) parameters = node.parameters() parameters.set_string( 'names method', label='Output names', value='From model', description='Method used to generate output names', editor=node.editors.combo_editor(options=[ 'Copy from input', 'By index', 'From model'])) parameters.set_boolean( 'transpose', value=False, label='Transpose output', description='Transposes output data, suitable for large ' 'number of features (eg. word counts)') def execute(self, node_context): in_tbl = node_context.input['input'] y_tbls = node_context.input.group('Y') out_tbl = node_context.output['output'] in_model = node_context.input['in-model'] out_model = node_context.output['out-model'] names = node_context.parameters['names method'].value transpose = node_context.parameters['transpose'].value out_model.source(in_model) out_model.load() transform = out_model.get_skl() X = table_to_array(in_tbl) if len(y_tbls) > 0: Y = table_to_array(y_tbls[0], unitary=True) try: with warnings.catch_warnings(): warnings.simplefilter( 'ignore', sklearn.exceptions.ConvergenceWarning) warnings.simplefilter( 'ignore', sklearn.exceptions.DataConversionWarning) if len(y_tbls) > 0: Xprim = transform.fit_transform(X, Y) else: Xprim = transform.fit_transform(X) except TypeError as e: sywarn(e) raise SyDataError( "Model does not implement transforms with only one input" ) from e return desc = out_model.get_desc() desc.set_x_names(in_tbl.column_names()) if len(y_tbls) > 0: desc.set_y_names(y_tbls[0].column_names()) desc.post_fit(transform) out_model.save() if names == 'Copy from input': cols = in_tbl.column_names() elif names == 'From model' and desc.xout_names is not None: cols = desc.xout_names else: cols = [] Xprim = data_to_table(Xprim, cols, out_tbl, transpose=transpose)
[docs] class FitTransformText(node.Node): name = 'Fit Transform Text' author = 'Mathias Broxvall' icon = 'transform_text.svg' description = ( 'Fits a transform model to the given text data and computes ' 'the transformed data. ' 'Use "Create Input Port > Y" for supervised training') nodeid = 'org.sysess.sympathy.machinelearning.fit_transform_text' tags = Tags(Tag.MachineLearning.Apply) inputs = Ports([ModelPort('Input model', 'in-model'), Port.Custom('[text]', 'X', name='X')]) outputs = Ports([ModelPort('Output model', 'out-model'), Port.Table('Output table', name='output')]) parameters = node.parameters() parameters.set_string( 'names method', label='Output names', value='From model', description='Method used to generate output names', editor=node.editors.combo_editor(options=[ 'By index', 'From model'])) parameters.set_boolean( 'transpose', value=False, label='Transpose output', description='Transposes output data, suitable for large ' 'number of features (eg. word counts)') def execute(self, node_context): out_tbl = node_context.output['output'] in_model = node_context.input['in-model'] out_model = node_context.output['out-model'] names = node_context.parameters['names method'].value transpose = node_context.parameters['transpose'].value X = [x.get() for x in node_context.input['X']] out_model.source(in_model) out_model.load() transform = out_model.get_skl() try: Xprim = transform.fit_transform(X) except TypeError as exc: raise SyDataError( "Model does not implement transforms with only one input" ) from exc return desc = out_model.get_desc() desc.post_fit(transform) out_model.save() if names == 'From model' and desc.y_names is not None: cols = desc.xout_names else: cols = [] Xprim = data_to_table(Xprim, cols, out_tbl, transpose=transpose)
[docs] class Transform(node.Node): """Apply transformation model to data. The models implementation of .transform() is used to transform the given data. A transformation model does not predict anything, it only transforms the data (e.g. scaling, encoding, dimensionality reduction). It returns a modified version of the input data. """ name = 'Transform' author = 'Mathias Broxvall' icon = 'transform.svg' description = 'Applies a transformation model to the given data' nodeid = 'org.sysess.sympathy.machinelearning.transform' tags = Tags(Tag.MachineLearning.Apply) inputs = Ports([ModelPort('Input model', 'in-model'), Port.Table('Input table', name='input')]) outputs = Ports([Port.Table('Output table', name='output')]) parameters = node.parameters() parameters.set_string( 'names method', label='Output names', value='Copy from input', description='Method used to generate output names', editor=node.editors.combo_editor(options=[ 'Copy from input', 'By index', 'From model'])) parameters.set_boolean( 'transpose', value=False, label='Transpose output', description='Transposes output data, suitable for large ' 'number of features (eg. word counts)') def execute(self, node_context): in_tbl = node_context.input['input'] out_tbl = node_context.output['output'] in_model = node_context.input['in-model'] names = node_context.parameters['names method'].value transpose = node_context.parameters['transpose'].value in_model.load() transform = in_model.get_skl() desc = in_model.get_desc() X = table_to_array(in_tbl) try: Xprim = desc.transform(transform, X) except TypeError as exc: raise SyDataError( "Given model does not implement transforms (one input)" ) from exc return except sklearn.exceptions.NotFittedError as e: raise SyDataError(repr(e)) from e if names == 'Copy from input': cols = in_tbl.column_names() elif names == 'From model' and desc.y_names is not None: cols = desc.xout_names else: cols = [] Xprim = data_to_table(Xprim, cols, out_tbl, transpose=transpose)
[docs] class TransformText(node.Node): name = 'Transform Text' author = 'Mathias Broxvall' icon = 'transform_text.svg' description = 'Applies a transformation model to the given text data' nodeid = 'org.sysess.sympathy.machinelearning.transform_text' tags = Tags(Tag.MachineLearning.Apply) inputs = Ports([ModelPort('Input model', 'in-model'), Port.Custom('[text]', 'X', name='X')]) outputs = Ports([Port.Table('Output table', name='output')]) parameters = node.parameters() parameters.set_string( 'names method', label='Output names', value='From model', description='Method used to generate output names', editor=node.editors.combo_editor(options=[ 'By index', 'From model'])) parameters.set_boolean( 'transpose', value=False, label='Transpose output', description='Transposes output data, suitable for large ' 'number of features (eg. word counts)') def execute(self, node_context): out_tbl = node_context.output['output'] in_model = node_context.input['in-model'] names = node_context.parameters['names method'].value transpose = node_context.parameters['transpose'].value X = [x.get() for x in node_context.input['X']] in_model.load() transform = in_model.get_skl() desc = in_model.get_desc() try: Xprim = transform.transform(X) except TypeError as exc: raise SyDataError( "Given model does not implement transforms (one input)" ) from exc return except sklearn.exceptions.NotFittedError as e: raise SyDataError(repr(e)) from e if scipy.sparse.issparse(Xprim): Xprim = Xprim.toarray() if len(Xprim.shape) < 2: Xprim = Xprim.reshape(Xprim.shape+(1,)) if names == 'From model' and desc.y_names is not None: cols = desc.y_names else: cols = [] Xprim = data_to_table(Xprim, cols, out_tbl, transpose=transpose)
[docs] class InverseTransform(node.Node): """Applies the inverse of a transformation model to the given data. Some transform models (e.g. :ref:`org.sysess.sympathy.machinelearning.standard_scaler`, :ref:`org.sysess.sympathy.machinelearning.pca` or :ref:`org.sysess.sympathy.machinelearning.label_encoder`) have the possibility to reverse the transformation. Given previously transformed data, the original can be reconstructed by reversing the transformation. The models implementation of the .inverse_transform() method is used. """ name = 'Inverse Transform' author = 'Mathias Broxvall' icon = 'inverse_transform.svg' description = ( 'Applies the inverse of a transformation model to the given data') nodeid = 'org.sysess.sympathy.machinelearning.inverse_transform' tags = Tags(Tag.MachineLearning.Apply) inputs = Ports([ModelPort('Input model', 'in-model'), Port.Table('Input table', name='input')]) outputs = Ports([Port.Table('Output table', name='output')]) parameters = node.parameters() parameters.set_string( 'names method', label='Output names', value='Copy from input', description='Method used to generate output names', editor=node.editors.combo_editor(options=[ 'Copy from input', 'By index', 'From model'])) parameters.set_boolean( 'transpose', value=False, label='Transpose output', description='Transposes output data, suitable for large ' 'number of features (eg. word counts)') def execute(self, node_context): in_tbl = node_context.input['input'] out_tbl = node_context.output['output'] in_model = node_context.input['in-model'] names = node_context.parameters['names method'].value transpose = node_context.parameters['transpose'].value in_model.load() transform = in_model.get_skl() desc = in_model.get_desc() X = table_to_array(in_tbl) try: Xprim = transform.inverse_transform(X) except TypeError as exc: raise SyDataError( "Given model does not implement inverse transforms" ) from exc return except sklearn.exceptions.NotFittedError as e: raise SyDataError(repr(e)) from e if names == 'Copy from input': cols = in_tbl.column_names() elif names == 'From model' and desc.y_names is not None: cols = desc.x_names else: cols = [] Xprim = data_to_table(Xprim, cols, out_tbl, transpose=transpose)
[docs] class Score(node.Node): name = 'Score' author = 'Mathias Broxvall' icon = 'score.svg' description = ( 'Scores the model using given X and Y data. Exact semantics\n' 'depends on the type of model (classifier, regressor, etc).\n' 'Typically, for classifiers the score will be the mean accuracy\n' '(fraction of correct predictions out of all predictions).') nodeid = 'org.sysess.sympathy.machinelearning.score' tags = Tags(Tag.MachineLearning.Metrics) parameters = node.parameters() parameters.set_boolean( 'default method', label='Use built-in default scoring', value=True, description=( 'Use the default scoring method defined by the used model.\n' 'Otherwise the problem is assumed to be a classification problem,\n' 'a single predict call is made and extended information is given\n' 'for each target. If model does not implement the predict function\n' 'then a transform is used instead.')) inputs = Ports([ModelPort('Input model', 'in-model'), Port.Table('X', name='X'), Port.Table('Y', name='Y')]) outputs = Ports([Port.Table('Score', name='Score')]) def execute(self, node_context): X_tbl = node_context.input['X'] Y_tbl = node_context.input['Y'] in_model = node_context.input['in-model'] score_tbl = node_context.output['Score'] default_method = node_context.parameters['default method'].value in_model.load() model = in_model.get_skl() desc = in_model.get_desc() X = table_to_array(X_tbl) Y = table_to_array(Y_tbl) if list(desc.y_names) != list(Y_tbl.column_names()): sywarn('Column names for Y does not match those in model') if default_method: try: score = model.score(X, Y) except TypeError as exc: raise SyDataError( 'Given model does not implement the "score" function' ) from exc else: score_tbl.set_column_from_array('score', np.array([score])) else: try: Y_pred = model.predict(X) except TypeError: try: Y_pred = model.transform(X) except TypeError as exc: raise SyDataError( "Given model does not implement neither predict " "nor transform" ) from exc if len(Y_pred.shape) == 1: Y_pred = Y_pred.reshape(Y_pred.shape+(1,)) if len(Y.shape) == 1: Y = Y.reshape(Y.shape+(1,)) if Y.shape != Y_pred.shape: raise SyDataError( 'Shape of predicted Y-data {} does not match actual Y {}' .format(Y_pred.shape, Y.shape)) correct = Y == Y_pred score = np.all(correct, axis=1).mean() score_tbl.set_column_from_array('score', np.array([score])) if len(desc.y_names) > 1: for pos, name in enumerate(desc.y_names): col = correct[:, pos] score_tbl.set_column_from_array( name, np.array([col.mean()]))
[docs] class SelectFromModel(SyML_abstract, node.Node): """Model based feature selection. Important features are selected based on a trained estimator's importance scores. It is typically used with models that have a coef or feature_importances attribute, such as linear models or tree-based models (e.g., LogisticRegression or RandomForestClassifier). It uses the model's learned coefficients or importances to determine which features to keep. You can specify a threshold (default or custom) to decide which features are considered important enough to retain. The given data is transformed to get a reduced feature set. This helps improve model performance by removing less useful or redundant features. It's particularly useful in pipelines and when you're trying to automate feature selection based on model insights. """ name = 'Select Features from Model' author = 'Mathias Broxvall' icon = 'select_model.svg' description = ( 'Meta-transformer for selecting features based on importance weight.\n' 'Only works for models with coef or feature_importances attributes.' ) nodeid = 'org.sysess.sympathy.machinelearning.select_from_model' tags = Tags(Tag.MachineLearning.Apply) descriptor = Descriptor() descriptor.name = name descriptor.set_info([ {'name': 'threshold', 'type': UnionType([ StringSelectionType(['median', 'mean']), FloatType(), NoneType()], default=None)}, ], doc_class=sklearn.feature_selection.SelectFromModel) parameters = node.parameters() SyML_abstract.generate_parameters(parameters, descriptor) inputs = Ports([ModelPort('Input model', 'model'), Port.Table('Input data', name='in-data')]) outputs = Ports([Port.Table('Output data', name='out-data'), Port.Table('Features', name='features')]) __doc__ += SyML_abstract.generate_docstring( description, descriptor.info, descriptor.attributes, inputs, outputs) def execute(self, node_context): model = node_context.input['model'] in_data = node_context.input['in-data'] out_data = node_context.output['out-data'] features = node_context.output['features'] model.load() skl = model.get_skl() desc = model.get_desc() kwargs = self.__class__.descriptor.get_parameters( node_context.parameters) kwargs['estimator'] = skl kwargs['prefit'] = True sfm = sklearn.feature_selection.SelectFromModel(**kwargs) X = table_to_array(in_data) Xsel = sfm.transform(X) indices = sfm.get_support(indices=True) x_names = desc.x_names if x_names is None: x_names = ["X{}".format(i) for i in X.shape[1]] array_to_table(np.array(x_names)[indices], Xsel, tbl=out_data) support = sfm.get_support() array_to_table(x_names, support.reshape((1, len(support))), tbl=features)