# This file is part of Sympathy for Data.
# Copyright (c) 2017, Combine Control Systems AB
#
# SYMPATHY FOR DATA COMMERCIAL LICENSE
# You should have received a link to the License with Sympathy for Data.
import inspect
import numpy as np
import warnings
import sklearn.exceptions
import sklearn.feature_selection
import scipy.sparse
from sympathy.api import node
from sympathy.api.nodeconfig import Port, Ports, Tag, Tags
from sympathy.api.exceptions import SyDataError, sywarn
from sylib.machinelearning.model import ModelPort
from sylib.machinelearning.abstract_nodes import SyML_abstract
from sylib.machinelearning.descriptors import Descriptor
from sylib.machinelearning.descriptors import FloatType
from sylib.machinelearning.descriptors import NoneType
from sylib.machinelearning.descriptors import StringSelectionType
from sylib.machinelearning.descriptors import UnionType
from sylib.machinelearning.utility import data_to_table
from sylib.machinelearning.utility import table_to_array
from sylib.machinelearning.utility import array_to_table
[docs]
class Fit(node.Node):
"""Trains a machine learning model using the input data. It uses the implementation of the
model class .fit method. It adjusts the internal parameters of the model based on the patterns
found in the training dataset. It is recommended to first use the
:ref:`org.sysess.sympathy.machinelearning.simple_split` node to separate the data before
training, for unbiased verification. (Note that some advanced models will not work with this
node but must be used with the Fit Dataset node from Advanced Machine Learning extension.)
For unsupervised learining the input port Y can be removed. However, some models might not be
defined to be trained unsupervised and need to have the Y supplied.
"""
name = 'Fit'
author = 'Mathias Broxvall'
icon = 'fit.svg'
description = (
'Trains a model. Use "Ports > Input > Y" for supervised/unsupervised training')
nodeid = 'org.sysess.sympathy.machinelearning.fit'
tags = Tags(Tag.MachineLearning.Apply)
inputs = Ports([ModelPort('Input model', 'in-model'),
Port.Table('X', name='X'),
Port.Custom('table', 'Y', name='Y0', n=(0, 1, 1)),
Port.Custom('table', 'Sample weights',
name='sample_weights', n=(0, 1))])
outputs = Ports([ModelPort('Output model', 'out-model')])
def execute(self, node_context):
X_tbl = node_context.input['X']
Y_tbls = node_context.input.group('Y0')
in_model = node_context.input['in-model']
out_model = node_context.output['out-model']
sample_weights = node_context.input.group('sample_weights')
if len(Y_tbls) > 0:
Y_tbl = Y_tbls[0]
Y = table_to_array(Y_tbls[0], unitary=True)
else:
Y = None
Y_tbl = None
if len(sample_weights) > 0:
sample_weight = table_to_array(sample_weights[0], unitary=True)
else:
sample_weight = None
out_model.source(in_model)
out_model.load()
model = out_model.get_skl()
X = table_to_array(X_tbl)
# Check if we can fit in a progress_update function
kwargs = {}
if 'progress_fn' in inspect.signature(model.fit).parameters:
kwargs['progress_fn'] = lambda i: self.set_progress(i)
with warnings.catch_warnings():
warnings.simplefilter(
'ignore', sklearn.exceptions.ConvergenceWarning)
if Y is None:
if sample_weight is None:
model.fit(X, **kwargs)
else:
model.fit(X, sample_weight=sample_weight, **kwargs)
else:
if sample_weight is None:
model.fit(X, Y, **kwargs)
else:
model.fit(X, Y, sample_weight=sample_weight, **kwargs)
desc = out_model.get_desc()
desc.set_x_names(X_tbl.column_names())
if Y_tbl is not None:
desc.set_y_names(Y_tbl.column_names())
desc.post_fit(model)
out_model.save()
[docs]
class FitText(node.Node):
name = 'Fit Texts'
author = 'Mathias Broxvall'
icon = 'fit_text.svg'
description = (
'Fits a model using lists of texts. '
'Use "Create Input Port > Y" for supervised training')
nodeid = 'org.sysess.sympathy.machinelearning.fit_text'
tags = Tags(Tag.MachineLearning.Apply)
inputs = Ports([ModelPort('Input model', 'in-model'),
Port.Custom('[text]', 'X', name='X'),
Port.Custom('table', 'Y', name='Y0', n=(0, 1, 1))])
outputs = Ports([ModelPort('Output model', 'out-model')])
def execute(self, node_context):
Y_tbls = node_context.input.group('Y0')
in_model = node_context.input['in-model']
out_model = node_context.output['out-model']
X = [x.get() for x in node_context.input['X']]
if len(Y_tbls) > 0:
Y_tbl = Y_tbls[0]
else:
Y_tbl = None
out_model.source(in_model)
out_model.load()
model = out_model.get_skl()
with warnings.catch_warnings():
warnings.simplefilter(
'ignore', sklearn.exceptions.ConvergenceWarning)
if Y_tbl is None:
model.fit(X)
else:
Y = table_to_array(Y_tbl, unitary=True)
model.fit(X, Y)
desc = out_model.get_desc()
desc.set_x_names(['corpus'])
if Y_tbl is not None:
desc.set_y_names(Y_tbl.column_names())
desc.post_fit(model)
out_model.save()
[docs]
class Predict(node.Node):
"""Applies a model to data. Uses the models implementation of the .predict method to predict Y
given X. If the model was trained, X must be in the same format as the train data. (Note that
some advanced models will not work with this node but must be used with the Predict Dataset
node.)
"""
name = 'Predict'
author = 'Mathias Broxvall'
icon = 'predict.svg'
description = 'Uses a model to predict Y given X'
nodeid = 'org.sysess.sympathy.machinelearning.predict'
tags = Tags(Tag.MachineLearning.Apply)
parameters = node.parameters()
parameters.set_boolean(
'pass_x', label='Pass through X', value=False,
description=(
'Passes through a copy of X in addition to the predicted values'))
inputs = Ports([ModelPort('Input model', 'in-model'),
Port.Table('X', name='X')])
outputs = Ports([Port.Table('Y', name='Y')])
def execute(self, node_context):
X_tbl = node_context.input['X']
Y_tbl = node_context.output['Y']
in_model = node_context.input['in-model']
pass_x = node_context.parameters['pass_x'].value
in_model.load()
model = in_model.get_skl()
desc = in_model.get_desc()
X = table_to_array(X_tbl)
try:
Y = desc.predict(model, X)
except TypeError as exc:
raise SyDataError(
"Model does not implement the 'predict' function"
) from exc
return
if pass_x:
for col in X_tbl.cols():
Y_tbl.set_column_from_array(col.name, col.data)
if len(Y.shape) < 2:
Y = Y.reshape(Y.shape + (1,))
y_names = in_model.get_desc().y_names
if y_names is None:
y_names = ["y{0}".format(i) for i in range(Y.shape[1])]
for i, name in enumerate(y_names):
Y_tbl.set_column_from_array(name, Y[:, i])
[docs]
class PredictProbabilities(node.Node):
"""Applies a model to data. Uses the models implementation of the .predict_proba method to
predict Y given X. But instead of returning the value for Y, the estimated probabilites for
each class is returned.
"""
name = 'Predict Probabilities'
author = 'Mathias Broxvall'
icon = 'probabilities.svg'
description = (
'Uses a model to predict Y given X and returns the estimated'
'probabilities for each class in Y')
nodeid = 'org.sysess.sympathy.machinelearning.predict_proba'
tags = Tags(Tag.MachineLearning.Apply)
inputs = Ports([ModelPort('Input model', 'in-model'),
Port.Table('X', name='X')])
outputs = Ports([Port.Table('Y', name='Y')])
parameters = node.parameters()
parameters.set_string(
'names method',
label='Output names',
value='From classes',
description='Method used to generate output names',
editor=node.editors.combo_editor(options=[
'From classes', 'By index', 'From model Y names']))
def execute(self, node_context):
X_tbl = node_context.input['X']
Y_tbl = node_context.output['Y']
in_model = node_context.input['in-model']
names = node_context.parameters['names method'].value
in_model.load()
model = in_model.get_skl()
X = table_to_array(X_tbl)
try:
Y = model.predict_proba(X)
except TypeError as exc:
raise SyDataError(
"Model does not implement the 'predict' function"
) from exc
if isinstance(Y, list):
Y = np.concatenate(Y, axis=1)
if len(Y.shape) < 2:
Y = Y.reshape(Y.shape+(1,))
y_names = ["Y{0}".format(i) for i in range(Y.shape[1])]
if names == 'From classes':
try:
y_names = [str(classname) for classname in model.classes_]
except AttributeError:
pass
elif names == 'From model Y names':
cols = in_model.get_desc().y_names
y_names[:len(cols)] = cols
for i, name in enumerate(y_names):
Y_tbl.set_column_from_array(name, Y[:, i])
[docs]
class DecisionFunction(node.Node):
"""Predict confidence scores for samples with a model. Uses the models implementation of the
.decision_function. The decision function is typically used in regression, to get the raw
scores or distances from the decision boundary for each sample, rather than the predicted
class labels. In multi-class classification, it returns either one score per class or a score
matrix, depending on the model and configuration.
"""
name = 'Decision Function'
author = 'Mathias Broxvall'
icon = 'decision_function.svg'
description = (
'Applies the decision function (if available) of a trained model '
'to return a scalar for each class of outputs')
nodeid = 'org.sysess.sympathy.machinelearning.decision_function'
tags = Tags(Tag.MachineLearning.Apply)
inputs = Ports([ModelPort('Input model', 'in-model'),
Port.Table('X', name='X')])
outputs = Ports([Port.Table('Y', name='Y')])
def execute(self, node_context):
X_tbl = node_context.input['X']
Y_tbl = node_context.output['Y']
in_model = node_context.input['in-model']
in_model.load()
model = in_model.get_skl()
X = table_to_array(X_tbl)
try:
Y = model.decision_function(X)
except TypeError as exc:
raise SyDataError(
"Model does not implement 'decision_function'"
) from exc
return
if len(Y.shape) < 2:
Y = Y.reshape(Y.shape + (1,))
y_names = in_model.get_desc().y_names
if y_names is not None and len(y_names) < Y.shape[1]:
y_names = None
if y_names is None:
try:
y_names = model.classes_
except AttributeError:
# Well, we tried - use fallback names
pass
if y_names is None:
y_names = ["y{0}".format(i) for i in range(Y.shape[1])]
for i, name in enumerate(y_names):
Y_tbl.set_column_from_array(name, Y[:, i])
[docs]
class FitTransformText(node.Node):
name = 'Fit Transform Text'
author = 'Mathias Broxvall'
icon = 'transform_text.svg'
description = (
'Fits a transform model to the given text data and computes '
'the transformed data. '
'Use "Create Input Port > Y" for supervised training')
nodeid = 'org.sysess.sympathy.machinelearning.fit_transform_text'
tags = Tags(Tag.MachineLearning.Apply)
inputs = Ports([ModelPort('Input model', 'in-model'),
Port.Custom('[text]', 'X', name='X')])
outputs = Ports([ModelPort('Output model', 'out-model'),
Port.Table('Output table', name='output')])
parameters = node.parameters()
parameters.set_string(
'names method', label='Output names', value='From model',
description='Method used to generate output names',
editor=node.editors.combo_editor(options=[
'By index', 'From model']))
parameters.set_boolean(
'transpose', value=False, label='Transpose output',
description='Transposes output data, suitable for large '
'number of features (eg. word counts)')
def execute(self, node_context):
out_tbl = node_context.output['output']
in_model = node_context.input['in-model']
out_model = node_context.output['out-model']
names = node_context.parameters['names method'].value
transpose = node_context.parameters['transpose'].value
X = [x.get() for x in node_context.input['X']]
out_model.source(in_model)
out_model.load()
transform = out_model.get_skl()
try:
Xprim = transform.fit_transform(X)
except TypeError as exc:
raise SyDataError(
"Model does not implement transforms with only one input"
) from exc
return
desc = out_model.get_desc()
desc.post_fit(transform)
out_model.save()
if names == 'From model' and desc.y_names is not None:
cols = desc.xout_names
else:
cols = []
Xprim = data_to_table(Xprim, cols, out_tbl, transpose=transpose)
[docs]
class TransformText(node.Node):
name = 'Transform Text'
author = 'Mathias Broxvall'
icon = 'transform_text.svg'
description = 'Applies a transformation model to the given text data'
nodeid = 'org.sysess.sympathy.machinelearning.transform_text'
tags = Tags(Tag.MachineLearning.Apply)
inputs = Ports([ModelPort('Input model', 'in-model'),
Port.Custom('[text]', 'X', name='X')])
outputs = Ports([Port.Table('Output table', name='output')])
parameters = node.parameters()
parameters.set_string(
'names method', label='Output names', value='From model',
description='Method used to generate output names',
editor=node.editors.combo_editor(options=[
'By index', 'From model']))
parameters.set_boolean(
'transpose', value=False, label='Transpose output',
description='Transposes output data, suitable for large '
'number of features (eg. word counts)')
def execute(self, node_context):
out_tbl = node_context.output['output']
in_model = node_context.input['in-model']
names = node_context.parameters['names method'].value
transpose = node_context.parameters['transpose'].value
X = [x.get() for x in node_context.input['X']]
in_model.load()
transform = in_model.get_skl()
desc = in_model.get_desc()
try:
Xprim = transform.transform(X)
except TypeError as exc:
raise SyDataError(
"Given model does not implement transforms (one input)"
) from exc
return
except sklearn.exceptions.NotFittedError as e:
raise SyDataError(repr(e)) from e
if scipy.sparse.issparse(Xprim):
Xprim = Xprim.toarray()
if len(Xprim.shape) < 2:
Xprim = Xprim.reshape(Xprim.shape+(1,))
if names == 'From model' and desc.y_names is not None:
cols = desc.y_names
else:
cols = []
Xprim = data_to_table(Xprim, cols, out_tbl, transpose=transpose)
[docs]
class Score(node.Node):
name = 'Score'
author = 'Mathias Broxvall'
icon = 'score.svg'
description = (
'Scores the model using given X and Y data. Exact semantics\n'
'depends on the type of model (classifier, regressor, etc).\n'
'Typically, for classifiers the score will be the mean accuracy\n'
'(fraction of correct predictions out of all predictions).')
nodeid = 'org.sysess.sympathy.machinelearning.score'
tags = Tags(Tag.MachineLearning.Metrics)
parameters = node.parameters()
parameters.set_boolean(
'default method', label='Use built-in default scoring',
value=True,
description=(
'Use the default scoring method defined by the used model.\n'
'Otherwise the problem is assumed to be a classification problem,\n'
'a single predict call is made and extended information is given\n'
'for each target. If model does not implement the predict function\n'
'then a transform is used instead.'))
inputs = Ports([ModelPort('Input model', 'in-model'),
Port.Table('X', name='X'),
Port.Table('Y', name='Y')])
outputs = Ports([Port.Table('Score', name='Score')])
def execute(self, node_context):
X_tbl = node_context.input['X']
Y_tbl = node_context.input['Y']
in_model = node_context.input['in-model']
score_tbl = node_context.output['Score']
default_method = node_context.parameters['default method'].value
in_model.load()
model = in_model.get_skl()
desc = in_model.get_desc()
X = table_to_array(X_tbl)
Y = table_to_array(Y_tbl)
if list(desc.y_names) != list(Y_tbl.column_names()):
sywarn('Column names for Y does not match those in model')
if default_method:
try:
score = model.score(X, Y)
except TypeError as exc:
raise SyDataError(
'Given model does not implement the "score" function'
) from exc
else:
score_tbl.set_column_from_array('score', np.array([score]))
else:
try:
Y_pred = model.predict(X)
except TypeError:
try:
Y_pred = model.transform(X)
except TypeError as exc:
raise SyDataError(
"Given model does not implement neither predict "
"nor transform"
) from exc
if len(Y_pred.shape) == 1:
Y_pred = Y_pred.reshape(Y_pred.shape+(1,))
if len(Y.shape) == 1:
Y = Y.reshape(Y.shape+(1,))
if Y.shape != Y_pred.shape:
raise SyDataError(
'Shape of predicted Y-data {} does not match actual Y {}'
.format(Y_pred.shape, Y.shape))
correct = Y == Y_pred
score = np.all(correct, axis=1).mean()
score_tbl.set_column_from_array('score', np.array([score]))
if len(desc.y_names) > 1:
for pos, name in enumerate(desc.y_names):
col = correct[:, pos]
score_tbl.set_column_from_array(
name, np.array([col.mean()]))
[docs]
class SelectFromModel(SyML_abstract, node.Node):
"""Model based feature selection. Important features are selected based on a trained
estimator's importance scores. It is typically used with models that have a coef or
feature_importances attribute, such as linear models or tree-based models
(e.g., LogisticRegression or RandomForestClassifier).
It uses the model's learned coefficients or importances to determine which features to keep.
You can specify a threshold (default or custom) to decide which features are considered
important enough to retain.
The given data is transformed to get a reduced feature set.
This helps improve model performance by removing less useful or redundant features.
It's particularly useful in pipelines and when you're trying to automate feature selection
based on model insights.
"""
name = 'Select Features from Model'
author = 'Mathias Broxvall'
icon = 'select_model.svg'
description = (
'Meta-transformer for selecting features based on importance weight.\n'
'Only works for models with coef or feature_importances attributes.'
)
nodeid = 'org.sysess.sympathy.machinelearning.select_from_model'
tags = Tags(Tag.MachineLearning.Apply)
descriptor = Descriptor()
descriptor.name = name
descriptor.set_info([
{'name': 'threshold',
'type': UnionType([
StringSelectionType(['median', 'mean']),
FloatType(), NoneType()], default=None)},
], doc_class=sklearn.feature_selection.SelectFromModel)
parameters = node.parameters()
SyML_abstract.generate_parameters(parameters, descriptor)
inputs = Ports([ModelPort('Input model', 'model'),
Port.Table('Input data', name='in-data')])
outputs = Ports([Port.Table('Output data', name='out-data'),
Port.Table('Features', name='features')])
__doc__ += SyML_abstract.generate_docstring(
description, descriptor.info, descriptor.attributes, inputs, outputs)
def execute(self, node_context):
model = node_context.input['model']
in_data = node_context.input['in-data']
out_data = node_context.output['out-data']
features = node_context.output['features']
model.load()
skl = model.get_skl()
desc = model.get_desc()
kwargs = self.__class__.descriptor.get_parameters(
node_context.parameters)
kwargs['estimator'] = skl
kwargs['prefit'] = True
sfm = sklearn.feature_selection.SelectFromModel(**kwargs)
X = table_to_array(in_data)
Xsel = sfm.transform(X)
indices = sfm.get_support(indices=True)
x_names = desc.x_names
if x_names is None:
x_names = ["X{}".format(i) for i in X.shape[1]]
array_to_table(np.array(x_names)[indices],
Xsel,
tbl=out_data)
support = sfm.get_support()
array_to_table(x_names,
support.reshape((1, len(support))),
tbl=features)