Source code for node_io

# This file is part of Sympathy for Data.
# Copyright (c) 2017, Combine Control Systems AB
#
# Sympathy for Data is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# Sympathy for Data is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Sympathy for Data.  If not, see <http://www.gnu.org/licenses/>.
import numpy as np
import PySide6.QtWidgets as QtWidgets
import PySide6.QtCore as QtCore

import sklearn
import sklearn.base
import sklearn.datasets
import sklearn.exceptions

from sympathy.api import node
from sympathy.api import ParameterView
from sympathy.api.nodeconfig import Port
from sympathy.api.nodeconfig import Ports
from sympathy.api.nodeconfig import Tag
from sympathy.api.nodeconfig import Tags

from sylib.machinelearning.model import ModelPort, encode, decode

from sylib.machinelearning.descriptors import BoolType
from sylib.machinelearning.descriptors import FloatType
from sylib.machinelearning.descriptors import IntType


[docs]class Export(node.Node): name = 'Export Model' author = 'Mathias Broxvall' version = '0.1' icon = 'export.svg' description = 'Exports a model to disk' nodeid = 'org.sysess.sympathy.machinelearning.export' tags = Tags(Tag.MachineLearning.IO) inputs = Ports([Port.Datasource('Datasource', name='filename'), ModelPort('Input model', 'in-model')]) outputs = Ports([Port.Datasource('Datasource', name='filename')]) def execute(self, node_context): in_model = node_context.input['in-model'] datasource = node_context.input['filename'] pathname = datasource.decode_path() in_model.load() with open(pathname, 'wb') as f: f.write(encode(in_model.get()).encode('ascii')) node_context.output['filename'].encode_path(pathname)
[docs]class Import(node.Node): name = 'Import Model' author = 'Mathias Broxvall' version = '0.1' icon = 'import.svg' description = 'Imports a model from disk' nodeid = 'org.sysess.sympathy.machinelearning.import' tags = Tags(Tag.MachineLearning.IO) inputs = Ports([Port.Datasource('Datasource', name='filename')]) outputs = Ports([ModelPort('Output model', 'out-model')]) def execute(self, node_context): out_model = node_context.output['out-model'] datasource = node_context.input['filename'] pathname = datasource.decode_path() with open(pathname, 'rb') as f: out_model.set(decode(f.read()))
datasets = { 'Diabetes': sklearn.datasets.load_diabetes, 'Digits': sklearn.datasets.load_digits, 'Iris': sklearn.datasets.load_iris, 'LFW Faces': None, 'linnerud': sklearn.datasets.load_linnerud, } # Datasets only available in sklearn 0.19+ try: datasets['wine'] = sklearn.datasets.load_wine except AttributeError: pass lfw_description = """ WARNING: Loading this dataset for the first time will download circa 230 MB of images from the internet. Loader for the Labeled Faces in the Wild (LFW) people dataset This dataset is a collection of JPEG pictures of famous people collected on the internet, all details are available on the official website: http://vis-www.cs.umass.edu/lfw/ Each picture is centered on a single face. Each pixel of each channel (color in RGB) is encoded by a float in range 0.0 - 1.0. The task is called Face Recognition (or Identification): given the picture of a face, find the name of the person given a training set (gallery). The original images are 250 x 250 pixels, but the default slice and resize arguments reduce them to 74 x 62. """ class ExampleParameterWidget(ParameterView): def __init__(self, node_context, parent=None): super(ParameterView, self).__init__(parent=parent) self._parameters = node_context.parameters self._validator = None self.example_sel = QtWidgets.QComboBox() self.description_text = QtWidgets.QTextEdit("") self.description_text.setReadOnly(True) self.description_text.setMinimumSize(480, 300) self.lfw_group = QtWidgets.QGroupBox("Labeled Faces in the Wild") self.lfw_vbox = QtWidgets.QVBoxLayout() self.lfw_group.setLayout(self.lfw_vbox) self.names = sorted(datasets.keys()) for pos, name in enumerate(self.names): self.example_sel.addItem(name) # Set initially selected example index = self.example_sel.findText(self._parameters['dataset'].value, QtCore.Qt.MatchFlag.MatchFixedString) if index < 0: index = 0 self.example_sel.setCurrentIndex(index) self.example_sel.currentIndexChanged.connect(self.example_selected) self.example_selected(index) # Add to layout self.options_layout = QtWidgets.QVBoxLayout() self.options_layout.addWidget(self.example_sel) self.options_layout.addWidget(self.description_text) self.options_layout.addWidget(self.lfw_group) self.options_layout.addStretch(1) self.setLayout(self.options_layout) self.options_layout.setStretchFactor(self.description_text, 100) def param_widget(label, _name, _type): widget = QtWidgets.QWidget() hbox = QtWidgets.QHBoxLayout() widget.setLayout(hbox) label_widget = QtWidgets.QLabel(label) if isinstance(_type, BoolType): editor_widget = QtWidgets.QCheckBox() else: editor_widget = QtWidgets.QLineEdit() hbox.addWidget(label_widget) hbox.addStretch(1) hbox.addWidget(editor_widget) def text_updated(widget_=editor_widget, _type=_type): txt = widget_.text() try: value = _type.from_string(txt) self._parameters[_name].value = value except ValueError: editor_widget.setText( _type.to_string(node_context.parameters[name].value)) def bool_updated(state, widget_=editor_widget, _type=_type): self._parameters[_name].value = bool(state) if isinstance(_type, BoolType): editor_widget.stateChanged.connect(bool_updated) if node_context.parameters[_name].value: editor_widget.setCheckState(QtCore.Qt.CheckState.Checked) else: editor_widget.setCheckState(QtCore.Qt.CheckState.Unchecked) else: editor_widget.editingFinished.connect(text_updated) editor_widget.setText( _type.to_string(node_context.parameters[_name].value)) editor_widget.setToolTip(_type.description()) return widget class_names = param_widget('Class names', 'classnames', BoolType()) self.options_layout.addWidget(class_names) class_names.setToolTip('Attempts to use class names as Y') self.lfw_vbox.addWidget( param_widget('color', 'color', BoolType())) self.lfw_vbox.addWidget( param_widget('Resize factor: ', 'resize', FloatType(min_value=0.0))) self.lfw_vbox.addWidget( param_widget('Min pictures per person: ', 'min_pictures', IntType(min_value=0))) def example_selected(self, index): name = self.names[index] self._parameters['dataset'].value = name if name == 'LFW Faces': self.description_text.setText(lfw_description) self.lfw_group.show() else: dataset_fn = datasets[name] dataset = dataset_fn() try: self.description_text.setText(dataset.DESCR) except AttributeError: self.description_text.setText('Dataset lack a description') self.lfw_group.hide()
[docs]class ExampleDatasets(node.Node): name = 'Example datasets' author = 'Mathias Broxvall' version = '0.1' icon = 'example_datasets.svg' description = 'Exposes the example datasets from sklearn' nodeid = 'org.sysess.sympathy.machinelearning.example_datasets' tags = Tags(Tag.MachineLearning.IO) inputs = Ports([]) outputs = Ports([Port.Table('X', name='X'), Port.Table('Y', name='Y')]) parameters = node.parameters() parameters.set_string( 'dataset', value='Iris', label='Dataset', description='Choose from one of the default toy datasets') parameters.set_boolean( 'classnames', value=True, label='Class names', description='Attempts to use class names as Y') parameters.set_boolean( 'color', value=False, label='color') parameters.set_integer( 'min_pictures', value=10, label='min_pictures', description=( 'Minimum number of pictures per person required for including ' 'a person in the LFW dataset')) parameters.set_float( 'resize', value=0.5, label='resize', description=( 'Resize LFW pictures, default 0.5 give size 62x47 images')) def exec_parameter_view(self, node_context): return ExampleParameterWidget(node_context) def execute(self, node_context): out_X = node_context.output['X'] out_Y = node_context.output['Y'] dataset_name = node_context.parameters['dataset'].value min_pictures = node_context.parameters['min_pictures'].value color = node_context.parameters['color'].value resize = node_context.parameters['resize'].value if dataset_name != 'LFW Faces': dataset = datasets[dataset_name]() else: dataset = sklearn.datasets.fetch_lfw_people( min_faces_per_person=min_pictures, resize=resize, color=color) classnames = node_context.parameters['classnames'].value X = dataset.data Y = dataset.target out_X.set_name(dataset_name+" X") out_Y.set_name(dataset_name+" Y") try: X_names = dataset.feature_names except AttributeError: X_names = ["X{0}".format(i) for i in range(X.shape[1])] for i, name in enumerate(X_names): out_X.set_column_from_array(name, X[:, i]) if classnames: try: target_names = dataset.target_names Y = target_names[Y] except (TypeError, AttributeError): pass if len(Y.shape) < 2: out_Y.set_column_from_array("Y", Y) else: for i in range(Y.shape[1]): out_Y.set_column_from_array("Y{0}".format(i), Y[:, i])
[docs]class MakeBlobs(node.Node): name = 'Generate dataset blobs' author = 'Mathias Broxvall' version = '0.1' icon = 'dataset_blobs.svg' description = ( 'Generates an artificial dataset useful for testing ' 'clustering algorithms') nodeid = 'org.sysess.sympathy.machinelearning.generate_blobs' tags = Tags(Tag.MachineLearning.IO) inputs = Ports([]) outputs = Ports([Port.Table('X', name='X'), Port.Table('Y', name='Y')]) parameters = node.parameters() parameters.set_integer( 'n_samples', value=100, label='n_samples', description=( 'The total number of points equally divided among clusters.')) parameters.set_integer( 'n_features', value=2, label='n_features', description='The number of features for each sample.') parameters.set_integer( 'centers', value=3, label='centers', description='Number of clusters.') parameters.set_float( 'cluster_std', value=2.0, label='cluster_std', description='Standard deviation of the clusters.') parameters.set_float( 'center_min', value=-10.0, label='center_min', description='Smallest allowed coordinate for the generated centers') parameters.set_float( 'center_max', value=10.0, label='center_max', description='Largest allowed coordinate for the generated centers') parameters.set_boolean( 'shuffle', value=True, label='shuffle', description='Shuffle datapoints (otherwise given in cluster order)') def execute(self, node_context): out_X = node_context.output['X'] out_Y = node_context.output['Y'] out_X.set_name("X") out_Y.set_name("Y") kwargs = {} kwargs['center_box'] = (node_context.parameters['center_min'].value, node_context.parameters['center_max'].value) kn = ['n_samples', 'n_features', 'centers', 'cluster_std', 'shuffle'] for name in kn: kwargs[name] = node_context.parameters[name].value X, Y = sklearn.datasets.make_blobs(**kwargs) X_names = ["X{0}".format(i) for i in range(X.shape[1])] for i, name in enumerate(X_names): out_X.set_column_from_array(name, X[:, i]) out_Y.set_column_from_array("Y", Y)
[docs]class MakeBlobsFromTable(node.Node): name = 'Generate dataset blobs from table' author = 'Mathias Broxvall' version = '0.1' icon = 'dataset_blobs.svg' description = ( 'Takes a table describing blob center positions and optionally ' 'standard deviations and generates a random dataset. Rows in table' 'corresponds to cluster number, and columns to the number of ' 'features in the datasets.') nodeid = 'org.sysess.sympathy.machinelearning.generate_blobs_from_table' tags = Tags(Tag.MachineLearning.IO) inputs = Ports([Port.Table('C', name='C')]) outputs = Ports([Port.Table('X', name='X'), Port.Table('Y', name='Y')]) parameters = node.parameters() parameters.set_integer( 'n_samples', value=100, label='n_samples', description=( 'The total number of points equally divided among clusters.')) parameters.set_integer( 'n_features', value=2, label='n_features', description='The number of features for each sample.') parameters.set_string( 'cluster_std', value="2.0", label='cluster_std', description=('Column name used to give standard deviation for each ' 'cluster. If empty or a float number then the same value ' 'is used for each cluster')) parameters.set_boolean( 'shuffle', value=True, label='shuffle', description='Shuffle datapoints (otherwise given in cluster order)') def execute(self, node_context): in_C = node_context.input['C'] out_X = node_context.output['X'] out_Y = node_context.output['Y'] kwargs = {} try: std = float(node_context.parameters['cluster_std'].value) ignore_col = None except ValueError: name = node_context.parameters['cluster_std'].value std = in_C.get_column_to_array(name) ignore_col = name cols = [] X_names = [] for name in in_C.column_names(): if name != ignore_col: cols.append(in_C.get_column_to_array(name)) X_names.append(name) kwargs['centers'] = np.array(cols).T kwargs['cluster_std'] = std for name in ['n_samples', 'n_features', 'shuffle']: kwargs[name] = node_context.parameters[name].value X, Y = sklearn.datasets.make_blobs(**kwargs) for i, name in enumerate(X_names): out_X.set_column_from_array(name, X[:, i]) out_Y.set_column_from_array("Y", Y)
[docs]class MakeClassification(node.Node): name = 'Generate classification dataset' author = 'Mathias Broxvall' version = '0.1' icon = 'dataset_classes.svg' description = """ Generates an artificial dataset useful for testing classification algorithms. Generate a random n-class classification problem. This initially creates clusters of points normally distributed (std=1) about vertices of a 2 * class_sep-sided hypercube, and assigns an equal number of clusters to each class. It introduces interdependence between these features and adds various types of further noise to the data. Prior to shuffling, X stacks a number of these primary 'informative' features, 'redundant' linear combinations of these, 'repeated' duplicates of sampled features, and arbitrary noise for any remaining features. """ nodeid = 'org.sysess.sympathy.machinelearning.generate_classification' tags = Tags(Tag.MachineLearning.IO) inputs = Ports([]) outputs = Ports([Port.Table('X', name='X'), Port.Table('Y', name='Y')]) parameters = node.parameters() parameters.set_integer( 'n_samples', value=100, label='n_samples', description='The total number of samples generated.') parameters.set_integer( 'n_features', value=20, label='n_features', description='The number of features for each sample.') parameters.set_integer( 'n_informative', value=2, label='n_informative', description="""The number of informative features. Each class is composed of a number of gaussian clusters each located around the vertices of a hypercube in a subspace of dimension n_informative. For each cluster, informative features are drawn independently from N(0, 1) and then randomly linearly combined within each cluster in order to add covariance. The clusters are then placed on the vertices of the hypercube.""") parameters.set_integer( 'n_redundant', value=2, label='n_redundant', description=( 'The number of redundant features. These features are generated ' 'as random linear combinations of the informative features.')) parameters.set_integer( 'n_repeated', value=0, label='n_repeated', description=( 'The number of duplicated features, drawn randomly from the ' 'informative and the redundant features.')) parameters.set_integer( 'n_classes', value=2, label='n_classes', description=('The number of classes (labels) for the ' 'classification problem')) parameters.set_integer( 'n_clusters_per_class', value=2, label='n_clusters_per_class', description=( 'The number of classes (or labels) of the classification problem.') ) parameters.set_string( 'weights', value="None", label='weights', description="""Comma separated list of float weights for each class. Determines the proportions of samples assigned to each class. If None, then classes are balanced. Note that if len(weights) == n_classes - 1, then the last class weight is automatically inferred. More than n_samples samples may be returned if the sum of weights exceeds 1. """) parameters.set_float( 'flip_y', value=0.01, label='flip_y', description=('The fraction of samples whose class are ' 'randomly exchanged')) parameters.set_float( 'class_sep', value=1.0, label='class_sep', description='Factor multiplying the hypercube dimension') parameters.set_string( 'shift', value="0.0", label='shift', description=( 'Shift features by the specified comma separated value(s). ' 'If None, then features are shifted by a random value drawn in:' ' [-class_sep, class_sep].')) parameters.set_string( 'scale', value="1.0", label='scale', description=( 'Multiply features by the specified comma separated value(s). ' 'If None, then features are scaled by a random value drawn in:' ' [1, 100].\nNote that scaling happens after shifting.')) parameters.set_boolean( 'hypercube', value=True, label='hypercube', description=( 'If true clusters are put on vertices of a hypercube, ' 'otherwise a random polytope')) parameters.set_boolean( 'shuffle', value=True, label='shuffle', description='Shuffle datapoints (otherwise given in cluster order)') def execute(self, node_context): out_X = node_context.output['X'] out_Y = node_context.output['Y'] kwargs = {} for name in node_context.parameters.keys(): kwargs[name] = node_context.parameters[name].value for name in ['weights', 'shift', 'scale']: string = kwargs[name] if string.lower() == 'none': kwargs[name] = None continue string = string.replace(',', ' ') arg = [float(word) for word in string.split()] if len(arg) == 1: kwargs[name] = arg elif name == 'weights': kwargs[name] = arg else: kwargs[name] = np.array(arg) X, Y = sklearn.datasets.make_classification(**kwargs) X_names = ["X{0}".format(i) for i in range(X.shape[1])] for i, name in enumerate(X_names): out_X.set_column_from_array(name, X[:, i]) out_Y.set_column_from_array("Y", Y)