Source code for node_decomposition

# This file is part of Sympathy for Data.
# Copyright (c) 2017, Combine Control Systems AB
#
# SYMPATHY FOR DATA COMMERCIAL LICENSE
# You should have received a link to the License with Sympathy for Data.
import sklearn.decomposition
import sklearn.cross_decomposition

from sympathy.api import node
from sympathy.api.nodeconfig import Ports, Tag, Tags
from sylib.machinelearning.model import ModelPort
from sylib.machinelearning.abstract_nodes import SyML_abstract
from sylib.machinelearning.utility import names_from_x
from sylib.machinelearning.utility import names_from_y
from sylib.machinelearning.utility import names_from_prefix
from sylib.machinelearning.descriptors import Descriptor

from sylib.machinelearning.descriptors import BoolType
from sylib.machinelearning.descriptors import FloatType
from sylib.machinelearning.descriptors import IntType
from sylib.machinelearning.descriptors import NoneType
from sylib.machinelearning.descriptors import StringSelectionType
from sylib.machinelearning.descriptors import UnionType


[docs] class PrincipalComponentAnalysis(SyML_abstract, node.Node): """Principal Component Analysis (PCA) is used to decompose a multivariate dataset in a set of successive orthogonal components that explain a maximum amount of the variance. It is implemented as a transformer model that learns `n` components in its fit method, and can be used on new data to project it on these components. """ name = 'Principal Component Analysis (PCA)' author = 'Mathias Broxvall' icon = 'PCA.svg' description = ( 'Linear dimensionality reduction using Singular Value Decomposition ' 'of the data to project it to a lower dimensional space.') nodeid = 'org.sysess.sympathy.machinelearning.pca' tags = Tags(Tag.MachineLearning.DimensionalityReduction) descriptor = Descriptor() descriptor.name = name info = [ {'name': 'n_components', 'dispname': 'Number of components to keep', 'type': UnionType([IntType(min_value=1), FloatType(min_value=0, max_value=1), StringSelectionType(['mle'])], default=1)}, {'name': 'svd_solver', 'dispname': 'Solver', 'type': StringSelectionType( ['auto', 'full', 'arpack', 'randomized'], default='auto')}, {'name': 'tol', 'dispname': 'Tolerance for singular values', 'type': FloatType(default=0.0)}, {'name': 'iterated_power', 'dispname': 'N. of iteratins (for randomized solver)', 'type': UnionType( [IntType(min_value=0), StringSelectionType(['auto'])], default='auto')}, {'name': 'whiten', 'dispname': 'Whiten', 'type': BoolType(default=False)}, ] descriptor.set_info(info, doc_class=sklearn.decomposition.PCA) descriptor.set_attributes([ {'name': 'components_', 'cnames': names_from_x}, {'name': 'explained_variance_'}, {'name': 'explained_variance_ratio_'}, {'name': 'mean_', 'cnames': names_from_x}, {'name': 'n_components_'}, {'name': 'noise_variance_'}, ], doc_class=sklearn.decomposition.PCA) parameters = node.parameters() SyML_abstract.generate_parameters(parameters, descriptor) inputs = Ports([]) outputs = Ports([ModelPort('Model', 'model')]) __doc__ += SyML_abstract.generate_docstring( description, descriptor.info, descriptor.attributes, inputs, outputs) def execute(self, node_context): model = node_context.output['model'] desc = self.__class__.descriptor model.set_desc(desc) kwargs = self.__class__.descriptor.get_parameters( node_context.parameters) skl = sklearn.decomposition.PCA(**kwargs) model.set_skl(skl) model.save()
[docs] class KernelPCA(SyML_abstract, node.Node): """KernelPCA is an extension of PCA which achieves non-linear dimensionality reduction through the use of kernels. It has many applications including denoising, compression and structured prediction (kernel dependency estimation). KernelPCA supports both transform and inverse_transform. """ name = 'Kernel Principal Component Analysis (KPCA)' author = 'Mathias Broxvall' icon = 'PCA.svg' description = ( 'Non-linear dimensionality reduction through the use of kernels') nodeid = 'org.sysess.sympathy.machinelearning.kpca' tags = Tags(Tag.MachineLearning.DimensionalityReduction) descriptor = Descriptor() descriptor.name = name info = [ [ "Model options", {'name': 'n_components', 'dispname': 'Number of components', 'type': UnionType( [IntType(min_value=1), NoneType()], default=None)}, {'name': 'kernel', 'dispname': 'Kernel', 'type': StringSelectionType( ['linear', 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed'], default='linear')}, {'name': 'fit_inverse_transform', 'dispname': 'Fit inverse-transform', 'type': BoolType(default=False)}, ], [ "Advanced options", {'name': 'degree', 'dispname': 'Poly kernel degree', 'type': IntType(min_value=1, default=3)}, {'name': 'gamma', 'dispname': 'Kernel coefficient (poly, rbf, sigmoid)', 'type': UnionType([ FloatType(min_value=0.0), NoneType()], default=None)}, {'name': 'coef0', 'dispname': 'Independent term (poly, sigmoid)', 'type': FloatType(min_value=0.0, default=1)}, {'name': 'alpha', 'dispname': 'Ridge regression hyperparameter', 'type': FloatType(min_value=0.0, default=1)}, {'name': 'remove_zero_eig', 'dispname': 'Remove components with zero eigenvalue', 'type': BoolType(default=False)}, ], [ "Solver", {'name': 'eigen_solver', 'dispname': 'Eigensolver', 'type': StringSelectionType([ 'auto', 'dense', 'arpack'], default='auto')}, {'name': 'tol', 'dispname': 'Tolerance', 'type': FloatType(default=0.0)}, {'name': 'max_iter', 'dispname': 'Max iteratins', 'type': UnionType([IntType(min_value=1), NoneType()], default=None)}, {'name': 'random_state', 'dispname': 'Random seed', 'type': UnionType([NoneType(), IntType()], default=None)}, {'name': 'n_jobs', 'dispname': 'number of jobs', 'type': IntType(min_value=-1, default=1)}, ] ] descriptor.set_info(info, doc_class=sklearn.decomposition.KernelPCA) descriptor.set_attributes([ {'name': 'lambdas_'}, {'name': 'alphas_'}, {'name': 'dual_coef_', 'cnames': names_from_x}, {'name': 'X_transformed_fit_'}, {'name': 'X_fit_'}, ], doc_class=sklearn.decomposition.KernelPCA) parameters = node.parameters() SyML_abstract.generate_parameters(parameters, descriptor) inputs = Ports([]) outputs = Ports([ModelPort('Model', 'model')]) __doc__ += SyML_abstract.generate_docstring( description, descriptor.info, descriptor.attributes, inputs, outputs) def execute(self, node_context): model = node_context.output['model'] desc = self.__class__.descriptor model.set_desc(desc) kwargs = self.__class__.descriptor.get_parameters( node_context.parameters) kwargs['copy_X'] = True skl = sklearn.decomposition.KernelPCA(**kwargs) model.set_skl(skl) model.save()
[docs] class PLSRegressionCrossDecomposition(SyML_abstract, node.Node): """Partial Least Squares (PLS) cross-decomposition is a statistical method used to find the fundamental relations between two matrices, typically predictors (X) and responses (Y). It projects both X and Y into a lower-dimensional subspace such that the covariance between transformed(X) and transformed(Y) is maximal. PLS draws similarities with Principal Component Regression (PCR), where the samples are first projected into a lower-dimensional subspace, and the targets y are predicted using transformed(X). One issue with PCR is that the dimensionality reduction is unsupervised, and may lose some important variables: PCR would keep the features with the most variance, but it's possible that features with a small variances are relevant from predicting the target. In a way, PLS allows for the same kind of dimensionality reduction, but by taking into account the targets y. """ name = 'Partial Least Squares cross-decomposition (PLS regression)' author = 'Mathias Broxvall' icon = 'PCA.svg' description = ( 'Finds the fundamental relations between two matrices X and Y, ie. ' 'it finds the (multidimensional) direction in X that best explains ' 'maximum multidimensional direction in Y. See also PCA-analysis') nodeid = 'org.sysess.sympathy.machinelearning.pls' tags = Tags(Tag.MachineLearning.DimensionalityReduction) descriptor = Descriptor() descriptor.name = name info = [ {'name': 'n_components', 'dispname': 'Number of components to keep', 'type': IntType(min_value=1, default=2)}, {'name': 'scale', 'dispname': 'Scale the data', 'type': BoolType(default=True)}, {'name': 'max_iter', 'dispname': 'Max iterations', 'type': IntType(min_value=1, default=500)}, {'name': 'tol', 'dispname': 'Tolerance', 'type': FloatType(default=0.0)}, ] descriptor.set_info( info, doc_class=sklearn.cross_decomposition.PLSRegression) descriptor.set_attributes([ {'name': 'x_weights_', 'rnames': names_from_x, 'cnames': names_from_prefix('component ')}, {'name': 'y_weights_', 'rnames': names_from_y, 'cnames': names_from_prefix('component ')}, {'name': 'x_loadings_', 'rnames': names_from_x, 'cnames': names_from_prefix('component ')}, {'name': 'y_loadings_', 'rnames': names_from_y, 'cnames': names_from_prefix('component ')}, {'name': 'x_scores_', 'cnames': names_from_prefix('component ')}, {'name': 'y_scores_', 'cnames': names_from_prefix('component ')}, {'name': 'x_rotations_', 'rnames': names_from_x, 'cnames': names_from_prefix('component ')}, {'name': 'y_rotations_', 'rnames': names_from_y}, {'name': 'coef_'}, {'name': 'n_iter_'}, ], doc_class=sklearn.cross_decomposition.PLSRegression) parameters = node.parameters() SyML_abstract.generate_parameters(parameters, descriptor) inputs = Ports([]) outputs = Ports([ModelPort('Model', 'model')]) __doc__ += SyML_abstract.generate_docstring( description, descriptor.info, descriptor.attributes, inputs, outputs) def execute(self, node_context): model = node_context.output['model'] desc = self.__class__.descriptor model.set_desc(desc) kwargs = self.__class__.descriptor.get_parameters( node_context.parameters) skl = sklearn.cross_decomposition.PLSRegression(**kwargs) model.set_skl(skl) model.save()