Source code for node_clustering2

# This file is part of Sympathy for Data.
# Copyright (c) 2023, Combine Control Systems AB
#
# SYMPATHY FOR DATA COMMERCIAL LICENSE
# You should have received a link to the License with Sympathy for Data.
"""
Some of the docstrings for this module have been automatically
extracted from the `scikit-learn <http://scikit-learn.org/>`_ library
and are covered by their respective licenses.
"""

import sklearn
import sklearn.cluster

from sympathy.api import node
from sympathy.api.nodeconfig import Ports, Tag, Tags

from sylib.machinelearning.model import ModelPort
from sylib.machinelearning.abstract_nodes import SyML_abstract
from sylib.machinelearning.descriptors import Descriptor
from sylib.machinelearning.descriptors import FloatType
from sylib.machinelearning.descriptors import IntType
from sylib.machinelearning.descriptors import StringSelectionType

from sylib_aml.clustering2 import DBScanDescriptor

from packaging import version as pversion
sklearn_version = pversion.parse(sklearn.__version__)


[docs] class DBScan(SyML_abstract, node.Node): name = 'Density based spatial clustering (DBSCAN)' author = 'Mathias Broxvall' icon = 'DBScan.svg' description = ( 'Finds core samples of high density and expands clusters from them. ' 'Good for data which contains clusters of similar density ' 'This model can be given to predict directly without using fit.' ) nodeid = 'com.sympathyfordata.advancedmachinelearning.dbscan' tags = Tags(Tag.MachineLearning.Unsupervised) descriptor = DBScanDescriptor() descriptor.name = name info = [ [ "Model", {'name': 'eps', 'dispname': 'Max distance between two samples', 'type': FloatType(min_value=0, default=0.5)}, {'name': 'min_samples', 'dispname': 'Min samples for core point', 'type': IntType(default=5)}, {'name': 'metric', 'dispname': 'Metric', 'type': StringSelectionType([ 'euclidean', 'cityblock', 'cosine', 'l1', 'l2', 'manhattan', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'], default='euclidean')}, ], [ "Advanced", {'name': 'algorithm', 'dispname': 'Algorithm', 'type': StringSelectionType( ['auto', 'ball_tree', 'kd_tree', 'brute'], default='auto')}, {'name': 'leaf_size', 'dispname': 'Leaf size (for BallTree or cKDTree)', 'type': IntType(default=30)}, {'name': 'p', 'dispname': 'Minkowski metric power', 'type': FloatType(min_value=1.0, default=2.0)}, {'name': 'n_jobs', 'dispname': 'Number of jobs', 'type': IntType(min_value=-1, default=1)}, ] ] descriptor.set_info(info, doc_class=sklearn.cluster.DBSCAN) descriptor.set_attributes([ {'name': attr_name} for attr_name in [ 'core_sample_indices_', 'components_', 'labels_' ]], doc_class=sklearn.cluster.DBSCAN) parameters = node.parameters() SyML_abstract.generate_parameters(parameters, descriptor) inputs = Ports([]) outputs = Ports([ModelPort('Model', 'model')]) __doc__ = SyML_abstract.generate_docstring( '', descriptor.info, descriptor.attributes, inputs, outputs) def execute(self, node_context): model = node_context.output['model'] desc = self.__class__.descriptor model.set_desc(desc) kwargs = self.__class__.descriptor.get_parameters( node_context.parameters) skl = sklearn.cluster.DBSCAN(**kwargs) model.set_skl(skl) model.save()
[docs] class AgglomerativeClustering(SyML_abstract, node.Node): name = 'Agglomerative Clustering' author = 'Emil Staf' icon = 'agglomerative_clustering.svg' description = ( 'Recursively merges the pair of clusters that minimally increases a ' 'given linkage distance.') nodeid = 'com.sympathyfordata.advancedmachinelearning' \ '.agglomerativeclustering' tags = Tags(Tag.MachineLearning.Unsupervised) descriptor = Descriptor() descriptor.name = name info = [ {'name': 'n_clusters', 'dispname': 'Number of clusters', 'type': IntType(default=2)}, {'name': 'affinity', 'dispname': 'Affinity', 'type': StringSelectionType( ['euclidean', 'l1', 'l2', 'manhattan', 'cosine', 'precomputed'], default='euclidean')}, # memory? # connectivity? # compute_full_tree (only useful when specifying a connectivity matrix) {'name': 'linkage', 'type': StringSelectionType( ['ward', 'complete', 'average', 'single'], default='ward')}, ] descriptor.set_info( info, doc_class=sklearn.cluster.AgglomerativeClustering) descriptor.set_attributes([ {'name': attr_name} for attr_name in [ 'labels_', 'n_leaves_', 'n_components', 'children_' ]], doc_class=sklearn.cluster.AgglomerativeClustering) parameters = node.parameters() SyML_abstract.generate_parameters(parameters, descriptor) inputs = Ports([]) outputs = Ports([ModelPort('Model', 'model')]) __doc__ = SyML_abstract.generate_docstring( '', descriptor.info, descriptor.attributes, inputs, outputs) def execute(self, node_context): model = node_context.output['model'] desc = self.__class__.descriptor model.set_desc(desc) kwargs = self.__class__.descriptor.get_parameters( node_context.parameters) params_120 = sklearn_version >= pversion.Version('1.2.0') # Change parameter name from affinity to metric if params_120: kwargs['metric'] = kwargs.pop('affinity') skl = sklearn.cluster.AgglomerativeClustering(**kwargs) model.set_skl(skl) model.save()