Source code for node_analysis
# This file is part of Sympathy for Data.
# Copyright (c) 2017, Combine Control Systems AB
#
# SYMPATHY FOR DATA COMMERCIAL LICENSE
# You should have received a link to the License with Sympathy for Data.
import numpy as np
from sklearn.cluster import SpectralCoclustering
from sympathy.api import node
from sympathy.api.nodeconfig import Port
from sympathy.api.nodeconfig import Ports
from sympathy.api.nodeconfig import Tag
from sympathy.api.nodeconfig import Tags
from sympathy.api.exceptions import SyDataError
from sympathy.api import table
from sylib.machinelearning.utility import table_to_array
[docs]
class SpectralCoClusteringNode(node.Node):
author = 'Mathias Broxvall'
icon = 'spectral_coclustering.svg'
description = ('Rearranges rows/columns so that cluster intra-connections '
'become adjacent and move towards the diagonal.')
name = 'Spectral co-clustering'
tags = Tags(Tag.MachineLearning.Unsupervised)
nodeid = 'com.sympathyfordata.advancedmachinelearning' \
'.spectral_co_clustering'
parameters = node.parameters()
parameters.set_boolean(
'name',
value=False,
label='Create name column',
description='Creates an additional column first in the data showing '
'the original column name for each corresponding row')
parameters.set_boolean(
'clust_col',
value=False,
label='Create cluster column',
description='Creates an additional column first in the data showing '
'cluster ID that each row belongs to')
parameters.set_boolean(
'sub_clustering',
value=True,
label='Sub clustering',
description='Performs one level of additional clustering to order '
'data within a cluster')
parameters.set_integer(
'clusters',
value=3,
label='Clusters',
description='Number of clusters to look for')
inputs = Ports([
Port.Table('data', name='data'),
# Port.Table('weights', name='weights', n=(0, 1, 0)),
])
outputs = Ports([
Port.Table('Table with results', name='result'),
Port.Tables('List of columns in each cluster',
name='clusters', n=(0, 1, 0)),
])
def execute(self, node_context):
data_tbl = node_context.input['data']
output = node_context.output['result']
out_clusters = node_context.output['clusters']
create_name = node_context.parameters['name'].value
n_clusters = node_context.parameters['clusters'].value
sub_clustering = node_context.parameters['sub_clustering'].value
clust_col = node_context.parameters['clust_col'].value
if data_tbl.number_of_rows() == 0:
raise SyDataError("Empty table")
data = table_to_array(data_tbl)
if data.shape[1] != data.shape[0]:
raise SyDataError('Input table must have exactly as many columns '
'as rows')
col_names = [col.name for col in data_tbl.cols()]
clustering = SpectralCoclustering(n_clusters)
clustering.fit(data)
order = []
orig = np.arange(data.shape[0])
clust_col = []
for cluster in range(n_clusters):
indices = orig[clustering.rows_[cluster, :]]
if sub_clustering and len(indices) > 2:
n_subclusters = max(2, int(len(indices)/2))
print("n subclusters: ", n_subclusters, len(indices))
L = []
for i in indices:
L += [data[indices, i]]
sub_data = np.column_stack(L)
print("sub_data:\n", sub_data)
print("sub_data: ", sub_data.shape)
print("sub_data: ", sub_data.dtype)
subclustering = SpectralCoclustering(n_subclusters)
subclustering.fit(sub_data)
sub_order = []
sub_clusters = []
for sub_cluster in range(n_subclusters):
sub_indices = indices[subclustering.rows_[sub_cluster, :]]
sub_order += list(sub_indices)
sub_clusters += [sub_cluster]*len(sub_indices)
indices = sub_order
elif sub_clustering:
sub_clusters = [0] * len(indices)
order = order + list(indices)
tbl = table.File()
tbl.set_column_from_array("idx", np.array(indices))
tbl.set_column_from_array("name",
np.array(col_names)[np.array(indices)])
if sub_clustering:
tbl.set_column_from_array("subcluster", np.array(sub_clusters))
out_clusters.append(tbl)
clust_col += [cluster]*len(indices)
if create_name:
output.set_column_from_array("name", np.array(col_names)[order])
if clust_col:
output.set_column_from_array("cluster", np.array(clust_col))
for i in range(data.shape[1]):
output.set_column_from_array(
col_names[order[i]], data[:, order[i]][order])