Source code for node_select_columns_in_table_dataset

# This file is part of Sympathy for Data.
# Copyright (c) 2021 Combine Control Systems
#
# SYMPATHY FOR DATA COMMERCIAL LICENSE
# You should have received a link to the License with Sympathy for Data.
"""
There are many situations where you may want to throw away some of the columns
of a table. Perhaps the amount of data is large and you want to trim it to
increase performance, or perhaps some column was just needed as an intermediary
step in some analysis.
"""

from sympathy.api import node as synode
from sympathy.api.nodeconfig import Ports, Tag, Tags, adjust

from sylib_aml.dataset import DatasetPort


BASE_SELECT_NODEIDS = [
    "org.sysess.sympathy.data.table.selecttablecolumns",
    "org.sysess.sympathy.data.table.selecttablecolumnstype",
    "org.sysess.sympathy.data.table.selecttablecolumnsregex",
    "org.sysess.sympathy.data.table.selecttablecolumnsfromtable",
]


[docs] class SelectColumnsTableDataset(synode.Node): """ Select columns to propagate. """ name = "Select columns in Table Dataset (Experimental)" nodeid = ( "com.sympathyfordata.advancedmachinelearning." "selectcolumnsintabledataset") author = "Jannes Germishuys" icon = "convert_column_types_table_ds.svg" description = "Select columns from input to propagate to output." tags = Tags(Tag.MachineLearning.Processing) inputs = Ports([DatasetPort("Dataset", "dataset")]) outputs = Ports([DatasetPort("Dataset", "dataset")]) parameters = synode.parameters() editor = synode.editors.multilist_editor(edit=True, mode=True) parameters.set_list( "columns", label="Select columns", description="Select columns.", value=[], editor=editor, ) def adjust_parameters(self, node_context): adjust(node_context.parameters["columns"], node_context.input[0]) def update_parameters(self, old_params): cols = old_params["columns"] if not cols.editor.get("mode", False): try: complement = old_params["complement"].value del old_params["complement"] except KeyError: complement = False if complement: cols.multiselect_mode = "unselected" else: cols.multiselect_mode = "selected" def execute(self, node_context): input_ds = node_context.input["dataset"] input_ds.load() input_ds = input_ds.get_ds() all_cols = list(input_ds["column_config"].keys()) if node_context.parameters["columns"].multiselect_mode == "selected": keep_cols = node_context.parameters["columns"].value_names elif (node_context.parameters["columns"].multiselect_mode == "unselected"): keep_cols = list( set(all_cols) - set( node_context.parameters["columns"].value_names) ) else: raise ValueError( "This method of selection is not currently supported for " "datasets. Please try selected or unselected.") for col in list(set(all_cols) - set(keep_cols)): del input_ds["column_config"][col] json = node_context.output["dataset"] json.set_ds(input_ds) json.save()