Source code for node_transformdataset

# This file is part of Sympathy for Data.
# Copyright (c) 2021 Combine Control Systems
#
# SYMPATHY FOR DATA COMMERCIAL LICENSE
# You should have received a link to the License with Sympathy for Data.
from sympathy.api import node, qt2, ParameterView
from sympathy.api.nodeconfig import Ports, Tag, Tags, adjust
from sympathy.api import exceptions

from sylib.imageprocessing.algorithm_selector import (
    ImageFiltering_abstract,
    AlgorithmParameterWidget,
)
from sylib_aml.dataset import DatasetPort

import ast
import numpy as np

QtGui = qt2.import_module("QtGui")
QtCore = qt2.import_module("QtCore")
QtWidgets = qt2.import_module("QtWidgets")


class TransformWidget(ParameterView):
    """
    Creates the data transformation GUI for datasets
    """

    def __init__(
        self, parameters, algorithms, options_list, options_types, parent=None
    ):
        super().__init__(parent=parent)
        self._parameters = parameters

        column_choice = self._parameters["columns"].gui()

        algo_widget = AlgorithmParameterWidget(
            self._parameters, algorithms, options_list, options_types
        )

        layout = QtWidgets.QVBoxLayout()
        layout.addWidget(column_choice)
        layout.addWidget(algo_widget)
        self.setLayout(layout)


[docs] class TransformImageDataset(ImageFiltering_abstract, node.Node): name = "Transform Image Dataset (Experimental)" nodeid = ( "com.sympathyfordata.advancedmachinelearning.transformimagedataset") author = "Jannes Germishuys" icon = "image_ds_transform.svg" tags = Tags(Tag.MachineLearning.Processing) # Choose parameters for each transform PARAMS_DICT = { "Grayscale": ["Number of output channels"], "Center Crop": ["Width", "Height"], "Normalize": ["Mean", "Standard deviation"], "Pad": ["Padding size", "Fill", "Padding mode"], "Resize": ["Width", "Height", "Interpolation"], "To Tensor": [], "To PIL Image": [], } # Set-up structure for each algorithm (parameters, parameter types, default # parameter values) TRANSFORM_ALGS = { "Grayscale": { "description": ( "Convert image to grayscale. If the image is torch " "Tensor, it is expected to have […, 3, H, W] shape, where … " "means an arbitrary number of leading dimensions"), "Number of output channels": ( "(1 or 3) number of channels desired for output image"), }, "Center Crop": { "description": ( "Crops the given image at the center. If the image is torch " "Tensor, it is expected to have […, H, W] shape, where … " "means an arbitrary number of leading dimensions. " "If image size is smaller than output size along any edge, " "image is padded with 0 and then center cropped."), "Width": "Desired output width of the crop", "Height": "Desired output height of the crop", }, "Normalize": { "description": ( "Normalize a tensor image with mean and standard deviation. " "This transform does not support PIL Image. Given mean: " "(mean[1],...,mean[n]) and std: (std[1],..,std[n]) for n " "channels, this transform will normalize each channel of the " "input torch.*Tensor i.e., output[channel] = " "(input[channel] - mean[channel]) / std[channel]"), "Mean": "Sequence of means for each channel.", "Standard deviation": ( "Sequence of standard deviations for each channel."), }, "Pad": { "description": ( "Pad the given image on all sides with the given “pad” value. " "If the image is torch Tensor, it is expected to have " "[…, H, W] shape, where … means at most 2 leading dimensions " "for mode reflect and symmetric, at most 3 leading dimensions " "for mode edge, and an arbitrary number of leading dimensions " "for mode constant"), "Padding size": ( "Padding on each border. If a single int is provided this is " "used to pad all borders. If sequence of length 2 is provided " "this is the padding on left/right and top/bottom " "respectively. If a sequence of length 4 is provided this is " "the padding for the left, top, right and bottom borders " "respectively."), "Fill": ( "Pixel fill value for constant fill. Default is 0. If a tuple " "of length 3, it is used to fill R, G, B channels " "respectively. This value is only used when the padding_mode " "is constant. Only number is supported for torch Tensor. Only " "int or str or tuple value is supported for PIL Image."), "Padding mode": ( "Type of padding. Should be: constant, edge, reflect or " "symmetric. Default is constant. - constant: pads with a " "constant value, this value is specified with fill - edge: " "pads with the last value at the edge of the image. If input " "a 5D torch Tensor, the last 3 dimensions will be padded " "instead of the last 2 - reflect: pads with reflection of " "image without repeating the last value on the edge. For " "example, padding [1, 2, 3, 4] with 2 elements on both sides " "in reflect mode will result in [3, 2, 1, 2, 3, 4, 3, 2] " "- symmetric: pads with reflection of image repeating the " "last value on the edge. For example, padding [1, 2, 3, 4] " "with 2 elements on both sides in symmetric mode will result " "in [2, 1, 1, 2, 3, 4, 4, 3]"), }, "Resize": { "description": ( "Resize the input image to the given size. If the image is " "torch Tensor, it is expected to have […, H, W] shape, where " "… means an arbitrary number of leading dimensions"), "Width": "Desired output width.", "Height": "Desired output height.", "Interpolation": ( "Desired interpolation enum defined by " "torchvision.transforms.InterpolationMode. Default is " "InterpolationMode.BILINEAR. If input is Tensor, only " "InterpolationMode.NEAREST, InterpolationMode.BILINEAR and " "InterpolationMode.BICUBIC are supported. For backward " "compatibility integer values (e.g. PIL.Image.NEAREST) are " "still acceptable."), }, "To Tensor": { "description": ( "Convert a PIL Image or numpy.ndarray to tensor. This " "transform does not support torchscript."), }, "To PIL Image": { "description": ( "Convert a tensor or an ndarray to PIL Image. This transform " "does not support torchscript."), }, } TRANSFORM_PARAMETERS = [] for k, v in PARAMS_DICT.items(): TRANSFORM_PARAMETERS.extend(v) TRANSFORM_PARAMETERS = list(set(TRANSFORM_PARAMETERS)) TEXT_PARAMS = ( { "Number of output channels": "num_output_channels", "Width": "width", "Height": "height", "Mean": "mean", "Standard deviation": "std", "Padding size": "padding", "Fill": "fill", "Padding mode": "padding_mode", "Interpolation": "interpolation", } ) TRANSFORM_TYPES = { "Number of output channels": int, "Width": int, "Height": int, "Mean": int, "Standard deviation": int, "Padding size": int, "Fill": int, "Padding mode": str, "Interpolation": ["Bilinear", "Bicubic", "Nearest"], } TRANSFORM_DEFAULTS = { "Width": 100, # (100, 100), "Height": 100, "Standard deviation": 0, # [], "Padding size": 1, "Mean": 0, # [], "Interpolation": "Nearest", "Padding mode": "constant", "Fill": 1, "Number of output channels": 1, } algorithms = TRANSFORM_ALGS options_list = TRANSFORM_PARAMETERS options_types = TRANSFORM_TYPES options_default = TRANSFORM_DEFAULTS options_converted = TEXT_PARAMS parameters = node.parameters() parameters.set_string( "algorithm", value=next(iter(algorithms)), description="", label="Algorithm" ) ImageFiltering_abstract.generate_parameters( parameters, options_types, options_default ) description = "Transforms images within an image dataset" inputs = Ports([DatasetPort("Dataset", "dataset")]) outputs = Ports([DatasetPort("Dataset", "dataset")]) __doc__ = ImageFiltering_abstract.generate_docstring( description, algorithms, options_list, inputs, outputs ) def execute(self, node_context): algorithm_choice = node_context.parameters["algorithm"].value poi = list( { k: v for k, v in TransformImageDataset.algorithms[ algorithm_choice].items() if k != "description" }.keys() ) if poi == [""]: poi_values = {} else: poi_values = { TransformImageDataset.options_converted[k]: ( node_context.parameters[k].value) if TransformImageDataset.options_types[k] is not tuple else ast.literal_eval(node_context.parameters[k].value) for k in poi } transform = algorithm_choice transform_values = poi_values # Combine upper and lower into single parameter entry as expected if transform in ["Resize", "Center Crop"]: transform_values["size"] = tuple( [transform_values["width"], transform_values["height"]] ) for key in ["width", "height"]: del transform_values[key] # transforms[algorithm_choice](**poi_values) ds_obj = node_context.input["dataset"] ds_obj.load() ds_skl = ds_obj.get_ds() if ds_skl is None: raise exceptions.SyDataError("Empty dataset") elif ds_skl is not None: if ds_skl["dstype"] == "table": raise exceptions.SyDataError("Incorrect data type") ds_skl["transforms"].extend([transform]) ds_skl["transforms_values"].extend([transform_values]) ds = { "dstype": ds_skl["dstype"], "paths": ds_skl["paths"], "transforms": ds_skl["transforms"], "transforms_values": ds_skl["transforms_values"], "labels": ds_skl["labels"], } else: ds = {} json = node_context.output["dataset"] json.set_ds(ds) json.save()
[docs] class TransformTableDataset(ImageFiltering_abstract, node.Node): name = "Transform Table Dataset (Experimental)" nodeid = ( "com.sympathyfordata.advancedmachinelearning.transformtabledataset") author = "Jannes Germishuys" icon = "table_ds_transform.svg" tags = Tags(Tag.MachineLearning.Processing) # Choose parameters for each transform PARAMS_DICT = ( { "SimpleImputer": [ "Missing values", "Strategy for missing values", "Fill value for missing values", "Verbose", "Copy", "Add indicator", ], "Binarizer": ["threshold"], "LabelEncoder": ["Use categorical"], "OneHotEncoder": [ "Categories", "Drop category", "Transformed array in sparse format", "Desired data type", "Handle Unknown", ], "PolynomialFeatures": [ "Degree", "Only interaction features produced", "Include bias", "Order", "Preserve as dataframe", ], "RobustScaler": [ "Center the data before scaling", "Scale to interquartile range", "IQR Quantile range - Lower", "IQR Quantile range - Upper", "Copy", ], "StandardScaler": [ "Copy", "Center the data", "Scale to unit variance"], } ) # Set-up structure for each algorithm (parameters, parameter types, default # parameter values) TRANSFORM_ALGS = ( { "SimpleImputer": { "description": ( "Simple imputation for missing data in tabular datasets"), "Missing values": ( "The placeholder for the missing values. All occurrences " "of missing_values will be imputed. For pandas’ " "dataframes with nullable integer dtypes with missing " "values, missing_values should be set to np.nan, since " "pd.NA will be converted to np.nan."), "Strategy for missing values": ( "The imputation strategy. If “mean”, then replace missing " "values using the mean along each column. Can only be " "used with numeric data. If “median”, then replace " "missing values using the median along each column. Can " "only be used with numeric data. If “most_frequent”, " "then replace missing using the most frequent value along " "each column. Can be used with strings or numeric data. " "If there is more than one such value, only the smallest " "is returned. If “constant”, then replace missing values " "with fill_value. Can be used with strings or numeric " "data."), "Fill value for missing values": ( "When strategy == “constant”, fill_value is used to " "replace all occurrences of missing_values. If left to " "the default, fill_value will be 0 when imputing " "numerical data and “missing_value” for strings or object " "data types."), "Verbose": "Controls the verbosity of the imputer.", "Copy": ( "If True, a copy of X will be created. If False, " "imputation will be done in-place whenever possible. Note " "that, in the following cases, a new copy will always be " "made, even if copy=False: If X is not an array of " "floating values; If X is encoded as a CSR matrix; If " "add_indicator=True."), "Add indicator": ( "If True, a MissingIndicator transform will stack onto " "output of the imputer’s transform. This allows a " "predictive estimator to account for missingness despite " "imputation. If a feature has no missing values at " "fit/train time, the feature won’t appear on the missing " "indicator even if there are missing values at " "transform/test time."), }, "Binarizer": { "description": ( "Binarize data (set feature values to 0 or 1) according " "to a threshold."), "threshold": ( "Feature values below or equal to this are replaced by 0, " "above it by 1. Threshold may not be less than 0 for " "operations on sparse matrices."), }, "LabelEncoder": { "description": ( "Encode target labels with value between 0 and " "n_classes-1."), "Use categorical": "", }, "OneHotEncoder": { "description": ( "Encode categorical features as a one-hot numeric array."), "Categories": ( "Categories (unique values) per feature: ‘auto’ : " "Determine categories automatically from the training " "data. list : categories[i] holds the categories expected " "in the ith column. The passed categories should not mix " "strings and numeric values within a single feature, and " "should be sorted in case of numeric values. The used " "categories can be found in the `categories_` attribute."), "Drop category": ( "Specifies a methodology to use to drop one of the " "categories per feature. This is useful in situations " "where perfectly collinear features cause problems, such " "as when feeding the resulting data into a neural network " "or an unregularized regression. However, dropping one " "category breaks the symmetry of the original " "representation and can therefore induce a bias in " "downstream models, for instance for penalized linear " "classification or regression models. None : retain all " "features (the default). ‘first’ : drop the first " "category in each feature. If only one category is " "present, the feature will be dropped entirely. " "‘if_binary’ : drop the first category in each feature " "with two categories. Features with 1 or more than 2 " "categories are left intact. array : drop[i] " "is the category in feature X[:, i] that should be " "dropped"), "Transformed array in sparse format": ( "Will return sparse matrix if set True else will return " "an array."), "Desired data type": "Desired dtype of output.", "Handle Unknown": ( "Whether to raise an error or ignore if an unknown " "categorical feature is present during transform (default " "is to raise). When this parameter is set to ‘ignore’ and " "an unknown category is encountered during transform, the " "resulting one-hot encoded columns for this feature will " "be all zeros. In the inverse transform, an unknown " "category will be denoted as None."), }, "PolynomialFeatures": { "description": "Generate polynomial and interaction features.", "Degree": "The degree of the polynomial features.", "Only interaction features produced": ( "If true, only interaction features are produced: " "features that are products of at most degree distinct " "input features (so not x[1] ** 2, x[0] * x[2] ** 3, " "etc.)."), "Include bias": ( "If True (default), then include a bias column, the " "feature in which all polynomial powers are zero (i.e. " "a column of ones - acts as an intercept term in a linear " "model)."), "Order": ( "Order of output array in the dense case. ‘F’ order is " "faster to compute, but may slow down subsequent " "estimators."), "Preserve as dataframe": "Preserve as Dask dataframe", }, "RobustScaler": { "description": ( "Scale features using statistics that are robust to " "outliers."), "Center the data before scaling": ( "If True, center the data before scaling. This will cause " "transform to raise an exception when attempted on sparse " "matrices, because centering them entails building a " "dense matrix which in common use cases is likely to be " "too large to fit in memory."), "Scale to interquartile range": ( "If True, scale the data to interquartile range."), "IQR Quantile range - Lower": ( "Quantile range used to calculate `scale_`."), "IQR Quantile range - Upper": ( "Quantile range used to calculate `scale_`."), "Copy": ( "If False, try to avoid a copy and do inplace scaling " "instead. This is not guaranteed to always work inplace; " "e.g. if the data is not a NumPy array or scipy.sparse " "CSR matrix, a copy may still be returned."), }, "StandardScaler": { "description": ( "Standardize features by removing the mean and scaling to " "unit variance"), "Copy": ( "If False, try to avoid a copy and do inplace scaling " "instead. This is not guaranteed to always work inplace; " "e.g. if the data is not a NumPy array or scipy.sparse " "CSR matrix, a copy may still be returned."), "Center the data": ( "If True, center the data before scaling. This does not " "work (and will raise an exception) when attempted on " "sparse matrices, because centering them entails building " "a dense matrix which in common use cases is likely to be " "too large to fit in memory."), "Scale to unit variance": ( "If True, scale the data to unit variance (or " "equivalently, unit standard deviation)."), }, } ) TRANSFORM_PARAMETERS = [] for k, v in PARAMS_DICT.items(): TRANSFORM_PARAMETERS.extend(v) TRANSFORM_PARAMETERS = list(set(TRANSFORM_PARAMETERS)) TRANSFORM_TYPES = ( { "threshold": float, "maximum_categories": int, "Positive Label": int, "Negative Label": int, "Transformed array in sparse format": bool, "norm": ["l1", "l2", "max"], "Use categorical": bool, "Categories": list, "Drop category": str, "Desired data type": str, "Handle Unknown": ["error", "ignore"], "Degree": int, "Only interaction features produced": bool, "Include bias": bool, "Order": ["C", "F"], "Preserve as dataframe": bool, "Copy": bool, "Center the data before scaling": bool, "Center the data": bool, "Scale to interquartile range": bool, "IQR Quantile range - Lower": float, "IQR Quantile range - Upper": float, "Scale to unit variance": bool, "Missing values": float, "Strategy for missing values": str, "Fill value for missing values": int, "Verbose": int, "Add indicator": bool, } ) TRANSFORM_DEFAULTS = ( { "threshold": 0.0, "maximum_categories": None, "Positive Label": 1, "Negative Label": 0, "Transformed array in sparse format": False, "norm": "l2", "Use categorical": True, "Drop category": None, "Categories": "auto", "Desired data type": "float", "Handle Unknown": "error", "Degree": 2, "Only interaction features produced": False, "Include bias": True, "Order": "C", "Preserve as dataframe": False, "Copy": True, "Center the data before scaling": True, "Center the data": True, "Scale to interquartile range": True, "IQR Quantile range - Lower": 25.0, "IQR Quantile range - Upper": 75.0, "Scale to unit variance": True, "Missing values": np.nan, "Strategy for missing values": "mean", "Fill value for missing values": None, "Verbose": 0, "Add indicator": False, } ) TEXT_PARAMS = ( { "threshold": "threshold", "maximum_categories": "maximum_categories", "Positive Label": "Positive Label", "Negative Label": "Negative Label", "Transformed array in sparse format": "sparse", "norm": "norm", "Use categorical": "use_categorical", "Drop category": "drop", "Categories": "categories", "Desired data type": "dtype", "Handle Unknown": "handle_unknown", "Degree": "degree", "Only interaction features produced": "interaction_only", "Include bias": "include_bias", "Order": "order", "Preserve as dataframe": "preserve_dataframe", "Copy": "copy", "Center the data before scaling": "with_centering", "Center the data": "with_mean", "Scale to interquartile range": "with_scaling", "IQR Quantile range - Lower": "quantile_range_lower", "IQR Quantile range - Upper": "quantile_range_upper", "IQR Quantile range": "quantile_range", "Scale to unit variance": "with_std", "Missing values": "missing_values", "Strategy for missing values": "strategy", "Fill value for missing values": "fill_value", "Verbose": "verbose", "Add indicator": "add_indicator", } ) algorithms = TRANSFORM_ALGS options_list = TRANSFORM_PARAMETERS options_types = TRANSFORM_TYPES options_default = TRANSFORM_DEFAULTS options_converted = TEXT_PARAMS parameters = node.parameters() editor = node.editors.multilist_editor(edit=True, mode=True) parameters.set_list( "columns", label="Columns", description="Columns that should be converted.", value=[], editor=editor, ) parameters.set_string( "algorithm", value=next(iter(algorithms)), description="", label="Algorithm" ) ImageFiltering_abstract.generate_parameters( parameters, options_types, options_default ) description = ( "Transforms tabular dataset based on common preprocessing operations") inputs = Ports([DatasetPort("Dataset", "dataset")]) outputs = Ports([DatasetPort("Dataset", "dataset")]) __doc__ = ImageFiltering_abstract.generate_docstring( description, algorithms, options_list, inputs, outputs ) def adjust_parameters(self, node_context): try: adjust(node_context.parameters["columns"], node_context.input[0]) except Exception: pass def exec_parameter_view(self, node_context): return TransformWidget( node_context.parameters, TransformTableDataset.algorithms, TransformTableDataset.options_list, TransformTableDataset.options_types, ) def execute(self, ctx): algorithm_choice = ctx.parameters["algorithm"].value all_cols = ctx.input[0].names(kind='cols') columns_choice = ctx.parameters['columns'].selected_names( all_cols) poi = list( { k: v for k, v in TransformTableDataset.algorithms[ algorithm_choice].items() if k != "description" }.keys() ) poi_values = {} if not poi == [""]: poi_values = { TransformTableDataset.options_converted[k]: ctx.parameters[k].value if TransformTableDataset.options_types[k] is not tuple and TransformTableDataset.options_types[k] is not list and TransformTableDataset.options_default[k] != "None" or k == "Categories" else ast.literal_eval(str(ctx.parameters[k].value)) for k in poi } transform = algorithm_choice transform_values = poi_values ds_obj = ctx.input["dataset"] ds_obj.load() ds = ds_obj.get_ds() if ds is None: raise exceptions.SyDataError("Empty dataset") if ds["dstype"] == "image": raise exceptions.SyDataError("Incorrect data type") for col in columns_choice: if transform == "RobustScaler": # Combine upper and lower into single parameter entry as # expected transform_values["quantile_range"] = tuple( (transform_values["quantile_range_lower"], transform_values["quantile_range_upper"]) ) transform_values.pop("quantile_range_lower") transform_values.pop("quantile_range_upper") if transform == "OneHotEncoder" and isinstance( transform_values["categories"], list ): new_cols = [ col + f"_{cat}" for cat in transform_values[ "categories"][0] ] # Remove old column ds["column_config"][col]["exclude"] = 1 # Add new columns to dataset for c in new_cols: ds["column_config"][c] = { "orig": False, "exclude": 0, "dtype": transform_values["dtype"], "transforms": [], "transforms_values": [], } ds["column_config"][col]["transforms"].extend([transform]) ds["column_config"][col]["transforms_values"].extend( [transform_values]) json = ctx.output["dataset"] json.set_ds(ds) json.save()