Source code for node_transformdataset

# This file is part of Sympathy for Data.
# Copyright (c) 2021 Combine Control Systems
#
# SYMPATHY FOR DATA COMMERCIAL LICENSE
# You should have received a link to the License with Sympathy for Data.
from sympathy.api import node, qt2, ParameterView
from sympathy.api.nodeconfig import Ports, Tag, Tags, adjust
from sympathy.api import exceptions

from sylib.imageprocessing.algorithm_selector import (
    ImageFiltering_abstract,
    AlgorithmParameterWidget,
)
from sylib_aml.dataset import DatasetPort

import ast
import numpy as np

QtGui = qt2.import_module("QtGui")
QtCore = qt2.import_module("QtCore")
QtWidgets = qt2.import_module("QtWidgets")


class TransformWidget(ParameterView):
    """
    Creates the data transformation GUI for datasets
    """

    def __init__(
        self, parameters, algorithms, options_list, options_types, parent=None
    ):
        super().__init__(parent=parent)
        self._parameters = parameters

        column_choice = self._parameters["columns"].gui()

        algo_widget = AlgorithmParameterWidget(
            self._parameters, algorithms, options_list, options_types
        )

        layout = QtWidgets.QVBoxLayout()
        layout.addWidget(column_choice)
        layout.addWidget(algo_widget)
        self.setLayout(layout)



[docs]
class TransformImageDataset(ImageFiltering_abstract, node.Node):
    name = "Transform Image Dataset (Experimental)"
    nodeid = (
        "com.sympathyfordata.advancedmachinelearning.transformimagedataset")
    author = "Jannes Germishuys"
    icon = "image_ds_transform.svg"
    tags = Tags(Tag.MachineLearning.Processing)

    # Choose parameters for each transform
    PARAMS_DICT = {
        "Grayscale": ["Number of output channels"],
        "Center Crop": ["Width", "Height"],
        "Normalize": ["Mean", "Standard deviation"],
        "Pad": ["Padding size", "Fill", "Padding mode"],
        "Resize": ["Width", "Height", "Interpolation"],
        "To Tensor": [],
        "To PIL Image": [],
    }

    # Set-up structure for each algorithm (parameters, parameter types, default
    # parameter values)
    TRANSFORM_ALGS = {
        "Grayscale": {
            "description": (
                "Convert image to grayscale. If the image is torch "
                "Tensor, it is expected to have […, 3, H, W] shape, where … "
                "means an arbitrary number of leading dimensions"),
            "Number of output channels": (
                "(1 or 3) number of channels desired for output image"),
        },
        "Center Crop": {
            "description": (
                "Crops the given image at the center. If the image is torch "
                "Tensor, it is expected to have […, H, W] shape, where … "
                "means an arbitrary number of leading dimensions. "
                "If image size is smaller than output size along any edge, "
                "image is padded with 0 and then center cropped."),
            "Width": "Desired output width of the crop",
            "Height": "Desired output height of the crop",
        },
        "Normalize": {
            "description": (
                "Normalize a tensor image with mean and standard deviation. "
                "This transform does not support PIL Image. Given mean: "
                "(mean[1],...,mean[n]) and std: (std[1],..,std[n]) for n "
                "channels, this transform will normalize each channel of the "
                "input torch.*Tensor i.e., output[channel] = "
                "(input[channel] - mean[channel]) / std[channel]"),
            "Mean": "Sequence of means for each channel.",
            "Standard deviation": (
                "Sequence of standard deviations for each channel."),
        },
        "Pad": {
            "description": (
                "Pad the given image on all sides with the given “pad” value. "
                "If the image is torch Tensor, it is expected to have "
                "[…, H, W] shape, where … means at most 2 leading dimensions "
                "for mode reflect and symmetric, at most 3 leading dimensions "
                "for mode edge, and an arbitrary number of leading dimensions "
                "for mode constant"),
            "Padding size": (
                "Padding on each border. If a single int is provided this is "
                "used to pad all borders. If sequence of length 2 is provided "
                "this is the padding on left/right and top/bottom "
                "respectively. If a sequence of length 4 is provided this is "
                "the padding for the left, top, right and bottom borders "
                "respectively."),
            "Fill": (
                "Pixel fill value for constant fill. Default is 0. If a tuple "
                "of length 3, it is used to fill R, G, B channels "
                "respectively. This value is only used when the padding_mode "
                "is constant. Only number is supported for torch Tensor. Only "
                "int or str or tuple value is supported for PIL Image."),
            "Padding mode": (
                "Type of padding. Should be: constant, edge, reflect or "
                "symmetric. Default is constant. - constant: pads with a "
                "constant value, this value is specified with fill - edge: "
                "pads with the last value at the edge of the image. If input "
                "a 5D torch Tensor, the last 3 dimensions will be padded "
                "instead of the last 2 - reflect: pads with reflection of "
                "image without repeating the last value on the edge. For "
                "example, padding [1, 2, 3, 4] with 2 elements on both sides "
                "in reflect mode will result in [3, 2, 1, 2, 3, 4, 3, 2] "
                "- symmetric: pads with reflection of image repeating the "
                "last value on the edge. For example, padding [1, 2, 3, 4] "
                "with 2 elements on both sides in symmetric mode will result "
                "in [2, 1, 1, 2, 3, 4, 4, 3]"),
        },
        "Resize": {
            "description": (
                "Resize the input image to the given size. If the image is "
                "torch Tensor, it is expected to have […, H, W] shape, where "
                "… means an arbitrary number of leading dimensions"),
            "Width": "Desired output width.",
            "Height": "Desired output height.",
            "Interpolation": (
                "Desired interpolation enum defined by "
                "torchvision.transforms.InterpolationMode. Default is "
                "InterpolationMode.BILINEAR. If input is Tensor, only "
                "InterpolationMode.NEAREST, InterpolationMode.BILINEAR and "
                "InterpolationMode.BICUBIC are supported. For backward "
                "compatibility integer values (e.g. PIL.Image.NEAREST) are "
                "still acceptable."),
        },
        "To Tensor": {
            "description": (
                "Convert a PIL Image or numpy.ndarray to tensor. This "
                "transform does not support torchscript."),
        },
        "To PIL Image": {
            "description": (
                "Convert a tensor or an ndarray to PIL Image. This transform "
                "does not support torchscript."),
        },
    }

    TRANSFORM_PARAMETERS = []
    for k, v in PARAMS_DICT.items():
        TRANSFORM_PARAMETERS.extend(v)

    TRANSFORM_PARAMETERS = list(set(TRANSFORM_PARAMETERS))

    TEXT_PARAMS = (
        {
            "Number of output channels": "num_output_channels",
            "Width": "width",
            "Height": "height",
            "Mean": "mean",
            "Standard deviation": "std",
            "Padding size": "padding",
            "Fill": "fill",
            "Padding mode": "padding_mode",
            "Interpolation": "interpolation",
        }
    )

    TRANSFORM_TYPES = {
        "Number of output channels": int,
        "Width": int,
        "Height": int,
        "Mean": int,
        "Standard deviation": int,
        "Padding size": int,
        "Fill": int,
        "Padding mode": str,
        "Interpolation": ["Bilinear", "Bicubic", "Nearest"],
    }

    TRANSFORM_DEFAULTS = {
        "Width": 100,  # (100, 100),
        "Height": 100,
        "Standard deviation": 0,  # [],
        "Padding size": 1,
        "Mean": 0,  # [],
        "Interpolation": "Nearest",
        "Padding mode": "constant",
        "Fill": 1,
        "Number of output channels": 1,
    }

    algorithms = TRANSFORM_ALGS
    options_list = TRANSFORM_PARAMETERS
    options_types = TRANSFORM_TYPES
    options_default = TRANSFORM_DEFAULTS
    options_converted = TEXT_PARAMS

    parameters = node.parameters()
    parameters.set_string(
        "algorithm", value=next(iter(algorithms)), description="",
        label="Algorithm"
    )
    ImageFiltering_abstract.generate_parameters(
        parameters, options_types, options_default
    )
    description = "Transforms images within an image dataset"
    inputs = Ports([DatasetPort("Dataset", "dataset")])
    outputs = Ports([DatasetPort("Dataset", "dataset")])
    __doc__ = ImageFiltering_abstract.generate_docstring(
        description, algorithms, options_list, inputs, outputs
    )

    def execute(self, node_context):
        algorithm_choice = node_context.parameters["algorithm"].value

        poi = list(
            {
                k: v
                for k, v in TransformImageDataset.algorithms[
                        algorithm_choice].items()
                if k != "description"
            }.keys()
        )

        if poi == [""]:
            poi_values = {}
        else:
            poi_values = {
                TransformImageDataset.options_converted[k]: (
                    node_context.parameters[k].value)
                if TransformImageDataset.options_types[k] is not tuple
                else ast.literal_eval(node_context.parameters[k].value)
                for k in poi
            }

        transform = algorithm_choice
        transform_values = poi_values
        # Combine upper and lower into single parameter entry as expected
        if transform in ["Resize", "Center Crop"]:
            transform_values["size"] = tuple(
                [transform_values["width"], transform_values["height"]]
            )
            for key in ["width", "height"]:
                del transform_values[key]
        # transforms[algorithm_choice](**poi_values)

        ds_obj = node_context.input["dataset"]
        ds_obj.load()
        ds_skl = ds_obj.get_ds()

        if ds_skl is None:
            raise exceptions.SyDataError("Empty dataset")
        elif ds_skl is not None:
            if ds_skl["dstype"] == "table":
                raise exceptions.SyDataError("Incorrect data type")

            ds_skl["transforms"].extend([transform])
            ds_skl["transforms_values"].extend([transform_values])

            ds = {
                "dstype": ds_skl["dstype"],
                "paths": ds_skl["paths"],
                "transforms": ds_skl["transforms"],
                "transforms_values": ds_skl["transforms_values"],
                "labels": ds_skl["labels"],
            }
        else:
            ds = {}

        json = node_context.output["dataset"]
        json.set_ds(ds)
        json.save()




[docs]
class TransformTableDataset(ImageFiltering_abstract, node.Node):
    name = "Transform Table Dataset (Experimental)"
    nodeid = (
        "com.sympathyfordata.advancedmachinelearning.transformtabledataset")
    author = "Jannes Germishuys"
    icon = "table_ds_transform.svg"
    tags = Tags(Tag.MachineLearning.Processing)

    # Choose parameters for each transform
    PARAMS_DICT = (
        {
            "SimpleImputer": [
                "Missing values",
                "Strategy for missing values",
                "Fill value for missing values",
                "Verbose",
                "Copy",
                "Add indicator",
            ],
            "Binarizer": ["threshold"],
            "LabelEncoder": ["Use categorical"],
            "OneHotEncoder": [
                "Categories",
                "Drop category",
                "Transformed array in sparse format",
                "Desired data type",
                "Handle Unknown",
            ],
            "PolynomialFeatures": [
                "Degree",
                "Only interaction features produced",
                "Include bias",
                "Order",
                "Preserve as dataframe",
            ],
            "RobustScaler": [
                "Center the data before scaling",
                "Scale to interquartile range",
                "IQR Quantile range - Lower",
                "IQR Quantile range - Upper",
                "Copy",
            ],
            "StandardScaler": [
                "Copy", "Center the data", "Scale to unit variance"],
        }
    )

    # Set-up structure for each algorithm (parameters, parameter types, default
    # parameter values)
    TRANSFORM_ALGS = (
        {
            "SimpleImputer": {
                "description": (
                    "Simple imputation for missing data in tabular datasets"),
                "Missing values": (
                    "The placeholder for the missing values. All occurrences "
                    "of missing_values will be imputed. For pandas’ "
                    "dataframes with nullable integer dtypes with missing "
                    "values, missing_values should be set to np.nan, since "
                    "pd.NA will be converted to np.nan."),
                "Strategy for missing values": (
                    "The imputation strategy. If “mean”, then replace missing "
                    "values using the mean along each column. Can only be "
                    "used with numeric data. If “median”, then replace "
                    "missing values using the median along each column. Can "
                    "only be used with numeric data. If “most_frequent”, "
                    "then replace missing using the most frequent value along "
                    "each column. Can be used with strings or numeric data. "
                    "If there is more than one such value, only the smallest "
                    "is returned. If “constant”, then replace missing values "
                    "with fill_value. Can be used with strings or numeric "
                    "data."),
                "Fill value for missing values": (
                    "When strategy == “constant”, fill_value is used to "
                    "replace all occurrences of missing_values. If left to "
                    "the default, fill_value will be 0 when imputing "
                    "numerical data and “missing_value” for strings or object "
                    "data types."),
                "Verbose": "Controls the verbosity of the imputer.",
                "Copy": (
                    "If True, a copy of X will be created. If False, "
                    "imputation will be done in-place whenever possible. Note "
                    "that, in the following cases, a new copy will always be "
                    "made, even if copy=False: If X is not an array of "
                    "floating values; If X is encoded as a CSR matrix; If "
                    "add_indicator=True."),
                "Add indicator": (
                    "If True, a MissingIndicator transform will stack onto "
                    "output of the imputer’s transform. This allows a "
                    "predictive estimator to account for missingness despite "
                    "imputation. If a feature has no missing values at "
                    "fit/train time, the feature won’t appear on the missing "
                    "indicator even if there are missing values at "
                    "transform/test time."),
            },
            "Binarizer": {
                "description": (
                    "Binarize data (set feature values to 0 or 1) according "
                    "to a threshold."),
                "threshold": (
                    "Feature values below or equal to this are replaced by 0, "
                    "above it by 1. Threshold may not be less than 0 for "
                    "operations on sparse matrices."),
            },
            "LabelEncoder": {
                "description": (
                    "Encode target labels with value between 0 and "
                    "n_classes-1."),
                "Use categorical": "",
            },
            "OneHotEncoder": {
                "description": (
                    "Encode categorical features as a one-hot numeric array."),
                "Categories": (
                    "Categories (unique values) per feature: ‘auto’ : "
                    "Determine categories automatically from the training "
                    "data. list : categories[i] holds the categories expected "
                    "in the ith column. The passed categories should not mix "
                    "strings and numeric values within a single feature, and "
                    "should be sorted in case of numeric values. The used "
                    "categories can be found in the `categories_` attribute."),
                "Drop category": (
                    "Specifies a methodology to use to drop one of the "
                    "categories per feature. This is useful in situations "
                    "where perfectly collinear features cause problems, such "
                    "as when feeding the resulting data into a neural network "
                    "or an unregularized regression. However, dropping one "
                    "category breaks the symmetry of the original "
                    "representation and can therefore induce a bias in "
                    "downstream models, for instance for penalized linear "
                    "classification or regression models. None : retain all "
                    "features (the default). ‘first’ : drop the first "
                    "category in each feature. If only one category is "
                    "present, the feature will be dropped entirely. "
                    "‘if_binary’ : drop the first category in each feature "
                    "with two categories. Features with 1 or more than 2 "
                    "categories are left intact. array : drop[i] "
                    "is the category in feature X[:, i] that should be "
                    "dropped"),
                "Transformed array in sparse format": (
                    "Will return sparse matrix if set True else will return "
                    "an array."),
                "Desired data type": "Desired dtype of output.",
                "Handle Unknown": (
                    "Whether to raise an error or ignore if an unknown "
                    "categorical feature is present during transform (default "
                    "is to raise). When this parameter is set to ‘ignore’ and "
                    "an unknown category is encountered during transform, the "
                    "resulting one-hot encoded columns for this feature will "
                    "be all zeros. In the inverse transform, an unknown "
                    "category will be denoted as None."),
            },
            "PolynomialFeatures": {
                "description": "Generate polynomial and interaction features.",
                "Degree": "The degree of the polynomial features.",
                "Only interaction features produced": (
                    "If true, only interaction features are produced: "
                    "features that are products of at most degree distinct "
                    "input features (so not x[1] ** 2, x[0] * x[2] ** 3, "
                    "etc.)."),
                "Include bias": (
                    "If True (default), then include a bias column, the "
                    "feature in which all polynomial powers are zero (i.e. "
                    "a column of ones - acts as an intercept term in a linear "
                    "model)."),
                "Order": (
                    "Order of output array in the dense case. ‘F’ order is "
                    "faster to compute, but may slow down subsequent "
                    "estimators."),
                "Preserve as dataframe": "Preserve as Dask dataframe",
            },
            "RobustScaler": {
                "description": (
                    "Scale features using statistics that are robust to "
                    "outliers."),
                "Center the data before scaling": (
                    "If True, center the data before scaling. This will cause "
                    "transform to raise an exception when attempted on sparse "
                    "matrices, because centering them entails building a "
                    "dense matrix which in common use cases is likely to be "
                    "too large to fit in memory."),
                "Scale to interquartile range": (
                    "If True, scale the data to interquartile range."),
                "IQR Quantile range - Lower": (
                    "Quantile range used to calculate `scale_`."),
                "IQR Quantile range - Upper": (
                    "Quantile range used to calculate `scale_`."),
                "Copy": (
                    "If False, try to avoid a copy and do inplace scaling "
                    "instead. This is not guaranteed to always work inplace; "
                    "e.g. if the data is not a NumPy array or scipy.sparse "
                    "CSR matrix, a copy may still be returned."),
            },
            "StandardScaler": {
                "description": (
                    "Standardize features by removing the mean and scaling to "
                    "unit variance"),
                "Copy": (
                    "If False, try to avoid a copy and do inplace scaling "
                    "instead. This is not guaranteed to always work inplace; "
                    "e.g. if the data is not a NumPy array or scipy.sparse "
                    "CSR matrix, a copy may still be returned."),
                "Center the data": (
                    "If True, center the data before scaling. This does not "
                    "work (and will raise an exception) when attempted on "
                    "sparse matrices, because centering them entails building "
                    "a dense matrix which in common use cases is likely to be "
                    "too large to fit in memory."),
                "Scale to unit variance": (
                    "If True, scale the data to unit variance (or "
                    "equivalently, unit standard deviation)."),
            },
        }
    )

    TRANSFORM_PARAMETERS = []
    for k, v in PARAMS_DICT.items():
        TRANSFORM_PARAMETERS.extend(v)

    TRANSFORM_PARAMETERS = list(set(TRANSFORM_PARAMETERS))

    TRANSFORM_TYPES = (
        {
            "threshold": float,
            "maximum_categories": int,
            "Positive Label": int,
            "Negative Label": int,
            "Transformed array in sparse format": bool,
            "norm": ["l1", "l2", "max"],
            "Use categorical": bool,
            "Categories": list,
            "Drop category": str,
            "Desired data type": str,
            "Handle Unknown": ["error", "ignore"],
            "Degree": int,
            "Only interaction features produced": bool,
            "Include bias": bool,
            "Order": ["C", "F"],
            "Preserve as dataframe": bool,
            "Copy": bool,
            "Center the data before scaling": bool,
            "Center the data": bool,
            "Scale to interquartile range": bool,
            "IQR Quantile range - Lower": float,
            "IQR Quantile range - Upper": float,
            "Scale to unit variance": bool,
            "Missing values": float,
            "Strategy for missing values": str,
            "Fill value for missing values": int,
            "Verbose": int,
            "Add indicator": bool,
        }
    )

    TRANSFORM_DEFAULTS = (
        {
            "threshold": 0.0,
            "maximum_categories": None,
            "Positive Label": 1,
            "Negative Label": 0,
            "Transformed array in sparse format": False,
            "norm": "l2",
            "Use categorical": True,
            "Drop category": None,
            "Categories": "auto",
            "Desired data type": "float",
            "Handle Unknown": "error",
            "Degree": 2,
            "Only interaction features produced": False,
            "Include bias": True,
            "Order": "C",
            "Preserve as dataframe": False,
            "Copy": True,
            "Center the data before scaling": True,
            "Center the data": True,
            "Scale to interquartile range": True,
            "IQR Quantile range - Lower": 25.0,
            "IQR Quantile range - Upper": 75.0,
            "Scale to unit variance": True,
            "Missing values": np.nan,
            "Strategy for missing values": "mean",
            "Fill value for missing values": None,
            "Verbose": 0,
            "Add indicator": False,
        }
    )

    TEXT_PARAMS = (
        {
            "threshold": "threshold",
            "maximum_categories": "maximum_categories",
            "Positive Label": "Positive Label",
            "Negative Label": "Negative Label",
            "Transformed array in sparse format": "sparse",
            "norm": "norm",
            "Use categorical": "use_categorical",
            "Drop category": "drop",
            "Categories": "categories",
            "Desired data type": "dtype",
            "Handle Unknown": "handle_unknown",
            "Degree": "degree",
            "Only interaction features produced": "interaction_only",
            "Include bias": "include_bias",
            "Order": "order",
            "Preserve as dataframe": "preserve_dataframe",
            "Copy": "copy",
            "Center the data before scaling": "with_centering",
            "Center the data": "with_mean",
            "Scale to interquartile range": "with_scaling",
            "IQR Quantile range - Lower": "quantile_range_lower",
            "IQR Quantile range - Upper": "quantile_range_upper",
            "IQR Quantile range": "quantile_range",
            "Scale to unit variance": "with_std",
            "Missing values": "missing_values",
            "Strategy for missing values": "strategy",
            "Fill value for missing values": "fill_value",
            "Verbose": "verbose",
            "Add indicator": "add_indicator",
        }
    )

    algorithms = TRANSFORM_ALGS
    options_list = TRANSFORM_PARAMETERS
    options_types = TRANSFORM_TYPES
    options_default = TRANSFORM_DEFAULTS
    options_converted = TEXT_PARAMS

    parameters = node.parameters()
    editor = node.editors.multilist_editor(edit=True, mode=True)
    parameters.set_list(
        "columns",
        label="Columns",
        description="Columns that should be converted.",
        value=[],
        editor=editor,
    )
    parameters.set_string(
        "algorithm", value=next(iter(algorithms)), description="",
        label="Algorithm"
    )
    ImageFiltering_abstract.generate_parameters(
        parameters, options_types, options_default
    )
    description = (
        "Transforms tabular dataset based on common preprocessing operations")
    inputs = Ports([DatasetPort("Dataset", "dataset")])
    outputs = Ports([DatasetPort("Dataset", "dataset")])
    __doc__ = ImageFiltering_abstract.generate_docstring(
        description, algorithms, options_list, inputs, outputs
    )

    def adjust_parameters(self, node_context):
        try:
            adjust(node_context.parameters["columns"], node_context.input[0])
        except Exception:
            pass

    def exec_parameter_view(self, node_context):
        return TransformWidget(
            node_context.parameters,
            TransformTableDataset.algorithms,
            TransformTableDataset.options_list,
            TransformTableDataset.options_types,
        )

    def execute(self, ctx):

        algorithm_choice = ctx.parameters["algorithm"].value

        all_cols = ctx.input[0].names(kind='cols')
        columns_choice = ctx.parameters['columns'].selected_names(
            all_cols)

        poi = list(
            {
                k: v
                for k, v in TransformTableDataset.algorithms[
                        algorithm_choice].items()
                if k != "description"
            }.keys()
        )

        poi_values = {}
        if not poi == [""]:
            poi_values = {
                TransformTableDataset.options_converted[k]: ctx.parameters[k].value
                if TransformTableDataset.options_types[k] is not tuple
                and TransformTableDataset.options_types[k] is not list
                and TransformTableDataset.options_default[k] != "None"
                or k == "Categories"
                else ast.literal_eval(str(ctx.parameters[k].value))
                for k in poi
            }

        transform = algorithm_choice
        transform_values = poi_values

        ds_obj = ctx.input["dataset"]
        ds_obj.load()
        ds = ds_obj.get_ds()

        if ds is None:
            raise exceptions.SyDataError("Empty dataset")

        if ds["dstype"] == "image":
            raise exceptions.SyDataError("Incorrect data type")

        for col in columns_choice:

            if transform == "RobustScaler":
                # Combine upper and lower into single parameter entry as
                # expected
                transform_values["quantile_range"] = tuple(
                        (transform_values["quantile_range_lower"],
                         transform_values["quantile_range_upper"])
                )

                transform_values.pop("quantile_range_lower")
                transform_values.pop("quantile_range_upper")

            if transform == "OneHotEncoder" and isinstance(
                transform_values["categories"], list
            ):
                new_cols = [
                    col + f"_{cat}" for cat in transform_values[
                        "categories"][0]
                ]
                # Remove old column
                ds["column_config"][col]["exclude"] = 1
                # Add new columns to dataset
                for c in new_cols:
                    ds["column_config"][c] = {
                        "orig": False,
                        "exclude": 0,
                        "dtype": transform_values["dtype"],
                        "transforms": [],
                        "transforms_values": [],
                    }

            ds["column_config"][col]["transforms"].extend([transform])
            ds["column_config"][col]["transforms_values"].extend(
                [transform_values])

        json = ctx.output["dataset"]
        json.set_ds(ds)
        json.save()
Source code for node_transformdataset

Sympathy for Data

Navigation

Related Topics