Source code for node_transformdataset
# This file is part of Sympathy for Data.
# Copyright (c) 2021 Combine Control Systems
#
# SYMPATHY FOR DATA COMMERCIAL LICENSE
# You should have received a link to the License with Sympathy for Data.
from sympathy.api import node, qt2, ParameterView
from sympathy.api.nodeconfig import Ports, Tag, Tags, adjust
from sympathy.api import exceptions
from sylib.imageprocessing.algorithm_selector import (
ImageFiltering_abstract,
AlgorithmParameterWidget,
)
from sylib_aml.dataset import DatasetPort
import ast
import numpy as np
QtGui = qt2.import_module("QtGui")
QtCore = qt2.import_module("QtCore")
QtWidgets = qt2.import_module("QtWidgets")
class TransformWidget(ParameterView):
"""
Creates the data transformation GUI for datasets
"""
def __init__(
self, parameters, algorithms, options_list, options_types, parent=None
):
super().__init__(parent=parent)
self._parameters = parameters
column_choice = self._parameters["columns"].gui()
algo_widget = AlgorithmParameterWidget(
self._parameters, algorithms, options_list, options_types
)
layout = QtWidgets.QVBoxLayout()
layout.addWidget(column_choice)
layout.addWidget(algo_widget)
self.setLayout(layout)
[docs]
class TransformImageDataset(ImageFiltering_abstract, node.Node):
name = "Transform Image Dataset (Experimental)"
nodeid = (
"com.sympathyfordata.advancedmachinelearning.transformimagedataset")
author = "Jannes Germishuys"
icon = "image_ds_transform.svg"
tags = Tags(Tag.MachineLearning.Processing)
# Choose parameters for each transform
PARAMS_DICT = {
"Grayscale": ["Number of output channels"],
"Center Crop": ["Width", "Height"],
"Normalize": ["Mean", "Standard deviation"],
"Pad": ["Padding size", "Fill", "Padding mode"],
"Resize": ["Width", "Height", "Interpolation"],
"To Tensor": [],
"To PIL Image": [],
}
# Set-up structure for each algorithm (parameters, parameter types, default
# parameter values)
TRANSFORM_ALGS = {
"Grayscale": {
"description": (
"Convert image to grayscale. If the image is torch "
"Tensor, it is expected to have […, 3, H, W] shape, where … "
"means an arbitrary number of leading dimensions"),
"Number of output channels": (
"(1 or 3) number of channels desired for output image"),
},
"Center Crop": {
"description": (
"Crops the given image at the center. If the image is torch "
"Tensor, it is expected to have […, H, W] shape, where … "
"means an arbitrary number of leading dimensions. "
"If image size is smaller than output size along any edge, "
"image is padded with 0 and then center cropped."),
"Width": "Desired output width of the crop",
"Height": "Desired output height of the crop",
},
"Normalize": {
"description": (
"Normalize a tensor image with mean and standard deviation. "
"This transform does not support PIL Image. Given mean: "
"(mean[1],...,mean[n]) and std: (std[1],..,std[n]) for n "
"channels, this transform will normalize each channel of the "
"input torch.*Tensor i.e., output[channel] = "
"(input[channel] - mean[channel]) / std[channel]"),
"Mean": "Sequence of means for each channel.",
"Standard deviation": (
"Sequence of standard deviations for each channel."),
},
"Pad": {
"description": (
"Pad the given image on all sides with the given “pad” value. "
"If the image is torch Tensor, it is expected to have "
"[…, H, W] shape, where … means at most 2 leading dimensions "
"for mode reflect and symmetric, at most 3 leading dimensions "
"for mode edge, and an arbitrary number of leading dimensions "
"for mode constant"),
"Padding size": (
"Padding on each border. If a single int is provided this is "
"used to pad all borders. If sequence of length 2 is provided "
"this is the padding on left/right and top/bottom "
"respectively. If a sequence of length 4 is provided this is "
"the padding for the left, top, right and bottom borders "
"respectively."),
"Fill": (
"Pixel fill value for constant fill. Default is 0. If a tuple "
"of length 3, it is used to fill R, G, B channels "
"respectively. This value is only used when the padding_mode "
"is constant. Only number is supported for torch Tensor. Only "
"int or str or tuple value is supported for PIL Image."),
"Padding mode": (
"Type of padding. Should be: constant, edge, reflect or "
"symmetric. Default is constant. - constant: pads with a "
"constant value, this value is specified with fill - edge: "
"pads with the last value at the edge of the image. If input "
"a 5D torch Tensor, the last 3 dimensions will be padded "
"instead of the last 2 - reflect: pads with reflection of "
"image without repeating the last value on the edge. For "
"example, padding [1, 2, 3, 4] with 2 elements on both sides "
"in reflect mode will result in [3, 2, 1, 2, 3, 4, 3, 2] "
"- symmetric: pads with reflection of image repeating the "
"last value on the edge. For example, padding [1, 2, 3, 4] "
"with 2 elements on both sides in symmetric mode will result "
"in [2, 1, 1, 2, 3, 4, 4, 3]"),
},
"Resize": {
"description": (
"Resize the input image to the given size. If the image is "
"torch Tensor, it is expected to have […, H, W] shape, where "
"… means an arbitrary number of leading dimensions"),
"Width": "Desired output width.",
"Height": "Desired output height.",
"Interpolation": (
"Desired interpolation enum defined by "
"torchvision.transforms.InterpolationMode. Default is "
"InterpolationMode.BILINEAR. If input is Tensor, only "
"InterpolationMode.NEAREST, InterpolationMode.BILINEAR and "
"InterpolationMode.BICUBIC are supported. For backward "
"compatibility integer values (e.g. PIL.Image.NEAREST) are "
"still acceptable."),
},
"To Tensor": {
"description": (
"Convert a PIL Image or numpy.ndarray to tensor. This "
"transform does not support torchscript."),
},
"To PIL Image": {
"description": (
"Convert a tensor or an ndarray to PIL Image. This transform "
"does not support torchscript."),
},
}
TRANSFORM_PARAMETERS = []
for k, v in PARAMS_DICT.items():
TRANSFORM_PARAMETERS.extend(v)
TRANSFORM_PARAMETERS = list(set(TRANSFORM_PARAMETERS))
TEXT_PARAMS = (
{
"Number of output channels": "num_output_channels",
"Width": "width",
"Height": "height",
"Mean": "mean",
"Standard deviation": "std",
"Padding size": "padding",
"Fill": "fill",
"Padding mode": "padding_mode",
"Interpolation": "interpolation",
}
)
TRANSFORM_TYPES = {
"Number of output channels": int,
"Width": int,
"Height": int,
"Mean": int,
"Standard deviation": int,
"Padding size": int,
"Fill": int,
"Padding mode": str,
"Interpolation": ["Bilinear", "Bicubic", "Nearest"],
}
TRANSFORM_DEFAULTS = {
"Width": 100, # (100, 100),
"Height": 100,
"Standard deviation": 0, # [],
"Padding size": 1,
"Mean": 0, # [],
"Interpolation": "Nearest",
"Padding mode": "constant",
"Fill": 1,
"Number of output channels": 1,
}
algorithms = TRANSFORM_ALGS
options_list = TRANSFORM_PARAMETERS
options_types = TRANSFORM_TYPES
options_default = TRANSFORM_DEFAULTS
options_converted = TEXT_PARAMS
parameters = node.parameters()
parameters.set_string(
"algorithm", value=next(iter(algorithms)), description="",
label="Algorithm"
)
ImageFiltering_abstract.generate_parameters(
parameters, options_types, options_default
)
description = "Transforms images within an image dataset"
inputs = Ports([DatasetPort("Dataset", "dataset")])
outputs = Ports([DatasetPort("Dataset", "dataset")])
__doc__ = ImageFiltering_abstract.generate_docstring(
description, algorithms, options_list, inputs, outputs
)
def execute(self, node_context):
algorithm_choice = node_context.parameters["algorithm"].value
poi = list(
{
k: v
for k, v in TransformImageDataset.algorithms[
algorithm_choice].items()
if k != "description"
}.keys()
)
if poi == [""]:
poi_values = {}
else:
poi_values = {
TransformImageDataset.options_converted[k]: (
node_context.parameters[k].value)
if TransformImageDataset.options_types[k] is not tuple
else ast.literal_eval(node_context.parameters[k].value)
for k in poi
}
transform = algorithm_choice
transform_values = poi_values
# Combine upper and lower into single parameter entry as expected
if transform in ["Resize", "Center Crop"]:
transform_values["size"] = tuple(
[transform_values["width"], transform_values["height"]]
)
for key in ["width", "height"]:
del transform_values[key]
# transforms[algorithm_choice](**poi_values)
ds_obj = node_context.input["dataset"]
ds_obj.load()
ds_skl = ds_obj.get_ds()
if ds_skl is None:
raise exceptions.SyDataError("Empty dataset")
elif ds_skl is not None:
if ds_skl["dstype"] == "table":
raise exceptions.SyDataError("Incorrect data type")
ds_skl["transforms"].extend([transform])
ds_skl["transforms_values"].extend([transform_values])
ds = {
"dstype": ds_skl["dstype"],
"paths": ds_skl["paths"],
"transforms": ds_skl["transforms"],
"transforms_values": ds_skl["transforms_values"],
"labels": ds_skl["labels"],
}
else:
ds = {}
json = node_context.output["dataset"]
json.set_ds(ds)
json.save()
[docs]
class TransformTableDataset(ImageFiltering_abstract, node.Node):
name = "Transform Table Dataset (Experimental)"
nodeid = (
"com.sympathyfordata.advancedmachinelearning.transformtabledataset")
author = "Jannes Germishuys"
icon = "table_ds_transform.svg"
tags = Tags(Tag.MachineLearning.Processing)
# Choose parameters for each transform
PARAMS_DICT = (
{
"SimpleImputer": [
"Missing values",
"Strategy for missing values",
"Fill value for missing values",
"Verbose",
"Copy",
"Add indicator",
],
"Binarizer": ["threshold"],
"LabelEncoder": ["Use categorical"],
"OneHotEncoder": [
"Categories",
"Drop category",
"Transformed array in sparse format",
"Desired data type",
"Handle Unknown",
],
"PolynomialFeatures": [
"Degree",
"Only interaction features produced",
"Include bias",
"Order",
"Preserve as dataframe",
],
"RobustScaler": [
"Center the data before scaling",
"Scale to interquartile range",
"IQR Quantile range - Lower",
"IQR Quantile range - Upper",
"Copy",
],
"StandardScaler": [
"Copy", "Center the data", "Scale to unit variance"],
}
)
# Set-up structure for each algorithm (parameters, parameter types, default
# parameter values)
TRANSFORM_ALGS = (
{
"SimpleImputer": {
"description": (
"Simple imputation for missing data in tabular datasets"),
"Missing values": (
"The placeholder for the missing values. All occurrences "
"of missing_values will be imputed. For pandas’ "
"dataframes with nullable integer dtypes with missing "
"values, missing_values should be set to np.nan, since "
"pd.NA will be converted to np.nan."),
"Strategy for missing values": (
"The imputation strategy. If “mean”, then replace missing "
"values using the mean along each column. Can only be "
"used with numeric data. If “median”, then replace "
"missing values using the median along each column. Can "
"only be used with numeric data. If “most_frequent”, "
"then replace missing using the most frequent value along "
"each column. Can be used with strings or numeric data. "
"If there is more than one such value, only the smallest "
"is returned. If “constant”, then replace missing values "
"with fill_value. Can be used with strings or numeric "
"data."),
"Fill value for missing values": (
"When strategy == “constant”, fill_value is used to "
"replace all occurrences of missing_values. If left to "
"the default, fill_value will be 0 when imputing "
"numerical data and “missing_value” for strings or object "
"data types."),
"Verbose": "Controls the verbosity of the imputer.",
"Copy": (
"If True, a copy of X will be created. If False, "
"imputation will be done in-place whenever possible. Note "
"that, in the following cases, a new copy will always be "
"made, even if copy=False: If X is not an array of "
"floating values; If X is encoded as a CSR matrix; If "
"add_indicator=True."),
"Add indicator": (
"If True, a MissingIndicator transform will stack onto "
"output of the imputer’s transform. This allows a "
"predictive estimator to account for missingness despite "
"imputation. If a feature has no missing values at "
"fit/train time, the feature won’t appear on the missing "
"indicator even if there are missing values at "
"transform/test time."),
},
"Binarizer": {
"description": (
"Binarize data (set feature values to 0 or 1) according "
"to a threshold."),
"threshold": (
"Feature values below or equal to this are replaced by 0, "
"above it by 1. Threshold may not be less than 0 for "
"operations on sparse matrices."),
},
"LabelEncoder": {
"description": (
"Encode target labels with value between 0 and "
"n_classes-1."),
"Use categorical": "",
},
"OneHotEncoder": {
"description": (
"Encode categorical features as a one-hot numeric array."),
"Categories": (
"Categories (unique values) per feature: ‘auto’ : "
"Determine categories automatically from the training "
"data. list : categories[i] holds the categories expected "
"in the ith column. The passed categories should not mix "
"strings and numeric values within a single feature, and "
"should be sorted in case of numeric values. The used "
"categories can be found in the `categories_` attribute."),
"Drop category": (
"Specifies a methodology to use to drop one of the "
"categories per feature. This is useful in situations "
"where perfectly collinear features cause problems, such "
"as when feeding the resulting data into a neural network "
"or an unregularized regression. However, dropping one "
"category breaks the symmetry of the original "
"representation and can therefore induce a bias in "
"downstream models, for instance for penalized linear "
"classification or regression models. None : retain all "
"features (the default). ‘first’ : drop the first "
"category in each feature. If only one category is "
"present, the feature will be dropped entirely. "
"‘if_binary’ : drop the first category in each feature "
"with two categories. Features with 1 or more than 2 "
"categories are left intact. array : drop[i] "
"is the category in feature X[:, i] that should be "
"dropped"),
"Transformed array in sparse format": (
"Will return sparse matrix if set True else will return "
"an array."),
"Desired data type": "Desired dtype of output.",
"Handle Unknown": (
"Whether to raise an error or ignore if an unknown "
"categorical feature is present during transform (default "
"is to raise). When this parameter is set to ‘ignore’ and "
"an unknown category is encountered during transform, the "
"resulting one-hot encoded columns for this feature will "
"be all zeros. In the inverse transform, an unknown "
"category will be denoted as None."),
},
"PolynomialFeatures": {
"description": "Generate polynomial and interaction features.",
"Degree": "The degree of the polynomial features.",
"Only interaction features produced": (
"If true, only interaction features are produced: "
"features that are products of at most degree distinct "
"input features (so not x[1] ** 2, x[0] * x[2] ** 3, "
"etc.)."),
"Include bias": (
"If True (default), then include a bias column, the "
"feature in which all polynomial powers are zero (i.e. "
"a column of ones - acts as an intercept term in a linear "
"model)."),
"Order": (
"Order of output array in the dense case. ‘F’ order is "
"faster to compute, but may slow down subsequent "
"estimators."),
"Preserve as dataframe": "Preserve as Dask dataframe",
},
"RobustScaler": {
"description": (
"Scale features using statistics that are robust to "
"outliers."),
"Center the data before scaling": (
"If True, center the data before scaling. This will cause "
"transform to raise an exception when attempted on sparse "
"matrices, because centering them entails building a "
"dense matrix which in common use cases is likely to be "
"too large to fit in memory."),
"Scale to interquartile range": (
"If True, scale the data to interquartile range."),
"IQR Quantile range - Lower": (
"Quantile range used to calculate `scale_`."),
"IQR Quantile range - Upper": (
"Quantile range used to calculate `scale_`."),
"Copy": (
"If False, try to avoid a copy and do inplace scaling "
"instead. This is not guaranteed to always work inplace; "
"e.g. if the data is not a NumPy array or scipy.sparse "
"CSR matrix, a copy may still be returned."),
},
"StandardScaler": {
"description": (
"Standardize features by removing the mean and scaling to "
"unit variance"),
"Copy": (
"If False, try to avoid a copy and do inplace scaling "
"instead. This is not guaranteed to always work inplace; "
"e.g. if the data is not a NumPy array or scipy.sparse "
"CSR matrix, a copy may still be returned."),
"Center the data": (
"If True, center the data before scaling. This does not "
"work (and will raise an exception) when attempted on "
"sparse matrices, because centering them entails building "
"a dense matrix which in common use cases is likely to be "
"too large to fit in memory."),
"Scale to unit variance": (
"If True, scale the data to unit variance (or "
"equivalently, unit standard deviation)."),
},
}
)
TRANSFORM_PARAMETERS = []
for k, v in PARAMS_DICT.items():
TRANSFORM_PARAMETERS.extend(v)
TRANSFORM_PARAMETERS = list(set(TRANSFORM_PARAMETERS))
TRANSFORM_TYPES = (
{
"threshold": float,
"maximum_categories": int,
"Positive Label": int,
"Negative Label": int,
"Transformed array in sparse format": bool,
"norm": ["l1", "l2", "max"],
"Use categorical": bool,
"Categories": list,
"Drop category": str,
"Desired data type": str,
"Handle Unknown": ["error", "ignore"],
"Degree": int,
"Only interaction features produced": bool,
"Include bias": bool,
"Order": ["C", "F"],
"Preserve as dataframe": bool,
"Copy": bool,
"Center the data before scaling": bool,
"Center the data": bool,
"Scale to interquartile range": bool,
"IQR Quantile range - Lower": float,
"IQR Quantile range - Upper": float,
"Scale to unit variance": bool,
"Missing values": float,
"Strategy for missing values": str,
"Fill value for missing values": int,
"Verbose": int,
"Add indicator": bool,
}
)
TRANSFORM_DEFAULTS = (
{
"threshold": 0.0,
"maximum_categories": None,
"Positive Label": 1,
"Negative Label": 0,
"Transformed array in sparse format": False,
"norm": "l2",
"Use categorical": True,
"Drop category": None,
"Categories": "auto",
"Desired data type": "float",
"Handle Unknown": "error",
"Degree": 2,
"Only interaction features produced": False,
"Include bias": True,
"Order": "C",
"Preserve as dataframe": False,
"Copy": True,
"Center the data before scaling": True,
"Center the data": True,
"Scale to interquartile range": True,
"IQR Quantile range - Lower": 25.0,
"IQR Quantile range - Upper": 75.0,
"Scale to unit variance": True,
"Missing values": np.nan,
"Strategy for missing values": "mean",
"Fill value for missing values": None,
"Verbose": 0,
"Add indicator": False,
}
)
TEXT_PARAMS = (
{
"threshold": "threshold",
"maximum_categories": "maximum_categories",
"Positive Label": "Positive Label",
"Negative Label": "Negative Label",
"Transformed array in sparse format": "sparse",
"norm": "norm",
"Use categorical": "use_categorical",
"Drop category": "drop",
"Categories": "categories",
"Desired data type": "dtype",
"Handle Unknown": "handle_unknown",
"Degree": "degree",
"Only interaction features produced": "interaction_only",
"Include bias": "include_bias",
"Order": "order",
"Preserve as dataframe": "preserve_dataframe",
"Copy": "copy",
"Center the data before scaling": "with_centering",
"Center the data": "with_mean",
"Scale to interquartile range": "with_scaling",
"IQR Quantile range - Lower": "quantile_range_lower",
"IQR Quantile range - Upper": "quantile_range_upper",
"IQR Quantile range": "quantile_range",
"Scale to unit variance": "with_std",
"Missing values": "missing_values",
"Strategy for missing values": "strategy",
"Fill value for missing values": "fill_value",
"Verbose": "verbose",
"Add indicator": "add_indicator",
}
)
algorithms = TRANSFORM_ALGS
options_list = TRANSFORM_PARAMETERS
options_types = TRANSFORM_TYPES
options_default = TRANSFORM_DEFAULTS
options_converted = TEXT_PARAMS
parameters = node.parameters()
editor = node.editors.multilist_editor(edit=True, mode=True)
parameters.set_list(
"columns",
label="Columns",
description="Columns that should be converted.",
value=[],
editor=editor,
)
parameters.set_string(
"algorithm", value=next(iter(algorithms)), description="",
label="Algorithm"
)
ImageFiltering_abstract.generate_parameters(
parameters, options_types, options_default
)
description = (
"Transforms tabular dataset based on common preprocessing operations")
inputs = Ports([DatasetPort("Dataset", "dataset")])
outputs = Ports([DatasetPort("Dataset", "dataset")])
__doc__ = ImageFiltering_abstract.generate_docstring(
description, algorithms, options_list, inputs, outputs
)
def adjust_parameters(self, node_context):
try:
adjust(node_context.parameters["columns"], node_context.input[0])
except Exception:
pass
def exec_parameter_view(self, node_context):
return TransformWidget(
node_context.parameters,
TransformTableDataset.algorithms,
TransformTableDataset.options_list,
TransformTableDataset.options_types,
)
def execute(self, ctx):
algorithm_choice = ctx.parameters["algorithm"].value
all_cols = ctx.input[0].names(kind='cols')
columns_choice = ctx.parameters['columns'].selected_names(
all_cols)
poi = list(
{
k: v
for k, v in TransformTableDataset.algorithms[
algorithm_choice].items()
if k != "description"
}.keys()
)
poi_values = {}
if not poi == [""]:
poi_values = {
TransformTableDataset.options_converted[k]: ctx.parameters[k].value
if TransformTableDataset.options_types[k] is not tuple
and TransformTableDataset.options_types[k] is not list
and TransformTableDataset.options_default[k] != "None"
or k == "Categories"
else ast.literal_eval(str(ctx.parameters[k].value))
for k in poi
}
transform = algorithm_choice
transform_values = poi_values
ds_obj = ctx.input["dataset"]
ds_obj.load()
ds = ds_obj.get_ds()
if ds is None:
raise exceptions.SyDataError("Empty dataset")
if ds["dstype"] == "image":
raise exceptions.SyDataError("Incorrect data type")
for col in columns_choice:
if transform == "RobustScaler":
# Combine upper and lower into single parameter entry as
# expected
transform_values["quantile_range"] = tuple(
(transform_values["quantile_range_lower"],
transform_values["quantile_range_upper"])
)
transform_values.pop("quantile_range_lower")
transform_values.pop("quantile_range_upper")
if transform == "OneHotEncoder" and isinstance(
transform_values["categories"], list
):
new_cols = [
col + f"_{cat}" for cat in transform_values[
"categories"][0]
]
# Remove old column
ds["column_config"][col]["exclude"] = 1
# Add new columns to dataset
for c in new_cols:
ds["column_config"][c] = {
"orig": False,
"exclude": 0,
"dtype": transform_values["dtype"],
"transforms": [],
"transforms_values": [],
}
ds["column_config"][col]["transforms"].extend([transform])
ds["column_config"][col]["transforms_values"].extend(
[transform_values])
json = ctx.output["dataset"]
json.set_ds(ds)
json.save()