Source code for WORC.featureprocessing.OneHotEncoderWrapper

#!/usr/bin/env python

# Copyright 2020 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
from sklearn.preprocessing import OneHotEncoder


[docs]class OneHotEncoderWrapper(object): """Module for OneHotEncoding features."""
[docs] def __init__(self, feature_labels_tofit, handle_unknown='ignore', verbose=False): """Init preprocessor of features.""" # Initiate varables self.handle_unknown = handle_unknown self.verbose = verbose self.feature_labels_tofit = feature_labels_tofit
[docs] def fit(self, X, feature_labels, y=None): """Fit OneHotEncoder for labels in feature_labels.""" self.selectcolumns = list() self.selectlabels = list() self.skipcolumns = list() for num, label in enumerate(feature_labels): if any(fl in label for fl in self.feature_labels_tofit): # This feature needs to be one hot encoded self.selectcolumns.append(num) self.selectlabels.append(label) else: # This feature needs to be skipped from onehotencoding self.skipcolumns.append(num) if self.verbose: print(f'\t Fitting one-hot-encoder for features {self.selectlabels}.') if len(self.selectcolumns) == 0: if self.verbose: print('\t No features selected, skip one-hot-encoding') self.encoder = None return # Gather skipped feature values and labels and selected ones skipped_feature_labels = list(np.asarray(feature_labels)[self.skipcolumns]) select_feature_values = X[:, self.selectcolumns] select_feature_labels = list(np.asarray(feature_labels)[self.selectcolumns]) # Apply the onehotencoding self.encoder = OneHotEncoder(handle_unknown=self.handle_unknown) self.encoder.fit(select_feature_values) # Adjust feature labels categories = self.encoder.categories_ self.encoded_feature_labels = skipped_feature_labels for fl, cat in zip(select_feature_labels, categories): for c in range(cat.shape[0]): self.encoded_feature_labels.append(fl + f'_{c}') if self.verbose: print(f'\t Encoded feature labels: {self.encoded_feature_labels}.')
[docs] def transform(self, inputarray): """Transform feature array. Transform the inputarray to select only the features based on the result from the fit function. Parameters ---------- inputarray: numpy array, mandatory Array containing the items to use selection on. The type of item in this list does not matter, e.g. floats, strings etc. """ if self.encoder is None: # No features encoded outputarray = inputarray else: # Gather skipped feature values and labels and selected ones skipped_feature_values = inputarray[:, self.skipcolumns] select_feature_values = inputarray[:, self.selectcolumns] # Transform selected features encoded_feature_values = self.encoder.transform(select_feature_values).toarray() # Recombine both outputarray = np.concatenate((skipped_feature_values, encoded_feature_values), axis=1) return outputarray
[docs]def test(): """Test OneHotEncoderWrapper object.""" # Objects X_train = np.asarray([['Male', 1, 5], ['Female', 3, 6], ['Female', 2, 7]]) X_test = np.asarray([['Male', 2, 7], ['Unknown', 10, 10]]) feature_labels = ['Gender', 'Numeric0', 'Numeric1'] feature_labels_tofit = ['Gender', '0'] # Fit and transform enc = OneHotEncoderWrapper(feature_labels_tofit=feature_labels_tofit, verbose=True) enc.fit(X_train, feature_labels) X_train_encoded = enc.transform(X_train) X_test_encoded = enc.transform(X_test) # Print results print("X_train:") print(f"Input: {X_train}.") print(f"Output: {X_train_encoded}.") print("X_test:") print(f"Input: {X_test}.") print(f"Output: {X_test_encoded}.") print("Encoded feature labels:") print(enc.encoded_feature_labels)
if __name__ == "__main__": test()