#!/usr/bin/env python
# Copyright 2020 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from sklearn.preprocessing import OneHotEncoder
[docs]class OneHotEncoderWrapper(object):
"""Module for OneHotEncoding features."""
[docs] def __init__(self, feature_labels_tofit, handle_unknown='ignore',
verbose=False):
"""Init preprocessor of features."""
# Initiate varables
self.handle_unknown = handle_unknown
self.verbose = verbose
self.feature_labels_tofit = feature_labels_tofit
[docs] def fit(self, X, feature_labels, y=None):
"""Fit OneHotEncoder for labels in feature_labels."""
self.selectcolumns = list()
self.selectlabels = list()
self.skipcolumns = list()
for num, label in enumerate(feature_labels):
if any(fl in label for fl in self.feature_labels_tofit):
# This feature needs to be one hot encoded
self.selectcolumns.append(num)
self.selectlabels.append(label)
else:
# This feature needs to be skipped from onehotencoding
self.skipcolumns.append(num)
if self.verbose:
print(f'\t Fitting one-hot-encoder for features {self.selectlabels}.')
if len(self.selectcolumns) == 0:
if self.verbose:
print('\t No features selected, skip one-hot-encoding')
self.encoder = None
return
# Gather skipped feature values and labels and selected ones
skipped_feature_labels = list(np.asarray(feature_labels)[self.skipcolumns])
select_feature_values = X[:, self.selectcolumns]
select_feature_labels = list(np.asarray(feature_labels)[self.selectcolumns])
# Apply the onehotencoding
self.encoder = OneHotEncoder(handle_unknown=self.handle_unknown)
self.encoder.fit(select_feature_values)
# Adjust feature labels
categories = self.encoder.categories_
self.encoded_feature_labels = skipped_feature_labels
for fl, cat in zip(select_feature_labels, categories):
for c in range(cat.shape[0]):
self.encoded_feature_labels.append(fl + f'_{c}')
if self.verbose:
print(f'\t Encoded feature labels: {self.encoded_feature_labels}.')
[docs]def test():
"""Test OneHotEncoderWrapper object."""
# Objects
X_train = np.asarray([['Male', 1, 5], ['Female', 3, 6], ['Female', 2, 7]])
X_test = np.asarray([['Male', 2, 7], ['Unknown', 10, 10]])
feature_labels = ['Gender', 'Numeric0', 'Numeric1']
feature_labels_tofit = ['Gender', '0']
# Fit and transform
enc = OneHotEncoderWrapper(feature_labels_tofit=feature_labels_tofit,
verbose=True)
enc.fit(X_train, feature_labels)
X_train_encoded = enc.transform(X_train)
X_test_encoded = enc.transform(X_test)
# Print results
print("X_train:")
print(f"Input: {X_train}.")
print(f"Output: {X_train_encoded}.")
print("X_test:")
print(f"Input: {X_test}.")
print(f"Output: {X_test_encoded}.")
print("Encoded feature labels:")
print(enc.encoded_feature_labels)
if __name__ == "__main__":
test()