Source code for WORC.validators.preflightcheck

#!/usr/bin/env python

# Copyright 2016-2023 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import ABC, abstractmethod

from WORC.processing.label_processing import load_labels
import WORC.addexceptions as ae
import os

# Global variables
min_subjects = 10
recommended_subjects = 50


[docs]class AbstractValidator(ABC):
    # noinspection PyBroadException
[docs]    def do_validation(self, *args, **kwargs):
        # try:
        result = self._validate(*args, **kwargs)
        if result is None:
            result = True
        # except:
        #     result = False

        msg = self._generate_detector_message(result)
        if msg:
            print(msg)
        return result

    def _generate_detector_message(self, validated_value):
        return f"{self.__class__.__name__[0:-8]} validated: {validated_value}."

    @abstractmethod
    def _validate(self, *args, **kwargs):
        pass


[docs]class SimpleValidator(AbstractValidator):
    def _validate(self, simpleworc, *args, **kwargs):
        # if not simpleworc._labels_file_train:
        #     if hasattr(simpleworc, 'labels_file_train'):
        #         if not simpleworc.labels_file_train:
        #             raise ae.WORCValueError(f'No labels, use SimpleWorc().labels_from_this_file(**) to add labels.')
        #     else:
        #         raise ae.WORCValueError(f'No labels, use SimpleWorc().labels_from_this_file(**) to add labels.')

        if not simpleworc._label_names:
            if not simpleworc.label_names:
                raise ae.WORCValueError(f'No label(s) to predict selected. Use SimpleWorc().predict_labels(**) to select labels.')

        if not simpleworc._method:
            raise ae.WORCValueError(f'No method selected. Call function binary_classification(**) or regression(**) or survival(**) on SimpleWorc().')

        if simpleworc._images_train:
            for num, (ims, segs) in enumerate(zip(simpleworc._images_train, simpleworc._segmentations_train)):
                if ims.keys() != segs.keys():
                    raise ae.WORCValueError(f'Subjects in images_train and segmentations_train are not the same for modality {num}.')

        if hasattr(simpleworc, 'images_train'):
            if simpleworc.images_train:
                for num, (ims, segs) in enumerate(zip(simpleworc.images_train, simpleworc.segmentations_train)):
                    if ims.keys() != segs.keys():
                        raise ae.WORCValueError(f'Subjects in images_train and segmentations_train are not the same for modality {num}.')
                    
        if simpleworc._worc.images_train:
            if len(simpleworc._worc.images_train) != len(simpleworc._image_types):
                raise ae.WORCValueError(f'Number of image types you said you would provide (image_types: {len(simpleworc._image_types)}) is not the same as the actual number of image types provided (images_train: {len(simpleworc._worc.images_train)}).')


[docs]class MinSubjectsValidator(AbstractValidator):
    def _validate(self, simpleworc, *args, **kwargs):
        if simpleworc._num_subjects < min_subjects:
            raise ae.WORCValueError(f'Less than {min_subjects} subjects (you have {simpleworc._num_subjects}) will probably make WORC crash due to a split in the test/validation set having only one subject. Use at least {min_subjects} subjects or more.')


# class EvaluateValidator(AbstractValidator):
#     def _validate(self, simpleworc, *args, **kwargs):
#         if simpleworc._add_evaluation:
#             if not simpleworc._images_train:
#                 if hasattr(simpleworc, 'images_train'):
#                     if not simpleworc.images_train:
#                         raise ae.WORCValueError(f'You have added the evaluation pipeline, but have not provided images, which is currently required. We will work on this option in a future release.')
#                 else:
#                     raise ae.WORCValueError(f'You have added the evaluation pipeline, but have not provided images, which is currently required. We will work on this option in a future release.')


[docs]class SamplesWarning(AbstractValidator):
    # Not really a validator, but more a good practice. Hence this won't throw an exception but prints a warning instead.
    def _validate(self, simpleworc, *args, **kwargs):
        if simpleworc._method == 'classification':
            if simpleworc._num_subjects < len(simpleworc._label_names) * recommended_subjects: # at least 100 subjects per label recommended
                print(f'Warning: at least {len(simpleworc._label_names) * recommended_subjects} subjects is recommended when predicting {len(simpleworc._label_names)} labels. Current subject count is: {simpleworc._num_subjects}')
        elif simpleworc._method == 'regression':
            # TODO @martijn not sure how to tackle this, what would be a reasonable amount of subjects for regression?
            pass


[docs]class InvalidLabelsValidator(AbstractValidator):
    def _validate(self, simpleworc):
        labels_file_train = None
        labels_file_test = None
        
        if simpleworc._labels_file_train:
            labels_file_train = simpleworc._labels_file_train
        elif simpleworc.labels_file_train:
            labels_file_train = simpleworc.labels_file_train
        elif simpleworc.trained_model is not None or simpleworc._trained_model is not None:
            # Inference, only testing objects
            if simpleworc._labels_file_test:
                labels_file_test = simpleworc._labels_file_test
            elif simpleworc.labels_file_test:
                labels_file_test = simpleworc.labels_file_test
            else:
                raise ae.WORCValueError(f'No test labels, use SimpleWorc().labels_from_this_file(**) to add labels.')
        else:
            raise ae.WORCValueError(f'No training labels, use SimpleWorc().labels_from_this_file(**) to add labels.')

        if labels_file_train is not None:
            self._validate_labels_file(labels_file_train)
            
        if labels_file_test is not None:
            self._validate_labels_file(labels_file_test)

    def _validate_labels_file(self, labels_file):
        errstr = None
        if not os.path.exists(labels_file):
            raise ae.WORCValueError(f'Given label file {labels_file} does not exist.')
        
        try:
            label_data = load_labels(labels_file)
        except ae.WORCAssertionError as wae:
            if 'First column should be patient ID' in str(wae):
                # TODO: print wrong column name and file so that it is clear what needs to be replaced in which file
                raise ae.WORCValueError(f'First column in the file given to SimpleWORC().labels_from_this_file(**) needs to be named Patient.')

        # check labels for substrings of eachother
        labels = label_data['label_name']
        subjects = label_data['patient_IDs']
        labels_matches = self._get_all_substrings_for_array(labels)

        if labels_matches:
            # if not empty we have a problem
            errstr = "Found label(s) that are a substring of other label(s). This is currently not allowed in WORC. Rename the following label(s):\n"
            for label, matches in labels_matches.items():
                for match in matches:
                    errstr += f"{label} is a substring of {match}\n"

        # check subject names for substrings of eachother
        subjects_matches = self._get_all_substrings_for_array(subjects)
        if subjects_matches:
            # if not empty we have a problem
            errstr = "Found subject(s) that are a substring of other subject(s). This is currently not allowed in WORC. Rename the following subject(s):\n"
            for subject, matches in subjects_matches.items():
                for match in matches:
                    errstr += f"{subject} is a substring of {match}\n"

        if errstr:
            raise ae.WORCValueError(errstr)
        
        
    def _get_all_substrings_for_array(self, arr):
        # generate a dict with substrings of each element in array
        all_matches = {}
        for strcmp in arr:
            matches = [s for s in arr if s != strcmp and strcmp in s]
            if matches:
                all_matches[strcmp] = matches

        return all_matches


[docs]class ValidatorsFactory:
[docs]    @staticmethod
    def factor_validators():
        return [
            SimpleValidator(),
            MinSubjectsValidator(),
            SamplesWarning(),
            InvalidLabelsValidator()
        ]


__all__ = [ValidatorsFactory]