Source code for WORC.validators.preflightcheck

from abc import ABC, abstractmethod

from WORC.processing.label_processing import load_label_csv
import WORC.addexceptions as ae

# Global variables
min_subjects = 10
recommended_subjects = 50


[docs]class AbstractValidator(ABC):
    # noinspection PyBroadException
[docs]    def do_validation(self, *args, **kwargs):
        # try:
        result = self._validate(*args, **kwargs)
        if result is None:
            result = True
        # except:
        #     result = False

        msg = self._generate_detector_message(result)
        if msg:
            print(msg)
        return result

    def _generate_detector_message(self, validated_value):
        return f"{self.__class__.__name__[0:-8]} validated: {validated_value}."

    @abstractmethod
    def _validate(self, *args, **kwargs):
        pass


[docs]class SimpleValidator(AbstractValidator):
    def _validate(self, simpleworc, *args, **kwargs):
        if not simpleworc._labels_file_train:
            if hasattr(simpleworc, 'labels_file_train'):
                if not simpleworc.labels_file_train:
                    raise ae.WORCValueError(f'No labels, use SimpleWorc().labels_from_this_file(**) to add labels.')
            else:
                raise ae.WORCValueError(f'No labels, use SimpleWorc().labels_from_this_file(**) to add labels.')

        if not simpleworc._label_names:
            if not simpleworc.label_names:
                raise ae.WORCValueError(f'No label(s) to predict selected. Use SimpleWorc().predict_labels(**) to select labels.')

        if not simpleworc._method:
            raise ae.WORCValueError(f'No method selected. Call function binary_classification(**) or regression(**) or survival(**) on SimpleWorc().')

        if simpleworc._images_train:
            for num, (ims, segs) in enumerate(zip(simpleworc._images_train, simpleworc._segmentations_train)):
                if ims.keys() != segs.keys():
                    raise ae.WORCValueError(f'Subjects in images_train and segmentations_train are not the same for modality {num}.')

        if hasattr(simpleworc, 'images_train'):
            if simpleworc.images_train:
                for num, (ims, segs) in enumerate(zip(simpleworc.images_train, simpleworc.segmentations_train)):
                    if ims.keys() != segs.keys():
                        raise ae.WORCValueError(f'Subjects in images_train and segmentations_train are not the same for modality {num}.')


[docs]class MinSubjectsValidator(AbstractValidator):
    def _validate(self, simpleworc, *args, **kwargs):
        if simpleworc._num_subjects < min_subjects:
            raise ae.WORCValueError(f'Less than {min_subjects} subjects (you have {simpleworc._num_subjects}) will probably make WORC crash due to a split in the test/validation set having only one subject. Use at least {min_subjects} subjects or more.')


[docs]class EvaluateValidator(AbstractValidator):
    def _validate(self, simpleworc, *args, **kwargs):
        if simpleworc._add_evaluation:
            if not simpleworc._images_train:
                if hasattr(simpleworc, 'images_train'):
                    if not simpleworc.images_train:
                        raise ae.WORCValueError(f'You have added the evaluation pipeline, but have not provided images, which is currently required. We will work on this option in a future release.')
                else:
                    raise ae.WORCValueError(f'You have added the evaluation pipeline, but have not provided images, which is currently required. We will work on this option in a future release.')


[docs]class SamplesWarning(AbstractValidator):
    # Not really a validator, but more a good practice. Hence this won't throw an exception but prints a warning instead.
    def _validate(self, simpleworc, *args, **kwargs):
        if simpleworc._method == 'classification':
            if simpleworc._num_subjects < len(simpleworc._label_names) * recommended_subjects: # at least 100 subjects per label recommended
                print(f'Warning: at least {len(simpleworc._label_names) * recommended_subjects} subjects is recommended when predicting {len(simpleworc._label_names)} labels. Current subject count is: {simpleworc._num_subjects}')
        elif simpleworc._method == 'regression':
            # TODO @martijn not sure how to tackle this, what would be a reasonable amount of subjects for regression?
            pass


[docs]class InvalidLabelsValidator(AbstractValidator):
    def _validate(self, simpleworc):
        errstr = None

        try:
            if simpleworc._labels_file_train:
                labels, subjects, _ = load_label_csv(simpleworc._labels_file_train)
            elif simpleworc.labels_file_train:
                labels, subjects, _ = load_label_csv(simpleworc.labels_file_train)
            else:
                raise ae.WORCValueError(f'No labels, use SimpleWorc().labels_from_this_file(**) to add labels.')

        except ae.WORCAssertionError as wae:
            if 'First column should be patient ID' in str(wae):
                # TODO: print wrong column name and file so that it is clear what needs to be replaced in which file
                raise ae.WORCValueError(f'First column in the file given to SimpleWORC().labels_from_this_file(**) needs to be named Patient.')

        # check labels for substrings of eachother
        labels_matches = self._get_all_substrings_for_array(labels)

        if labels_matches:
            # if not empty we have a problem
            errstr = "Found label(s) that are a substring of other label(s). This is currently not allowed in WORC. Rename the following label(s):\n"
            for label, matches in labels_matches.items():
                for match in matches:
                    errstr += f"{label} is a substring of {match}\n"

        # check subject names for substrings of eachother
        subjects_matches = self._get_all_substrings_for_array(subjects)
        if subjects_matches:
            # if not empty we have a problem
            errstr = "Found subject(s) that are a substring of other subject(s). This is currently not allowed in WORC. Rename the following subject(s):\n"
            for subject, matches in subjects_matches.items():
                for match in matches:
                    errstr += f"{subject} is a substring of {match}\n"

        if errstr:
            raise ae.WORCValueError(errstr)

    def _get_all_substrings_for_array(self, arr):
        # generate a dict with substrings of each element in array
        all_matches = {}
        for strcmp in arr:
            matches = [s for s in arr if s != strcmp and strcmp in s]
            if matches:
                all_matches[strcmp] = matches

        return all_matches


[docs]class ValidatorsFactory:
[docs]    @staticmethod
    def factor_validators():
        return [
            SimpleValidator(),
            MinSubjectsValidator(),
            SamplesWarning(),
            EvaluateValidator(),
            InvalidLabelsValidator()
        ]


__all__ = [ValidatorsFactory]