Skip to content

Type Validation

TypeCheck

Validate the data type of the column(s).

If the column and validation_type is not provided then column_type_definitions dictionary should be required to validate multiple columns.

Operator can use the generic column data type provided by Validoopsie (e.g. IntegerType) or more specific type provided by Narwhals (e.g. narwhals.Int64).

For a full list of types refer to:

Example of the column_type_definitions:

from validoopsie.types import IntegerType
import narwhals

{
    "column1": IntegerType,
    "column2": narwhals.Int64,
}

Parameters:

Name Type Description Default
column str | None

The column to validate.

None
column_type type | None

The type of validation to perform.

None
frame_schema_definition dict[str, ValidoopsieType] | None

A dictionary of column names and their respective validation types.

None
threshold float

Threshold for validation. Defaults to 0.0.

required
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

required
kwargs

KwargsType (dict): Additional keyword arguments.

{}
Source code in validoopsie/validation_catalogue/TypeValidation/type_check.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
@base_validation_wrapper
class TypeCheck(BaseValidationParameters):
    """Validate the data type of the column(s).

    If the column and `validation_type` is not provided then `column_type_definitions`
    dictionary should be required to validate multiple columns.

    Operator can use the generic column data type provided by Validoopsie
    (e.g. `IntegerType`) or more specific type provided by Narwhals
    (e.g. `narwhals.Int64`).

    For a full list of types refer to:

    * [Validoopsie Generic Types](https://akmalsoliev.github.io/Validoopsie/typing.html#typing.FloatType)
    * [Narwhals Specific Types](https://narwhals-dev.github.io/narwhals/api-reference/dtypes/)

    Example of the `column_type_definitions`:

    ```python

    from validoopsie.types import IntegerType
    import narwhals

    {
        "column1": IntegerType,
        "column2": narwhals.Int64,
    }
    ```

    Parameters:
        column (str | None): The column to validate.
        column_type (type | None): The type of validation to perform.
        frame_schema_definition (dict[str, ValidoopsieType] | None): A dictionary of
            column names and their respective validation types.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".
        kwargs: KwargsType (dict): Additional keyword arguments.

    """

    def __init__(
        self,
        column: str | None = None,
        column_type: type | None = None,
        frame_schema_definition: dict[str, type] | None = None,
        *args,
        **kwargs,
    ) -> None:
        # Single validation check
        if column and column_type:
            self.__check_validation_parameter__(column, column_type, DType)
            self.column_type = column_type
            self.frame_schema_definition = {column: column_type}

        # Multiple validation checks
        elif not column and not column_type and frame_schema_definition:
            # Check if Validation inside of the dictionary is actually correct
            [
                self.__check_validation_parameter__(column, vtype, DType)
                for column, vtype in frame_schema_definition.items()
            ]

            column = "DataTypeColumnValidation"
            self.frame_schema_definition = frame_schema_definition
        else:
            error_message = (
                "Either `column` and `validation_type` should be provided or "
                "`frame_schema_definition` should be provided.",
            )
            raise ValueError(error_message)

        super().__init__(column, *args, **kwargs)

    def __check_validation_parameter__(
        self,
        column: str,
        column_type: type,
        expected_type: type,
    ) -> None:
        """Check if the validation parameter is correct."""
        if not issubclass(column_type, expected_type):
            error_message = (
                f"Validation type must be a subclass of DType, column: {column}, "
                f"type: {column_type.__name__}."
            )
            raise TypeError(error_message)

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        if self.column == "DataTypeColumnValidation":
            return (
                "The data type of the column(s) is not correct. "
                "Please check `column_type_definitions`."
            )

        return (
            f"The column '{self.column}' has failed the Validation, "
            f"expected type: {self.column_type}."
        )

    def __call__(self, frame: Frame) -> Frame:
        """Validate the data type of the column(s)."""
        schema = frame.schema
        # Introduction of a new structure where the schema len will be used a frame length
        self.schema_lenght = schema.len()
        failed_columns = []
        for column_name in self.frame_schema_definition:
            # Should this be raised or not?
            if column_name not in schema:
                failed_columns.append(column_name)
                continue

            column_type = schema[column_name]
            defined_type = self.frame_schema_definition[column_name]

            if not issubclass(defined_type, column_type.__class__):
                failed_columns.append(column_name)

        native_namespace = nw.get_native_namespace(frame)
        return nw.from_dict(
            {self.column: failed_columns},
            native_namespace=native_namespace,
        ).with_columns(
            nw.lit(1).alias(f"{self.column}-count"),
        )