Skip to content

Type Validation

TypeCheck

Validate the data type of the column(s).

Parameters:

Name Type Description Default
column str | None

The column to validate.

None
column_type type | None

The type of validation to perform.

None
frame_schema_definition dict[str, type] | None

A dictionary of column names and their respective validation types.

None
threshold float

Threshold for validation. Defaults to 0.0.

0.0
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

'low'

Examples:

>>> import pandas as pd
>>> from validoopsie import Validate
>>> from narwhals.dtypes import IntegerType, FloatType, String
>>>
>>> # Validate column types
>>> df = pd.DataFrame({
...     "id": [1001, 1002, 1003],
...     "name": ["Alice", "Bob", "Charlie"],
...     "balance": [100.50, 250.75, 0.00]
... })
>>>
>>> vd = (
...     Validate(df)
...     .TypeValidation.TypeCheck(
...         frame_schema_definition={
...             "id": IntegerType,
...             "name": String,
...             "balance": FloatType
...         }
...     )
... )
>>>
>>> key = "TypeCheck_DataTypeColumnValidation"
>>> vd.results[key]["result"]["status"]
'Success'
>>>
>>> # When calling validate on successful validation there is no error.
>>> vd.validate()
Source code in validoopsie/validation_catalogue/TypeValidation/type_check.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
class TypeCheck(BaseValidation):
    """Validate the data type of the column(s).

    Args:
        column (str | None): The column to validate.
        column_type (type | None): The type of validation to perform.
        frame_schema_definition (dict[str, type] | None): A dictionary
            of column names and their respective validation types.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".

    Examples:
        >>> import pandas as pd
        >>> from validoopsie import Validate
        >>> from narwhals.dtypes import IntegerType, FloatType, String
        >>>
        >>> # Validate column types
        >>> df = pd.DataFrame({
        ...     "id": [1001, 1002, 1003],
        ...     "name": ["Alice", "Bob", "Charlie"],
        ...     "balance": [100.50, 250.75, 0.00]
        ... })
        >>>
        >>> vd = (
        ...     Validate(df)
        ...     .TypeValidation.TypeCheck(
        ...         frame_schema_definition={
        ...             "id": IntegerType,
        ...             "name": String,
        ...             "balance": FloatType
        ...         }
        ...     )
        ... )
        >>>
        >>> key = "TypeCheck_DataTypeColumnValidation"
        >>> vd.results[key]["result"]["status"]
        'Success'
        >>>
        >>> # When calling validate on successful validation there is no error.
        >>> vd.validate()

    """

    def __init__(
        self,
        column: str | None = None,
        column_type: type | None = None,
        frame_schema_definition: dict[str, type] | None = None,
        impact: Literal["low", "medium", "high"] = "low",
        threshold: float = 0.00,
        **kwargs: KwargsParams,
    ) -> None:
        # Single validation check
        if column and column_type:
            self.__check_validation_parameter__(column, column_type, DType)
            self.column_type = column_type
            self.frame_schema_definition = {column: column_type}

        # Multiple validation checks
        elif not column and not column_type and frame_schema_definition:
            # Check if Validation inside of the dictionary is actually correct
            for vcolumn, vtype in frame_schema_definition.items():
                self.__check_validation_parameter__(vcolumn, vtype, DType)

            column = "DataTypeColumnValidation"
            self.frame_schema_definition = frame_schema_definition
        else:
            error_message = (
                "Either `column` and `validation_type` should be provided or "
                "`frame_schema_definition` should be provided.",
            )
            raise ValueError(error_message)

        super().__init__(column, impact, threshold, **kwargs)

    def __check_validation_parameter__(
        self,
        column: str,
        column_type: type,
        expected_type: type,
    ) -> None:
        """Check if the validation parameter is correct."""
        if not issubclass(column_type, expected_type):
            error_message = (
                f"Validation type must be a subclass of DType, column: {column}, "
                f"type: {column_type.__name__}."
            )
            raise TypeError(error_message)

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        if self.column == "DataTypeColumnValidation":
            return (
                "The data type of the column(s) is not correct. "
                "Please check `column_type_definitions`."
            )

        return (
            f"The column '{self.column}' has failed the Validation, "
            f"expected type: {self.column_type}."
        )

    def __call__(self, frame: Frame) -> Frame:
        """Validate the data type of the column(s)."""
        schema = frame.schema
        # Introduction of a new structure where the schema len will be used a frame length
        self.schema_length = schema.len()
        failed_columns = []
        for column_name in self.frame_schema_definition:
            # Should this be raised or not?
            if column_name not in schema:
                failed_columns.append(column_name)
                continue

            column_type = schema[column_name]
            defined_type = self.frame_schema_definition[column_name]

            if not issubclass(column_type.__class__, defined_type):
                failed_columns.append(column_name)

        return nw.from_native(pa.table({self.column: failed_columns})).with_columns(
            nw.lit(1).alias(f"{self.column}-count"),
        )