Validate the data type of the column(s).
If the column and validation_type
is not provided then column_type_definitions
dictionary should be required to validate multiple columns.
Operator can use the generic column data type provided by Validoopsie
(e.g. IntegerType
) or more specific type provided by Narwhals
(e.g. narwhals.Int64
).
For a full list of types refer to:
Example of the column_type_definitions
:
from validoopsie.types import IntegerType
import narwhals
{
"column1": IntegerType,
"column2": narwhals.Int64,
}
Parameters:
Name |
Type |
Description |
Default |
column
|
str | None
|
|
None
|
column_type
|
type | None
|
The type of validation to perform.
|
None
|
frame_schema_definition
|
dict[str, ValidoopsieType] | None
|
A dictionary of
column names and their respective validation types.
|
None
|
threshold
|
float
|
Threshold for validation. Defaults to 0.0.
|
required
|
impact
|
Literal['low', 'medium', 'high']
|
Impact level of validation.
Defaults to "low".
|
required
|
kwargs
|
|
KwargsType (dict): Additional keyword arguments.
|
{}
|
Source code in validoopsie/validation_catalogue/TypeValidation/type_check.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136 | @base_validation_wrapper
class TypeCheck(BaseValidationParameters):
"""Validate the data type of the column(s).
If the column and `validation_type` is not provided then `column_type_definitions`
dictionary should be required to validate multiple columns.
Operator can use the generic column data type provided by Validoopsie
(e.g. `IntegerType`) or more specific type provided by Narwhals
(e.g. `narwhals.Int64`).
For a full list of types refer to:
* [Validoopsie Generic Types](https://akmalsoliev.github.io/Validoopsie/typing.html#typing.FloatType)
* [Narwhals Specific Types](https://narwhals-dev.github.io/narwhals/api-reference/dtypes/)
Example of the `column_type_definitions`:
```python
from validoopsie.types import IntegerType
import narwhals
{
"column1": IntegerType,
"column2": narwhals.Int64,
}
```
Parameters:
column (str | None): The column to validate.
column_type (type | None): The type of validation to perform.
frame_schema_definition (dict[str, ValidoopsieType] | None): A dictionary of
column names and their respective validation types.
threshold (float, optional): Threshold for validation. Defaults to 0.0.
impact (Literal["low", "medium", "high"], optional): Impact level of validation.
Defaults to "low".
kwargs: KwargsType (dict): Additional keyword arguments.
"""
def __init__(
self,
column: str | None = None,
column_type: type | None = None,
frame_schema_definition: dict[str, type] | None = None,
*args,
**kwargs,
) -> None:
# Single validation check
if column and column_type:
self.__check_validation_parameter__(column, column_type, DType)
self.column_type = column_type
self.frame_schema_definition = {column: column_type}
# Multiple validation checks
elif not column and not column_type and frame_schema_definition:
# Check if Validation inside of the dictionary is actually correct
[
self.__check_validation_parameter__(column, vtype, DType)
for column, vtype in frame_schema_definition.items()
]
column = "DataTypeColumnValidation"
self.frame_schema_definition = frame_schema_definition
else:
error_message = (
"Either `column` and `validation_type` should be provided or "
"`frame_schema_definition` should be provided.",
)
raise ValueError(error_message)
super().__init__(column, *args, **kwargs)
def __check_validation_parameter__(
self,
column: str,
column_type: type,
expected_type: type,
) -> None:
"""Check if the validation parameter is correct."""
if not issubclass(column_type, expected_type):
error_message = (
f"Validation type must be a subclass of DType, column: {column}, "
f"type: {column_type.__name__}."
)
raise TypeError(error_message)
@property
def fail_message(self) -> str:
"""Return the fail message, that will be used in the report."""
if self.column == "DataTypeColumnValidation":
return (
"The data type of the column(s) is not correct. "
"Please check `column_type_definitions`."
)
return (
f"The column '{self.column}' has failed the Validation, "
f"expected type: {self.column_type}."
)
def __call__(self, frame: Frame) -> Frame:
"""Validate the data type of the column(s)."""
schema = frame.schema
# Introduction of a new structure where the schema len will be used a frame length
self.schema_lenght = schema.len()
failed_columns = []
for column_name in self.frame_schema_definition:
# Should this be raised or not?
if column_name not in schema:
failed_columns.append(column_name)
continue
column_type = schema[column_name]
defined_type = self.frame_schema_definition[column_name]
if not issubclass(defined_type, column_type.__class__):
failed_columns.append(column_name)
native_namespace = nw.get_native_namespace(frame)
return nw.from_dict(
{self.column: failed_columns},
native_namespace=native_namespace,
).with_columns(
nw.lit(1).alias(f"{self.column}-count"),
)
|