Skip to content

String Validation

LengthToBeBetween

Check if the string lengths are between the specified range.

If the min_value or max_value is not provided then other will be used as the threshold.

If neither min_value nor max_value is provided, then the validation will result in failure.

Parameters:

Name Type Description Default
column str

Column to validate.

required
min_value int | None

Minimum value for a column entry length.

None
max_value int | None

Maximum value for a column entry length.

None
threshold float

Threshold for validation. Defaults to 0.0.

required
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

required
kwargs

KwargsType (dict): Additional keyword arguments.

{}
Source code in validoopsie/validation_catalogue/StringValidation/length_to_be_between.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
@base_validation_wrapper
class LengthToBeBetween(BaseValidationParameters):
    """Check if the string lengths are between the specified range.

    If the `min_value` or `max_value` is not provided then other will be used as the
    threshold.

    If neither `min_value` nor `max_value` is provided, then the validation will result
    in failure.

    Parameters:
        column (str): Column to validate.
        min_value (int | None): Minimum value for a column entry length.
        max_value (int | None): Maximum value for a column entry length.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".
        kwargs: KwargsType (dict): Additional keyword arguments.

    """

    def __init__(
        self,
        column: str,
        min_value: int | None = None,
        max_value: int | None = None,
        *args,
        **kwargs,
    ) -> None:
        min_max_arg_check(min_value, max_value)

        super().__init__(column, *args, **kwargs)
        self.min_value = min_value
        self.max_value = max_value

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return (
            f"The column '{self.column}' has string lengths outside the range"
            f"[{self.min_value}, {self.max_value}]."
        )

    def __call__(self, frame: FrameT) -> FrameT | ValueError:
        """Check if the string lengths are between the specified range."""
        transformed_frame = frame.with_columns(
            nw.col(self.column).str.len_chars().alias(f"{self.column}-length"),
        )

        return (
            min_max_filter(
                transformed_frame,
                f"{self.column}-length",
                self.min_value,
                self.max_value,
            )
            .group_by(self.column)
            .agg(nw.col(self.column).count().alias(f"{self.column}-count"))
        )

LengthToBeEqualTo

Expect the column entries to be strings with length equal to value.

Parameters:

Name Type Description Default
column str

Column to validate.

required
value int

The expected value for a column entry length.

required
threshold float

Threshold for validation. Defaults to 0.0.

required
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

required
kwargs

KwargsType (dict): Additional keyword arguments.

{}
Source code in validoopsie/validation_catalogue/StringValidation/length_to_be_equal_to.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
@base_validation_wrapper
class LengthToBeEqualTo(BaseValidationParameters):
    """Expect the column entries to be strings with length equal to `value`.

    Parameters:
        column (str): Column to validate.
        value (int): The expected value for a column entry length.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".
        kwargs: KwargsType (dict): Additional keyword arguments.

    """

    def __init__(
        self,
        column: str,
        value: int,
        *args,
        **kwargs,
    ) -> None:
        super().__init__(column, *args, **kwargs)
        self.value = value

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return (
            f"The column '{self.column}' has entries with length not "
            f"equal to {self.value}."
        )

    def __call__(self, frame: FrameT) -> FrameT:
        """Expect the column entries to be strings with length equal to `value`."""
        return (
            frame.filter(
                nw.col(self.column).str.len_chars() != self.value,
            )
            .group_by(self.column)
            .agg(nw.col(self.column).count().alias(f"{self.column}-count"))
        )

NotPatternMatch

Expect the column entries to be strings that do not pattern match.

Parameters:

Name Type Description Default
column str

The column name.

required
pattern str

The pattern expression the column should not match.

required
threshold float

Threshold for validation. Defaults to 0.0.

required
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

required
kwargs

KwargsType (dict): Additional keyword arguments.

{}
Source code in validoopsie/validation_catalogue/StringValidation/not_pattern_match.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
@base_validation_wrapper
class NotPatternMatch(BaseValidationParameters):
    """Expect the column entries to be strings that do not pattern match.

    Parameters:
        column (str): The column name.
        pattern (str): The pattern expression the column should not match.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".
        kwargs: KwargsType (dict): Additional keyword arguments.

    """

    def __init__(
        self,
        column: str,
        pattern: str,
        *args,
        **kwargs,
    ) -> None:
        super().__init__(column, *args, **kwargs)
        self.pattern = pattern

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return (
            f"The column '{self.column}' has entries that do not match "
            f"the pattern '{self.pattern}'."
        )

    def __call__(self, frame: FrameT) -> FrameT:
        """Expect the column entries to be strings that do not pattern match."""
        return (
            frame.filter(
                nw.col(self.column).str.contains(self.pattern) == True,
            )
            .group_by(self.column)
            .agg(nw.col(self.column).count().alias(f"{self.column}-count"))
        )

PatternMatch

Expect the column entries to be strings that pattern matches.

Parameters:

Name Type Description Default
column str

The column name.

required
pattern str

The pattern expression the column should match.

required
threshold float

Threshold for validation. Defaults to 0.0.

required
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

required
kwargs

KwargsType (dict): Additional keyword arguments.

{}
Source code in validoopsie/validation_catalogue/StringValidation/pattern_match.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
@base_validation_wrapper
class PatternMatch(BaseValidationParameters):
    """Expect the column entries to be strings that pattern matches.

    Parameters:
        column (str): The column name.
        pattern (str): The pattern expression the column should match.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".
        kwargs: KwargsType (dict): Additional keyword arguments.

    """

    def __init__(
        self,
        column: str,
        pattern: str,
        *args,
        **kwargs,
    ) -> None:
        super().__init__(column, *args, **kwargs)
        self.pattern = pattern

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return (
            f"The column '{self.column}' has entries that do not match "
            f"the pattern '{self.pattern}'."
        )

    def __call__(self, frame: FrameT) -> FrameT:
        """Expect the column entries to be strings that pattern matches."""
        return (
            frame.filter(
                nw.col(self.column).str.contains(self.pattern) == False,
            )
            .group_by(self.column)
            .agg(nw.col(self.column).count().alias(f"{self.column}-count"))
        )