Skip to content

String Validation

LengthToBeBetween

Check if the string lengths are between the specified range.

If the min_value or max_value is not provided then other will be used as the threshold.

If neither min_value nor max_value is provided, then the validation will result in failure.

Parameters:

Name Type Description Default
column str

Column to validate.

required
min_value float | None

Minimum value for a column entry length.

None
max_value float | None

Maximum value for a column entry length.

None
threshold float

Threshold for validation. Defaults to 0.0.

0.0
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

'low'

Examples:

>>> import pandas as pd
>>> from validoopsie import Validate
>>>
>>> # Validate string length
>>> df = pd.DataFrame({
...     "username": ["user1", "user2", "user3"],
...     "password": ["pass123", "password", "p@ssw0rd"]
... })
>>>
>>> vd = (
...     Validate(df)
...     .StringValidation.LengthToBeBetween(
...         column="password",
...         min_value=6,
...         max_value=10
...     )
... )
>>> key = "LengthToBeBetween_password"
>>> vd.results[key]["result"]["status"]
'Success'
>>>
>>> # When calling validate on successful validation there is no error.
>>> vd.validate()
Source code in validoopsie/validation_catalogue/StringValidation/length_to_be_between.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
class LengthToBeBetween(BaseValidation):
    """Check if the string lengths are between the specified range.

    If the `min_value` or `max_value` is not provided then other will be used as the
    threshold.

    If neither `min_value` nor `max_value` is provided, then the validation will result
    in failure.

    Args:
        column (str): Column to validate.
        min_value (float | None): Minimum value for a column entry length.
        max_value (float | None): Maximum value for a column entry length.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".

    Examples:
        >>> import pandas as pd
        >>> from validoopsie import Validate
        >>>
        >>> # Validate string length
        >>> df = pd.DataFrame({
        ...     "username": ["user1", "user2", "user3"],
        ...     "password": ["pass123", "password", "p@ssw0rd"]
        ... })
        >>>
        >>> vd = (
        ...     Validate(df)
        ...     .StringValidation.LengthToBeBetween(
        ...         column="password",
        ...         min_value=6,
        ...         max_value=10
        ...     )
        ... )
        >>> key = "LengthToBeBetween_password"
        >>> vd.results[key]["result"]["status"]
        'Success'
        >>>
        >>> # When calling validate on successful validation there is no error.
        >>> vd.validate()

    """

    def __init__(
        self,
        column: str,
        min_value: int | None = None,
        max_value: int | None = None,
        impact: Literal["low", "medium", "high"] = "low",
        threshold: float = 0.00,
        **kwargs: KwargsParams,
    ) -> None:
        min_max_arg_check(min_value, max_value)

        super().__init__(column, impact, threshold, **kwargs)
        self.min_value = min_value
        self.max_value = max_value

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return (
            f"The column '{self.column}' has string lengths outside the range"
            f"[{self.min_value}, {self.max_value}]."
        )

    def __call__(self, frame: Frame) -> Frame:
        """Check if the string lengths are between the specified range."""
        transformed_frame = frame.with_columns(
            nw.col(self.column).str.len_chars().alias(f"{self.column}-length"),
        )

        return (
            min_max_filter(
                transformed_frame,
                f"{self.column}-length",
                self.min_value,
                self.max_value,
            )
            .group_by(self.column)
            .agg(nw.col(self.column).count().alias(f"{self.column}-count"))
        )

LengthToBeEqualTo

Expect the column entries to be strings with length equal to value.

Parameters:

Name Type Description Default
column str

Column to validate.

required
value int

The expected value for a column entry length.

required
threshold float

Threshold for validation. Defaults to 0.0.

0.0
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

'low'

Examples:

>>> import pandas as pd
>>> from validoopsie import Validate
>>>
>>> # Validate fixed-length codes
>>> df = pd.DataFrame({
...     "country_code": ["US", "UK", "FR"]
... })
>>>
>>> vd = (
...     Validate(df)
...     .StringValidation.LengthToBeEqualTo(
...         column="country_code",
...         value=2
...     )
... )
>>> key = "LengthToBeEqualTo_country_code"
>>> vd.results[key]["result"]["status"]
'Success'
>>>
>>> # When calling validate on successful validation there is no error.
>>> vd.validate()
Source code in validoopsie/validation_catalogue/StringValidation/length_to_be_equal_to.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
class LengthToBeEqualTo(BaseValidation):
    """Expect the column entries to be strings with length equal to `value`.

    Args:
        column (str): Column to validate.
        value (int): The expected value for a column entry length.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".

    Examples:
        >>> import pandas as pd
        >>> from validoopsie import Validate
        >>>
        >>> # Validate fixed-length codes
        >>> df = pd.DataFrame({
        ...     "country_code": ["US", "UK", "FR"]
        ... })
        >>>
        >>> vd = (
        ...     Validate(df)
        ...     .StringValidation.LengthToBeEqualTo(
        ...         column="country_code",
        ...         value=2
        ...     )
        ... )
        >>> key = "LengthToBeEqualTo_country_code"
        >>> vd.results[key]["result"]["status"]
        'Success'
        >>>
        >>> # When calling validate on successful validation there is no error.
        >>> vd.validate()

    """

    def __init__(
        self,
        column: str,
        value: int,
        impact: Literal["low", "medium", "high"] = "low",
        threshold: float = 0.00,
        **kwargs: KwargsParams,
    ) -> None:
        super().__init__(column, impact, threshold, **kwargs)
        self.value = value

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return (
            f"The column '{self.column}' has entries with length not "
            f"equal to {self.value}."
        )

    def __call__(self, frame: Frame) -> Frame:
        """Expect the column entries to be strings with length equal to `value`."""
        return (
            frame.filter(
                nw.col(self.column).str.len_chars() != self.value,
            )
            .group_by(self.column)
            .agg(nw.col(self.column).count().alias(f"{self.column}-count"))
        )

NotPatternMatch

Expect the column entries to be strings that do not pattern match.

Parameters:

Name Type Description Default
column str

The column name.

required
pattern str

The pattern expression the column should not match.

required
threshold float

Threshold for validation. Defaults to 0.0.

0.0
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

'low'

Examples:

>>> import pandas as pd
>>> from validoopsie import Validate
>>>
>>> # Validate text doesn't contain pattern
>>> df = pd.DataFrame({
...     "comment": ["Great product!", "Normal comment", "Just okay"]
... })
>>>
>>> vd = (
...     Validate(df)
...     .StringValidation.NotPatternMatch(
...         column="comment",
...         pattern=r"password"
...     )
... )
>>> key = "NotPatternMatch_comment"
>>> vd.results[key]["result"]["status"]
'Success'
>>>
>>> # When calling validate on successful validation there is no error.
>>> vd.validate()
Source code in validoopsie/validation_catalogue/StringValidation/not_pattern_match.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
class NotPatternMatch(BaseValidation):
    """Expect the column entries to be strings that do not pattern match.

    Args:
        column (str): The column name.
        pattern (str): The pattern expression the column should not match.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of
            validation. Defaults to "low".

    Examples:
        >>> import pandas as pd
        >>> from validoopsie import Validate
        >>>
        >>> # Validate text doesn't contain pattern
        >>> df = pd.DataFrame({
        ...     "comment": ["Great product!", "Normal comment", "Just okay"]
        ... })
        >>>
        >>> vd = (
        ...     Validate(df)
        ...     .StringValidation.NotPatternMatch(
        ...         column="comment",
        ...         pattern=r"password"
        ...     )
        ... )
        >>> key = "NotPatternMatch_comment"
        >>> vd.results[key]["result"]["status"]
        'Success'
        >>>
        >>> # When calling validate on successful validation there is no error.
        >>> vd.validate()

    """

    def __init__(
        self,
        column: str,
        pattern: str,
        impact: Literal["low", "medium", "high"] = "low",
        threshold: float = 0.00,
        **kwargs: KwargsParams,
    ) -> None:
        super().__init__(column, impact, threshold, **kwargs)
        self.pattern = pattern

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return (
            f"The column '{self.column}' has entries that do not match "
            f"the pattern '{self.pattern}'."
        )

    def __call__(self, frame: Frame) -> Frame:
        """Expect the column entries to be strings that do not pattern match."""
        return (
            frame.filter(
                nw.col(self.column).cast(nw.String).str.contains(self.pattern) == True,
            )
            .group_by(self.column)
            .agg(nw.col(self.column).count().alias(f"{self.column}-count"))
        )

PatternMatch

Expect the column entries to be strings that pattern matches.

Parameters:

Name Type Description Default
column str

The column name.

required
pattern str

The pattern expression the column should match.

required
threshold float

Threshold for validation. Defaults to 0.0.

0.0
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

'low'

Examples:

>>> import pandas as pd
>>> from validoopsie import Validate
>>>
>>> # Validate email format
>>> df = pd.DataFrame({
...     "email": ["user1@example.com", "user2@example.com"]
... })
>>>
>>> vd = (
...     Validate(df)
...     .StringValidation.PatternMatch(
...         column="email",
...         pattern=r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
...     )
... )
>>> key = "PatternMatch_email"
>>> vd.results[key]["result"]["status"]
'Success'
>>>
>>> # When calling validate on successful validation there is no error.
>>> vd.validate()
Source code in validoopsie/validation_catalogue/StringValidation/pattern_match.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
class PatternMatch(BaseValidation):
    r"""Expect the column entries to be strings that pattern matches.

    Args:
        column (str): The column name.
        pattern (str): The pattern expression the column should match.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".

    Examples:
        >>> import pandas as pd
        >>> from validoopsie import Validate
        >>>
        >>> # Validate email format
        >>> df = pd.DataFrame({
        ...     "email": ["user1@example.com", "user2@example.com"]
        ... })
        >>>
        >>> vd = (
        ...     Validate(df)
        ...     .StringValidation.PatternMatch(
        ...         column="email",
        ...         pattern=r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
        ...     )
        ... )
        >>> key = "PatternMatch_email"
        >>> vd.results[key]["result"]["status"]
        'Success'
        >>>
        >>> # When calling validate on successful validation there is no error.
        >>> vd.validate()

    """

    def __init__(
        self,
        column: str,
        pattern: str,
        impact: Literal["low", "medium", "high"] = "low",
        threshold: float = 0.00,
        **kwargs: KwargsParams,
    ) -> None:
        super().__init__(column, impact, threshold, **kwargs)
        self.pattern = pattern

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return (
            f"The column '{self.column}' has entries that do not match "
            f"the pattern '{self.pattern}'."
        )

    def __call__(self, frame: Frame) -> Frame:
        """Expect the column entries to be strings that pattern matches."""
        return (
            frame.filter(
                nw.col(self.column).cast(nw.String).str.contains(self.pattern) == False,
            )
            .group_by(self.column)
            .agg(nw.col(self.column).count().alias(f"{self.column}-count"))
        )