String Validation

LengthToBeBetween

Check if the string lengths are between the specified range.

If the min_value or max_value is not provided then other will be used as the threshold.

If neither min_value nor max_value is provided, then the validation will result in failure.

Parameters:

Name	Type	Description	Default
`column`	`str`	Column to validate.	required
`min_value`	`float \| None`	Minimum value for a column entry length.	`None`
`max_value`	`float \| None`	Maximum value for a column entry length.	`None`
`threshold`	`float`	Threshold for validation. Defaults to 0.0.	`0.0`
`impact`	`Literal['low', 'medium', 'high']`	Impact level of validation. Defaults to "low".	`'low'`

Examples:

>>> import pandas as pd
>>> from validoopsie import Validate
>>>
>>> # Validate string length
>>> df = pd.DataFrame({
...     "username": ["user1", "user2", "user3"],
...     "password": ["pass123", "password", "p@ssw0rd"]
... })
>>>
>>> vd = (
...     Validate(df)
...     .StringValidation.LengthToBeBetween(
...         column="password",
...         min_value=6,
...         max_value=10
...     )
... )
>>> key = "LengthToBeBetween_password"
>>> vd.results[key]["result"]["status"]
'Success'
>>>
>>> # When calling validate on successful validation there is no error.
>>> vd.validate()

Source code in validoopsie/validation_catalogue/StringValidation/length_to_be_between.py

class LengthToBeBetween(BaseValidation):
    """Check if the string lengths are between the specified range.

    If the `min_value` or `max_value` is not provided then other will be used as the
    threshold.

    If neither `min_value` nor `max_value` is provided, then the validation will result
    in failure.

    Args:
        column (str): Column to validate.
        min_value (float | None): Minimum value for a column entry length.
        max_value (float | None): Maximum value for a column entry length.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".

    Examples:
        >>> import pandas as pd
        >>> from validoopsie import Validate
        >>>
        >>> # Validate string length
        >>> df = pd.DataFrame({
        ...     "username": ["user1", "user2", "user3"],
        ...     "password": ["pass123", "password", "p@ssw0rd"]
        ... })
        >>>
        >>> vd = (
        ...     Validate(df)
        ...     .StringValidation.LengthToBeBetween(
        ...         column="password",
        ...         min_value=6,
        ...         max_value=10
        ...     )
        ... )
        >>> key = "LengthToBeBetween_password"
        >>> vd.results[key]["result"]["status"]
        'Success'
        >>>
        >>> # When calling validate on successful validation there is no error.
        >>> vd.validate()

    """

    def __init__(
        self,
        column: str,
        min_value: int | None = None,
        max_value: int | None = None,
        impact: Literal["low", "medium", "high"] = "low",
        threshold: float = 0.00,
        **kwargs: KwargsParams,
    ) -> None:
        min_max_arg_check(min_value, max_value)

        super().__init__(column, impact, threshold, **kwargs)
        self.min_value = min_value
        self.max_value = max_value

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return (
            f"The column '{self.column}' has string lengths outside the range"
            f"[{self.min_value}, {self.max_value}]."
        )

    def __call__(self, frame: Frame) -> Frame:
        """Check if the string lengths are between the specified range."""
        transformed_frame = frame.with_columns(
            nw.col(self.column).str.len_chars().alias(f"{self.column}-length"),
        )

        return (
            min_max_filter(
                transformed_frame,
                f"{self.column}-length",
                self.min_value,
                self.max_value,
            )
            .group_by(self.column)
            .agg(nw.col(self.column).count().alias(f"{self.column}-count"))
        )

LengthToBeEqualTo

Expect the column entries to be strings with length equal to value.

Parameters:

Name	Type	Description	Default
`column`	`str`	Column to validate.	required
`value`	`int`	The expected value for a column entry length.	required
`threshold`	`float`	Threshold for validation. Defaults to 0.0.	`0.0`
`impact`	`Literal['low', 'medium', 'high']`	Impact level of validation. Defaults to "low".	`'low'`

Examples:

>>> import pandas as pd
>>> from validoopsie import Validate
>>>
>>> # Validate fixed-length codes
>>> df = pd.DataFrame({
...     "country_code": ["US", "UK", "FR"]
... })
>>>
>>> vd = (
...     Validate(df)
...     .StringValidation.LengthToBeEqualTo(
...         column="country_code",
...         value=2
...     )
... )
>>> key = "LengthToBeEqualTo_country_code"
>>> vd.results[key]["result"]["status"]
'Success'
>>>
>>> # When calling validate on successful validation there is no error.
>>> vd.validate()

Source code in validoopsie/validation_catalogue/StringValidation/length_to_be_equal_to.py

class LengthToBeEqualTo(BaseValidation):
    """Expect the column entries to be strings with length equal to `value`.

    Args:
        column (str): Column to validate.
        value (int): The expected value for a column entry length.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".

    Examples:
        >>> import pandas as pd
        >>> from validoopsie import Validate
        >>>
        >>> # Validate fixed-length codes
        >>> df = pd.DataFrame({
        ...     "country_code": ["US", "UK", "FR"]
        ... })
        >>>
        >>> vd = (
        ...     Validate(df)
        ...     .StringValidation.LengthToBeEqualTo(
        ...         column="country_code",
        ...         value=2
        ...     )
        ... )
        >>> key = "LengthToBeEqualTo_country_code"
        >>> vd.results[key]["result"]["status"]
        'Success'
        >>>
        >>> # When calling validate on successful validation there is no error.
        >>> vd.validate()

    """

    def __init__(
        self,
        column: str,
        value: int,
        impact: Literal["low", "medium", "high"] = "low",
        threshold: float = 0.00,
        **kwargs: KwargsParams,
    ) -> None:
        super().__init__(column, impact, threshold, **kwargs)
        self.value = value

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return (
            f"The column '{self.column}' has entries with length not "
            f"equal to {self.value}."
        )

    def __call__(self, frame: Frame) -> Frame:
        """Expect the column entries to be strings with length equal to `value`."""
        return (
            frame.filter(
                nw.col(self.column).str.len_chars() != self.value,
            )
            .group_by(self.column)
            .agg(nw.col(self.column).count().alias(f"{self.column}-count"))
        )

NotPatternMatch

Expect the column entries to be strings that do not pattern match.

Parameters:

Name	Type	Description	Default
`column`	`str`	The column name.	required
`pattern`	`str`	The pattern expression the column should not match.	required
`threshold`	`float`	Threshold for validation. Defaults to 0.0.	`0.0`
`impact`	`Literal['low', 'medium', 'high']`	Impact level of validation. Defaults to "low".	`'low'`

Examples:

>>> import pandas as pd
>>> from validoopsie import Validate
>>>
>>> # Validate text doesn't contain pattern
>>> df = pd.DataFrame({
...     "comment": ["Great product!", "Normal comment", "Just okay"]
... })
>>>
>>> vd = (
...     Validate(df)
...     .StringValidation.NotPatternMatch(
...         column="comment",
...         pattern=r"password"
...     )
... )
>>> key = "NotPatternMatch_comment"
>>> vd.results[key]["result"]["status"]
'Success'
>>>
>>> # When calling validate on successful validation there is no error.
>>> vd.validate()

Source code in validoopsie/validation_catalogue/StringValidation/not_pattern_match.py

class NotPatternMatch(BaseValidation):
    """Expect the column entries to be strings that do not pattern match.

    Args:
        column (str): The column name.
        pattern (str): The pattern expression the column should not match.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of
            validation. Defaults to "low".

    Examples:
        >>> import pandas as pd
        >>> from validoopsie import Validate
        >>>
        >>> # Validate text doesn't contain pattern
        >>> df = pd.DataFrame({
        ...     "comment": ["Great product!", "Normal comment", "Just okay"]
        ... })
        >>>
        >>> vd = (
        ...     Validate(df)
        ...     .StringValidation.NotPatternMatch(
        ...         column="comment",
        ...         pattern=r"password"
        ...     )
        ... )
        >>> key = "NotPatternMatch_comment"
        >>> vd.results[key]["result"]["status"]
        'Success'
        >>>
        >>> # When calling validate on successful validation there is no error.
        >>> vd.validate()

    """

    def __init__(
        self,
        column: str,
        pattern: str,
        impact: Literal["low", "medium", "high"] = "low",
        threshold: float = 0.00,
        **kwargs: KwargsParams,
    ) -> None:
        super().__init__(column, impact, threshold, **kwargs)
        self.pattern = pattern

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return (
            f"The column '{self.column}' has entries that do not match "
            f"the pattern '{self.pattern}'."
        )

    def __call__(self, frame: Frame) -> Frame:
        """Expect the column entries to be strings that do not pattern match."""
        return (
            frame.filter(
                nw.col(self.column).cast(nw.String).str.contains(self.pattern) == True,
            )
            .group_by(self.column)
            .agg(nw.col(self.column).count().alias(f"{self.column}-count"))
        )

PatternMatch

Expect the column entries to be strings that pattern matches.

Parameters:

Name	Type	Description	Default
`column`	`str`	The column name.	required
`pattern`	`str`	The pattern expression the column should match.	required
`threshold`	`float`	Threshold for validation. Defaults to 0.0.	`0.0`
`impact`	`Literal['low', 'medium', 'high']`	Impact level of validation. Defaults to "low".	`'low'`

Examples:

>>> import pandas as pd
>>> from validoopsie import Validate
>>>
>>> # Validate email format
>>> df = pd.DataFrame({
...     "email": ["user1@example.com", "user2@example.com"]
... })
>>>
>>> vd = (
...     Validate(df)
...     .StringValidation.PatternMatch(
...         column="email",
...         pattern=r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
...     )
... )
>>> key = "PatternMatch_email"
>>> vd.results[key]["result"]["status"]
'Success'
>>>
>>> # When calling validate on successful validation there is no error.
>>> vd.validate()

Source code in validoopsie/validation_catalogue/StringValidation/pattern_match.py

class PatternMatch(BaseValidation):
    r"""Expect the column entries to be strings that pattern matches.

    Args:
        column (str): The column name.
        pattern (str): The pattern expression the column should match.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".

    Examples:
        >>> import pandas as pd
        >>> from validoopsie import Validate
        >>>
        >>> # Validate email format
        >>> df = pd.DataFrame({
        ...     "email": ["user1@example.com", "user2@example.com"]
        ... })
        >>>
        >>> vd = (
        ...     Validate(df)
        ...     .StringValidation.PatternMatch(
        ...         column="email",
        ...         pattern=r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
        ...     )
        ... )
        >>> key = "PatternMatch_email"
        >>> vd.results[key]["result"]["status"]
        'Success'
        >>>
        >>> # When calling validate on successful validation there is no error.
        >>> vd.validate()

    """

    def __init__(
        self,
        column: str,
        pattern: str,
        impact: Literal["low", "medium", "high"] = "low",
        threshold: float = 0.00,
        **kwargs: KwargsParams,
    ) -> None:
        super().__init__(column, impact, threshold, **kwargs)
        self.pattern = pattern

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return (
            f"The column '{self.column}' has entries that do not match "
            f"the pattern '{self.pattern}'."
        )

    def __call__(self, frame: Frame) -> Frame:
        """Expect the column entries to be strings that pattern matches."""
        return (
            frame.filter(
                nw.col(self.column).cast(nw.String).str.contains(self.pattern) == False,
            )
            .group_by(self.column)
            .agg(nw.col(self.column).count().alias(f"{self.column}-count"))
        )