Skip to content

Unique Validation

ColumnUniquePair

Validates the uniqueness of combined values from multiple columns.

This class checks if the combination of values from specified columns creates unique entries in the dataset. For example, if checking columns ['first_name', 'last_name'], the combination of these values should be unique for each row.

Parameters:

Name Type Description Default
column_list list | tuple

List or tuple of column names to check for unique combinations.

required
threshold float

Threshold for validation. Defaults to 0.0.

0.0
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

'low'

Examples:

>>> import pandas as pd
>>> from validoopsie import Validate
>>>
>>> # Validate unique pairs
>>> df = pd.DataFrame({
...     "student_id": [101, 102, 103],
...     "course_id": [201, 202, 203],
... })
>>>
>>> vd = (
...     Validate(df)
...     .UniqueValidation.ColumnUniquePair(
...         column_list=["student_id", "course_id"]
...     )
... )
>>> key = "ColumnUniquePair_student_id - course_id"
>>> vd.results[key]["result"]["status"]
'Success'
>>>
>>> # When calling validate on successful validation there is no error.
>>> vd.validate()
Source code in validoopsie/validation_catalogue/UniqueValidation/column_unique_pair.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
class ColumnUniquePair(BaseValidation):
    """Validates the uniqueness of combined values from multiple columns.

    This class checks if the combination of values from specified columns creates unique
    entries in the dataset. For example, if checking columns ['first_name', 'last_name'],
    the combination of these values should be unique for each row.

    Args:
        column_list (list | tuple): List or tuple of column names to check for
            unique combinations.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of
            validation. Defaults to "low".

    Examples:
        >>> import pandas as pd
        >>> from validoopsie import Validate
        >>>
        >>> # Validate unique pairs
        >>> df = pd.DataFrame({
        ...     "student_id": [101, 102, 103],
        ...     "course_id": [201, 202, 203],
        ... })
        >>>
        >>> vd = (
        ...     Validate(df)
        ...     .UniqueValidation.ColumnUniquePair(
        ...         column_list=["student_id", "course_id"]
        ...     )
        ... )
        >>> key = "ColumnUniquePair_student_id - course_id"
        >>> vd.results[key]["result"]["status"]
        'Success'
        >>>
        >>> # When calling validate on successful validation there is no error.
        >>> vd.validate()

    """

    def __init__(
        self,
        column_list: list[str] | tuple[str, ...],
        impact: Literal["low", "medium", "high"] = "low",
        threshold: float = 0.00,
        **kwargs: KwargsParams,
    ) -> None:
        assert len(column_list) > 0, "At least two columns are required."

        self.column_list = column_list
        column = " - ".join(column_list)
        super().__init__(column, impact, threshold, **kwargs)

    @property
    def fail_message(self) -> str:
        """Return a descriptive message when the validation fails."""
        return (
            f"Duplicate entries found: The combination of columns [{self.column}] "
            "contains non-unique values."
        )

    def __call__(self, frame: Frame) -> Frame:
        """Check if the unique values are in the list."""
        return (
            frame.with_columns(
                nw.concat_str(
                    [nw.col(col) for col in self.column_list],
                    separator=" - ",
                ).alias(self.column),
            )
            .group_by(self.column)
            .agg(nw.len().alias(f"{self.column}-count"))
            .filter(nw.col(f"{self.column}-count") > 1)
        )

ColumnUniqueValueCountToBeBetween

Check the number of unique values in a column to be between min and max.

If the min_value or max_value is not provided then other will be used as the threshold.

If neither min_value nor max_value is provided, then the validation will result in failure.

Parameters:

Name Type Description Default
column str

The column to validate.

required
min_value int or None

The minimum number of unique values allowed.

None
max_value int or None

The maximum number of unique values allowed.

None
threshold float

Threshold for validation. Defaults to 0.0.

0.0
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

'low'

Examples:

>>> import pandas as pd
>>> from validoopsie import Validate
>>>
>>> # Validate number of unique values
>>> df = pd.DataFrame({
...     "category": ["A", "B", "C", "A", "B"]
... })
>>>
>>> vd = (
...     Validate(df)
...     .UniqueValidation.ColumnUniqueValueCountToBeBetween(
...         column="category",
...         min_value=1,
...         max_value=5
...     )
... )
>>> key = "ColumnUniqueValueCountToBeBetween_category"
>>> vd.results[key]["result"]["status"]
'Success'
>>>
>>> # When calling validate on successful validation there is no error.
>>> vd.validate()
Source code in validoopsie/validation_catalogue/UniqueValidation/column_unique_value_count_to_be_between.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
class ColumnUniqueValueCountToBeBetween(BaseValidation):
    """Check the number of unique values in a column to be between min and max.

    If the `min_value` or `max_value` is not provided then other will be used as the
    threshold.

    If neither `min_value` nor `max_value` is provided, then the validation will result
    in failure.

    Args:
        column (str): The column to validate.
        min_value (int or None): The minimum number of unique values allowed.
        max_value (int or None): The maximum number of unique values allowed.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".

    Examples:
        >>> import pandas as pd
        >>> from validoopsie import Validate
        >>>
        >>> # Validate number of unique values
        >>> df = pd.DataFrame({
        ...     "category": ["A", "B", "C", "A", "B"]
        ... })
        >>>
        >>> vd = (
        ...     Validate(df)
        ...     .UniqueValidation.ColumnUniqueValueCountToBeBetween(
        ...         column="category",
        ...         min_value=1,
        ...         max_value=5
        ...     )
        ... )
        >>> key = "ColumnUniqueValueCountToBeBetween_category"
        >>> vd.results[key]["result"]["status"]
        'Success'
        >>>
        >>> # When calling validate on successful validation there is no error.
        >>> vd.validate()

    """

    def __init__(
        self,
        column: str,
        min_value: int | None = None,
        max_value: int | None = None,
        impact: Literal["low", "medium", "high"] = "low",
        threshold: float = 0.00,
        **kwargs: KwargsParams,
    ) -> None:
        min_max_arg_check(min_value, max_value)

        super().__init__(column, impact, threshold, **kwargs)
        self.min_value = min_value
        self.max_value = max_value

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return (
            f"The column '{self.column}' has a number of unique values that "
            f"is not between {self.min_value} and {self.max_value}."
        )

    def __call__(self, frame: Frame) -> Frame:
        """Validate the number of unique values in the column."""
        unique_value_counts = frame.group_by(self.column).agg(
            nw.col(self.column).count().alias(f"{self.column}-count"),
        )

        return min_max_filter(
            unique_value_counts,
            f"{self.column}-count",
            self.min_value,
            self.max_value,
        )

ColumnUniqueValuesToBeInList

Check if the unique values are in the list.

Parameters:

Name Type Description Default
column str

Column to validate.

required
values list[Union[str, float, int, None]]

List of values to check.

required
threshold float

Threshold for validation. Defaults to 0.0.

0.0
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

'low'

Examples:

>>> import pandas as pd
>>> from validoopsie import Validate
>>>
>>> # Validate values in allowed list
>>> df = pd.DataFrame({
...     "status": ["active", "inactive", "pending"]
... })
>>>
>>> vd = (
...     Validate(df)
...     .UniqueValidation.ColumnUniqueValuesToBeInList(
...         column="status",
...         values=["active", "inactive", "pending"]
...     )
... )
>>> key = "ColumnUniqueValuesToBeInList_status"
>>> vd.results[key]["result"]["status"]
'Success'
>>>
>>> # When calling validate on successful validation there is no error.
>>> vd.validate()
Source code in validoopsie/validation_catalogue/UniqueValidation/column_unique_values_to_be_in_list.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
class ColumnUniqueValuesToBeInList(BaseValidation):
    """Check if the unique values are in the list.

    Args:
        column (str): Column to validate.
        values (list[Union[str, float, int, None]]): List of values to check.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of
            validation. Defaults to "low".

    Examples:
        >>> import pandas as pd
        >>> from validoopsie import Validate
        >>>
        >>> # Validate values in allowed list
        >>> df = pd.DataFrame({
        ...     "status": ["active", "inactive", "pending"]
        ... })
        >>>
        >>> vd = (
        ...     Validate(df)
        ...     .UniqueValidation.ColumnUniqueValuesToBeInList(
        ...         column="status",
        ...         values=["active", "inactive", "pending"]
        ...     )
        ... )
        >>> key = "ColumnUniqueValuesToBeInList_status"
        >>> vd.results[key]["result"]["status"]
        'Success'
        >>>
        >>> # When calling validate on successful validation there is no error.
        >>> vd.validate()

    """

    def __init__(
        self,
        column: str,
        values: list[str | int | float | None],
        impact: Literal["low", "medium", "high"] = "low",
        threshold: float = 0.00,
        **kwargs: KwargsParams,
    ) -> None:
        super().__init__(column, impact, threshold, **kwargs)
        self.values = values

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return f"The column '{self.column}' has unique values that are not in the list."

    def __call__(self, frame: Frame) -> Frame:
        """Check if the unique values are in the list."""
        return (
            frame.group_by(self.column)
            .agg(nw.col(self.column).count().alias(f"{self.column}-count"))
            .filter(
                nw.col(self.column).is_in(self.values) == False,
            )
        )