Skip to content

Value Validation

ColumnValuesToBeBetween

Check if the values in a column are between a range.

If the min_value or max_value is not provided then other will be used as the threshold.

If neither min_value nor max_value is provided, then the validation will result in failure.

Parameters:

Name Type Description Default
column str

Column to validate.

required
min_value float | None

Minimum value for a column entry length.

None
max_value float | None

Maximum value for a column entry length.

None
threshold float

Threshold for validation. Defaults to 0.0.

0.0
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

'low'

Examples:

>>> import pandas as pd
>>> from validoopsie import Validate
>>>
>>> # Validate numeric range
>>> df = pd.DataFrame({
...     "age": [25, 30, 42, 18, 65]
... })
>>>
>>> vd = (
...     Validate(df)
...     .ValuesValidation.ColumnValuesToBeBetween(
...         column="age",
...         min_value=18,
...         max_value=65
...     )
... )
>>> key = "ColumnValuesToBeBetween_age"
>>> vd.results[key]["result"]["status"]
'Success'
>>>
>>> # When calling validate on successful validation there is no error.
>>> vd.validate()
Source code in validoopsie/validation_catalogue/ValuesValidation/column_values_to_be_between.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
class ColumnValuesToBeBetween(BaseValidation):
    """Check if the values in a column are between a range.

    If the `min_value` or `max_value` is not provided then other will be used as the
    threshold.

    If neither `min_value` nor `max_value` is provided, then the validation will result
    in failure.


    Args:
        column (str): Column to validate.
        min_value (float | None): Minimum value for a column entry length.
        max_value (float | None): Maximum value for a column entry length.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".

    Examples:
        >>> import pandas as pd
        >>> from validoopsie import Validate
        >>>
        >>> # Validate numeric range
        >>> df = pd.DataFrame({
        ...     "age": [25, 30, 42, 18, 65]
        ... })
        >>>
        >>> vd = (
        ...     Validate(df)
        ...     .ValuesValidation.ColumnValuesToBeBetween(
        ...         column="age",
        ...         min_value=18,
        ...         max_value=65
        ...     )
        ... )
        >>> key = "ColumnValuesToBeBetween_age"
        >>> vd.results[key]["result"]["status"]
        'Success'
        >>>
        >>> # When calling validate on successful validation there is no error.
        >>> vd.validate()

    """

    def __init__(
        self,
        column: str,
        min_value: float | None = None,
        max_value: float | None = None,
        impact: Literal["low", "medium", "high"] = "low",
        threshold: float = 0.00,
        **kwargs: KwargsParams,
    ) -> None:
        min_max_arg_check(min_value, max_value)

        super().__init__(column, impact, threshold, **kwargs)
        self.min_value = min_value
        self.max_value = max_value

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return (
            f"The column '{self.column}' has values that are not "
            f"between {self.min_value} and {self.max_value}."
        )

    def __call__(self, frame: Frame) -> Frame:
        """Check if the values in a column are between a range."""
        return (
            min_max_filter(
                frame,
                f"{self.column}",
                self.min_value,
                self.max_value,
            )
            .group_by(self.column)
            .agg(
                nw.col(self.column).count().alias(f"{self.column}-count"),
            )
        )

ColumnsSumToBeBetween

Check if the sum of columns is greater than or equal to max_sum.

If the min_value or max_value is not provided then other will be used as the threshold.

If neither min_value nor max_value is provided, then the validation will result in failure.

Parameters:

Name Type Description Default
columns_list list[str]

List of columns to sum.

required
min_sum_value float | None

Minimum sum value that columns should be greater than or equal to.

None
max_sum_value float | None

Maximum sum value that columns should be less than or equal to.

None
threshold float

Threshold for validation. Defaults to 0.0.

0.0
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

'low'

Examples:

>>> import pandas as pd
>>> from validoopsie import Validate
>>>
>>> # Validate macronutrient sum in range
>>> df = pd.DataFrame({
...     "protein": [26],
...     "fat": [19],
...     "carbs": [0]
... })
>>>
>>> vd = (
...     Validate(df)
...     .ValuesValidation.ColumnsSumToBeBetween(
...         columns_list=["protein", "fat", "carbs"],
...         min_sum_value=30,
...         max_sum_value=50
...     )
... )
>>> key = "ColumnsSumToBeBetween_protein-fat-carbs-combined"
>>> vd.results[key]["result"]["status"]
'Success'
>>>
>>> # When calling validate on successful validation there is no error.
>>> vd.validate()
Source code in validoopsie/validation_catalogue/ValuesValidation/columns_sum_to_be_between.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
class ColumnsSumToBeBetween(BaseValidation):
    """Check if the sum of columns is greater than or equal to `max_sum`.

    If the `min_value` or `max_value` is not provided then other will be used as the
    threshold.

    If neither `min_value` nor `max_value` is provided, then the validation will result
    in failure.

    Args:
        columns_list (list[str]): List of columns to sum.
        min_sum_value (float | None): Minimum sum value that columns should be greater
            than or equal to.
        max_sum_value (float | None): Maximum sum value that columns should be less than
            or equal to.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".

    Examples:
        >>> import pandas as pd
        >>> from validoopsie import Validate
        >>>
        >>> # Validate macronutrient sum in range
        >>> df = pd.DataFrame({
        ...     "protein": [26],
        ...     "fat": [19],
        ...     "carbs": [0]
        ... })
        >>>
        >>> vd = (
        ...     Validate(df)
        ...     .ValuesValidation.ColumnsSumToBeBetween(
        ...         columns_list=["protein", "fat", "carbs"],
        ...         min_sum_value=30,
        ...         max_sum_value=50
        ...     )
        ... )
        >>> key = "ColumnsSumToBeBetween_protein-fat-carbs-combined"
        >>> vd.results[key]["result"]["status"]
        'Success'
        >>>
        >>> # When calling validate on successful validation there is no error.
        >>> vd.validate()

    """

    def __init__(
        self,
        columns_list: list[str],
        min_sum_value: float | None = None,
        max_sum_value: float | None = None,
        impact: Literal["low", "medium", "high"] = "low",
        threshold: float = 0.00,
        **kwargs: KwargsParams,
    ) -> None:
        min_max_arg_check(min_sum_value, max_sum_value)

        self.columns_list = columns_list
        self.max_sum_value = max_sum_value
        self.min_sum_value = min_sum_value
        self.column = "-".join(self.columns_list) + "-combined"
        super().__init__(self.column, impact, threshold, **kwargs)

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return (
            f"The columns {self.columns_list} are not between {self.min_sum_value} and "
            f"{self.max_sum_value}."
        )

    def __call__(self, frame: Frame) -> Frame:
        """Check if the sum of columns is greater than or equal to `max_sum`."""
        # This is just in case if there is some weird column name, such as "sum"
        col_name = "-".join(self.columns_list) + "-sum"
        summed_frame = frame.select(self.columns_list).with_columns(
            nw.sum_horizontal(self.columns_list).alias(col_name),
        )

        return (
            min_max_filter(
                summed_frame,
                col_name,
                self.min_sum_value,
                self.max_sum_value,
            )
            .with_columns(
                nw.concat_str(
                    [nw.col(column) for column in self.columns_list],
                    separator=" - ",
                ).alias(
                    self.column,
                ),
            )
            .group_by(
                self.column,
            )
            .agg(
                nw.col(self.column).count().alias(f"{self.column}-count"),
            )
        )

ColumnsSumToBeEqualTo

Check if the sum of the columns is equal to a specific value.

Parameters:

Name Type Description Default
columns_list list[str]

List of columns to sum.

required
sum_value float

Value that the columns should sum to.

required
threshold float

Threshold for validation. Defaults to 0.0.

0.0
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

'low'

Examples:

>>> import pandas as pd
>>> from validoopsie import Validate
>>>
>>> # Validate component sum equals total
>>> df = pd.DataFrame({
...     "hardware": [5000],
...     "software": [3000],
...     "personnel": [12000],
...     "total": [20000]
... })
>>>
>>> vd = (
...     Validate(df)
...     .ValuesValidation.ColumnsSumToBeEqualTo(
...         columns_list=["hardware", "software", "personnel"],
...         sum_value=20000
...     )
... )
>>> key = "ColumnsSumToBeEqualTo_hardware-software-personnel-combined"
>>> vd.results[key]["result"]["status"]
'Success'
>>>
>>> # When calling validate on successful validation there is no error.
>>> vd.validate()
Source code in validoopsie/validation_catalogue/ValuesValidation/columns_sum_to_be_equal_to.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
class ColumnsSumToBeEqualTo(BaseValidation):
    """Check if the sum of the columns is equal to a specific value.

    Args:
        columns_list (list[str]): List of columns to sum.
        sum_value (float): Value that the columns should sum to.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".

    Examples:
        >>> import pandas as pd
        >>> from validoopsie import Validate
        >>>
        >>> # Validate component sum equals total
        >>> df = pd.DataFrame({
        ...     "hardware": [5000],
        ...     "software": [3000],
        ...     "personnel": [12000],
        ...     "total": [20000]
        ... })
        >>>
        >>> vd = (
        ...     Validate(df)
        ...     .ValuesValidation.ColumnsSumToBeEqualTo(
        ...         columns_list=["hardware", "software", "personnel"],
        ...         sum_value=20000
        ...     )
        ... )
        >>> key = "ColumnsSumToBeEqualTo_hardware-software-personnel-combined"
        >>> vd.results[key]["result"]["status"]
        'Success'
        >>>
        >>> # When calling validate on successful validation there is no error.
        >>> vd.validate()
    """

    def __init__(
        self,
        columns_list: list[str],
        sum_value: float,
        impact: Literal["low", "medium", "high"] = "low",
        threshold: float = 0.00,
        **kwargs: KwargsParams,
    ) -> None:
        self.columns_list = columns_list
        self.sum_value = sum_value
        self.column = "-".join(self.columns_list) + "-combined"
        super().__init__(self.column, impact, threshold, **kwargs)

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return f"The columns {self.columns_list} do not sum to {self.sum_value}."

    def __call__(self, frame: Frame) -> Frame:
        """Check if the sum of the columns is equal to a specific value."""
        # This is just in case if there is some weird column name, such as "sum"
        col_name = "-".join(self.columns_list) + "-sum"
        return (
            frame.select(self.columns_list)
            .with_columns(
                nw.sum_horizontal(self.columns_list).alias(col_name),
            )
            .filter(
                nw.col(col_name) != self.sum_value,
            )
            .with_columns(
                nw.concat_str(
                    [nw.col(column) for column in self.columns_list],
                    separator=" - ",
                ).alias(
                    self.column,
                ),
            )
            .group_by(
                self.column,
            )
            .agg(
                nw.col(self.column).count().alias(f"{self.column}-count"),
            )
        )