Skip to content

Date Validation

ColumnMatchDateFormat

Check if the values in a column match the date format.

Parameters:

Name Type Description Default
column str

Column to validate.

required
date_format str

Date format to check.

required
threshold float

Threshold for validation. Defaults to 0.0.

0.0
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

'low'

Examples:

>>> import pandas as pd
>>> from validoopsie import Validate
>>>
>>> # Validate dates match format
>>> df = pd.DataFrame({
...     "dates_iso": ["2023-01-01", "2023-02-15", "2023-03-30"],
...     "dates_mixed": ["2023-01-01", "02/15/2023", "2023-03-30"]
... })
>>>
>>> vd = (
...     Validate(df)
...     .DateValidation.ColumnMatchDateFormat(
...         column="dates_iso",
...         date_format="YYYY-mm-dd"
...     )
... )
>>> key = "ColumnMatchDateFormat_dates_iso"
>>> vd.results[key]["result"]["status"]
'Success'
>>> # When calling validate on successful validation there is no error.
>>> vd.validate()
>>>
>>> # With threshold allowing some failures
>>> vd2 = (
...     Validate(df)
...     .DateValidation.ColumnMatchDateFormat(
...         column="dates_mixed",
...         date_format="YYYY-mm-dd",
...         threshold=0.4  # Allow 40% failure rate
...     )
... )
>>> key2 = "ColumnMatchDateFormat_dates_mixed"
>>> vd2.results[key2]["result"]["status"]
'Success'
Source code in validoopsie/validation_catalogue/DateValidation/column_match_date_format.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
class ColumnMatchDateFormat(BaseValidation):
    """Check if the values in a column match the date format.

    Args:
        column (str): Column to validate.
        date_format (str): Date format to check.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".

    Examples:
        >>> import pandas as pd
        >>> from validoopsie import Validate
        >>>
        >>> # Validate dates match format
        >>> df = pd.DataFrame({
        ...     "dates_iso": ["2023-01-01", "2023-02-15", "2023-03-30"],
        ...     "dates_mixed": ["2023-01-01", "02/15/2023", "2023-03-30"]
        ... })
        >>>
        >>> vd = (
        ...     Validate(df)
        ...     .DateValidation.ColumnMatchDateFormat(
        ...         column="dates_iso",
        ...         date_format="YYYY-mm-dd"
        ...     )
        ... )
        >>> key = "ColumnMatchDateFormat_dates_iso"
        >>> vd.results[key]["result"]["status"]
        'Success'

        >>> # When calling validate on successful validation there is no error.
        >>> vd.validate()
        >>>
        >>> # With threshold allowing some failures
        >>> vd2 = (
        ...     Validate(df)
        ...     .DateValidation.ColumnMatchDateFormat(
        ...         column="dates_mixed",
        ...         date_format="YYYY-mm-dd",
        ...         threshold=0.4  # Allow 40% failure rate
        ...     )
        ... )
        >>> key2 = "ColumnMatchDateFormat_dates_mixed"
        >>> vd2.results[key2]["result"]["status"]
        'Success'

    """

    def __init__(
        self,
        column: str,
        date_format: str,
        impact: Literal["low", "medium", "high"] = "low",
        threshold: float = 0.00,
        **kwargs: KwargsParams,
    ) -> None:
        self.date_format = date_format
        super().__init__(column, impact, threshold, **kwargs)

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return f"The column '{self.column}' has unique values that are not in the list."

    def __call__(self, frame: Frame) -> Frame:
        """Check if the values in a column match the date format."""
        date_patterns = re.findall(r"[Ymd]+", self.date_format)
        separators = re.findall(r"[^Ymd]+", self.date_format)

        pattern_parts = []
        for i, date_p in enumerate(date_patterns):
            pattern_parts.append(rf"\d{{{len(date_p)}}}")
            if i < len(separators):
                pattern_parts.append(re.escape(separators[i]))

        pattern = "^" + "".join(pattern_parts) + "$"
        exp = nw.col(self.column).cast(nw.String).str.contains(pattern).alias("contains")
        return (
            frame.with_columns(exp)
            .filter(nw.col("contains") == False)
            .select(nw.col(self.column))
            .group_by(self.column)
            .agg(nw.col(self.column).count().alias(f"{self.column}-count"))
        )

DateToBeBetween

Check if the column date is between min-max range.

If the min_date or max_date is not provided then other will be used as the threshold.

If neither min_date nor max_date is provided, then the validation will result in failure.

Parameters:

Name Type Description Default
column str

Column to validate.

required
min_date date | datetime | None

Minimum date for a column entry length.

None
max_date date | datetime | None

Maximum date for a column entry length.

None
threshold float

Threshold for validation. Defaults to 0.0.

0.0
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

'low'

Examples:

>>> import pandas as pd
>>> import narwhals as nw
>>> from validoopsie import Validate
>>> from datetime import datetime
>>>
>>> # Validate dates are within range
>>> df = pd.DataFrame({
...     "order_date": [
...         datetime(2023, 1, 15),
...         datetime(2023, 2, 20),
...         datetime(2023, 3, 25)
...     ]
... })
>>>
>>> vd = (
...     Validate(df)
...     .DateValidation.DateToBeBetween(
...         column="order_date",
...         min_date=datetime(2023, 1, 1),
...         max_date=datetime(2023, 12, 31)
...     )
... )
>>> key = "DateToBeBetween_order_date"
>>> vd.results[key]["result"]["status"]
'Success'
>>>
>>> # When calling validate on successful validation there is no error.
>>> vd.validate()
Source code in validoopsie/validation_catalogue/DateValidation/date_to_be_between.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
class DateToBeBetween(BaseValidation):
    """Check if the column date is between min-max range.

    If the `min_date` or `max_date` is not provided then other will be used as the
    threshold.

    If neither `min_date` nor `max_date` is provided, then the validation will result
    in failure.

    Args:
        column (str): Column to validate.
        min_date (date | datetime | None): Minimum date for a column entry length.
        max_date (date | datetime | None): Maximum date for a column entry length.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".

    Examples:
        >>> import pandas as pd
        >>> import narwhals as nw
        >>> from validoopsie import Validate
        >>> from datetime import datetime
        >>>
        >>> # Validate dates are within range
        >>> df = pd.DataFrame({
        ...     "order_date": [
        ...         datetime(2023, 1, 15),
        ...         datetime(2023, 2, 20),
        ...         datetime(2023, 3, 25)
        ...     ]
        ... })
        >>>
        >>> vd = (
        ...     Validate(df)
        ...     .DateValidation.DateToBeBetween(
        ...         column="order_date",
        ...         min_date=datetime(2023, 1, 1),
        ...         max_date=datetime(2023, 12, 31)
        ...     )
        ... )
        >>> key = "DateToBeBetween_order_date"
        >>> vd.results[key]["result"]["status"]
        'Success'
        >>>
        >>> # When calling validate on successful validation there is no error.
        >>> vd.validate()

    """

    def __init__(
        self,
        column: str,
        min_date: date | datetime | None = None,
        max_date: date | datetime | None = None,
        impact: Literal["low", "medium", "high"] = "low",
        threshold: float = 0.00,
        **kwargs: KwargsParams,
    ) -> None:
        min_max_arg_check(min_date, max_date)

        super().__init__(column, impact, threshold, **kwargs)
        self.min_date = min_date
        self.max_date = max_date

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return (
            f"The column '{self.column}' has date range outside "
            f"[{self.min_date}, {self.max_date}]."
        )

    def __call__(self, frame: Frame) -> Frame:
        """Check if the string lengths are between the specified range."""
        return (
            min_max_filter(
                frame,
                f"{self.column}",
                self.min_date,
                self.max_date,
            )
            .group_by(self.column)
            .agg(nw.col(self.column).count().alias(f"{self.column}-count"))
        )