Skip to content

Date Validation

ColumnMatchDateFormat

Check if the values in a column match the date format.

Parameters:

Name Type Description Default
column str

Column to validate.

required
date_format str

Date format to check.

required
threshold float

Threshold for validation. Defaults to 0.0.

required
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

required
kwargs KwargsType

KwargsType (dict): Additional keyword arguments.

{}
Source code in validoopsie/validation_catalogue/DateValidation/column_match_date_format.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
@base_validation_wrapper
class ColumnMatchDateFormat(BaseValidationParameters):
    """Check if the values in a column match the date format.

    Parameters:
        column (str): Column to validate.
        date_format (str): Date format to check.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".
        kwargs: KwargsType (dict): Additional keyword arguments.

    """

    def __init__(
        self,
        column: str,
        date_format: str,
        *args,
        **kwargs: KwargsType,
    ) -> None:
        self.date_format = date_format
        super().__init__(column, *args, **kwargs)

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return f"The column '{self.column}' has unique values that are not in the list."

    def __call__(self, frame: FrameT) -> FrameT:
        """Check if the values in a column match the date format."""
        date_patterns = re.findall(r"[Ymd]+", self.date_format)
        separators = re.findall(r"[^Ymd]+", self.date_format)

        pattern_parts = []
        for i, date_p in enumerate(date_patterns):
            pattern_parts.append(rf"\d{{{len(date_p)}}}")
            if i < len(separators):
                pattern_parts.append(re.escape(separators[i]))

        pattern = "^" + "".join(pattern_parts) + "$"
        exp = nw.col(self.column).str.contains(pattern).alias("contains")
        return (
            frame.with_columns(exp)
            .filter(nw.col("contains") == False)
            .select(nw.col(self.column))
            .group_by(self.column)
            .agg(nw.col(self.column).count().alias(f"{self.column}-count"))
        )

DateToBeBetween

Check if the column date is between min-max range.

If the min_date or max_date is not provided then other will be used as the threshold.

If neither min_date nor max_date is provided, then the validation will result in failure.

Parameters:

Name Type Description Default
column str

Column to validate.

required
min_date int | None

Minimum date for a column entry length.

None
max_date int | None

Maximum date for a column entry length.

None
threshold float

Threshold for validation. Defaults to 0.0.

required
impact Literal['low', 'medium', 'high']

Impact level of validation. Defaults to "low".

required
kwargs

KwargsType (dict): Additional keyword arguments.

{}
Source code in validoopsie/validation_catalogue/DateValidation/date_to_be_between.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
@base_validation_wrapper
class DateToBeBetween(BaseValidationParameters):
    """Check if the column date is between min-max range.

    If the `min_date` or `max_date` is not provided then other will be used as the
    threshold.

    If neither `min_date` nor `max_date` is provided, then the validation will result
    in failure.

    Parameters:
        column (str): Column to validate.
        min_date (int | None): Minimum date for a column entry length.
        max_date (int | None): Maximum date for a column entry length.
        threshold (float, optional): Threshold for validation. Defaults to 0.0.
        impact (Literal["low", "medium", "high"], optional): Impact level of validation.
            Defaults to "low".
        kwargs: KwargsType (dict): Additional keyword arguments.

    """

    def __init__(
        self,
        column: str,
        min_date: date | datetime | None = None,
        max_date: date | datetime | None = None,
        *args,
        **kwargs,
    ) -> None:
        min_max_arg_check(min_date, max_date)

        super().__init__(column, *args, **kwargs)
        self.min_date = min_date
        self.max_date = max_date

    @property
    def fail_message(self) -> str:
        """Return the fail message, that will be used in the report."""
        return (
            f"The column '{self.column}' has date range outside "
            f"[{self.min_date}, {self.max_date}]."
        )

    def __call__(self, frame: FrameT) -> FrameT | ValueError:
        """Check if the string lengths are between the specified range."""
        return (
            min_max_filter(
                frame,
                f"{self.column}",
                self.min_date,
                self.max_date,
            )
            .group_by(self.column)
            .agg(nw.col(self.column).count().alias(f"{self.column}-count"))
        )