Source code for sw_metadata_bot.check_parsing

"""Shared parsing helpers for RSMetacheck check identifiers.

RSMetacheck evaluates each repository against a catalog of checks for metadata
quality. Checks are identified by a code: P#### for Pitfalls (high-priority issues
that indicate missing or invalid metadata) or W#### for Warnings (informational
checks or best-practice recommendations). The #### segment is a 3-4 digit code
within each category.

Example check codes:
- P001: Repository lacks codemeta.json file
- W001: Incomplete metadata field descriptions
- P042: Missing license information

See constants.py for related definitions (CHECK_TYPE_*, CHECK_CODE_REGEX_PATTERN).
"""

import re

from . import constants

CHECK_CODE_PATTERN = re.compile(constants.CHECK_CODE_REGEX_PATTERN, re.IGNORECASE)



[docs]
def get_check_catalog_id(check: dict) -> str:
    """Return full RSMetacheck catalog ID URL for a check when available.

    Preferred source is the new schema key ``assessesIndicator.@id`` when it
    points to the RSMetacheck catalog. For backward compatibility, this falls
    back to the legacy ``pitfall`` key.
    """
    indicator_id = str(check.get("assessesIndicator", {}).get("@id", ""))
    if (
        indicator_id
        and "catalog" in indicator_id
        and CHECK_CODE_PATTERN.search(indicator_id)
    ):
        return indicator_id

    return str(check.get("pitfall", ""))




[docs]
def get_short_check_code(check: dict) -> str:
    """Return short check code such as P001 or W004."""
    full_id = get_check_catalog_id(check)
    if not full_id:
        return ""

    match = CHECK_CODE_PATTERN.search(full_id)
    if match is None:
        return ""

    return match.group(1).upper()




[docs]
def is_check_reported(check: dict) -> bool:
    """Return True only when a check is explicitly reported by metacheck.

    Verbose metacheck output marks each evaluated check with an ``output`` key.
    Only values representing true are considered reported findings.
    """
    output = check.get("output")
    return str(output).lower() == "true"




[docs]
def extract_check_ids(checks: list[dict]) -> tuple[list[str], list[str]]:
    """Extract ordered unique pitfall and warning codes from check entries."""
    pitfall_ids: list[str] = []
    warning_ids: list[str] = []

    for check in checks:
        if not is_check_reported(check):
            continue

        code = get_short_check_code(check)
        if not code:
            continue

        if code.startswith("P") and code not in pitfall_ids:
            pitfall_ids.append(code)
        elif code.startswith("W") and code not in warning_ids:
            warning_ids.append(code)

    return pitfall_ids, warning_ids