Source code for ntia_conformance_checker.base_checker

# SPDX-FileCopyrightText: 2024-2025 SPDX contributors
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0

"""Base checking functionality."""

from __future__ import annotations

import json
import logging
import os
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any, cast

from spdx_python_model import v3_0_1 as spdx3  # type: ignore # import-untyped
from spdx_tools.spdx.model.relationship import RelationshipType
from spdx_tools.spdx.model.spdx_no_assertion import SpdxNoAssertion
from spdx_tools.spdx.parser import parse_anything
from spdx_tools.spdx.parser.error import SPDXParsingError
from spdx_tools.spdx.validation.document_validator import validate_full_spdx_document

from .constants import DEFAULT_SBOM_SPEC
from .report import (
    ReportContext,
    get_validation_messages_json,
    report_html,
    report_text,
)
from .spdx3_utils import (
    get_boms_from_spdx_document,
    get_packages_from_bom,
    iter_objects_with_property,
    iter_relationships_by_type,
    validate_spdx3_data,
)

if TYPE_CHECKING:
    from spdx_tools.spdx.model.document import Document
    from spdx_tools.spdx.validation.validation_message import ValidationMessage


# pylint: disable=too-many-instance-attributes
[docs] class BaseChecker(ABC): """Base class for all compliance/conformance checkers. This base class contains methods for common tasks like file parsing and information extractions from the SBOM. Any class inheriting from BaseChecker must implement its abstract methods, such as `check_compliance` and `output_json`. """ # Minimum elements/baseline attributes required by a compliance standard MIN_ELEMENTS: list[str] = [] # Mapping of components without information # SBOM component name: (list containing components missing the info, label) _COMPONENTS_WITHOUT_INFO = { "name": ("components_without_names", "Components missing a name"), "version": ("components_without_versions", "Components missing a version"), "identifier": ( "components_without_identifiers", "Components missing an identifier", ), "supplier": ("components_without_suppliers", "Components missing a supplier"), "concluded_license": ( "components_without_concluded_licenses", "Components missing a concluded license", ), "copyright_text": ( "components_without_copyright_texts", "Components missing a copyright text", ), } compliance_standard: str = "" # fsct3-min, ntia sbom_spec: str = "" # spdx2, spdx3 # These are detectable by spdx-tools, so not needed for now. # file_format: str = "" # json, rdf-xml, tag-value, yaml, xml file: str = "" # For SPDX 3, we have to use SHACLObjectSet instead of SpdxDocument, # because we need access to relationships and other elements that are not # accessible from SpdxDocument. doc: Document | spdx3.SHACLObjectSet | None = None __spdx3_doc: spdx3.SpdxDocument | None = None # cached SPDX 3 document parsing_error: list[str] = [] validation_messages: list[ValidationMessage] = [] sbom_name: str = "" # Lists of components missing required information. # Each item is a tuple of (component name, component SPDX ID). components_without_names: list[tuple[str, str]] = [] components_without_versions: list[tuple[str, str]] = [] components_without_suppliers: list[tuple[str, str]] = [] components_without_identifiers: list[tuple[str, str]] = [] components_without_concluded_licenses: list[tuple[str, str]] = [] components_without_copyright_texts: list[tuple[str, str]] = [] doc_version: bool = False # Has SPDX document version? doc_author: bool = False # Has SPDX document author? doc_timestamp: bool = False # Has SPDX document creation timestamp? dependency_relationships: bool = False # Has DESCRIBES relationship? compliant: bool = False # Is SBOM compliant with the chosen standard? # An alias of "compliant", for backward compatibility ntia_minimum_elements_compliant: bool = compliant
[docs] @abstractmethod def check_compliance(self) -> bool: """Abstract method to check compliance/conformance.""" raise NotImplementedError
def __init__( self, file: str, validate: bool = True, compliance: str = "", sbom_spec: str = DEFAULT_SBOM_SPEC, ) -> None: """ Initialize the BaseChecker. Args: file (str): The name of the file to be checked. validate (bool): Whether to validate the file. compliance (str): The compliance standard to be used. sbom_spec (str): The SBOM specification to be used. """ self.compliance_standard = compliance self.sbom_spec = sbom_spec # self.file_format = "" self.file = file # Make sure the logs are instance variables and not class variables # to avoid shared state between instances. self.parsing_error = [] self.validation_messages = [] match sbom_spec: case "spdx2": self.doc = self.parse_file() case "spdx3": object_set = self.parse_spdx3_file() if not object_set: logging.error("Failed to parse the SPDX 3 file.") else: self.doc = object_set _doc, _validation_messages = validate_spdx3_data(object_set) if not _doc or _validation_messages: logging.error("SpdxDocument not found or invalid.") self.__spdx3_doc = _doc # cache the extracted SpdxDocument self.validation_messages.extend(_validation_messages) case _: # We can add a heuristic to detect the spec from the file content here, # in case sbom_spec is not provided or invalid. raise ValueError(f"Unsupported SBOM specification: {sbom_spec}") if self.doc: if validate: if sbom_spec == "spdx2": self.doc = cast("Document", self.doc) self.validation_messages = validate_full_spdx_document(self.doc) else: pass self.sbom_name = self.get_sbom_name() self.doc_version = self.check_doc_version() self.doc_author = self.check_author() self.doc_timestamp = self.check_timestamp() self.dependency_relationships = self.check_dependency_relationships() self.components_without_names = self.get_components_without_names() self.components_without_versions = self.get_components_without_versions() self.components_without_suppliers = self.get_components_without_suppliers() self.components_without_identifiers = ( self.get_components_without_identifiers() ) self.components_without_concluded_licenses = ( self.get_components_without_concluded_licenses() ) self.components_without_copyright_texts = ( self.get_components_without_copyright_texts() ) # List of (info_name, components) tuples, # where components is a list of (component_name, spdx_id) tuples self.all_components_without_info: list[ tuple[str, list[tuple[str, str]]] ] = self._get_all_components_without_info() self.table_elements: list[tuple[str, bool]] = []
[docs] def check_doc_version(self) -> bool: """Check if the document's specification version exists.""" if self.get_doc_spec_version(): return True return False
[docs] def check_author(self) -> bool: """Check if the author of SBOM data exists.""" if not self.doc: return False # SPDX 2 if self.sbom_spec == "spdx2": # Note that the spdx-tools's parser will raise an SPDXParsingError # anyway, if the document does not contain a creator. # So in practice, this section should always return True self.doc = cast("Document", self.doc) doc_creation_info = getattr(self.doc, "creation_info", None) if doc_creation_info: doc_creators = getattr(doc_creation_info, "creators", []) if doc_creators: return True return False # SPDX 3 if self.sbom_spec == "spdx3" and self.__spdx3_doc is not None: doc_creation_info = getattr(self.__spdx3_doc, "creationInfo", None) if doc_creation_info: doc_creators = getattr(doc_creation_info, "createdBy", []) if doc_creators: return True return False return False
[docs] def check_dependency_relationships(self) -> bool: """Check if the SPDX document DESCRIBES at least one package.""" if not self.doc: return False # SPDX 2 if self.sbom_spec == "spdx2": self.doc = cast("Document", self.doc) if not self.doc.relationships: return False describes_relationships = [ rel for rel in self.doc.relationships if rel.relationship_type == RelationshipType.DESCRIBES ] # A set of all package spdx_ids for quick lookup spdx_id_set = {package.spdx_id for package in self.doc.packages} # Check if any of the "DESCRIBES" relationships describe a Package describes_package = any( rel.related_spdx_element_id in spdx_id_set for rel in describes_relationships ) return describes_package # SPDX 3 if self.sbom_spec == "spdx3": # If a BOM/an SBOM's rootElement is a /Software/Package (or its subclass), # it is considered to have a dependency relationship. # # Note that if there is neither /Software/Package(s) nor /Core/Bom, # a DESCRIBES relationship is not needed; however, this method may still # return False, since it is factually considered as "no relationship". # There is a BOM/SBOM and an /Software/Package, # check if there is at least one package listed in any BOM/SBOM boms = get_boms_from_spdx_document(self.__spdx3_doc) if boms: for bom in boms: packages = get_packages_from_bom(bom) if packages: return True return False
[docs] def check_timestamp(self) -> bool: """Check if the SBOM creation timestamp exists.""" if not self.doc: return False # SPDX 2 if self.sbom_spec == "spdx2": # Note that the spdx-tools's parser will raise an SPDXParsingError, # if the document does not contain a timestamp. # So in practice, this section should always return True. self.doc = cast("Document", self.doc) doc_creation_info = getattr(self.doc, "creation_info", None) if doc_creation_info: doc_created = getattr(doc_creation_info, "created", None) if doc_created: return True return False # SPDX 3 if self.sbom_spec == "spdx3" and self.__spdx3_doc is not None: doc_creation_info = getattr(self.__spdx3_doc, "creationInfo", None) if doc_creation_info: doc_created = getattr(doc_creation_info, "created", None) if doc_created: return True return False
[docs] def get_doc_spec_version(self) -> str | None: """Retrieve the document's specification version.""" if not self.doc: return None doc_spec_version: str | None = None # SPDX 2 if self.sbom_spec == "spdx2": self.doc = cast("Document", self.doc) doc_creation_info = getattr(self.doc, "creation_info", None) if doc_creation_info: doc_spec_version = getattr(doc_creation_info, "spdx_version", None) # SPDX 3 if self.sbom_spec == "spdx3" and isinstance( self.__spdx3_doc, spdx3.SpdxDocument ): doc_creation_info = getattr(self.__spdx3_doc, "creationInfo", None) if doc_creation_info: doc_spec_version = getattr(doc_creation_info, "specVersion", None) return doc_spec_version
[docs] def get_sbom_name(self) -> str: """Retrieve the name of the SBOM.""" if not self.doc: return "" name: str = "" # SPDX 2 if self.sbom_spec == "spdx2": self.doc = cast("Document", self.doc) doc_creation_info = getattr(self.doc, "creation_info", None) if doc_creation_info: name = getattr(doc_creation_info, "name", "") # SPDX 3 elif self.sbom_spec == "spdx3" and isinstance( self.__spdx3_doc, spdx3.SpdxDocument ): name = getattr(self.__spdx3_doc, "name", "") return name
[docs] def get_components_without_concluded_licenses(self) -> list[tuple[str, str]]: """ Retrieve components missing a concluded license. Returns: list[tuple[str, str]]: A list of tuples of the form (component_name, spdx_id). Consumers should extract the preferred value (name or SPDX ID) as needed. """ # Note: concluded license is mandatory in SPDX-2.2 and SPDX-2.3 if not self.doc: return [] # SPDX 2 if self.sbom_spec == "spdx2": self.doc = cast("Document", self.doc) packages = getattr(self.doc, "packages", []) return [ (package.name or "", package.spdx_id or "") for package in packages if ( package.license_concluded is None or isinstance(package.license_concluded, SpdxNoAssertion) or ( isinstance(package.license_concluded, str) and package.license_concluded.strip() == "" ) ) ] # SPDX 3 if self.sbom_spec == "spdx3": self.doc = cast("spdx3.SHACLObjectSet", self.doc) has_concluded_license_ids: set[str] = { from_id for from_id, to_id in iter_relationships_by_type( self.doc, "hasConcludedLicense" ) if to_id.strip() != spdx3.expandedlicensing_IndividualLicensingInfo.NAMED_INDIVIDUALS[ "NoAssertionLicense" ] } return [ (name or "", spdx_id or "") for name, spdx_id, _ in iter_objects_with_property( self.doc, spdx3.software_Package, "spdxId", ) if spdx_id not in has_concluded_license_ids ] return []
[docs] def get_components_without_identifiers(self) -> list[tuple[str, str]]: """ Retrieve components missing unique identifiers (SPDX IDs). Note that SPDX 3 requires identifiers for all elements, so this should not happen in a valid SPDX 3 document. The spdx-python-model JSON deserializer will raise a ValueError if any element is missing an identifier. Returns: list[tuple[str, str]]: A list of tuples of the form (component_name, spdx_id). Consumers should extract the preferred value (name or SPDX ID) as needed. """ if not self.doc: return [] # SPDX 2 if self.sbom_spec == "spdx2": self.doc = cast("Document", self.doc) packages = getattr(self.doc, "packages", []) return [ (package.name or "", package.spdx_id or "") for package in packages if ( package.spdx_id is None or ( isinstance(package.spdx_id, str) and package.spdx_id.strip() == "" ) ) ] # SPDX 3 if self.sbom_spec == "spdx3": self.doc = cast("spdx3.SHACLObjectSet", self.doc) return [ (name or "", spdx_id or "") for name, _, spdx_id in iter_objects_with_property( self.doc, spdx3.Element, "spdxId" ) if not spdx_id or spdx_id.strip() == "" ] return []
[docs] def get_components_without_names(self) -> list[tuple[str, str]]: """ Retrieve components missing a name. Returns: list[tuple[str, str]]: A list of tuples of the form (component_name, spdx_id). Consumers should extract the preferred value (name or SPDX ID) as needed. """ if not self.doc: return [] # SPDX 2 if self.sbom_spec == "spdx2": self.doc = cast("Document", self.doc) packages = getattr(self.doc, "packages", []) return [ (package.name or "", package.spdx_id or "") for package in packages if ( package.name is None or (isinstance(package.name, str) and package.name.strip() == "") ) ] # SPDX 3 if self.sbom_spec == "spdx3": self.doc = cast("spdx3.SHACLObjectSet", self.doc) return [ (name or "", spdx_id or "") for _, spdx_id, name in iter_objects_with_property( self.doc, spdx3.software_Package, "name" ) if not name or name.strip() == "" ] return []
[docs] def get_components_without_suppliers(self) -> list[tuple[str, str]]: """ Retrieve components missing supplier information. Returns: list[tuple[str, str]]: A list of tuples of the form (component_name, spdx_id). Consumers should extract the preferred value (name or SPDX ID) as needed. """ if not self.doc: return [] # SPDX 2 if self.sbom_spec == "spdx2": self.doc = cast("Document", self.doc) packages = getattr(self.doc, "packages", []) return [ (package.name or "", package.spdx_id or "") for package in packages if ( package.supplier is None or isinstance(package.supplier, SpdxNoAssertion) or ( isinstance(package.supplier, str) and package.supplier.strip() == "" ) ) ] # SPDX 3 if self.sbom_spec == "spdx3": self.doc = cast("spdx3.SHACLObjectSet", self.doc) return [ (name or "", spdx_id or "") for name, spdx_id, supplier in iter_objects_with_property( self.doc, spdx3.software_Package, "suppliedBy" ) if not supplier or not supplier.name or supplier.name.strip() == "" ] return []
[docs] def get_components_without_versions(self) -> list[tuple[str, str]]: """ Retrieve components missing version information. Returns: list[tuple[str, str]]: A list of tuples of the form (component_name, spdx_id). Consumers should extract the preferred value (name or SPDX ID) as needed. """ if not self.doc: return [] # SPDX 2 if self.sbom_spec == "spdx2": self.doc = cast("Document", self.doc) packages = getattr(self.doc, "packages", []) return [ (package.name or "", package.spdx_id or "") for package in packages if ( package.version is None or isinstance(package.version, SpdxNoAssertion) or ( isinstance(package.version, str) and package.version.strip() == "" ) ) ] # SPDX 3 if self.sbom_spec == "spdx3": self.doc = cast("spdx3.SHACLObjectSet", self.doc) return [ (name or "", spdx_id or "") for name, spdx_id, package_version in iter_objects_with_property( self.doc, spdx3.software_Package, "software_packageVersion" ) if not package_version or package_version.strip() == "" ] return []
def _get_all_components_without_info( self, ) -> list[tuple[str, list[tuple[str, str]]]]: """Get a list of components missing information for each required info.""" # If all lists are empty, return an empty list if all( not getattr(self, list_name, []) for list_name, _ in self._COMPONENTS_WITHOUT_INFO.values() ): return [] return [ (info_name, getattr(self, self._COMPONENTS_WITHOUT_INFO[info_name][0], [])) for info_name in self.MIN_ELEMENTS if info_name in self._COMPONENTS_WITHOUT_INFO and getattr(self, self._COMPONENTS_WITHOUT_INFO[info_name][0], []) ]
[docs] def get_total_number_components(self) -> int: """ Retrieve total number of components. Returns: int: The total number of components. """ if not self.doc: return 0 # SPDX 2 if self.sbom_spec == "spdx2": self.doc = cast("Document", self.doc) if not self.doc.packages: return 0 return len(self.doc.packages) # SPDX 3 if self.sbom_spec == "spdx3": self.doc = cast("spdx3.SHACLObjectSet", self.doc) objects: set[spdx3.SHACLObject] = getattr(self.doc, "objects", set()) return len(objects) return 0
[docs] def parse_file(self) -> Document | None: """ Parse SPDX 2 SBOM document. Returns: Document | None: An SPDX 2 SBOM document if successful, otherwise None. """ if not self.file or str(self.file).strip() == "": logging.error("No file path provided.") return None if not os.path.exists(self.file): logging.error("File not found: %s", self.file) return None try: doc = parse_anything.parse_file(self.file) except SPDXParsingError as err: self.parsing_error.extend(err.get_messages()) return None return cast("Document", doc)
[docs] def parse_spdx3_file(self) -> spdx3.SHACLObjectSet | None: """ Parse SPDX 3 SBOM document. Returns: spdx3.SHACLObjectSet | None: An SHACLObjectSet if successful, otherwise None. """ if not self.file or str(self.file).strip() == "": logging.error("No file path provided.") return None if not os.path.exists(self.file): logging.error("File not found: %s", self.file) return None object_set: spdx3.SHACLObjectSet = spdx3.SHACLObjectSet() try: with open(self.file, encoding="utf-8") as f: spdx3.JSONLDDeserializer().read(f, object_set) except (OSError, json.JSONDecodeError) as err: logging.warning("SPDX3 deserialization failed: %s", err) self.parsing_error.append(str(err)) return None return object_set
[docs] def print_components_missing_info(self) -> None: """ Print information about components that are missing required details. What is considered "missing" is determined by a compliance standard. Subclasses may override this method to provide custom behavior. Returns: None """ # If parsing failed, skip if self.parsing_error: return if not self.all_components_without_info: return print("Missing required information in these components:") for info_name, components in self.all_components_without_info: print( f"{info_name} ({len(components)}): " f"{', '.join([name for name, _ in components])}" )
[docs] def print_table_output(self, verbose: bool = False) -> None: """ Print element-by-element result table. Args: verbose (bool): If True, print detailed information. Returns: None """ report_context = ReportContext( sbom_spec=getattr(self, "sbom_spec", ""), compliance_standard=getattr(self, "compliance_standard", ""), compliant=getattr(self, "compliant", False), requirement_results=getattr(self, "table_elements", []), components_without_info=getattr(self, "all_components_without_info", []), validation_messages=getattr(self, "validation_messages", []), parsing_error=getattr(self, "parsing_error", []), ) print(report_text(report_context, verbose))
[docs] def output_html(self) -> str: """ Create element-by-element result table in HTML. Returns: str: The HTML representation of the results. """ report_context = ReportContext( sbom_spec=getattr(self, "sbom_spec", ""), compliance_standard=getattr(self, "compliance_standard", ""), compliant=getattr(self, "compliant", False), requirement_results=getattr(self, "table_elements", []), components_without_info=getattr(self, "all_components_without_info", []), validation_messages=getattr(self, "validation_messages", []), parsing_error=getattr(self, "parsing_error", []), ) return report_html(report_context, verbose=True)
[docs] def output_json(self) -> dict[str, Any]: """ Create a JSON-serializable result dict. Subclasses may override to provide custom fields. """ result: dict[str, Any] = { "isConformant": getattr(self, "compliant", False), "isNtiaConformant": getattr( self, "compliant", False ), # backward compatibility "complianceStandard": getattr(self, "compliance_standard", ""), "sbomSpec": getattr(self, "sbom_spec", ""), "validationMessages": get_validation_messages_json( getattr(self, "validation_messages", []) ), "parsingError": getattr(self, "parsing_error", []), "sbomName": getattr(self, "sbom_name", ""), "specVersionProvided": getattr(self, "doc_version", False), "authorNameProvided": getattr(self, "doc_author", False), "timestampProvided": getattr(self, "doc_timestamp", False), "dependencyRelationshipsProvided": getattr( self, "dependency_relationships", False ), "totalNumberComponents": self.get_total_number_components(), } _groups = { "componentNames": "components_without_names", "componentVersions": "components_without_versions", "componentIdentifiers": "components_without_identifiers", "componentSuppliers": "components_without_suppliers", "componentConcludedLicenses": "components_without_concluded_licenses", "componentCopyrightTexts": "components_without_copyright_texts", } for key_, attr in _groups.items(): components_without_info = getattr(self, attr, []) # components_without_info is a list[tuple[name, spdx_id]]; # prefer the human-readable name and fall back to SPDX ID. nonconformant = [ (name if name not in (None, "") else spdx_id) for name, spdx_id in components_without_info ] result[key_] = { "nonconformantComponents": nonconformant, "allProvided": not bool(nonconformant), } return result