# SPDX-FileCopyrightText: 2024-2025 SPDX contributors
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""Base checking functionality."""
from __future__ import annotations
import json
import logging
import os
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any, cast
from spdx_python_model import v3_0_1 as spdx3 # type: ignore # import-untyped
from spdx_tools.spdx.model.relationship import RelationshipType
from spdx_tools.spdx.model.spdx_no_assertion import SpdxNoAssertion
from spdx_tools.spdx.parser import parse_anything
from spdx_tools.spdx.parser.error import SPDXParsingError
from spdx_tools.spdx.validation.document_validator import validate_full_spdx_document
from .constants import DEFAULT_SBOM_SPEC
from .report import (
ReportContext,
get_validation_messages_json,
report_html,
report_text,
)
from .spdx3_utils import (
get_boms_from_spdx_document,
get_packages_from_bom,
iter_objects_with_property,
iter_relationships_by_type,
validate_spdx3_data,
)
if TYPE_CHECKING:
from spdx_tools.spdx.model.document import Document
from spdx_tools.spdx.validation.validation_message import ValidationMessage
# pylint: disable=too-many-instance-attributes
[docs]
class BaseChecker(ABC):
"""Base class for all compliance/conformance checkers.
This base class contains methods for common tasks like file parsing
and information extractions from the SBOM.
Any class inheriting from BaseChecker must implement its abstract methods,
such as `check_compliance` and `output_json`.
"""
# Minimum elements/baseline attributes required by a compliance standard
MIN_ELEMENTS: list[str] = []
# Mapping of components without information
# SBOM component name: (list containing components missing the info, label)
_COMPONENTS_WITHOUT_INFO = {
"name": ("components_without_names", "Components missing a name"),
"version": ("components_without_versions", "Components missing a version"),
"identifier": (
"components_without_identifiers",
"Components missing an identifier",
),
"supplier": ("components_without_suppliers", "Components missing a supplier"),
"concluded_license": (
"components_without_concluded_licenses",
"Components missing a concluded license",
),
"copyright_text": (
"components_without_copyright_texts",
"Components missing a copyright text",
),
}
compliance_standard: str = "" # fsct3-min, ntia
sbom_spec: str = "" # spdx2, spdx3
# These are detectable by spdx-tools, so not needed for now.
# file_format: str = "" # json, rdf-xml, tag-value, yaml, xml
file: str = ""
# For SPDX 3, we have to use SHACLObjectSet instead of SpdxDocument,
# because we need access to relationships and other elements that are not
# accessible from SpdxDocument.
doc: Document | spdx3.SHACLObjectSet | None = None
__spdx3_doc: spdx3.SpdxDocument | None = None # cached SPDX 3 document
parsing_error: list[str] = []
validation_messages: list[ValidationMessage] = []
sbom_name: str = ""
# Lists of components missing required information.
# Each item is a tuple of (component name, component SPDX ID).
components_without_names: list[tuple[str, str]] = []
components_without_versions: list[tuple[str, str]] = []
components_without_suppliers: list[tuple[str, str]] = []
components_without_identifiers: list[tuple[str, str]] = []
components_without_concluded_licenses: list[tuple[str, str]] = []
components_without_copyright_texts: list[tuple[str, str]] = []
doc_version: bool = False # Has SPDX document version?
doc_author: bool = False # Has SPDX document author?
doc_timestamp: bool = False # Has SPDX document creation timestamp?
dependency_relationships: bool = False # Has DESCRIBES relationship?
compliant: bool = False # Is SBOM compliant with the chosen standard?
# An alias of "compliant", for backward compatibility
ntia_minimum_elements_compliant: bool = compliant
[docs]
@abstractmethod
def check_compliance(self) -> bool:
"""Abstract method to check compliance/conformance."""
raise NotImplementedError
def __init__(
self,
file: str,
validate: bool = True,
compliance: str = "",
sbom_spec: str = DEFAULT_SBOM_SPEC,
) -> None:
"""
Initialize the BaseChecker.
Args:
file (str): The name of the file to be checked.
validate (bool): Whether to validate the file.
compliance (str): The compliance standard to be used.
sbom_spec (str): The SBOM specification to be used.
"""
self.compliance_standard = compliance
self.sbom_spec = sbom_spec
# self.file_format = ""
self.file = file
# Make sure the logs are instance variables and not class variables
# to avoid shared state between instances.
self.parsing_error = []
self.validation_messages = []
match sbom_spec:
case "spdx2":
self.doc = self.parse_file()
case "spdx3":
object_set = self.parse_spdx3_file()
if not object_set:
logging.error("Failed to parse the SPDX 3 file.")
else:
self.doc = object_set
_doc, _validation_messages = validate_spdx3_data(object_set)
if not _doc or _validation_messages:
logging.error("SpdxDocument not found or invalid.")
self.__spdx3_doc = _doc # cache the extracted SpdxDocument
self.validation_messages.extend(_validation_messages)
case _:
# We can add a heuristic to detect the spec from the file content here,
# in case sbom_spec is not provided or invalid.
raise ValueError(f"Unsupported SBOM specification: {sbom_spec}")
if self.doc:
if validate:
if sbom_spec == "spdx2":
self.doc = cast("Document", self.doc)
self.validation_messages = validate_full_spdx_document(self.doc)
else:
pass
self.sbom_name = self.get_sbom_name()
self.doc_version = self.check_doc_version()
self.doc_author = self.check_author()
self.doc_timestamp = self.check_timestamp()
self.dependency_relationships = self.check_dependency_relationships()
self.components_without_names = self.get_components_without_names()
self.components_without_versions = self.get_components_without_versions()
self.components_without_suppliers = self.get_components_without_suppliers()
self.components_without_identifiers = (
self.get_components_without_identifiers()
)
self.components_without_concluded_licenses = (
self.get_components_without_concluded_licenses()
)
self.components_without_copyright_texts = (
self.get_components_without_copyright_texts()
)
# List of (info_name, components) tuples,
# where components is a list of (component_name, spdx_id) tuples
self.all_components_without_info: list[
tuple[str, list[tuple[str, str]]]
] = self._get_all_components_without_info()
self.table_elements: list[tuple[str, bool]] = []
[docs]
def check_doc_version(self) -> bool:
"""Check if the document's specification version exists."""
if self.get_doc_spec_version():
return True
return False
[docs]
def check_author(self) -> bool:
"""Check if the author of SBOM data exists."""
if not self.doc:
return False
# SPDX 2
if self.sbom_spec == "spdx2":
# Note that the spdx-tools's parser will raise an SPDXParsingError
# anyway, if the document does not contain a creator.
# So in practice, this section should always return True
self.doc = cast("Document", self.doc)
doc_creation_info = getattr(self.doc, "creation_info", None)
if doc_creation_info:
doc_creators = getattr(doc_creation_info, "creators", [])
if doc_creators:
return True
return False
# SPDX 3
if self.sbom_spec == "spdx3" and self.__spdx3_doc is not None:
doc_creation_info = getattr(self.__spdx3_doc, "creationInfo", None)
if doc_creation_info:
doc_creators = getattr(doc_creation_info, "createdBy", [])
if doc_creators:
return True
return False
return False
[docs]
def check_dependency_relationships(self) -> bool:
"""Check if the SPDX document DESCRIBES at least one package."""
if not self.doc:
return False
# SPDX 2
if self.sbom_spec == "spdx2":
self.doc = cast("Document", self.doc)
if not self.doc.relationships:
return False
describes_relationships = [
rel
for rel in self.doc.relationships
if rel.relationship_type == RelationshipType.DESCRIBES
]
# A set of all package spdx_ids for quick lookup
spdx_id_set = {package.spdx_id for package in self.doc.packages}
# Check if any of the "DESCRIBES" relationships describe a Package
describes_package = any(
rel.related_spdx_element_id in spdx_id_set
for rel in describes_relationships
)
return describes_package
# SPDX 3
if self.sbom_spec == "spdx3":
# If a BOM/an SBOM's rootElement is a /Software/Package (or its subclass),
# it is considered to have a dependency relationship.
#
# Note that if there is neither /Software/Package(s) nor /Core/Bom,
# a DESCRIBES relationship is not needed; however, this method may still
# return False, since it is factually considered as "no relationship".
# There is a BOM/SBOM and an /Software/Package,
# check if there is at least one package listed in any BOM/SBOM
boms = get_boms_from_spdx_document(self.__spdx3_doc)
if boms:
for bom in boms:
packages = get_packages_from_bom(bom)
if packages:
return True
return False
[docs]
def check_timestamp(self) -> bool:
"""Check if the SBOM creation timestamp exists."""
if not self.doc:
return False
# SPDX 2
if self.sbom_spec == "spdx2":
# Note that the spdx-tools's parser will raise an SPDXParsingError,
# if the document does not contain a timestamp.
# So in practice, this section should always return True.
self.doc = cast("Document", self.doc)
doc_creation_info = getattr(self.doc, "creation_info", None)
if doc_creation_info:
doc_created = getattr(doc_creation_info, "created", None)
if doc_created:
return True
return False
# SPDX 3
if self.sbom_spec == "spdx3" and self.__spdx3_doc is not None:
doc_creation_info = getattr(self.__spdx3_doc, "creationInfo", None)
if doc_creation_info:
doc_created = getattr(doc_creation_info, "created", None)
if doc_created:
return True
return False
[docs]
def get_doc_spec_version(self) -> str | None:
"""Retrieve the document's specification version."""
if not self.doc:
return None
doc_spec_version: str | None = None
# SPDX 2
if self.sbom_spec == "spdx2":
self.doc = cast("Document", self.doc)
doc_creation_info = getattr(self.doc, "creation_info", None)
if doc_creation_info:
doc_spec_version = getattr(doc_creation_info, "spdx_version", None)
# SPDX 3
if self.sbom_spec == "spdx3" and isinstance(
self.__spdx3_doc, spdx3.SpdxDocument
):
doc_creation_info = getattr(self.__spdx3_doc, "creationInfo", None)
if doc_creation_info:
doc_spec_version = getattr(doc_creation_info, "specVersion", None)
return doc_spec_version
[docs]
def get_sbom_name(self) -> str:
"""Retrieve the name of the SBOM."""
if not self.doc:
return ""
name: str = ""
# SPDX 2
if self.sbom_spec == "spdx2":
self.doc = cast("Document", self.doc)
doc_creation_info = getattr(self.doc, "creation_info", None)
if doc_creation_info:
name = getattr(doc_creation_info, "name", "")
# SPDX 3
elif self.sbom_spec == "spdx3" and isinstance(
self.__spdx3_doc, spdx3.SpdxDocument
):
name = getattr(self.__spdx3_doc, "name", "")
return name
[docs]
def get_components_without_concluded_licenses(self) -> list[tuple[str, str]]:
"""
Retrieve components missing a concluded license.
Returns:
list[tuple[str, str]]: A list of tuples of the form
(component_name, spdx_id). Consumers should extract the
preferred value (name or SPDX ID) as needed.
"""
# Note: concluded license is mandatory in SPDX-2.2 and SPDX-2.3
if not self.doc:
return []
# SPDX 2
if self.sbom_spec == "spdx2":
self.doc = cast("Document", self.doc)
packages = getattr(self.doc, "packages", [])
return [
(package.name or "", package.spdx_id or "")
for package in packages
if (
package.license_concluded is None
or isinstance(package.license_concluded, SpdxNoAssertion)
or (
isinstance(package.license_concluded, str)
and package.license_concluded.strip() == ""
)
)
]
# SPDX 3
if self.sbom_spec == "spdx3":
self.doc = cast("spdx3.SHACLObjectSet", self.doc)
has_concluded_license_ids: set[str] = {
from_id
for from_id, to_id in iter_relationships_by_type(
self.doc, "hasConcludedLicense"
)
if to_id.strip()
!= spdx3.expandedlicensing_IndividualLicensingInfo.NAMED_INDIVIDUALS[
"NoAssertionLicense"
]
}
return [
(name or "", spdx_id or "")
for name, spdx_id, _ in iter_objects_with_property(
self.doc,
spdx3.software_Package,
"spdxId",
)
if spdx_id not in has_concluded_license_ids
]
return []
[docs]
def get_components_without_copyright_texts(self) -> list[tuple[str, str]]:
"""
Retrieve components missing a copyright text.
Returns:
list[tuple[str, str]]: A list of tuples of the form
(component_name, spdx_id). Consumers should extract the
preferred value (name or SPDX ID) as needed.
"""
if not self.doc:
return []
# SPDX 2
if self.sbom_spec == "spdx2":
self.doc = cast("Document", self.doc)
packages = getattr(self.doc, "packages", [])
return [
(package.name or "", package.spdx_id or "")
for package in packages
if (
package.copyright_text is None
or isinstance(package.copyright_text, SpdxNoAssertion)
or (
isinstance(package.copyright_text, str)
and package.copyright_text.strip() == ""
)
)
]
# SPDX 3
if self.sbom_spec == "spdx3":
self.doc = cast("spdx3.SHACLObjectSet", self.doc)
return [
(name or "", spdx_id or "")
for name, spdx_id, copyright_text in iter_objects_with_property(
self.doc,
spdx3.software_Package,
"software_copyrightText",
)
if not copyright_text
or (isinstance(copyright_text, str) and copyright_text.strip() == "")
]
return []
[docs]
def get_components_without_identifiers(self) -> list[tuple[str, str]]:
"""
Retrieve components missing unique identifiers (SPDX IDs).
Note that SPDX 3 requires identifiers for all elements,
so this should not happen in a valid SPDX 3 document.
The spdx-python-model JSON deserializer will raise a ValueError
if any element is missing an identifier.
Returns:
list[tuple[str, str]]: A list of tuples of the form
(component_name, spdx_id). Consumers should extract the
preferred value (name or SPDX ID) as needed.
"""
if not self.doc:
return []
# SPDX 2
if self.sbom_spec == "spdx2":
self.doc = cast("Document", self.doc)
packages = getattr(self.doc, "packages", [])
return [
(package.name or "", package.spdx_id or "")
for package in packages
if (
package.spdx_id is None
or (
isinstance(package.spdx_id, str)
and package.spdx_id.strip() == ""
)
)
]
# SPDX 3
if self.sbom_spec == "spdx3":
self.doc = cast("spdx3.SHACLObjectSet", self.doc)
return [
(name or "", spdx_id or "")
for name, _, spdx_id in iter_objects_with_property(
self.doc, spdx3.Element, "spdxId"
)
if not spdx_id or spdx_id.strip() == ""
]
return []
[docs]
def get_components_without_names(self) -> list[tuple[str, str]]:
"""
Retrieve components missing a name.
Returns:
list[tuple[str, str]]: A list of tuples of the form
(component_name, spdx_id). Consumers should extract the
preferred value (name or SPDX ID) as needed.
"""
if not self.doc:
return []
# SPDX 2
if self.sbom_spec == "spdx2":
self.doc = cast("Document", self.doc)
packages = getattr(self.doc, "packages", [])
return [
(package.name or "", package.spdx_id or "")
for package in packages
if (
package.name is None
or (isinstance(package.name, str) and package.name.strip() == "")
)
]
# SPDX 3
if self.sbom_spec == "spdx3":
self.doc = cast("spdx3.SHACLObjectSet", self.doc)
return [
(name or "", spdx_id or "")
for _, spdx_id, name in iter_objects_with_property(
self.doc, spdx3.software_Package, "name"
)
if not name or name.strip() == ""
]
return []
[docs]
def get_components_without_suppliers(self) -> list[tuple[str, str]]:
"""
Retrieve components missing supplier information.
Returns:
list[tuple[str, str]]: A list of tuples of the form
(component_name, spdx_id). Consumers should extract the
preferred value (name or SPDX ID) as needed.
"""
if not self.doc:
return []
# SPDX 2
if self.sbom_spec == "spdx2":
self.doc = cast("Document", self.doc)
packages = getattr(self.doc, "packages", [])
return [
(package.name or "", package.spdx_id or "")
for package in packages
if (
package.supplier is None
or isinstance(package.supplier, SpdxNoAssertion)
or (
isinstance(package.supplier, str)
and package.supplier.strip() == ""
)
)
]
# SPDX 3
if self.sbom_spec == "spdx3":
self.doc = cast("spdx3.SHACLObjectSet", self.doc)
return [
(name or "", spdx_id or "")
for name, spdx_id, supplier in iter_objects_with_property(
self.doc, spdx3.software_Package, "suppliedBy"
)
if not supplier or not supplier.name or supplier.name.strip() == ""
]
return []
[docs]
def get_components_without_versions(self) -> list[tuple[str, str]]:
"""
Retrieve components missing version information.
Returns:
list[tuple[str, str]]: A list of tuples of the form
(component_name, spdx_id). Consumers should extract the
preferred value (name or SPDX ID) as needed.
"""
if not self.doc:
return []
# SPDX 2
if self.sbom_spec == "spdx2":
self.doc = cast("Document", self.doc)
packages = getattr(self.doc, "packages", [])
return [
(package.name or "", package.spdx_id or "")
for package in packages
if (
package.version is None
or isinstance(package.version, SpdxNoAssertion)
or (
isinstance(package.version, str)
and package.version.strip() == ""
)
)
]
# SPDX 3
if self.sbom_spec == "spdx3":
self.doc = cast("spdx3.SHACLObjectSet", self.doc)
return [
(name or "", spdx_id or "")
for name, spdx_id, package_version in iter_objects_with_property(
self.doc, spdx3.software_Package, "software_packageVersion"
)
if not package_version or package_version.strip() == ""
]
return []
def _get_all_components_without_info(
self,
) -> list[tuple[str, list[tuple[str, str]]]]:
"""Get a list of components missing information for each required info."""
# If all lists are empty, return an empty list
if all(
not getattr(self, list_name, [])
for list_name, _ in self._COMPONENTS_WITHOUT_INFO.values()
):
return []
return [
(info_name, getattr(self, self._COMPONENTS_WITHOUT_INFO[info_name][0], []))
for info_name in self.MIN_ELEMENTS
if info_name in self._COMPONENTS_WITHOUT_INFO
and getattr(self, self._COMPONENTS_WITHOUT_INFO[info_name][0], [])
]
[docs]
def get_total_number_components(self) -> int:
"""
Retrieve total number of components.
Returns:
int: The total number of components.
"""
if not self.doc:
return 0
# SPDX 2
if self.sbom_spec == "spdx2":
self.doc = cast("Document", self.doc)
if not self.doc.packages:
return 0
return len(self.doc.packages)
# SPDX 3
if self.sbom_spec == "spdx3":
self.doc = cast("spdx3.SHACLObjectSet", self.doc)
objects: set[spdx3.SHACLObject] = getattr(self.doc, "objects", set())
return len(objects)
return 0
[docs]
def parse_file(self) -> Document | None:
"""
Parse SPDX 2 SBOM document.
Returns:
Document | None: An SPDX 2 SBOM document if successful, otherwise None.
"""
if not self.file or str(self.file).strip() == "":
logging.error("No file path provided.")
return None
if not os.path.exists(self.file):
logging.error("File not found: %s", self.file)
return None
try:
doc = parse_anything.parse_file(self.file)
except SPDXParsingError as err:
self.parsing_error.extend(err.get_messages())
return None
return cast("Document", doc)
[docs]
def parse_spdx3_file(self) -> spdx3.SHACLObjectSet | None:
"""
Parse SPDX 3 SBOM document.
Returns:
spdx3.SHACLObjectSet | None: An SHACLObjectSet if successful, otherwise None.
"""
if not self.file or str(self.file).strip() == "":
logging.error("No file path provided.")
return None
if not os.path.exists(self.file):
logging.error("File not found: %s", self.file)
return None
object_set: spdx3.SHACLObjectSet = spdx3.SHACLObjectSet()
try:
with open(self.file, encoding="utf-8") as f:
spdx3.JSONLDDeserializer().read(f, object_set)
except (OSError, json.JSONDecodeError) as err:
logging.warning("SPDX3 deserialization failed: %s", err)
self.parsing_error.append(str(err))
return None
return object_set
[docs]
def print_components_missing_info(self) -> None:
"""
Print information about components that are missing required details.
What is considered "missing" is determined by a compliance standard.
Subclasses may override this method to provide custom behavior.
Returns:
None
"""
# If parsing failed, skip
if self.parsing_error:
return
if not self.all_components_without_info:
return
print("Missing required information in these components:")
for info_name, components in self.all_components_without_info:
print(
f"{info_name} ({len(components)}): "
f"{', '.join([name for name, _ in components])}"
)
[docs]
def print_table_output(self, verbose: bool = False) -> None:
"""
Print element-by-element result table.
Args:
verbose (bool): If True, print detailed information.
Returns:
None
"""
report_context = ReportContext(
sbom_spec=getattr(self, "sbom_spec", ""),
compliance_standard=getattr(self, "compliance_standard", ""),
compliant=getattr(self, "compliant", False),
requirement_results=getattr(self, "table_elements", []),
components_without_info=getattr(self, "all_components_without_info", []),
validation_messages=getattr(self, "validation_messages", []),
parsing_error=getattr(self, "parsing_error", []),
)
print(report_text(report_context, verbose))
[docs]
def output_html(self) -> str:
"""
Create element-by-element result table in HTML.
Returns:
str: The HTML representation of the results.
"""
report_context = ReportContext(
sbom_spec=getattr(self, "sbom_spec", ""),
compliance_standard=getattr(self, "compliance_standard", ""),
compliant=getattr(self, "compliant", False),
requirement_results=getattr(self, "table_elements", []),
components_without_info=getattr(self, "all_components_without_info", []),
validation_messages=getattr(self, "validation_messages", []),
parsing_error=getattr(self, "parsing_error", []),
)
return report_html(report_context, verbose=True)
[docs]
def output_json(self) -> dict[str, Any]:
"""
Create a JSON-serializable result dict.
Subclasses may override to provide custom fields.
"""
result: dict[str, Any] = {
"isConformant": getattr(self, "compliant", False),
"isNtiaConformant": getattr(
self, "compliant", False
), # backward compatibility
"complianceStandard": getattr(self, "compliance_standard", ""),
"sbomSpec": getattr(self, "sbom_spec", ""),
"validationMessages": get_validation_messages_json(
getattr(self, "validation_messages", [])
),
"parsingError": getattr(self, "parsing_error", []),
"sbomName": getattr(self, "sbom_name", ""),
"specVersionProvided": getattr(self, "doc_version", False),
"authorNameProvided": getattr(self, "doc_author", False),
"timestampProvided": getattr(self, "doc_timestamp", False),
"dependencyRelationshipsProvided": getattr(
self, "dependency_relationships", False
),
"totalNumberComponents": self.get_total_number_components(),
}
_groups = {
"componentNames": "components_without_names",
"componentVersions": "components_without_versions",
"componentIdentifiers": "components_without_identifiers",
"componentSuppliers": "components_without_suppliers",
"componentConcludedLicenses": "components_without_concluded_licenses",
"componentCopyrightTexts": "components_without_copyright_texts",
}
for key_, attr in _groups.items():
components_without_info = getattr(self, attr, [])
# components_without_info is a list[tuple[name, spdx_id]];
# prefer the human-readable name and fall back to SPDX ID.
nonconformant = [
(name if name not in (None, "") else spdx_id)
for name, spdx_id in components_without_info
]
result[key_] = {
"nonconformantComponents": nonconformant,
"allProvided": not bool(nonconformant),
}
return result