Initialize the Submission data structure

This is the first step towards solving #296. The code added by this commit is currently *not* put in use in the certification pipeline and is a partial duplicate of the "checkprcontent" package. The Submission data structure can collect and run early validation checks on all information concerning a user's submission (i.e. a GitHub Pull Request). In its current state, given an api_url (link to a given PR), it can collect the list of modified files, and extract the category, organization, chart's name and version, as well as running basic checks, such as SemVer compatibility of the provided version. This commit also implements a custom JSON Serializer / Deserializer. The Submission object can therefore easily be dumped to / read from a file. Such file can be an GitHub workflow artifact, populated in the "pre-check" job and read in subsequent jobs. Eventually, this data structure is meant to encapsulate more information concerning the user's submission. This early (and incomplete) version is intended to trigger discussion around the design choices and general direction taken to solve #296. Signed-off-by: Matthias Goerens <mgoerens@redhat.com>
openshift-helm-charts · Mar 15, 2024 · 5a55ce3 · 5a55ce3
1 parent af6cbb5
commit 5a55ce3
Show file tree

Hide file tree

Showing 6 changed files with 524 additions and 0 deletions.
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
@@ -25,6 +25,7 @@ pytest-forked==1.3.0
 pytest-xdist==2.4.0
 PyYAML==6.0.1
 requests==2.26.0
+responses==0.25.0
 retrying==1.3.3
 semantic-version==2.8.5
 semver==2.13.0

diff --git a/scripts/src/precheck/__init__.py b/scripts/src/precheck/__init__.py
diff --git a/scripts/src/precheck/serializer.py b/scripts/src/precheck/serializer.py
@@ -0,0 +1,40 @@
+import copy
+import json
+
+from submission import Submission, Chart, Report, Source, Tarball
+
+
+class SubmissionEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, Submission):
+            obj_dict = copy.deepcopy(obj.__dict__)
+            obj_dict["chart"] = obj_dict["chart"].__dict__
+            obj_dict["report"] = obj_dict["report"].__dict__
+            obj_dict["source"] = obj_dict["source"].__dict__
+            obj_dict["tarball"] = obj_dict["tarball"].__dict__
+            return obj_dict
+
+        return json.JSONEncoder.default(self, obj)
+
+
+class SubmissionDecoder(json.JSONDecoder):
+    def __init__(self, *args, **kwargs):
+        json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs)
+
+    def object_hook(self, dct):
+        if "chart" in dct:
+            chart_obj = Chart(**dct["chart"])
+            report_obj = Report(**dct["report"])
+            source_obj = Source(**dct["source"])
+            tarball_obj = Tarball(**dct["tarball"])
+
+            to_merge_dct = {
+                "chart": chart_obj,
+                "report": report_obj,
+                "source": source_obj,
+                "tarball": tarball_obj,
+            }
+
+            new_dct = dct | to_merge_dct
+            return Submission(**new_dct)
+        return dct
diff --git a/scripts/src/precheck/serializer_test.py b/scripts/src/precheck/serializer_test.py
@@ -0,0 +1,98 @@
+import json
+import sys
+
+sys.path.append(".")
+from serializer import SubmissionEncoder, SubmissionDecoder
+from submission import Submission, Chart, Report, Source, Tarball
+
+expected_category = "partners"
+expected_organization = "acme"
+expected_chart = "awesome"
+expected_version = "1.42.0"
+
+submission_json = """
+{
+    "api_url": "https://api.github.com/repos/openshift-helm-charts/charts/pulls/1",
+    "modified_files": ["charts/partners/acme/awesome/1.42.0/report.yaml"],
+    "chart": {
+        "category": "partners",
+        "organization": "acme",
+        "name": "awesome",
+        "version": "1.42.0"
+    },
+    "report": {
+        "found": true,
+        "signed": false,
+        "path": "charts/partners/acme/awesome/1.42.0/report.yaml"
+    },
+    "source": {
+        "found": false,
+        "path": null
+    },
+    "tarball": {
+        "found": false,
+        "path": null,
+        "provenance": null
+    }
+}
+"""
+
+
+def sanitize_json_string(json_string: str):
+    """Remove the newlines from the JSON string. This is done by
+    loading and dumping the string representation of the JSON object.
+    Goal is to allow comparison with other JSON string.
+    """
+    json_dict = json.loads(json_string)
+    return json.dumps(json_dict)
+
+
+def test_submission_serializer():
+    s = json.loads(submission_json, cls=SubmissionDecoder)
+
+    assert isinstance(s, Submission)
+    assert (
+        s.api_url == "https://api.github.com/repos/openshift-helm-charts/charts/pulls/1"
+    )
+    assert "charts/partners/acme/awesome/1.42.0/report.yaml" in s.modified_files
+    assert s.chart.category == "partners"
+    assert s.chart.organization == "acme"
+    assert s.chart.name == "awesome"
+    assert s.chart.version == "1.42.0"
+    assert s.report.found
+    assert not s.report.signed
+    assert s.report.path == "charts/partners/acme/awesome/1.42.0/report.yaml"
+    assert not s.source.found
+    assert not s.source.path
+    assert not s.tarball.found
+    assert not s.tarball.path
+    assert not s.tarball.provenance
+
+
+def test_submission_deserializer():
+    s = Submission(
+        api_url="https://api.github.com/repos/openshift-helm-charts/charts/pulls/1",
+        modified_files=["charts/partners/acme/awesome/1.42.0/report.yaml"],
+        chart=Chart(
+            category="partners",
+            organization="acme",
+            name="awesome",
+            version="1.42.0",
+        ),
+        report=Report(
+            found=True,
+            signed=False,
+            path="charts/partners/acme/awesome/1.42.0/report.yaml",
+        ),
+        source=Source(
+            found=False,
+            path=None,
+        ),
+        tarball=Tarball(
+            found=False,
+            path=None,
+            provenance=None,
+        ),
+    )
+
+    assert SubmissionEncoder().encode(s) == sanitize_json_string(submission_json)
diff --git a/scripts/src/precheck/submission.py b/scripts/src/precheck/submission.py
@@ -0,0 +1,220 @@
+import os
+import requests
+import semver
+
+from dataclasses import dataclass, field
+
+from checkprcontent import checkpr
+
+xRateLimit = "X-RateLimit-Limit"
+xRateRemain = "X-RateLimit-Remaining"
+
+
+class SubmissionError(Exception):
+    """Root Exception for handling any error with with the submission"""
+
+    pass
+
+
+class DuplicateChartError(SubmissionError):
+    """This Exception is to be raised when the user attempts to submit a PR with more than one chart"""
+
+    pass
+
+
+class VersionError(SubmissionError):
+    """This Exception is to be raised when the version of the chart is not semver compatible"""
+
+    pass
+
+
+class OwnersFileError(SubmissionError):
+    """This Exception is to be raised when the user incorrectly attempts to submit an OWNER file"""
+
+    pass
+
+
+@dataclass
+class Chart:
+    category: str = None
+    organization: str = None
+    name: str = None
+    version: str = None
+
+    def register_chart_info(self, category, organization, name, version):
+        if (
+            (self.category and self.category != category)
+            or (self.organization and self.organization != organization)
+            or (self.name and self.name != name)
+            or (self.version and self.version != version)
+        ):
+            msg = "[ERROR] A PR must contain only one chart. Current PR includes files for multiple charts."
+            print(msg)
+            raise DuplicateChartError(msg)
+
+        if not semver.VersionInfo.isvalid(version):
+            msg = (
+                f"[ERROR] Helm chart version is not a valid semantic version: {version}"
+            )
+            print(msg)
+            raise VersionError(msg)
+
+        self.category = category
+        self.organization = organization
+        self.name = name
+        self.version = version
+
+
+@dataclass
+class Report:
+    found: bool = False
+    signed: bool = False
+    path: str = None
+
+
+@dataclass
+class Source:
+    found: bool = False
+    path: str = None  # Path to the Chart.yaml
+
+
+@dataclass
+class Tarball:
+    found: bool = False
+    path: str = None
+    provenance: str = None
+
+
+@dataclass
+class Submission:
+    api_url: str
+    modified_files: list[str] = None  # field(default_factory=list)
+    chart: Chart = field(default_factory=lambda: Chart())
+    report: Report = field(default_factory=lambda: Report())
+    source: Source = field(default_factory=lambda: Source())
+    tarball: Tarball = field(default_factory=lambda: Tarball())
+
+    def __post_init__(self):
+        """Complete the initialization of the Submission object.
+
+        modified_files should be gatered and validated at the beginning of the pipeline, as part
+        of the "pre-check" job. Hence this isn't done when a Submission object is loaded from a
+        JSON file aka when modified_files is not empty.
+        """
+        if not self.modified_files:
+            self.modified_files = []
+            self._get_modified_files()
+            self._parse_modified_files()
+
+    def _get_modified_files(self):
+        """Query the GitHub API in order to retrieve the list of files that are added / modified by this PR"""
+        page_number = 1
+        max_page_size, page_size = 100, 100
+        headers = {
+            "Accept": "application/vnd.github.v3+json",
+            "Authorization": f'Bearer {os.environ.get("BOT_TOKEN")}',
+        }
+        files_api_url = f"{self.api_url}/files"
+
+        while page_size == max_page_size:
+            files_api_query = f"{files_api_url}?per_page={page_size}&page={page_number}"
+            print(f"[INFO] Query files : {files_api_query}")
+            r = requests.get(files_api_query, headers=headers)
+            files = r.json()
+            page_size = len(files)
+            page_number += 1
+
+            if xRateLimit in r.headers:
+                print(f"[DEBUG] {xRateLimit} : {r.headers[xRateLimit]}")
+            if xRateRemain in r.headers:
+                print(f"[DEBUG] {xRateRemain}  : {r.headers[xRateRemain]}")
+
+            if "message" in files:
+                msg = f'[ERROR] getting pr files: {files["message"]}'
+                print(msg)
+                raise SubmissionError(msg)
+            else:
+                for file in files:
+                    if "filename" in file:
+                        self.modified_files.append(file["filename"])
+
+    def _parse_modified_files(self):
+        """Classify the list of modified files.
+
+        This method will populate the chart, report, source and tarball attributes.
+
+        We expect the user to provide either:
+        * Only a report file
+        * Only a chart - either as source or tarball (potentially accompagnied by provenance file)
+        * Both the report and the chart
+
+        Raises a SubmissionError if:
+        * The Submission concerns more than one chart
+        * The version of the chart is not SemVer compatible
+        * The tarball file is names incorrectly
+        * The PR contains additional files, not related to the Chart being submitted
+        * The user attempts to create the OWNERS file for its project.
+
+        """
+        pattern, reportpattern, tarballpattern = (
+            checkpr.get_file_match_compiled_patterns()
+        )
+        matches_found = 0
+        none_chart_files = {}
+
+        for file_path in self.modified_files:
+            match = pattern.match(file_path)
+            if not match:
+                file_name = os.path.basename(file_path)
+                none_chart_files[file_name] = file_path
+            else:
+                self.chart.register_chart_info(*match.groups())
+
+                matches_found += 1
+                tar_match = tarballpattern.match(file_path)
+                if tar_match:
+                    self.tarball.found = True
+                    self.tarball.path = file_path
+                    print(f"[INFO] tarball found: {file_path}")
+                    _, _, chart_name, chart_version, tar_name = tar_match.groups()
+                    expected_tar_name = f"{chart_name}-{chart_version}.tgz"
+                    if tar_name != expected_tar_name:
+                        msg = f"[ERROR] the tgz file is named incorrectly. Expected: {expected_tar_name}. Got: {tar_name}"
+                        print(msg)
+                        raise SubmissionError(msg)
+                elif reportpattern.match(file_path):
+                    print(f"[INFO] Report found: {file_path}")
+                    self.report.found = True
+                    self.report.path = file_path
+                elif os.path.basename(file_path) == "Chart.yaml":
+                    self.source.found = True
+                    self.source.path = file_path
+
+        if none_chart_files:
+            if len(self.modified_files) > 1 or "OWNERS" not in none_chart_files:
+                # OWNERS not present or preset but not the only file
+                msg = (
+                    "[ERROR] PR includes one or more files not related to charts: "
+                    + ", ".join(none_chart_files)
+                )
+                print(msg)
+                raise SubmissionError(msg)
+
+            if "OWNERS" in none_chart_files:
+                file_path = none_chart_files["OWNERS"]
+                path_parts = file_path.split("/")
+                category = path_parts[1]  # Second after charts
+                if category == "partners":
+                    msg = "[ERROR] OWNERS file should never be set directly by partners. See certification docs."
+                    print(msg)
+                    raise OwnersFileError(msg)
+                elif matches_found > 0:
+                    # There is a mix of chart and non-chart files including OWNERS
+                    msg = "[ERROR] Send OWNERS file by itself in a separate PR."
+                    print(msg)
+                    raise OwnersFileError(msg)
+                elif len(self.modified_files) == 1:
+                    # OWNERS file is the only file in PR
+                    msg = "[INFO] OWNERS file changes require manual review by maintainers."
+                    print(msg)
+                    raise OwnersFileError(msg)