Skip to content

Commit

Permalink
Initialize the Submission data structure
Browse files Browse the repository at this point in the history
This is the first step towards solving #296. The code added by this
commit is currently *not* put in use in the certification pipeline and
is a partial duplicate of the "checkprcontent" package.

The Submission data structure can collect and run early validation
checks on all information concerning a user's submission (i.e. a GitHub
Pull Request). In its current state, given an api_url (link to a given
PR), it can collect the list of modified files, and extract the
category, organization, chart's name and version, as well as running
basic checks, such as SemVer compatibility of the provided version.

This commit also implements a custom JSON Serializer / Deserializer. The
Submission object can therefore easily be dumped to / read from a file.
Such file can be an GitHub workflow artifact, populated in the
"pre-check" job and read in subsequent jobs.

Eventually, this data structure is meant to encapsulate more information
concerning the user's submission. This early (and incomplete) version is
intended to trigger discussion around the design choices and general
direction taken to solve #296.

Signed-off-by: Matthias Goerens <mgoerens@redhat.com>
  • Loading branch information
mgoerens committed Mar 15, 2024
1 parent af6cbb5 commit 5a55ce3
Show file tree
Hide file tree
Showing 6 changed files with 524 additions and 0 deletions.
1 change: 1 addition & 0 deletions scripts/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ pytest-forked==1.3.0
pytest-xdist==2.4.0
PyYAML==6.0.1
requests==2.26.0
responses==0.25.0
retrying==1.3.3
semantic-version==2.8.5
semver==2.13.0
Expand Down
Empty file.
40 changes: 40 additions & 0 deletions scripts/src/precheck/serializer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import copy
import json

from submission import Submission, Chart, Report, Source, Tarball


class SubmissionEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, Submission):
obj_dict = copy.deepcopy(obj.__dict__)
obj_dict["chart"] = obj_dict["chart"].__dict__
obj_dict["report"] = obj_dict["report"].__dict__
obj_dict["source"] = obj_dict["source"].__dict__
obj_dict["tarball"] = obj_dict["tarball"].__dict__
return obj_dict

return json.JSONEncoder.default(self, obj)


class SubmissionDecoder(json.JSONDecoder):
def __init__(self, *args, **kwargs):
json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs)

def object_hook(self, dct):
if "chart" in dct:
chart_obj = Chart(**dct["chart"])
report_obj = Report(**dct["report"])
source_obj = Source(**dct["source"])
tarball_obj = Tarball(**dct["tarball"])

to_merge_dct = {
"chart": chart_obj,
"report": report_obj,
"source": source_obj,
"tarball": tarball_obj,
}

new_dct = dct | to_merge_dct
return Submission(**new_dct)
return dct
98 changes: 98 additions & 0 deletions scripts/src/precheck/serializer_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import json
import sys

sys.path.append(".")
from serializer import SubmissionEncoder, SubmissionDecoder
from submission import Submission, Chart, Report, Source, Tarball

expected_category = "partners"
expected_organization = "acme"
expected_chart = "awesome"
expected_version = "1.42.0"

submission_json = """
{
"api_url": "https://api.github.com/repos/openshift-helm-charts/charts/pulls/1",
"modified_files": ["charts/partners/acme/awesome/1.42.0/report.yaml"],
"chart": {
"category": "partners",
"organization": "acme",
"name": "awesome",
"version": "1.42.0"
},
"report": {
"found": true,
"signed": false,
"path": "charts/partners/acme/awesome/1.42.0/report.yaml"
},
"source": {
"found": false,
"path": null
},
"tarball": {
"found": false,
"path": null,
"provenance": null
}
}
"""


def sanitize_json_string(json_string: str):
"""Remove the newlines from the JSON string. This is done by
loading and dumping the string representation of the JSON object.
Goal is to allow comparison with other JSON string.
"""
json_dict = json.loads(json_string)
return json.dumps(json_dict)


def test_submission_serializer():
s = json.loads(submission_json, cls=SubmissionDecoder)

assert isinstance(s, Submission)
assert (
s.api_url == "https://api.github.com/repos/openshift-helm-charts/charts/pulls/1"
)
assert "charts/partners/acme/awesome/1.42.0/report.yaml" in s.modified_files
assert s.chart.category == "partners"
assert s.chart.organization == "acme"
assert s.chart.name == "awesome"
assert s.chart.version == "1.42.0"
assert s.report.found
assert not s.report.signed
assert s.report.path == "charts/partners/acme/awesome/1.42.0/report.yaml"
assert not s.source.found
assert not s.source.path
assert not s.tarball.found
assert not s.tarball.path
assert not s.tarball.provenance


def test_submission_deserializer():
s = Submission(
api_url="https://api.github.com/repos/openshift-helm-charts/charts/pulls/1",
modified_files=["charts/partners/acme/awesome/1.42.0/report.yaml"],
chart=Chart(
category="partners",
organization="acme",
name="awesome",
version="1.42.0",
),
report=Report(
found=True,
signed=False,
path="charts/partners/acme/awesome/1.42.0/report.yaml",
),
source=Source(
found=False,
path=None,
),
tarball=Tarball(
found=False,
path=None,
provenance=None,
),
)

assert SubmissionEncoder().encode(s) == sanitize_json_string(submission_json)
220 changes: 220 additions & 0 deletions scripts/src/precheck/submission.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
import os
import requests
import semver

from dataclasses import dataclass, field

from checkprcontent import checkpr

xRateLimit = "X-RateLimit-Limit"
xRateRemain = "X-RateLimit-Remaining"


class SubmissionError(Exception):
"""Root Exception for handling any error with with the submission"""

pass


class DuplicateChartError(SubmissionError):
"""This Exception is to be raised when the user attempts to submit a PR with more than one chart"""

pass


class VersionError(SubmissionError):
"""This Exception is to be raised when the version of the chart is not semver compatible"""

pass


class OwnersFileError(SubmissionError):
"""This Exception is to be raised when the user incorrectly attempts to submit an OWNER file"""

pass


@dataclass
class Chart:
category: str = None
organization: str = None
name: str = None
version: str = None

def register_chart_info(self, category, organization, name, version):
if (
(self.category and self.category != category)
or (self.organization and self.organization != organization)
or (self.name and self.name != name)
or (self.version and self.version != version)
):
msg = "[ERROR] A PR must contain only one chart. Current PR includes files for multiple charts."
print(msg)
raise DuplicateChartError(msg)

if not semver.VersionInfo.isvalid(version):
msg = (
f"[ERROR] Helm chart version is not a valid semantic version: {version}"
)
print(msg)
raise VersionError(msg)

self.category = category
self.organization = organization
self.name = name
self.version = version


@dataclass
class Report:
found: bool = False
signed: bool = False
path: str = None


@dataclass
class Source:
found: bool = False
path: str = None # Path to the Chart.yaml


@dataclass
class Tarball:
found: bool = False
path: str = None
provenance: str = None


@dataclass
class Submission:
api_url: str
modified_files: list[str] = None # field(default_factory=list)
chart: Chart = field(default_factory=lambda: Chart())
report: Report = field(default_factory=lambda: Report())
source: Source = field(default_factory=lambda: Source())
tarball: Tarball = field(default_factory=lambda: Tarball())

def __post_init__(self):
"""Complete the initialization of the Submission object.
modified_files should be gatered and validated at the beginning of the pipeline, as part
of the "pre-check" job. Hence this isn't done when a Submission object is loaded from a
JSON file aka when modified_files is not empty.
"""
if not self.modified_files:
self.modified_files = []
self._get_modified_files()
self._parse_modified_files()

def _get_modified_files(self):
"""Query the GitHub API in order to retrieve the list of files that are added / modified by this PR"""
page_number = 1
max_page_size, page_size = 100, 100
headers = {
"Accept": "application/vnd.github.v3+json",
"Authorization": f'Bearer {os.environ.get("BOT_TOKEN")}',
}
files_api_url = f"{self.api_url}/files"

while page_size == max_page_size:
files_api_query = f"{files_api_url}?per_page={page_size}&page={page_number}"
print(f"[INFO] Query files : {files_api_query}")
r = requests.get(files_api_query, headers=headers)
files = r.json()
page_size = len(files)
page_number += 1

if xRateLimit in r.headers:
print(f"[DEBUG] {xRateLimit} : {r.headers[xRateLimit]}")
if xRateRemain in r.headers:
print(f"[DEBUG] {xRateRemain} : {r.headers[xRateRemain]}")

if "message" in files:
msg = f'[ERROR] getting pr files: {files["message"]}'
print(msg)
raise SubmissionError(msg)
else:
for file in files:
if "filename" in file:
self.modified_files.append(file["filename"])

def _parse_modified_files(self):
"""Classify the list of modified files.
This method will populate the chart, report, source and tarball attributes.
We expect the user to provide either:
* Only a report file
* Only a chart - either as source or tarball (potentially accompagnied by provenance file)
* Both the report and the chart
Raises a SubmissionError if:
* The Submission concerns more than one chart
* The version of the chart is not SemVer compatible
* The tarball file is names incorrectly
* The PR contains additional files, not related to the Chart being submitted
* The user attempts to create the OWNERS file for its project.
"""
pattern, reportpattern, tarballpattern = (
checkpr.get_file_match_compiled_patterns()
)
matches_found = 0
none_chart_files = {}

for file_path in self.modified_files:
match = pattern.match(file_path)
if not match:
file_name = os.path.basename(file_path)
none_chart_files[file_name] = file_path
else:
self.chart.register_chart_info(*match.groups())

matches_found += 1
tar_match = tarballpattern.match(file_path)
if tar_match:
self.tarball.found = True
self.tarball.path = file_path
print(f"[INFO] tarball found: {file_path}")
_, _, chart_name, chart_version, tar_name = tar_match.groups()
expected_tar_name = f"{chart_name}-{chart_version}.tgz"
if tar_name != expected_tar_name:
msg = f"[ERROR] the tgz file is named incorrectly. Expected: {expected_tar_name}. Got: {tar_name}"
print(msg)
raise SubmissionError(msg)
elif reportpattern.match(file_path):
print(f"[INFO] Report found: {file_path}")
self.report.found = True
self.report.path = file_path
elif os.path.basename(file_path) == "Chart.yaml":
self.source.found = True
self.source.path = file_path

if none_chart_files:
if len(self.modified_files) > 1 or "OWNERS" not in none_chart_files:
# OWNERS not present or preset but not the only file
msg = (
"[ERROR] PR includes one or more files not related to charts: "
+ ", ".join(none_chart_files)
)
print(msg)
raise SubmissionError(msg)

if "OWNERS" in none_chart_files:
file_path = none_chart_files["OWNERS"]
path_parts = file_path.split("/")
category = path_parts[1] # Second after charts
if category == "partners":
msg = "[ERROR] OWNERS file should never be set directly by partners. See certification docs."
print(msg)
raise OwnersFileError(msg)
elif matches_found > 0:
# There is a mix of chart and non-chart files including OWNERS
msg = "[ERROR] Send OWNERS file by itself in a separate PR."
print(msg)
raise OwnersFileError(msg)
elif len(self.modified_files) == 1:
# OWNERS file is the only file in PR
msg = "[INFO] OWNERS file changes require manual review by maintainers."
print(msg)
raise OwnersFileError(msg)
Loading

0 comments on commit 5a55ce3

Please sign in to comment.