From 10cfcb034708c528e3ca387c3e5b75623751a514 Mon Sep 17 00:00:00 2001 From: Gerrod Ubben Date: Fri, 15 Jul 2022 21:13:19 -0400 Subject: [PATCH] Add sha256 uniqueness to CollectionVersion This should be ZDU compatible. The transition will be completed with the next y-release. fixes: #1052 Co-authored-by: Matthias Dellweg --- CHANGES/1052.feature | 2 + .../datarepair-ansible-collection-sha256.py | 56 +++++++ .../0056_collectionversion_sha256.py | 23 +++ .../0057_collectionversion_sha256_migrate.py | 57 +++++++ pulp_ansible/app/models.py | 5 +- pulp_ansible/app/serializers.py | 7 +- pulp_ansible/app/tasks/collections.py | 139 ++++++++++-------- pulp_ansible/app/tasks/git.py | 31 +++- pulp_ansible/app/tasks/upload.py | 1 + ..._0057_collection_version_sha256_migrate.py | 39 +++++ pulp_ansible/tests/unit/test_search.py | 5 +- pulp_ansible/tests/unit/utils.py | 38 +++-- 12 files changed, 317 insertions(+), 86 deletions(-) create mode 100644 CHANGES/1052.feature create mode 100644 pulp_ansible/app/management/commands/datarepair-ansible-collection-sha256.py create mode 100644 pulp_ansible/app/migrations/0056_collectionversion_sha256.py create mode 100644 pulp_ansible/app/migrations/0057_collectionversion_sha256_migrate.py create mode 100644 pulp_ansible/tests/unit/migrations/test_0057_collection_version_sha256_migrate.py diff --git a/CHANGES/1052.feature b/CHANGES/1052.feature new file mode 100644 index 000000000..bcab5c4a3 --- /dev/null +++ b/CHANGES/1052.feature @@ -0,0 +1,2 @@ +CollectionVersion global uniqueness constraint is now its sha256 digest. Repository level uniqueness +is still (namespace, name, version). diff --git a/pulp_ansible/app/management/commands/datarepair-ansible-collection-sha256.py b/pulp_ansible/app/management/commands/datarepair-ansible-collection-sha256.py new file mode 100644 index 000000000..6ec68208a --- /dev/null +++ b/pulp_ansible/app/management/commands/datarepair-ansible-collection-sha256.py @@ -0,0 +1,56 @@ +from gettext import gettext as _ + +from django.core.management import BaseCommand +from django.db import transaction + +from pulp_ansible.app.models import CollectionVersion + + +class Command(BaseCommand): + """ + Django management command to repair ansible collection versions without sha256. + """ + + help = ( + "This script repairs ansible collection versions without sha256 if artifacts are available." + ) + + def add_arguments(self, parser): + """Set up arguments.""" + parser.add_argument( + "--dry-run", + action="store_true", + help=_("Don't modify anything, just collect results."), + ) + + def handle(self, *args, **options): + dry_run = options["dry_run"] + failed_units = 0 + repaired_units = 0 + + unit_qs = CollectionVersion.objects.filter(sha256__isnull=True) + count = unit_qs.count() + print(f"CollectionVersions to repair: {count}") + if count == 0: + return + + for unit in unit_qs.iterator(): + try: + content_artifact = unit.contentartifact_set.get() + artifact = content_artifact.artifact + unit.sha256 = artifact.sha256 + + if not dry_run: + with transaction.atomic(): + unit.save(update_fields=["sha256"]) + except Exception as e: + failed_units += 1 + print( + f"Failed to migrate collection version '{unit.namespace}.{unit.name}' " + f"'{unit.version}': {e}" + ) + else: + repaired_units += 1 + + print(f"Successfully repaired collection versions: {repaired_units}") + print(f"Collection versions failed to repair: {failed_units}") diff --git a/pulp_ansible/app/migrations/0056_collectionversion_sha256.py b/pulp_ansible/app/migrations/0056_collectionversion_sha256.py new file mode 100644 index 000000000..7bb571b80 --- /dev/null +++ b/pulp_ansible/app/migrations/0056_collectionversion_sha256.py @@ -0,0 +1,23 @@ +# Generated by Django 3.2.14 on 2022-07-15 22:51 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('ansible', '0055_alter_collectionversion_version_alter_role_version'), + ] + + operations = [ + migrations.AddField( + model_name='collectionversion', + name='sha256', + field=models.CharField(default='', max_length=64, null=True), + preserve_default=False, + ), + migrations.AlterUniqueTogether( + name="collectionversion", + unique_together={("sha256",), ("namespace", "name", "version")}, + ), + ] diff --git a/pulp_ansible/app/migrations/0057_collectionversion_sha256_migrate.py b/pulp_ansible/app/migrations/0057_collectionversion_sha256_migrate.py new file mode 100644 index 000000000..f62df47ec --- /dev/null +++ b/pulp_ansible/app/migrations/0057_collectionversion_sha256_migrate.py @@ -0,0 +1,57 @@ +# Generated by Django 3.2.14 on 2022-07-21 22:35 +from django.db import migrations + + +def add_sha256_to_current_models(apps, schema_editor): + """Adds the sha256 to current CollectionVersion models.""" + CollectionVersion = apps.get_model("ansible", "CollectionVersion") + collection_versions_to_update = [] + collection_versions_on_demand = [] + + for collection_version in ( + CollectionVersion.objects.prefetch_related( + "content_artifacts", "content_artifacts__artifact" + ) + .filter(sha256="") + .only("pk", "sha256") + .iterator() + ): + content_artifact = collection_version.contentartifact_set.get() + if content_artifact.artifact: + collection_version.sha256 = content_artifact.artifact.sha256 + collection_versions_to_update.append(collection_version) + else: + collection_versions_on_demand.append(collection_version) + if len(collection_versions_to_update) >= 1024: + CollectionVersion.objects.bulk_update( + collection_versions_to_update, + [ + "sha256", + ], + ) + collection_versions_to_update.clear() + # Update remaining collection versions + if len(collection_versions_to_update) > 0: + CollectionVersion.objects.bulk_update( + collection_versions_to_update, + [ + "sha256", + ], + ) + + # If there are on-demand collections then the next migration will fail, so error here with + # helpful message on how to fix. No work will be performed by this migration on a second-run. + if len(collection_versions_on_demand) > 0: + raise Exception( + f"On demand collections found. Please remove or upload/sync their data: " + f"{[c.pk for c in collection_versions_on_demand]}" + ) + + +class Migration(migrations.Migration): + + dependencies = [ + ("ansible", "0056_collectionversion_sha256"), + ] + + operations = [migrations.RunPython(add_sha256_to_current_models, migrations.RunPython.noop)] diff --git a/pulp_ansible/app/models.py b/pulp_ansible/app/models.py index 4dcf13aae..582ff244f 100644 --- a/pulp_ansible/app/models.py +++ b/pulp_ansible/app/models.py @@ -162,6 +162,7 @@ class CollectionVersion(Content): """ TYPE = "collection_version" + repo_key_fields = ("name", "namespace", "version") # Data Fields authors = psql_fields.ArrayField(models.CharField(max_length=64), default=list, editable=False) @@ -179,6 +180,7 @@ class CollectionVersion(Content): namespace = models.CharField(max_length=64, editable=False) repository = models.CharField(default="", blank=True, max_length=2000, editable=False) requires_ansible = models.CharField(null=True, max_length=255) + sha256 = models.CharField(max_length=64, null=True, blank=False) version = models.CharField(max_length=128, db_collation="pulp_ansible_semver") version_major = models.IntegerField() @@ -229,7 +231,7 @@ def __str__(self): class Meta: default_related_name = "%(app_label)s_%(model_name)s" - unique_together = ("namespace", "name", "version") + unique_together = (("namespace", "name", "version"), ("sha256",)) constraints = [ UniqueConstraint( fields=("collection", "is_highest"), @@ -535,6 +537,7 @@ class Meta: def finalize_new_version(self, new_version): """Finalize repo version.""" + remove_duplicates(new_version) removed_collection_versions = new_version.removed( base_version=new_version.base_version ).filter(pulp_type=CollectionVersion.get_pulp_type()) diff --git a/pulp_ansible/app/serializers.py b/pulp_ansible/app/serializers.py index e42e7421e..f0a26157c 100644 --- a/pulp_ansible/app/serializers.py +++ b/pulp_ansible/app/serializers.py @@ -504,8 +504,10 @@ def deferred_validate(self, data): # Call super to ensure that data contains artifact data = super().deferred_validate(data) artifact = data.get("artifact") - if (sha256 := data.pop("sha256", None)) and sha256 != artifact.sha256: + if (sha256 := data.get("sha256")) and sha256 != artifact.sha256: raise ValidationError(_("Expected sha256 did not match uploaded artifact's sha256")) + else: + data["sha256"] = artifact.sha256 collection_info = process_collection_artifact( artifact=artifact, @@ -563,6 +565,8 @@ class Meta: "expected_version", ) model = CollectionVersion + # There was an autogenerated validator rendering sha256 required. + validators = [] class CollectionVersionSerializer(ContentChecksumSerializer, CollectionVersionUploadSerializer): @@ -687,6 +691,7 @@ def is_valid(self, raise_exception=False): write_fields = set(CollectionVersionUploadSerializer.Meta.fields) - { "pulp_created", "pulp_last_updated", + "sha256", } if hasattr(self, "initial_data"): if any((x in self.initial_data for x in self.Meta.read_fields)): diff --git a/pulp_ansible/app/tasks/collections.py b/pulp_ansible/app/tasks/collections.py index 61ce7ed96..5bcbdaa03 100644 --- a/pulp_ansible/app/tasks/collections.py +++ b/pulp_ansible/app/tasks/collections.py @@ -86,6 +86,20 @@ def __init__(self, expression): self.clause = Always() +@sync_to_async +def _save_collection_version(collection_version, artifact, tags): + with transaction.atomic(): + collection_version.save() + ContentArtifact.objects.create( + artifact=artifact, + content=collection_version, + relative_path=collection_version.relative_path, + ) + for name in tags: + tag, created = Tag.objects.get_or_create(name=name) + collection_version.tags.add(tag) + + async def declarative_content_from_git_repo(remote, url, git_ref=None, metadata_only=False): """Returns a DeclarativeContent for the Collection in a Git repository.""" if git_ref: @@ -100,29 +114,30 @@ async def declarative_content_from_git_repo(remote, url, git_ref=None, metadata_ gitrepo = Repo.clone_from(url, uuid4(), depth=1, multi_options=["--recurse-submodules"]) commit_sha = gitrepo.head.commit.hexsha metadata, artifact_path = sync_collection(gitrepo.working_dir, ".") + artifact = Artifact.init_and_validate(artifact_path) if metadata_only: metadata["artifact"] = None metadata["artifact_url"] = None else: - artifact = Artifact.init_and_validate(artifact_path) - try: - await sync_to_async(artifact.save)() - except IntegrityError: - artifact = Artifact.objects.get(sha256=artifact.sha256) + if existing_artifact := await Artifact.objects.filter(sha256=artifact.sha256).afirst(): + await existing_artifact.atouch() + artifact = existing_artifact + else: + try: + assert artifact._state.adding is True + await sync_to_async(artifact.save)() + assert artifact._state.adding is False + except IntegrityError: + artifact = await Artifact.objects.aget(sha256=artifact.sha256) metadata["artifact_url"] = reverse("artifacts-detail", args=[artifact.pk]) metadata["artifact"] = artifact metadata["remote_artifact_url"] = "{}/commit/{}".format(url.rstrip("/"), commit_sha) + metadata["sha256"] = artifact.sha256 artifact = metadata["artifact"] try: - collection_version = await sync_to_async(create_collection_from_importer)( - metadata, metadata_only=metadata_only - ) - await sync_to_async(ContentArtifact.objects.get_or_create)( - artifact=artifact, - content=collection_version, - relative_path=collection_version.relative_path, - ) + collection_version, tags = await sync_to_async(create_collection_from_importer)(metadata) + await _save_collection_version(collection_version, artifact, tags) except ValidationError as e: if "unique" in str(e): namespace = metadata["metadata"]["namespace"] @@ -144,10 +159,14 @@ async def declarative_content_from_git_repo(remote, url, git_ref=None, metadata_ ) # TODO: docs blob support?? + extra_data = {} + if metadata_only: + extra_data["d_artifact_files"] = {d_artifact: artifact_path} d_content = DeclarativeContent( content=collection_version, d_artifacts=[d_artifact], + extra_data=extra_data, ) return d_content @@ -275,16 +294,21 @@ def import_collection( artifact = Artifact.from_pulp_temporary_file(temp_file) temp_file = None importer_result["artifact_url"] = reverse("artifacts-detail", args=[artifact.pk]) - collection_version = create_collection_from_importer(importer_result) + importer_result["sha256"] = artifact.sha256 + collection_version, tags = create_collection_from_importer(importer_result) collection_version.manifest = manifest_data collection_version.files = files_data - with transaction.atomic: + with transaction.atomic(): collection_version.save() ContentArtifact.objects.create( artifact=artifact, content=collection_version, relative_path=collection_version.relative_path, ) + for name in tags: + tag, created = Tag.objects.get_or_create(name=name) + collection_version.tags.add(tag) + _update_highest_version(collection_version) except ImporterError as exc: log.info(f"Collection processing was not successful: {exc}") @@ -307,53 +331,44 @@ def import_collection( CreatedResource.objects.create(content_object=repository) -def create_collection_from_importer(importer_result, metadata_only=False): +def create_collection_from_importer(importer_result): """ Process results from importer. + + Do not perform any database operations, just return an unsaved CollectionVersion. """ collection_info = importer_result["metadata"] + tags = collection_info.pop("tags") - with transaction.atomic(): - collection, created = Collection.objects.get_or_create( - namespace=collection_info["namespace"], name=collection_info["name"] - ) - - tags = collection_info.pop("tags") - - # Remove fields not used by this model - collection_info.pop("license_file") - collection_info.pop("readme") - - # the importer returns many None values. We need to let the defaults in the model prevail - for key in ["description", "documentation", "homepage", "issues", "repository"]: - if collection_info[key] is None: - collection_info.pop(key) - - collection_version = CollectionVersion( - collection=collection, - **collection_info, - requires_ansible=importer_result.get("requires_ansible"), - contents=importer_result["contents"], - docs_blob=importer_result["docs_blob"], - ) + # Remove fields not used by this model + collection_info.pop("license_file") + collection_info.pop("readme") - serializer_fields = CollectionVersionSerializer.Meta.fields - data = {k: v for k, v in collection_version.__dict__.items() if k in serializer_fields} - data["id"] = collection_version.pulp_id - if not metadata_only: - data["artifact"] = importer_result["artifact_url"] + # the importer returns many None values. We need to let the defaults in the model prevail + for key in ["description", "documentation", "homepage", "issues", "repository"]: + if collection_info[key] is None: + collection_info.pop(key) - serializer = CollectionVersionSerializer(data=data) + collection, created = Collection.objects.get_or_create( + namespace=collection_info["namespace"], name=collection_info["name"] + ) + collection_version = CollectionVersion( + **collection_info, + collection=collection, + requires_ansible=importer_result.get("requires_ansible"), + contents=importer_result["contents"], + docs_blob=importer_result["docs_blob"], + sha256=importer_result["sha256"], + ) - serializer.is_valid(raise_exception=True) - collection_version.save() + serializer_fields = CollectionVersionSerializer.Meta.fields + data = {k: v for k, v in collection_version.__dict__.items() if k in serializer_fields} + data["id"] = collection_version.pulp_id - for name in tags: - tag, created = Tag.objects.get_or_create(name=name) - collection_version.tags.add(tag) + serializer = CollectionVersionSerializer(data=data) + serializer.is_valid(raise_exception=True) - collection_version.save() # Save the FK updates - return collection_version + return collection_version, tags def rebuild_repository_collection_versions_metadata( @@ -607,6 +622,7 @@ async def _add_collection_version(self, api_version, collection_version_url, met setattr(collection_version, attr_name, attr_value) artifact = metadata["artifact"] + collection_version.sha256 = artifact["sha256"] d_artifact = DeclarativeArtifact( artifact=Artifact(sha256=artifact["sha256"], size=artifact["size"]), url=url, @@ -1107,7 +1123,7 @@ class AnsibleContentSaver(ContentSaver): """ def __init__(self, repository_version, *args, **kwargs): - """Initialize CollectionContentSaver.""" + """Initialize AnsibleContentSaver.""" super().__init__(*args, **kwargs) self.repository_version = repository_version @@ -1124,9 +1140,8 @@ def _pre_save(self, batch): if d_content is None: continue if isinstance(d_content.content, CollectionVersion): - info = d_content.content.natural_key_dict() collection, created = Collection.objects.get_or_create( - namespace=info["namespace"], name=info["name"] + namespace=d_content.content.namespace, name=d_content.content.name ) d_content.content.collection = collection @@ -1161,13 +1176,17 @@ def _post_save(self, batch): docs_blob = d_content.extra_data.get("docs_blob", {}) if docs_blob: collection_version.docs_blob = docs_blob + d_artifact_files = d_content.extra_data.get("d_artifact_files", {}) for d_artifact in d_content.d_artifacts: artifact = d_artifact.artifact - assert not artifact._state.adding - with artifact.file.open() as artifact_file, tarfile.open( - fileobj=artifact_file, mode="r" - ) as tar: + # TODO change logic when implementing normal on-demand syncing + # Special Case for Git sync w/ metadata_only=True + if artifact_file_name := d_artifact_files.get(d_artifact): + artifact_file = open(artifact_file_name, mode="rb") + else: + artifact_file = artifact.file.open() + with tarfile.open(fileobj=artifact_file, mode="r") as tar: runtime_metadata = get_file_obj_from_tarball( tar, "meta/runtime.yml", artifact.file.name, raise_exc=False ) @@ -1186,7 +1205,7 @@ def _post_save(self, batch): collection_version.manifest = manifest_data collection_version.files = files_data info = manifest_data["collection_info"] - + artifact_file.close() # Create the tags tags = info.pop("tags") for name in tags: diff --git a/pulp_ansible/app/tasks/git.py b/pulp_ansible/app/tasks/git.py index cfb1474da..ea4798c04 100644 --- a/pulp_ansible/app/tasks/git.py +++ b/pulp_ansible/app/tasks/git.py @@ -7,9 +7,16 @@ from pulpcore.plugin.stages import ( DeclarativeVersion, Stage, + QueryExistingArtifacts, + QueryExistingContents, + ArtifactSaver, + RemoteArtifactSaver, ) from pulp_ansible.app.models import AnsibleRepository, GitRemote -from pulp_ansible.app.tasks.collections import declarative_content_from_git_repo +from pulp_ansible.app.tasks.collections import ( + declarative_content_from_git_repo, + AnsibleContentSaver, +) log = logging.getLogger(__name__) @@ -39,10 +46,30 @@ def synchronize(remote_pk, repository_pk, mirror=False): _("Synchronizing: repository=%(r)s remote=%(p)s"), {"r": repository.name, "p": remote.name} ) first_stage = GitFirstStage(remote) - d_version = DeclarativeVersion(first_stage, repository, mirror=mirror) + d_version = GitDeclarativeVersion(first_stage, repository, mirror=mirror) return d_version.create() +class GitDeclarativeVersion(DeclarativeVersion): + """ + Subclassed Declarative version creates a custom pipeline for Git sync. + """ + + def pipeline_stages(self, new_version): + """ + Build a list of stages feeding into the ContentUnitAssociation stage. + """ + return [ + self.first_stage, + QueryExistingArtifacts(), + ArtifactSaver(), + QueryExistingContents(), + # TODO: Use DocsBlobDownloader stage for Docs Blob support? + AnsibleContentSaver(new_version), + RemoteArtifactSaver(), + ] + + class GitFirstStage(Stage): """ The first stage of a pulp_ansible sync pipeline for git repositories. diff --git a/pulp_ansible/app/tasks/upload.py b/pulp_ansible/app/tasks/upload.py index 69ea3539c..0ce543d51 100644 --- a/pulp_ansible/app/tasks/upload.py +++ b/pulp_ansible/app/tasks/upload.py @@ -46,6 +46,7 @@ def process_collection_artifact(artifact, namespace, name, version): # Set CollectionVersion metadata collection_info = importer_result["metadata"] + collection_info["sha256"] = artifact.sha256 with transaction.atomic(): collection, created = Collection.objects.get_or_create( diff --git a/pulp_ansible/tests/unit/migrations/test_0057_collection_version_sha256_migrate.py b/pulp_ansible/tests/unit/migrations/test_0057_collection_version_sha256_migrate.py new file mode 100644 index 000000000..c81f89afe --- /dev/null +++ b/pulp_ansible/tests/unit/migrations/test_0057_collection_version_sha256_migrate.py @@ -0,0 +1,39 @@ +from django.core.files.uploadedfile import SimpleUploadedFile + + +def test_collection_version_sha256_migrate(migrate): + apps = migrate([("ansible", "0055_alter_collectionversion_version_alter_role_version")]) + Artifact = apps.get_model("core", "Artifact") + Collection = apps.get_model("ansible", "Collection") + CollectionVersion = apps.get_model("ansible", "CollectionVersion") + + artifact = Artifact.objects.create( + size=8, sha256="SENTINEL", file=SimpleUploadedFile("foo", b"deadbeef") + ) + collection = Collection.objects.create( + namespace="snap", + name="crackle", + ) + cv = CollectionVersion.objects.create( + pulp_type="collection_version", + collection=collection, + namespace="snap", + name="crackle", + version="pop", + version_minor="1", + version_major="2", + version_patch="3", + ) + cv.contentartifact_set.create(artifact=artifact) + + apps = migrate([("ansible", "0056_collectionversion_sha256")]) + CollectionVersion = apps.get_model("ansible", "CollectionVersion") + + cv = CollectionVersion.objects.get(pk=cv.pk) + assert cv.sha256 == "" + + apps = migrate([("ansible", "0057_collectionversion_sha256_migrate")]) + CollectionVersion = apps.get_model("ansible", "CollectionVersion") + + cv = CollectionVersion.objects.get(pk=cv.pk) + assert cv.sha256 == "SENTINEL" diff --git a/pulp_ansible/tests/unit/test_search.py b/pulp_ansible/tests/unit/test_search.py index 159504334..7b816298c 100644 --- a/pulp_ansible/tests/unit/test_search.py +++ b/pulp_ansible/tests/unit/test_search.py @@ -43,8 +43,9 @@ def setUp(self): self.collections[spec]["col"] = col # make the collection version - cv = CollectionVersion(collection=col, namespace=spec[0], name=spec[1], version=spec[2]) - cv.save() + cv = CollectionVersion.objects.create( + collection=col, sha256=str(ids), namespace=spec[0], name=spec[1], version=spec[2] + ) self.collections[spec]["cv"] = cv # add tags ... diff --git a/pulp_ansible/tests/unit/utils.py b/pulp_ansible/tests/unit/utils.py index e7cce1577..73f50cae8 100644 --- a/pulp_ansible/tests/unit/utils.py +++ b/pulp_ansible/tests/unit/utils.py @@ -45,28 +45,26 @@ def build_cvs_from_specs(specs, build_artifacts=True): """Make CVs from a list of [namespace, name, version] specs.""" collection_versions = [] for spec in specs: + with make_cv_tarball(spec[0], spec[1], spec[2]) as tarfn: + with open(tarfn, "rb") as fp: + rawbin = fp.read() + artifact = Artifact.objects.create( + size=os.path.getsize(tarfn), + file=SimpleUploadedFile(tarfn, rawbin), + **{ + algorithm: hashlib.new(algorithm, rawbin).hexdigest() + for algorithm in Artifact.DIGEST_FIELDS + }, + ) + col, _ = Collection.objects.get_or_create(name=spec[0]) - col.save() - cv = CollectionVersion(collection=col, namespace=spec[0], name=spec[1], version=spec[2]) - cv.save() + cv = CollectionVersion.objects.create( + collection=col, sha256=artifact.sha256, namespace=spec[0], name=spec[1], version=spec[2] + ) collection_versions.append(cv) - if build_artifacts: - with make_cv_tarball(spec[0], spec[1], spec[2]) as tarfn: - rawbin = open(tarfn, "rb").read() - artifact = Artifact.objects.create( - size=os.path.getsize(tarfn), - file=SimpleUploadedFile(tarfn, rawbin), - **{ - algorithm: hashlib.new(algorithm, rawbin).hexdigest() - for algorithm in Artifact.DIGEST_FIELDS - }, - ) - artifact.save() - - ca = ContentArtifact.objects.create( - artifact=artifact, content=cv, relative_path=cv.relative_path - ) - ca.save() + ContentArtifact.objects.create( + artifact=artifact, content=cv, relative_path=cv.relative_path + ) return collection_versions