uyuni-project · waterflow80 · Jun 17, 2024 · Jun 17, 2024 · Jun 17, 2024 · Jun 17, 2024
diff --git a/python/lzreposync/pyproject.toml b/python/lzreposync/pyproject.toml
@@ -1,6 +1,6 @@
 [build-system]
 requires = ["setuptools", "setuptools-scm"]
-bild-backend = "setuptools.build_meta"
+build-backend = "setuptools.build_meta"
 
 [project]
 name = "lzreposync"

@@ -1,14 +1,18 @@
+#  pylint: disable=missing-module-docstring
+
 import argparse
 import logging
 from itertools import islice
 
+from lzreposync import db_utils
+from lzreposync.import_utils import import_package_batch
 from lzreposync.rpm_repo import RPMRepo
 
 
 # TODO: put this function in a better location
 def batched(iterable, n):
     if n < 1:
-        raise ValueError('n must be at least one')
+        raise ValueError("n must be at least one")
     iterator = iter(iterable)
     while batch := tuple(islice(iterator, n)):
         yield batch
@@ -25,9 +29,10 @@ def main():
         "--url",
         "-u",
         help="The target url of the remote repository of which we'll "
-             "parse the metadata",
+        "parse the metadata",
         dest="url",
         type=str,
+        default=None,
     )
 
     parser.add_argument(
@@ -66,11 +71,64 @@ def main():
         type=int,
     )
 
+    parser.add_argument(
+        "-a",
+        "--arch",
+        help="A filter for package architecture. Can be a regex, for example: 'x86_64',  '(x86_64|arch_64)'",
+        default=".*",
+        dest="arch",
+        type=str,
+    )
+
+    parser.add_argument(
+        "--channel",
+        help="The channel id of which you want to synchronize repositories",
+        dest="channel",
+        type=int,
+        default=None,
+    )
+
+    # TODO encapsulate everything in a class LzRepoSync
+
     args = parser.parse_args()
+    arch = args.arch
+    if arch != ".*":
+        # pylint: disable-next=consider-using-f-string
+        arch = "(noarch|{})".format(args.arch)
 
     logging.getLogger().setLevel(args.loglevel)
-    rpm_repository = RPMRepo(args.name, args.cache, args.url)  # TODO args.url should be args.repo, no ?
-    packages = rpm_repository.get_packages_metadata()  # packages is a generator
-    for batch in batched(packages, args.batch_size):
-        print(f"Importing a batch of {len(batch)} packages...")
-        # TODO: complete the import
+    if args.url:
+        rpm_repository = RPMRepo(args.name, args.cache, args.url, arch)
+        packages = rpm_repository.get_packages_metadata()  # packages is a generator
+        failed = 0
+        for i, batch in enumerate(batched(packages, args.batch_size)):
+            failed += import_package_batch(batch, i)
+        logging.debug("Completed import with %d failed packages", failed)
+
+    else:
+        # No url specified
+        if args.channel:
+            channel_id = args.channel
+            target_repos = db_utils.get_repositories_by_channel_id(channel_id)
+            for repo in target_repos:
+                if repo.repo_type == "yum":
+                    rpm_repository = RPMRepo(
+                        repo.repo_label, args.cache, repo.source_url, arch
+                    )
+                    logging.debug("Importing package for repo %s", repo.repo_label)
+                    failed = 0
+                    packages = rpm_repository.get_packages_metadata()
+                    for i, batch in enumerate(batched(packages, args.batch_size)):
+                        failed += import_package_batch(batch, i)
+                    logging.debug(
+                        "Completed import for repo %s with %d failed packages",
+                        repo.repo_label,
+                        failed,
+                    )
+
+                else:
+                    # TODO: handle repositories other than rpm
+                    logging.debug("Not supported repo type: %s", repo.repo_type)
+                    continue
+        else:
+            logging.error("Either --url or --channel must be specified")
@@ -0,0 +1,16 @@
+#  pylint: disable=missing-module-docstring
+
+from typing import List
+
+from lzreposync.repo_dto import RepoDTO
+
+
+class ChannelDTO:
+    """
+    A temporary data structure to hold some minor channel information
+    """
+
+    def __init__(self, label, repositories: List[RepoDTO], channel_arch=None):
+        self.label = label
+        self.repositories = repositories
+        self.channel_arch = channel_arch
@@ -0,0 +1,37 @@
+#  pylint: disable=missing-module-docstring
+
+from lzreposync.repo_dto import RepoDTO
+from spacewalk.server import rhnSQL
+
+
+def get_repositories_by_channel_id(channel_id):
+    """
+    Fetch repositories information form the database, and return a list of
+    RepoDTO objects
+    """
+    rhnSQL.initDB()
+    h = rhnSQL.prepare(
+        """
+        select s.id, s.source_url, s.metadata_signed, s.label as repo_label, cst.label as repo_type_label
+        from rhnContentSource s,
+             rhnChannelContentSource cs,
+             rhnContentSourceType cst
+        where s.id = cs.source_id
+          and cst.id = s.type_id
+          and cs.channel_id = :channel_id"""
+    )
+    h.execute(channel_id=int(channel_id))
+    sources = h.fetchall_dict()
+    repositories = map(
+        lambda source: RepoDTO(
+            repo_id=source["id"],
+            repo_label=source["repo_label"],
+            repo_type=source["repo_type_label"],
+            source_url=source["source_url"],
+            metadata_signed=source["metadata_signed"],
+        ),
+        sources,
+    )
+    rhnSQL.closeDB()
+
+    return list(repositories)
@@ -0,0 +1,120 @@
+#  pylint: disable=missing-module-docstring
+
+import gzip
+import logging
+import os
+import re
+import shutil
+import xml.etree.ElementTree as ET
+from xml.dom import pulldom
+
+
+def map_attribute(attr):
+    attr_map = {"ver": "version", "rel": "release"}
+    return attr_map.get(attr, attr)
+
+
+def cache_xml_node(node, cache_dir):
+    """
+    Saving the content of the given xml node into a xml file in the given cache directory
+    node: of type xml.dom.minidom.Element
+    """
+    pkgid = node.getAttributeNode("pkgid").value
+
+    xml_content = node.toxml()
+    cache_file = os.path.join(cache_dir, pkgid)
+
+    if not os.path.exists(cache_dir):
+        logging.debug("Creating cache directory: %s", cache_dir)
+        os.makedirs(cache_dir)
+
+    with open(cache_file, "w", encoding="utf-8") as pkg_files:
+        logging.debug("Caching file %s", cache_file)
+        pkg_files.write(xml_content)
+
+
+# pylint: disable-next=missing-class-docstring
+class FilelistsParser:
+    def __init__(self, filelists_file, cache_dir="./.cache", arch_filter=".*"):
+        """
+        filelists_file: In gzip format
+        """
+        self.filelists_file = filelists_file
+        self.cache_dir = cache_dir
+        self.arch_filter = arch_filter
+        self.num_packages = -1  # The number of packages in the given filelist file
+        self.num_parsed_packages = 0  # The number packages parsed
+        self.parsed = False  # Tell whether the filelists file has been parsed or not
+
+    def parse_filelists(self):
+        """
+        Parse the given filelists.xml file (in gzip format) and save the filelist information
+        of each package in a separate file, where the name of the file is the 'pkgid' with no extension,
+        for eg the file name should be like: 1c51349b5b35baa58f4941528d25a1306e84b71109051705138dc3577a38bad4
+        """
+
+        with gzip.open(self.filelists_file) as gz_filelists:
+            doc = pulldom.parse(gz_filelists)
+            for event, node in doc:
+                if event == pulldom.START_ELEMENT and node.tagName == "filelists":
+                    # saving the num of packages contained in the filelists file
+                    num_packages = node.getAttributeNode("packages").value
+                    self.num_packages = num_packages
+
+                elif event == pulldom.START_ELEMENT and node.tagName == "package":
+                    doc.expandNode(node)
+                    pkg_arch = node.getAttributeNode("arch").value
+                    if re.fullmatch(self.arch_filter, pkg_arch):  # Filter by arch
+                        # Save the content of the package's filelist info in cache directory
+                        cache_xml_node(node, self.cache_dir)
+                        self.num_parsed_packages += 1
+
+            self.parsed = True
+
+    def get_package_filelist(self, pkgid):
+        """
+        Read the filelist information for the package with the given pkgid,
+        parse the information and return a dict containing the filelist info
+        """
+
+        filelist_path = os.path.join(self.cache_dir, pkgid)
+
+        # Read the cached filelist file
+        if not os.path.exists(filelist_path):
+            logging.debug("No filelist file found for package %s", pkgid)
+            if not self.parsed:
+                logging.debug("Parsing filelists file...")
+                self.parse_filelists()
+                self.parsed = True
+            else:
+                logging.error("Couldn't find filelist file for package %s", pkgid)
+                return
+
+        with open(
+            os.path.join(self.cache_dir, pkgid), "r", encoding="utf-8"
+        ) as filelist_xml:
+            tree = ET.parse(filelist_xml)
+            root = tree.getroot()
+
+            filelist = {}
+            filelist["pkgid"] = pkgid
+            filelist["files"] = []
+            # Setting version information (normally it is the same as the one in primary.xml file for the same package)
+            for attr in ("ver", "epoch", "rel"):
+                try:
+                    filelist[map_attribute(attr)] = root[0].attrib[attr]
+                except KeyError as key:
+                    logging.debug("missing %s information for package %s", key, pkgid)
+
+            for file in root[1:]:
+                filelist["files"].append(file.text)
+
+        return filelist
+
+    def clear_cache(self):
+        """
+        Remove the cached filelist files from the cache directory, including the cache directory
+        """
+        if os.path.exists(self.cache_dir):
+            logging.debug("Removing %s directory and its content")
+            shutil.rmtree(self.cache_dir)