-
Notifications
You must be signed in to change notification settings - Fork 180
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Synchronizing rpm repositories using channel lookup #9052
Changes from 65 commits
4af1dd0
bcd6506
67a6d2a
8d1f676
4f34429
bb0d1c8
6c145e5
b6f9bb5
5d9d2ce
2236cd2
3183789
7f0a84f
5fae52f
38c98d5
52f3143
f96f8c4
e207e36
7217738
046dc88
e939546
7e17747
d779b6f
4476ce5
39b0884
4324fc7
81d20a7
47c0de3
f1b0f0f
f4908af
060c0d6
433ecad
f836d51
bdd9984
37eb735
71a3d0e
551a4eb
f92bea5
838b53b
270da57
6f703d8
4e4f42c
5ba0ce6
3116c48
2948e0e
8ed594e
512cdf7
a5458d2
eecd794
d7f50ec
28a6643
8ed0baf
df85696
a846bc7
c47c44f
f1f72e5
dfe82bb
0c71bf5
752ba34
d8d3b36
5e52527
40895cd
e436da1
12452a9
13a4a7e
b29ef17
49aee7c
209696e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,18 @@ | ||
# pylint: disable=missing-module-docstring | ||
|
||
import argparse | ||
import logging | ||
from itertools import islice | ||
|
||
from lzreposync import db_utils | ||
from lzreposync.import_utils import import_package_batch | ||
from lzreposync.rpm_repo import RPMRepo | ||
|
||
|
||
# TODO: put this function in a better location | ||
def batched(iterable, n): | ||
if n < 1: | ||
raise ValueError('n must be at least one') | ||
raise ValueError("n must be at least one") | ||
iterator = iter(iterable) | ||
while batch := tuple(islice(iterator, n)): | ||
yield batch | ||
|
@@ -25,9 +29,10 @@ def main(): | |
"--url", | ||
"-u", | ||
help="The target url of the remote repository of which we'll " | ||
"parse the metadata", | ||
"parse the metadata", | ||
dest="url", | ||
type=str, | ||
default=None, | ||
) | ||
|
||
parser.add_argument( | ||
|
@@ -66,11 +71,64 @@ def main(): | |
type=int, | ||
) | ||
|
||
parser.add_argument( | ||
"-a", | ||
"--arch", | ||
help="A filter for package architecture. Can be a regex, for example: 'x86_64', '(x86_64|arch_64)'", | ||
default=".*", | ||
dest="arch", | ||
type=str, | ||
) | ||
|
||
parser.add_argument( | ||
"--channel", | ||
help="The channel id of which you want to synchronize repositories", | ||
dest="channel", | ||
type=int, | ||
default=None, | ||
) | ||
|
||
# TODO encapsulate everything in a class LzRepoSync | ||
|
||
args = parser.parse_args() | ||
arch = args.arch | ||
if arch != ".*": | ||
# pylint: disable-next=consider-using-f-string | ||
arch = "(noarch|{})".format(args.arch) | ||
|
||
logging.getLogger().setLevel(args.loglevel) | ||
rpm_repository = RPMRepo(args.name, args.cache, args.url) # TODO args.url should be args.repo, no ? | ||
packages = rpm_repository.get_packages_metadata() # packages is a generator | ||
for batch in batched(packages, args.batch_size): | ||
print(f"Importing a batch of {len(batch)} packages...") | ||
# TODO: complete the import | ||
if args.url: | ||
rpm_repository = RPMRepo(args.name, args.cache, args.url, arch) | ||
packages = rpm_repository.get_packages_metadata() # packages is a generator | ||
failed = 0 | ||
for i, batch in enumerate(batched(packages, args.batch_size)): | ||
failed += import_package_batch(batch, i) | ||
logging.debug("Completed import with %d failed packages", failed) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can probably move this into a function to share it with the channel case. |
||
|
||
else: | ||
# No url specified | ||
if args.channel: | ||
channel_id = args.channel | ||
target_repos = db_utils.get_repositories_by_channel_id(channel_id) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No one knows the channel ID, you cannot ask that as a command line parameter. What users will pass here is the channel label. |
||
for repo in target_repos: | ||
if repo.repo_type == "yum": | ||
rpm_repository = RPMRepo( | ||
repo.repo_label, args.cache, repo.source_url, arch | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use the channel arch instead of |
||
) | ||
logging.debug("Importing package for repo %s", repo.repo_label) | ||
failed = 0 | ||
packages = rpm_repository.get_packages_metadata() | ||
for i, batch in enumerate(batched(packages, args.batch_size)): | ||
failed += import_package_batch(batch, i) | ||
logging.debug( | ||
"Completed import for repo %s with %d failed packages", | ||
repo.repo_label, | ||
failed, | ||
) | ||
|
||
else: | ||
# TODO: handle repositories other than rpm | ||
logging.debug("Not supported repo type: %s", repo.repo_type) | ||
continue | ||
else: | ||
logging.error("Either --url or --channel must be specified") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# pylint: disable=missing-module-docstring | ||
|
||
from typing import List | ||
|
||
from lzreposync.repo_dto import RepoDTO | ||
|
||
|
||
class ChannelDTO: | ||
""" | ||
A temporary data structure to hold some minor channel information | ||
""" | ||
|
||
def __init__(self, label, repositories: List[RepoDTO], channel_arch=None): | ||
self.label = label | ||
self.repositories = repositories | ||
self.channel_arch = channel_arch |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# pylint: disable=missing-module-docstring | ||
|
||
from lzreposync.repo_dto import RepoDTO | ||
from spacewalk.server import rhnSQL | ||
|
||
|
||
def get_repositories_by_channel_id(channel_id): | ||
""" | ||
Fetch repositories information form the database, and return a list of | ||
RepoDTO objects | ||
""" | ||
rhnSQL.initDB() | ||
h = rhnSQL.prepare( | ||
""" | ||
select s.id, s.source_url, s.metadata_signed, s.label as repo_label, cst.label as repo_type_label | ||
from rhnContentSource s, | ||
rhnChannelContentSource cs, | ||
rhnContentSourceType cst | ||
where s.id = cs.source_id | ||
and cst.id = s.type_id | ||
and cs.channel_id = :channel_id""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could join with the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
) | ||
h.execute(channel_id=int(channel_id)) | ||
sources = h.fetchall_dict() | ||
repositories = map( | ||
lambda source: RepoDTO( | ||
repo_id=source["id"], | ||
repo_label=source["repo_label"], | ||
repo_type=source["repo_type_label"], | ||
source_url=source["source_url"], | ||
metadata_signed=source["metadata_signed"], | ||
), | ||
sources, | ||
) | ||
rhnSQL.closeDB() | ||
|
||
return list(repositories) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
# pylint: disable=missing-module-docstring | ||
|
||
import gzip | ||
import logging | ||
import os | ||
import re | ||
import shutil | ||
import xml.etree.ElementTree as ET | ||
from xml.dom import pulldom | ||
|
||
|
||
def map_attribute(attr): | ||
attr_map = {"ver": "version", "rel": "release"} | ||
return attr_map.get(attr, attr) | ||
|
||
|
||
def cache_xml_node(node, cache_dir): | ||
""" | ||
Saving the content of the given xml node into a xml file in the given cache directory | ||
node: of type xml.dom.minidom.Element | ||
""" | ||
pkgid = node.getAttributeNode("pkgid").value | ||
|
||
xml_content = node.toxml() | ||
cache_file = os.path.join(cache_dir, pkgid) | ||
|
||
if not os.path.exists(cache_dir): | ||
logging.debug("Creating cache directory: %s", cache_dir) | ||
os.makedirs(cache_dir) | ||
|
||
with open(cache_file, "w", encoding="utf-8") as pkg_files: | ||
logging.debug("Caching file %s", cache_file) | ||
pkg_files.write(xml_content) | ||
|
||
|
||
# pylint: disable-next=missing-class-docstring | ||
class FilelistsParser: | ||
def __init__(self, filelists_file, cache_dir="./.cache", arch_filter=".*"): | ||
""" | ||
filelists_file: In gzip format | ||
""" | ||
self.filelists_file = filelists_file | ||
self.cache_dir = cache_dir | ||
self.arch_filter = arch_filter | ||
self.num_packages = -1 # The number of packages in the given filelist file | ||
self.num_parsed_packages = 0 # The number packages parsed | ||
self.parsed = False # Tell whether the filelists file has been parsed or not | ||
|
||
def parse_filelists(self): | ||
""" | ||
Parse the given filelists.xml file (in gzip format) and save the filelist information | ||
of each package in a separate file, where the name of the file is the 'pkgid' with no extension, | ||
for eg the file name should be like: 1c51349b5b35baa58f4941528d25a1306e84b71109051705138dc3577a38bad4 | ||
""" | ||
|
||
with gzip.open(self.filelists_file) as gz_filelists: | ||
doc = pulldom.parse(gz_filelists) | ||
for event, node in doc: | ||
if event == pulldom.START_ELEMENT and node.tagName == "filelists": | ||
# saving the num of packages contained in the filelists file | ||
num_packages = node.getAttributeNode("packages").value | ||
self.num_packages = num_packages | ||
|
||
elif event == pulldom.START_ELEMENT and node.tagName == "package": | ||
doc.expandNode(node) | ||
pkg_arch = node.getAttributeNode("arch").value | ||
if re.fullmatch(self.arch_filter, pkg_arch): # Filter by arch | ||
# Save the content of the package's filelist info in cache directory | ||
cache_xml_node(node, self.cache_dir) | ||
self.num_parsed_packages += 1 | ||
|
||
self.parsed = True | ||
|
||
def get_package_filelist(self, pkgid): | ||
""" | ||
Read the filelist information for the package with the given pkgid, | ||
parse the information and return a dict containing the filelist info | ||
""" | ||
|
||
filelist_path = os.path.join(self.cache_dir, pkgid) | ||
|
||
# Read the cached filelist file | ||
if not os.path.exists(filelist_path): | ||
logging.debug("No filelist file found for package %s", pkgid) | ||
if not self.parsed: | ||
logging.debug("Parsing filelists file...") | ||
self.parse_filelists() | ||
self.parsed = True | ||
else: | ||
logging.error("Couldn't find filelist file for package %s", pkgid) | ||
return | ||
|
||
with open( | ||
os.path.join(self.cache_dir, pkgid), "r", encoding="utf-8" | ||
) as filelist_xml: | ||
tree = ET.parse(filelist_xml) | ||
root = tree.getroot() | ||
|
||
filelist = {} | ||
filelist["pkgid"] = pkgid | ||
filelist["files"] = [] | ||
# Setting version information (normally it is the same as the one in primary.xml file for the same package) | ||
for attr in ("ver", "epoch", "rel"): | ||
try: | ||
filelist[map_attribute(attr)] = root[0].attrib[attr] | ||
except KeyError as key: | ||
logging.debug("missing %s information for package %s", key, pkgid) | ||
|
||
for file in root[1:]: | ||
filelist["files"].append(file.text) | ||
|
||
return filelist | ||
|
||
def clear_cache(self): | ||
""" | ||
Remove the cached filelist files from the cache directory, including the cache directory | ||
""" | ||
if os.path.exists(self.cache_dir): | ||
logging.debug("Removing %s directory and its content") | ||
shutil.rmtree(self.cache_dir) |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not the channel id, but the channel label