From 7e8e1f7dfe081d204cfeebea756bd43933ef38a4 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 20 Sep 2023 12:56:34 -0400 Subject: [PATCH] Refactor for better control and to fix unit tests. --- dcicutils/license_utils.py | 111 ++++++++++++++++++++----------------- test/test_license_utils.py | 90 +++++++++++++++++++++++++++--- 2 files changed, 140 insertions(+), 61 deletions(-) diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index 14dd74ca1..d37e9a3f9 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -33,6 +33,7 @@ from dcicutils.lang_utils import there_are from dcicutils.misc_utils import ( PRINT, get_error_message, ignorable, ignored, json_file_contents, local_attrs, environ_bool, + remove_suffix, ) T = TypeVar("T") @@ -65,13 +66,16 @@ class LicenseStatus: UNEXPECTED_MISSING = "UNEXPECTED_MISSING" -LICENSE_UTILS_VERBOSE = environ_bool("LICENSE_UTILS_VERBOSE", default=False) +class LicenseOptions: + # General verbosity, such as progress information + VERBOSE = environ_bool("LICENSE_UTILS_VERBOSE", default=True) + # Specific additional debugging output + DEBUG = environ_bool("LICENSE_UTILS_DEBUG", default=False) class LicenseFramework: NAME = None - VERBOSE = LICENSE_UTILS_VERBOSE @classmethod def get_dependencies(cls): @@ -135,29 +139,6 @@ def all_frameworks(cls): return sorted(cls.LICENSE_FRAMEWORKS.values(), key=lambda x: x.NAME) -def extract_boolean_terms(boolean_expression: str, for_package_name: str) -> List[str]: - # We only care which licenses were mentioned, not what algebra is used on them. - # (Thankfully there are no NOTs, and that's probably not by accident, since that would be too big a set.) - # So for us, either (FOO AND BAR) or (FOO OR BAR) is the same because we want to treat it as "FOO,BAR". - # If all of those licenses match, all is good. That _does_ mean some things like (MIT OR GPL-3.0) will - # have trouble passing unless both MIT and GPL-3.0 are allowed. - terms = sorted(map(lambda x: x.strip(), - (boolean_expression - .replace('(', '') - .replace(')', '') - .replace(' AND ', ',') - .replace(' and ', ',') - .replace(' & ', ',') - .replace(' OR ', ',') - .replace(' or ', ',') - .replace('|', ',') - .replace(';', ',') - .replace(' + ', ',') - .replace('file ', f'Custom: {for_package_name} file ') - ).split(','))) - return terms - - # This is intended to match ' (= 3)', ' (>= 3)', ' (version 3)', ' (version 3 or greater)' # It will incidentally and harmlessly also take ' (>version 3)' or '(>= 3 or greater)'. # It will also correctly handle the unlikely case of ' (= 3 or greater)' @@ -168,7 +149,7 @@ def extract_boolean_terms(boolean_expression: str, for_package_name: str) -> Lis _GPL_VERSION_CHOICE = re.compile('^GPL-v?([0-9.+]) (?:OR|[|]) GPL-v?([0-9.+])$') -def simplify_license_versions(licenses_spec: str, *, for_package_name, verbose: bool = False) -> str: +def simplify_license_versions(licenses_spec: str, *, for_package_name) -> str: m = _GPL_VERSION_CHOICE.match(licenses_spec) if m: version_a, version_b = m.groups() @@ -207,18 +188,58 @@ def simplify_license_versions(licenses_spec: str, *, for_package_name, verbose: break matched = m.group(1) licenses_spec = licenses_spec.replace(matched, '+') - if verbose and licenses_spec != original_licenses_spec: - print(f"Rewriting {original_licenses_spec!r} as {licenses_spec!r}.") + if LicenseOptions.DEBUG and licenses_spec != original_licenses_spec: + PRINT(f"Rewriting {original_licenses_spec!r} as {licenses_spec!r}.") return licenses_spec +def extract_boolean_terms(boolean_expression: str, for_package_name: str) -> List[str]: + # We only care which licenses were mentioned, not what algebra is used on them. + # (Thankfully there are no NOTs, and that's probably not by accident, since that would be too big a set.) + # So for us, either (FOO AND BAR) or (FOO OR BAR) is the same because we want to treat it as "FOO,BAR". + # If all of those licenses match, all is good. That _does_ mean some things like (MIT OR GPL-3.0) will + # have trouble passing unless both MIT and GPL-3.0 are allowed. + revised_boolean_expression = ( + boolean_expression + .replace('(', '') + .replace(')', '') + .replace(' AND ', ',') + .replace(' and ', ',') + .replace(' & ', ',') + .replace(' OR ', ',') + .replace(' or ', ',') + .replace('|', ',') + .replace(';', ',') + .replace(' + ', ',') + .replace('file ', f'Custom: {for_package_name} file ') + ) + terms = [x for x in sorted(map(lambda x: x.strip(), revised_boolean_expression.split(','))) if x] + if LicenseOptions.DEBUG and revised_boolean_expression != boolean_expression: + PRINT(f"Rewriting {boolean_expression!r} as {terms!r}.") + return terms + + @LicenseFrameworkRegistry.register_framework(name='javascript') class JavascriptLicenseFramework(LicenseFramework): @classmethod def implicated_licenses(cls, *, package_name, licenses_spec: str) -> List[str]: ignored(package_name) - return extract_boolean_terms(licenses_spec, for_package_name=package_name) + licenses_spec = simplify_license_versions(licenses_spec, for_package_name=package_name) + licenses = extract_boolean_terms(licenses_spec, for_package_name=package_name) + return licenses + + VERSION_PATTERN = re.compile('^.+?([@][0-9.][^@]*|)$') + + @classmethod + def strip_version(cls, raw_name): + name = raw_name + m = cls.VERSION_PATTERN.match(raw_name) # e.g., @foo/bar@3.7 + if m: + suffix = m.group(1) + if suffix: + name = remove_suffix(m.group(1), name) + return name @classmethod def get_dependencies(cls): @@ -231,18 +252,13 @@ def get_dependencies(cls): # e.g., this happens if there's no javascript in the repo raise Exception("No javascript license data was found.") result = [] - for name, record in records.items(): - licenses_spec = record.get(_LICENSES) - if '(' in licenses_spec: - licenses = cls.implicated_licenses(package_name=name, licenses_spec=licenses_spec) - # print(f"Rewriting {licenses_spec!r} as {licenses!r}") - elif licenses_spec: - licenses = [licenses_spec] - else: - licenses = [] + for raw_name, record in records.items(): + name = cls.strip_version(raw_name) + raw_licenses_spec = record.get(_LICENSES) + licenses = cls.implicated_licenses(licenses_spec=raw_licenses_spec, package_name=name) entry = { - _NAME: name.lstrip('@').split('@')[0], # e.g., @foo/bar@3.7 - _LICENSES: licenses, # TODO: could parse this better. + _NAME: name, + _LICENSES: licenses, _FRAMEWORK: 'javascript' } result.append(entry) @@ -322,11 +338,6 @@ def implicated_licenses(cls, *, package_name, licenses_spec: str) -> List[str]: return [cls.R_LANGUAGE_LICENSE_NAME] licenses_spec = simplify_license_versions(licenses_spec, for_package_name=package_name) licenses = extract_boolean_terms(licenses_spec, for_package_name=package_name) - # licenses = sorted(map(lambda x: x.strip(), - # (licenses_spec - # .replace('|', ',') - # .replace('file ', f'Custom: {package_name} file ') - # ).split(','))) return licenses @classmethod @@ -367,7 +378,7 @@ def get_dependencies(cls): result.append(entry) except Exception as e: found_problems += 1 - if cls.VERBOSE: + if LicenseOptions.VERBOSE: PRINT(get_error_message(e)) if found_problems > 0: warnings.warn(there_are(found_problems, kind="problem", show=False, punctuate=True, tense='past')) @@ -376,8 +387,6 @@ def get_dependencies(cls): class LicenseFileParser: - VERBOSE = LICENSE_UTILS_VERBOSE - SEPARATORS = '-.,' SEPARATORS_AND_WHITESPACE = SEPARATORS + ' \t' COPYRIGHT_SYMBOL = '\u00a9' @@ -414,7 +423,7 @@ def parse_simple_license_file(cls, *, filename): lines = [] for i, line in enumerate(fp): line = line.strip(' \t\n\r') - if cls.VERBOSE: # pragma: no cover - this is just for debugging + if LicenseOptions.DEBUG: # pragma: no cover - this is just for debugging PRINT(str(i).rjust(3), line) m = cls.COPYRIGHT_LINE.match(line) if line[:1].isupper() else None if not m: @@ -506,8 +515,6 @@ class LicenseChecker: # Set this to True in subclasses if you want your organization's policy to be that you see # some visible proof of which licenses were checked. - VERBOSE = True - LICENSE_TITLE = None COPYRIGHT_OWNER = None LICENSE_FRAMEWORKS = None @@ -630,7 +637,7 @@ def analyze_license_dependencies_for_framework(cls, *, _LICENSES: license_names, _STATUS: status }) - if cls.VERBOSE: # pragma: no cover - this is just for debugging + if LicenseOptions.VERBOSE: # pragma: no cover - this is just for debugging PRINT(f"Checked {framework.NAME} {name}:" f" {'; '.join(license_names) if license_names else '---'} ({status})") diff --git a/test/test_license_utils.py b/test/test_license_utils.py index 01d898eff..bb87ce8d8 100644 --- a/test/test_license_utils.py +++ b/test/test_license_utils.py @@ -8,11 +8,12 @@ from collections import defaultdict from dcicutils.license_utils import ( - LicenseFrameworkRegistry, LicenseFramework, + LicenseOptions, LicenseFrameworkRegistry, LicenseFramework, PythonLicenseFramework, JavascriptLicenseFramework, CondaLicenseFramework, RLicenseFramework, LicenseAnalysis, LicenseChecker, LicenseStatus, LicenseFileParser, LicenseCheckFailure, LicenseOwnershipCheckFailure, LicenseAcceptabilityCheckFailure, warnings as license_utils_warnings_module, + extract_boolean_terms, simplify_license_versions, ) from dcicutils.misc_utils import ignored, file_contents, local_attrs from dcicutils.qa_utils import printed_output, MockFileSystem @@ -187,11 +188,82 @@ class DummyLicenseFramework1(LicenseFramework): LicenseFrameworkRegistry.find_framework(1) # noQA - arg is intentionally of wrong type for testing -def test_javascript_license_framework_implicated_licenses(): +def test_javascript_license_framework_strip_version(): + + print() # start on fresh line + + strip_version = JavascriptLicenseFramework.strip_version + + assert strip_version('') == '' + + assert strip_version('foo') == 'foo' + assert strip_version('foo@bar') == 'foo@bar' + + assert strip_version('foo@3') == 'foo' + assert strip_version('foo@3.1') == 'foo' + assert strip_version('foo@3.1.0') == 'foo' + assert strip_version('foo@3.1.0b3') == 'foo' + assert strip_version('foo@3.1-beta') == 'foo' + + assert strip_version("@foo-3.1-beta") == '@foo-3.1-beta' # we don't treat leading '@' as a version marker + assert strip_version('foo@.9') == 'foo' # we tolerate a leading dot even though it's probably bad form + assert strip_version('foo@beta-3.9') == 'foo@beta-3.9' # treating suffix as version here is farther than we'll go + + +@pytest.mark.parametrize('debug', [False, True]) +def test_simplify_license_versions(debug): + + def test_it(spec, expected): + with local_attrs(LicenseOptions, DEBUG=True): + with printed_output() as printed: + assert simplify_license_versions(spec, for_package_name='ignored') == expected + assert printed.last == f"Rewriting {spec!r} as {expected!r}." + + test_it('GPL (version 2)', 'GPL-2') + test_it('GPL (version 2.0)', 'GPL-2.0') + test_it('GPL (= 2.0)', 'GPL-2.0') + test_it('GPL (= 2.1)', 'GPL-2.1') + + test_it('GPL (>= 2)', 'GPL-2+') + test_it('GPL (>= 2.0)', 'GPL-2.0+') + test_it('GPL (version 2 or greater)', 'GPL-2+') + test_it('GPL (version 2 or later)', 'GPL-2+') + + +@pytest.mark.parametrize('debug', [False, True]) +def test_extract_boolean_terms(debug): + + print() # start on a blank line def check_implications(spec, implications): - assert JavascriptLicenseFramework.implicated_licenses(package_name='ignored', - licenses_spec=spec) == implications + with local_attrs(LicenseOptions, DEBUG=debug): + with printed_output() as printed: + assert extract_boolean_terms(spec, for_package_name='ignored') == implications + assert printed.lines == ([f"Rewriting {spec!r} as {implications!r}."] if debug else []) + + check_implications(spec='(MIT AND BSD-3-Clause)', implications=['BSD-3-Clause', 'MIT']) + check_implications(spec='(CC-BY-4.0 AND OFL-1.1 AND MIT)', implications=['CC-BY-4.0', 'MIT', 'OFL-1.1']) + + check_implications(spec='(MIT OR Apache-2.0)', implications=['Apache-2.0', 'MIT']) + + check_implications(spec='(FOO OR (BAR AND BAZ))', implications=['BAR', 'BAZ', 'FOO']) + + sample_package = 'some-package' + assert extract_boolean_terms('MIT or file FOO', for_package_name=sample_package) == [ + f'Custom: {sample_package} file FOO', + 'MIT', + ] + + +@pytest.mark.parametrize('debug', [False, True]) +def test_javascript_license_framework_implicated_licenses(debug): + + def check_implications(spec, implications): + with local_attrs(LicenseOptions, DEBUG=debug): + with printed_output() as printed: + assert JavascriptLicenseFramework.implicated_licenses(package_name='ignored', + licenses_spec=spec) == implications + assert printed.lines == ([f"Rewriting {spec!r} as {implications!r}."] if debug else []) check_implications(spec='(MIT AND BSD-3-Clause)', implications=['BSD-3-Clause', 'MIT']) check_implications(spec='(CC-BY-4.0 AND OFL-1.1 AND MIT)', implications=['CC-BY-4.0', 'MIT', 'OFL-1.1']) @@ -201,10 +273,10 @@ def check_implications(spec, implications): check_implications(spec='(FOO OR (BAR AND BAZ))', implications=['BAR', 'BAZ', 'FOO']) -@pytest.mark.parametrize('verbose', [False, True]) -def test_javascript_license_framework_get_licenses(verbose): +@pytest.mark.parametrize('debug', [False, True]) +def test_javascript_license_framework_get_licenses(debug): - with local_attrs(LicenseFramework, REWRITE_VERBOSE=verbose): + with local_attrs(LicenseOptions, DEBUG=debug): print() # start on a fresh line packages = {} for i, license in enumerate(['Apache-2.0', 'MIT', '(MIT OR Apache-2.0)', ''], start=1): @@ -227,8 +299,8 @@ def test_javascript_license_framework_get_licenses(verbose): {'framework': 'javascript', 'licenses': ['Apache-2.0', 'MIT'], 'name': 'package3'}, {'framework': 'javascript', 'licenses': [], 'name': 'package4'}, ] - expected_rewrite_description = "Rewriting '(MIT OR Apache-2.0)' as ['Apache-2.0', 'MIT']" - assert printed.lines == ([expected_rewrite_description] if verbose else []) + expected_rewrite_description = "Rewriting '(MIT OR Apache-2.0)' as ['Apache-2.0', 'MIT']." + assert printed.lines == ([expected_rewrite_description] if debug else []) # A special case for missing data... mock_check_output.return_value = "{}\n\n"