Skip to content

Commit

Permalink
Refactor for better control and to fix unit tests.
Browse files Browse the repository at this point in the history
  • Loading branch information
netsettler committed Sep 20, 2023
1 parent 521fc83 commit 7e8e1f7
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 61 deletions.
111 changes: 59 additions & 52 deletions dcicutils/license_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from dcicutils.lang_utils import there_are
from dcicutils.misc_utils import (
PRINT, get_error_message, ignorable, ignored, json_file_contents, local_attrs, environ_bool,
remove_suffix,
)

T = TypeVar("T")
Expand Down Expand Up @@ -65,13 +66,16 @@ class LicenseStatus:
UNEXPECTED_MISSING = "UNEXPECTED_MISSING"


LICENSE_UTILS_VERBOSE = environ_bool("LICENSE_UTILS_VERBOSE", default=False)
class LicenseOptions:
# General verbosity, such as progress information
VERBOSE = environ_bool("LICENSE_UTILS_VERBOSE", default=True)
# Specific additional debugging output
DEBUG = environ_bool("LICENSE_UTILS_DEBUG", default=False)


class LicenseFramework:

NAME = None
VERBOSE = LICENSE_UTILS_VERBOSE

@classmethod
def get_dependencies(cls):
Expand Down Expand Up @@ -135,29 +139,6 @@ def all_frameworks(cls):
return sorted(cls.LICENSE_FRAMEWORKS.values(), key=lambda x: x.NAME)


def extract_boolean_terms(boolean_expression: str, for_package_name: str) -> List[str]:
# We only care which licenses were mentioned, not what algebra is used on them.
# (Thankfully there are no NOTs, and that's probably not by accident, since that would be too big a set.)
# So for us, either (FOO AND BAR) or (FOO OR BAR) is the same because we want to treat it as "FOO,BAR".
# If all of those licenses match, all is good. That _does_ mean some things like (MIT OR GPL-3.0) will
# have trouble passing unless both MIT and GPL-3.0 are allowed.
terms = sorted(map(lambda x: x.strip(),
(boolean_expression
.replace('(', '')
.replace(')', '')
.replace(' AND ', ',')
.replace(' and ', ',')
.replace(' & ', ',')
.replace(' OR ', ',')
.replace(' or ', ',')
.replace('|', ',')
.replace(';', ',')
.replace(' + ', ',')
.replace('file ', f'Custom: {for_package_name} file ')
).split(',')))
return terms


# This is intended to match ' (= 3)', ' (>= 3)', ' (version 3)', ' (version 3 or greater)'
# It will incidentally and harmlessly also take ' (>version 3)' or '(>= 3 or greater)'.
# It will also correctly handle the unlikely case of ' (= 3 or greater)'
Expand All @@ -168,7 +149,7 @@ def extract_boolean_terms(boolean_expression: str, for_package_name: str) -> Lis
_GPL_VERSION_CHOICE = re.compile('^GPL-v?([0-9.+]) (?:OR|[|]) GPL-v?([0-9.+])$')


def simplify_license_versions(licenses_spec: str, *, for_package_name, verbose: bool = False) -> str:
def simplify_license_versions(licenses_spec: str, *, for_package_name) -> str:
m = _GPL_VERSION_CHOICE.match(licenses_spec)
if m:
version_a, version_b = m.groups()
Expand Down Expand Up @@ -207,18 +188,58 @@ def simplify_license_versions(licenses_spec: str, *, for_package_name, verbose:
break
matched = m.group(1)
licenses_spec = licenses_spec.replace(matched, '+')
if verbose and licenses_spec != original_licenses_spec:
print(f"Rewriting {original_licenses_spec!r} as {licenses_spec!r}.")
if LicenseOptions.DEBUG and licenses_spec != original_licenses_spec:
PRINT(f"Rewriting {original_licenses_spec!r} as {licenses_spec!r}.")
return licenses_spec


def extract_boolean_terms(boolean_expression: str, for_package_name: str) -> List[str]:
# We only care which licenses were mentioned, not what algebra is used on them.
# (Thankfully there are no NOTs, and that's probably not by accident, since that would be too big a set.)
# So for us, either (FOO AND BAR) or (FOO OR BAR) is the same because we want to treat it as "FOO,BAR".
# If all of those licenses match, all is good. That _does_ mean some things like (MIT OR GPL-3.0) will
# have trouble passing unless both MIT and GPL-3.0 are allowed.
revised_boolean_expression = (
boolean_expression
.replace('(', '')
.replace(')', '')
.replace(' AND ', ',')
.replace(' and ', ',')
.replace(' & ', ',')
.replace(' OR ', ',')
.replace(' or ', ',')
.replace('|', ',')
.replace(';', ',')
.replace(' + ', ',')
.replace('file ', f'Custom: {for_package_name} file ')
)
terms = [x for x in sorted(map(lambda x: x.strip(), revised_boolean_expression.split(','))) if x]
if LicenseOptions.DEBUG and revised_boolean_expression != boolean_expression:
PRINT(f"Rewriting {boolean_expression!r} as {terms!r}.")
return terms


@LicenseFrameworkRegistry.register_framework(name='javascript')
class JavascriptLicenseFramework(LicenseFramework):

@classmethod
def implicated_licenses(cls, *, package_name, licenses_spec: str) -> List[str]:
ignored(package_name)
return extract_boolean_terms(licenses_spec, for_package_name=package_name)
licenses_spec = simplify_license_versions(licenses_spec, for_package_name=package_name)
licenses = extract_boolean_terms(licenses_spec, for_package_name=package_name)
return licenses

VERSION_PATTERN = re.compile('^.+?([@][0-9.][^@]*|)$')

@classmethod
def strip_version(cls, raw_name):
name = raw_name
m = cls.VERSION_PATTERN.match(raw_name) # e.g., @foo/bar@3.7
if m:
suffix = m.group(1)
if suffix:
name = remove_suffix(m.group(1), name)
return name

@classmethod
def get_dependencies(cls):
Expand All @@ -231,18 +252,13 @@ def get_dependencies(cls):
# e.g., this happens if there's no javascript in the repo
raise Exception("No javascript license data was found.")
result = []
for name, record in records.items():
licenses_spec = record.get(_LICENSES)
if '(' in licenses_spec:
licenses = cls.implicated_licenses(package_name=name, licenses_spec=licenses_spec)
# print(f"Rewriting {licenses_spec!r} as {licenses!r}")
elif licenses_spec:
licenses = [licenses_spec]
else:
licenses = []
for raw_name, record in records.items():
name = cls.strip_version(raw_name)
raw_licenses_spec = record.get(_LICENSES)
licenses = cls.implicated_licenses(licenses_spec=raw_licenses_spec, package_name=name)
entry = {
_NAME: name.lstrip('@').split('@')[0], # e.g., @foo/bar@3.7
_LICENSES: licenses, # TODO: could parse this better.
_NAME: name,
_LICENSES: licenses,
_FRAMEWORK: 'javascript'
}
result.append(entry)
Expand Down Expand Up @@ -322,11 +338,6 @@ def implicated_licenses(cls, *, package_name, licenses_spec: str) -> List[str]:
return [cls.R_LANGUAGE_LICENSE_NAME]
licenses_spec = simplify_license_versions(licenses_spec, for_package_name=package_name)
licenses = extract_boolean_terms(licenses_spec, for_package_name=package_name)
# licenses = sorted(map(lambda x: x.strip(),
# (licenses_spec
# .replace('|', ',')
# .replace('file ', f'Custom: {package_name} file ')
# ).split(',')))
return licenses

@classmethod
Expand Down Expand Up @@ -367,7 +378,7 @@ def get_dependencies(cls):
result.append(entry)
except Exception as e:
found_problems += 1
if cls.VERBOSE:
if LicenseOptions.VERBOSE:
PRINT(get_error_message(e))
if found_problems > 0:
warnings.warn(there_are(found_problems, kind="problem", show=False, punctuate=True, tense='past'))
Expand All @@ -376,8 +387,6 @@ def get_dependencies(cls):

class LicenseFileParser:

VERBOSE = LICENSE_UTILS_VERBOSE

SEPARATORS = '-.,'
SEPARATORS_AND_WHITESPACE = SEPARATORS + ' \t'
COPYRIGHT_SYMBOL = '\u00a9'
Expand Down Expand Up @@ -414,7 +423,7 @@ def parse_simple_license_file(cls, *, filename):
lines = []
for i, line in enumerate(fp):
line = line.strip(' \t\n\r')
if cls.VERBOSE: # pragma: no cover - this is just for debugging
if LicenseOptions.DEBUG: # pragma: no cover - this is just for debugging
PRINT(str(i).rjust(3), line)
m = cls.COPYRIGHT_LINE.match(line) if line[:1].isupper() else None
if not m:
Expand Down Expand Up @@ -506,8 +515,6 @@ class LicenseChecker:

# Set this to True in subclasses if you want your organization's policy to be that you see
# some visible proof of which licenses were checked.
VERBOSE = True

LICENSE_TITLE = None
COPYRIGHT_OWNER = None
LICENSE_FRAMEWORKS = None
Expand Down Expand Up @@ -630,7 +637,7 @@ def analyze_license_dependencies_for_framework(cls, *,
_LICENSES: license_names,
_STATUS: status
})
if cls.VERBOSE: # pragma: no cover - this is just for debugging
if LicenseOptions.VERBOSE: # pragma: no cover - this is just for debugging
PRINT(f"Checked {framework.NAME} {name}:"
f" {'; '.join(license_names) if license_names else '---'} ({status})")

Expand Down
90 changes: 81 additions & 9 deletions test/test_license_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@

from collections import defaultdict
from dcicutils.license_utils import (
LicenseFrameworkRegistry, LicenseFramework,
LicenseOptions, LicenseFrameworkRegistry, LicenseFramework,
PythonLicenseFramework, JavascriptLicenseFramework, CondaLicenseFramework, RLicenseFramework,
LicenseAnalysis, LicenseChecker, LicenseStatus, LicenseFileParser,
LicenseCheckFailure, LicenseOwnershipCheckFailure, LicenseAcceptabilityCheckFailure,
warnings as license_utils_warnings_module,
extract_boolean_terms, simplify_license_versions,
)
from dcicutils.misc_utils import ignored, file_contents, local_attrs
from dcicutils.qa_utils import printed_output, MockFileSystem
Expand Down Expand Up @@ -187,11 +188,82 @@ class DummyLicenseFramework1(LicenseFramework):
LicenseFrameworkRegistry.find_framework(1) # noQA - arg is intentionally of wrong type for testing


def test_javascript_license_framework_implicated_licenses():
def test_javascript_license_framework_strip_version():

print() # start on fresh line

strip_version = JavascriptLicenseFramework.strip_version

assert strip_version('') == ''

assert strip_version('foo') == 'foo'
assert strip_version('foo@bar') == 'foo@bar'

assert strip_version('foo@3') == 'foo'
assert strip_version('foo@3.1') == 'foo'
assert strip_version('foo@3.1.0') == 'foo'
assert strip_version('foo@3.1.0b3') == 'foo'
assert strip_version('foo@3.1-beta') == 'foo'

assert strip_version("@foo-3.1-beta") == '@foo-3.1-beta' # we don't treat leading '@' as a version marker
assert strip_version('foo@.9') == 'foo' # we tolerate a leading dot even though it's probably bad form
assert strip_version('foo@beta-3.9') == 'foo@beta-3.9' # treating suffix as version here is farther than we'll go


@pytest.mark.parametrize('debug', [False, True])
def test_simplify_license_versions(debug):

def test_it(spec, expected):
with local_attrs(LicenseOptions, DEBUG=True):
with printed_output() as printed:
assert simplify_license_versions(spec, for_package_name='ignored') == expected
assert printed.last == f"Rewriting {spec!r} as {expected!r}."

test_it('GPL (version 2)', 'GPL-2')
test_it('GPL (version 2.0)', 'GPL-2.0')
test_it('GPL (= 2.0)', 'GPL-2.0')
test_it('GPL (= 2.1)', 'GPL-2.1')

test_it('GPL (>= 2)', 'GPL-2+')
test_it('GPL (>= 2.0)', 'GPL-2.0+')
test_it('GPL (version 2 or greater)', 'GPL-2+')
test_it('GPL (version 2 or later)', 'GPL-2+')


@pytest.mark.parametrize('debug', [False, True])
def test_extract_boolean_terms(debug):

print() # start on a blank line

def check_implications(spec, implications):
assert JavascriptLicenseFramework.implicated_licenses(package_name='ignored',
licenses_spec=spec) == implications
with local_attrs(LicenseOptions, DEBUG=debug):
with printed_output() as printed:
assert extract_boolean_terms(spec, for_package_name='ignored') == implications
assert printed.lines == ([f"Rewriting {spec!r} as {implications!r}."] if debug else [])

check_implications(spec='(MIT AND BSD-3-Clause)', implications=['BSD-3-Clause', 'MIT'])
check_implications(spec='(CC-BY-4.0 AND OFL-1.1 AND MIT)', implications=['CC-BY-4.0', 'MIT', 'OFL-1.1'])

check_implications(spec='(MIT OR Apache-2.0)', implications=['Apache-2.0', 'MIT'])

check_implications(spec='(FOO OR (BAR AND BAZ))', implications=['BAR', 'BAZ', 'FOO'])

sample_package = 'some-package'
assert extract_boolean_terms('MIT or file FOO', for_package_name=sample_package) == [
f'Custom: {sample_package} file FOO',
'MIT',
]


@pytest.mark.parametrize('debug', [False, True])
def test_javascript_license_framework_implicated_licenses(debug):

def check_implications(spec, implications):
with local_attrs(LicenseOptions, DEBUG=debug):
with printed_output() as printed:
assert JavascriptLicenseFramework.implicated_licenses(package_name='ignored',
licenses_spec=spec) == implications
assert printed.lines == ([f"Rewriting {spec!r} as {implications!r}."] if debug else [])

check_implications(spec='(MIT AND BSD-3-Clause)', implications=['BSD-3-Clause', 'MIT'])
check_implications(spec='(CC-BY-4.0 AND OFL-1.1 AND MIT)', implications=['CC-BY-4.0', 'MIT', 'OFL-1.1'])
Expand All @@ -201,10 +273,10 @@ def check_implications(spec, implications):
check_implications(spec='(FOO OR (BAR AND BAZ))', implications=['BAR', 'BAZ', 'FOO'])


@pytest.mark.parametrize('verbose', [False, True])
def test_javascript_license_framework_get_licenses(verbose):
@pytest.mark.parametrize('debug', [False, True])
def test_javascript_license_framework_get_licenses(debug):

with local_attrs(LicenseFramework, REWRITE_VERBOSE=verbose):
with local_attrs(LicenseOptions, DEBUG=debug):
print() # start on a fresh line
packages = {}
for i, license in enumerate(['Apache-2.0', 'MIT', '(MIT OR Apache-2.0)', ''], start=1):
Expand All @@ -227,8 +299,8 @@ def test_javascript_license_framework_get_licenses(verbose):
{'framework': 'javascript', 'licenses': ['Apache-2.0', 'MIT'], 'name': 'package3'},
{'framework': 'javascript', 'licenses': [], 'name': 'package4'},
]
expected_rewrite_description = "Rewriting '(MIT OR Apache-2.0)' as ['Apache-2.0', 'MIT']"
assert printed.lines == ([expected_rewrite_description] if verbose else [])
expected_rewrite_description = "Rewriting '(MIT OR Apache-2.0)' as ['Apache-2.0', 'MIT']."
assert printed.lines == ([expected_rewrite_description] if debug else [])

# A special case for missing data...
mock_check_output.return_value = "{}\n\n"
Expand Down

0 comments on commit 7e8e1f7

Please sign in to comment.