diff --git a/dcicutils/common.py b/dcicutils/common.py index b4f487cf3..13d518455 100644 --- a/dcicutils/common.py +++ b/dcicutils/common.py @@ -1,4 +1,5 @@ import os +import re from typing import Dict, Union, Tuple, List, Any from typing_extensions import Literal @@ -36,6 +37,8 @@ LIBRARY_DIR = os.path.dirname(__file__) +Regexp = type(re.compile("sample")) + # ===== Auth Data ===== AuthStr = str diff --git a/dcicutils/license_policies/c4-infrastructure.jsonc b/dcicutils/license_policies/c4-infrastructure.jsonc new file mode 100644 index 000000000..7a77448f6 --- /dev/null +++ b/dcicutils/license_policies/c4-infrastructure.jsonc @@ -0,0 +1,8 @@ +{ + "class_key": "c4-infrastructure", + "class_name": "C4InfrastructureLicenseChecker", + "inherits_from": ["park-lab-common-server"], + "description": "Checker for C4 infrastructure (Fourfront, CGAP, SMaHT) from Park Lab.", + + "LICENSE_TITLE": "(The )?MIT License" +} diff --git a/dcicutils/license_policies/c4-python-infrastructure.jsonc b/dcicutils/license_policies/c4-python-infrastructure.jsonc new file mode 100644 index 000000000..12a4afcf2 --- /dev/null +++ b/dcicutils/license_policies/c4-python-infrastructure.jsonc @@ -0,0 +1,8 @@ +{ + "class_key": "c4-python-infrastructure", + "class_name": "C4PythonInfrastructureLicenseChecker", + "inherits_from": ["c4-infrastructure"], + "description": "Checker for C4 python library infrastructure (Fourfront, CGAP, SMaHT) from Park Lab.", + + "LICENSE_FRAMEWORKS": ["python"] +} diff --git a/dcicutils/license_policies/park-lab-common-server.jsonc b/dcicutils/license_policies/park-lab-common-server.jsonc new file mode 100644 index 000000000..72c1af930 --- /dev/null +++ b/dcicutils/license_policies/park-lab-common-server.jsonc @@ -0,0 +1,104 @@ +{ + "class_key": "park-lab-common-server", + "inherits_from": ["park-lab-common"], + "description": "Minimal/generic checker for servers from Park Lab.", + + "LICENSE_FRAMEWORKS": ["python", "javascript"], + + "EXCEPTIONS": { + "BSD*": [ + // Although modified to insert the author name into the license text itself, + // the license for these libraries are essentially BSD-3-Clause. + "formatio", + "samsam", + + // There are some slightly different versions of what appear to be BSD licenses here, + // but clearly the license is permissive. + // Ref: https://www.npmjs.com/package/mutation-observer?activeTab=readme + "mutation-observer" + ], + "Custom: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global": [ + // The use of this URL appears to be a syntax error in the definition of entries-ponyfill + // In fact this seems to be covered by a CC0-1.0 license. + // Ref: https://unpkg.com/browse/object.entries-ponyfill@1.0.1/LICENSE + "object.entries-ponyfill" + ], + "Custom: https://github.com/saikocat/colorbrewer.": [ + // The use of this URL appears to be a syntax error in the definition of cartocolor + // In fact, this seems to be covered by a CC-BY-3.0 license. + // Ref: https://www.npmjs.com/package/cartocolor?activeTab=readme + "cartocolor" + ], + "Custom: https://travis-ci.org/component/emitter.png": [ + // The use of this png appears to be a syntax error in the definition of emitter-component. + // In fact, emitter-component uses an MIT License + // Ref: https://www.npmjs.com/package/emitter-component + // Ref: https://github.com/component/emitter/blob/master/LICENSE + "emitter-component" + ], + "Custom: https://travis-ci.org/DenisCarriere/turf-jsts.svg": [ + // The 'turfs-jsts' repository (https://github.com/DenisCarriere/turf-jsts/blob/master/README.md) + // seems to lack a license, but appears to be forked from the jsts library that uses + // the Eclipse Public License 1.0 and Eclipse Distribution License 1.0, so probably a permissive + // license is intended. + "turf-jsts" + ], + "GNU General Public License (GPL)": [ + "docutils" // Used only privately as a separate documentation-generation task for ReadTheDocs + ], + "GNU Library or Lesser General Public License (LGPL)": [ + + // Linking = With Restrictions, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + // "GNU Lesser General Public License v3 or later (LGPLv3+)", + // Linking = With Restrictions, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "psycopg2", // Used at runtime during server operation, but not modified or distributed + "psycopg2-binary", // Used at runtime during server operation, but not modified or distributed + "chardet", // Potentially used downstream in loadxl to detect charset for text files + "pyzmq" // Used in post-deploy-perf-tests, not distributed, and not modified or distributed + ], + "GPL-2.0": [ + // The license file for the node-forge javascript library says: + // + // "You may use the Forge project under the terms of either the BSD License or the + // GNU General Public License (GPL) Version 2." + // + // (We choose to use it under the BSD license.) + // Ref: https://www.npmjs.com/package/node-forge?activeTab=code + "node-forge" + ], + "MIT*": [ + // This library uses a mix of licenses, but they (MIT, CC0) generally seem permissive. + // (It also mentions that some tools for building/testing use other libraries.) + // Ref: https://github.com/requirejs/domReady/blob/master/LICENSE + "domready", + + // This library is under "COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.1" + // Ref: https://github.com/javaee/jsonp/blob/master/LICENSE.txt + // About CDDL ... + // Linking = Permissive, Private Use = ? + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "jsonp", + + // This library says pretty clearly it intends MIT license. + // Ref: https://www.npmjs.com/package/component-indexof + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "component-indexof", + + // These look like a pretty straight MIT license. + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "mixin", // LICENSE file at https://www.npmjs.com/package/mixin?activeTab=code + "stack-trace", // https://github.com/stacktracejs/stacktrace.js/blob/master/LICENSE + "typed-function" // LICENSE at https://www.npmjs.com/package/typed-function?activeTab=code + ], + "UNLICENSED": [ + // The udn-browser library is our own and has been observed to sometimes show up in some contexts + // as UNLICENSED, when really it is MIT. + // Ref: https://github.com/dbmi-bgm/udn-browser/blob/main/LICENSE + "udn-browser" + ] + } +} diff --git a/dcicutils/license_policies/park-lab-common.jsonc b/dcicutils/license_policies/park-lab-common.jsonc new file mode 100644 index 000000000..e59d67aee --- /dev/null +++ b/dcicutils/license_policies/park-lab-common.jsonc @@ -0,0 +1,407 @@ +{ + "class_key": "park-lab-common", + "class_name": "ParkLabCommonLicenseChecker", + "inherits_from": [], + "description": "Minimal/generic checker common to all tech from Park Lab.", + + "COPYRIGHT_OWNER": "President and Fellows of Harvard College", + + "LICENSE_FRAMEWORKS": "ALL", + + "ALLOWED": [ + + // <> + // Ref: https://opensource.org/license/0bsd/ + "0BSD", + + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "Academic Free License (AFL)", + "AFL-2.1", + + // Linking = Permissive, Private Use = Yes + // Apache licenses before version 2.0 are controversial, but we here construe an unmarked naming to imply + // any version, and hence v2. + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "Apache Software License", + "Apache-Style", + {"pattern": "Apache([- ]2([.]0)?)?([- ]Licen[cs]e)?([- ]with[- ]LLVM[- ]exception)?"}, + // "Apache-2.0", + + // Artistic License 1.0 was confusing to people, so its status as permissive is in general uncertain, + // however the issue seems to revolve around point 8 (relating to whether or not perl is deliberately + // exposed). That isn't in play for our uses, so we don't flag it here. + // Artistic license 2.0 is a permissive license. + // Ref: https://en.wikipedia.org/wiki/Artistic_License + "Artistic-1.0-Perl", + {"pattern": "Artistic[- ]2([.]0)?"}, + + // According to Wikipedia, the Boost is considered permissive and BSD-like. + // Refs: + // * + // * https://en.wikipedia.org/wiki/Boost_(C%2B%2B_libraries)#License + {"pattern": "(BSL|Boost(([- ]Software)?[- ]License)?)([- ]1([.]0)?)?"}, + + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + {"pattern": "((modified[- ])?[234][- ]Clause[- ])?BSD([- ][234][- ]Clause)?( Licen[cs]e)?"}, + // "BSD License", + // "BSD-2-Clause", + // "BSD-3-Clause", + // "BSD 3-Clause", + + // BZIP2 is a permissive license + // Ref: https://github.com/asimonov-im/bzip2/blob/master/LICENSE + {"pattern": "bzip2(-1[.0-9]*)"}, + + // Linking = Public Domain, Private Use = Public Domain + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "CC0", + "CC0-1.0", + + // Linking = Permissive, Private Use = Permissive + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "CC-BY", + "CC-BY-3.0", + "CC-BY-4.0", + + // The curl license is a permissive license. + // Ref: https://curl.se/docs/copyright.html + "curl", + + // Linking = Permissive, Private Use = ? + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "CDDL", + + // The original Eclipse Distribution License 1.0 is essentially a BSD-3-Clause license. + // Ref: https://www.eclipse.org/org/documents/edl-v10.php + "Eclipse Distribution License", + + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "Eclipse Public License", + "EPL-2.0", + + // The FSF Unlimited License (FSFUL) seems to be a completely permissive license. + // Refs: + // * https://spdx.org/licenses/FSFUL.html + // * https://fedoraproject.org/wiki/Licensing/FSF_Unlimited_License + "FSF Unlimited License", + "FSFUL", + + // The FreeType license is a permissive license. + // Ref: LicenseRef-FreeType + {"pattern": "(Licen[cs]eRef-)?(FTL|FreeType( Licen[cs]e)?)"}, + + // Linking = Yes, Cat = Permissive Software Licenses + // Ref: https://en.wikipedia.org/wiki/Historical_Permission_Notice_and_Disclaimer + "Historical Permission Notice and Disclaimer (HPND)", + "HPND", + {"pattern": "(Licen[cs]eRef-)?PIL"}, + // The Pillow or Python Image Library is an HPND license, which is a simple permissive license: + // Refs: + // * https://github.com/python-pillow/Pillow/blob/main/LICENSE + // * https://www.fsf.org/blogs/licensing/historical-permission-notice-and-disclaimer-added-to-license-list + + // The IJG license, used by Independent JPEG Group (IJG) is a custom permissive license. + // Refs: + // * https://en.wikipedia.org/wiki/Libjpeg + // * https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/LICENSE.md + "IJG", + + // Linking = Permissive, Private Use = Permissive + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "ISC License (ISCL)", + "ISC", + + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "MIT License", + "MIT", + + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "Mozilla Public License 2.0 (MPL 2.0)", + "MPL-1.1", + "MPL-2.0", + + // The SIL Open Font License appears to be a copyleft-style license that applies narrowly + // to icons and not to the entire codebase. It is advertised as OK for use even in commercial + // applications. + // Ref: https://fontawesome.com/license/free + "OFL-1.1", + + // Ref: https://en.wikipedia.org/wiki/Public_domain + {"pattern": "(Licen[cs]eRef-)?Public[- ]Domain([- ]dedic[t]?ation)?"}, // "dedictation" is a typo in docutils + + // Linking = Permissive, Private Use = Permissive + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + {"pattern": "(Licen[cs]eRef-)?PSF-2([.][.0-9]*)"}, + "Python Software Foundation License", + "Python-2.0", + + // License = BSD-like + // Ref: https://en.wikipedia.org/wiki/Pylons_project + "Repoze Public License", + + // The TCL or Tcl/Tk licenses are permissive licenses. + // Ref: https://www.tcl.tk/software/tcltk/license.html + // The one used by the tktable library has a "bourbon" clause that doesn't add compliance requirements + // Ref: https://github.com/wjoye/tktable/blob/master/license.txt + {"pattern": "Tcl([/]tk)?"}, + + // The Ubuntu Font Licence is mostly permissive. It contains some restrictions if you are going to modify the + // fonts that require you to change the name to avoid confusion. But for our purposes, we're assuming that's + // not done, and so we're not flagging it. + {"pattern": "Ubuntu Font Licen[cs]e Version( 1([.]0)?)?"}, + + // Linking = Permissive/Public domain, Private Use = Permissive/Public domain + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "The Unlicense (Unlicense)", + "Unlicense", + + // Various licenses seem to call themselves or be summed up as unlimited. + // So far we know of none that are not highly permissive. + // * boot and KernSmooth are reported by R as being "Unlimited" + // Refs: + // * https://cran.r-project.org/web/packages/KernSmooth/index.html + // (https://github.com/cran/KernSmooth/blob/master/LICENCE.note) + // * https://cran.r-project.org/package=boot + // (https://github.com/cran/boot/blob/master/DESCRIPTION) + "Unlimited", + + // Linking = Permissive, Private Use = ? + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "W3C License", + "W3C-20150513", + + // Linking = Permissive/Public Domain, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "WTFPL", + + // Copyleft = No + // Ref: https://en.wikipedia.org/wiki/Zlib_License + // Linking = Permissive, Private Use = ? (for zlib/libpng license) + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "Zlib", + + // Copyleft = No, FSF/OSI-approved: Yes + // Ref: https://en.wikipedia.org/wiki/Zope_Public_License + "Zope Public License" + ], + + "EXCEPTIONS": { + + // The Bioconductor zlibbioc license is a permissive license. + // Ref: https://github.com/Bioconductor/zlibbioc/blob/devel/LICENSE + "Custom: bioconductor-zlibbioc file LICENSE": [ + "bioconductor-zlibbioc" + ], + + // The Bioconductor rsamtools license is an MIT license + // Ref: https://bioconductor.org/packages/release/bioc/licenses/Rsamtools/LICENSE + "Custom: bioconductor-rsamtools file LICENSE": [ + "bioconductor-rsamtools" + ], + + // DFSG = Debian Free Software Guidelines + // Ref: https://en.wikipedia.org/wiki/Debian_Free_Software_Guidelines + // Used as an apparent modifier to other licenses, to say they are approved per Debian. + // For example in this case, pytest-timeout has license: DFSG approved, MIT License, + // but is really just an MIT License that someone has checked is DFSG approved. + "DFSG approved": [ + "pytest-timeout" // MIT Licensed + ], + + "FOSS": [ + // The r-stringi library is a conda library that implements a stringi (pronounced "stringy") library for R. + // The COnda source feed is: https://github.com/conda-forge/r-stringi-feedstock + // This page explains that the home source is https://stringi.gagolewski.com/ but that's a doc page. + // The doc page says: + // > stringi’s source code is hosted on GitHub. + // > It is distributed under the open source BSD-3-clause license. + // The source code has a license that begins with a BSD-3-clause license and includes numerous others, + // but they all appear to be permissive. + // Ref: https://github.com/gagolews/stringi/blob/master/LICENSE + "stringi", + "r-stringi" + ], + + // Linking = With Restrictions, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "GNU Lesser General Public License v2 or later (LGPLv2+)": [ + "chardet" // used at runtime during server operation (ingestion), but not modified or distributed + ], + + // Linking = With Restrictions, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "GNU Lesser General Public License v3 or later (LGPLv3+)": [ + // used only privately in testing, not used in server code, not modified, not distributed + "pytest-redis", + // required by pytest-redis (used only where it's used) + "mirakuru" + ], + + "GNU General Public License (GPL)": [ + "docutils" // Used only privately as a separate documentation-generation task for ReadTheDocs + ], + + "MIT/X11 Derivative": [ + // The license used by libxkbcommon is complicated and involves numerous included licenses, + // but all are permissive. + // Ref: https://github.com/xkbcommon/libxkbcommon/blob/master/LICENSE + "libxkbcommon" + ], + + "None": [ + // It's not obvious why Conda shows this license as "None". + // In fact, though, BSD 3-Clause "New" or "Revised" License + // Ref: https://github.com/AnacondaRecipes/_libgcc_mutex-feedstock/blob/master/LICENSE.txt + "_libgcc_mutex" + ], + + "PostgreSQL": [ + // The libpq library is actually licensed with a permissive BSD 3-Clause "New" or "Revised" License + // Ref: https://github.com/lpsmith/postgresql-libpq/blob/master/LICENSE + "libpq" + ], + + "UCSD": [ + // It isn't obvious why these show up with a UCSD license in Conda. + // The actual sources say it should be a 2-clause BSD license: + // Refs: + // * https://github.com/AlexandrovLab/SigProfilerMatrixGenerator/blob/master/LICENSE + // * https://github.com/AlexandrovLab/SigProfilerPlotting/blob/master/LICENSE + "sigprofilermatrixgenerator", + "sigprofilerplotting" + ], + + "X11": [ + // The ncurses library has a VERY complicated history, BUT seems consistently permissive + // and the most recent version seems to be essentially the MIT license. + // Refs: + // * https://en.wikipedia.org/wiki/Ncurses#License + // * https://invisible-island.net/ncurses/ncurses-license.html + "ncurses" + ], + + "zlib-acknowledgement": [ + // It isn't clear whey libpng shows up with this license name, but the license for libpng + // is a permissive license. + // Ref: https://github.com/glennrp/libpng/blob/libpng16/LICENSE + "libpng" + ] + }, + + "EXPECTED_MISSING_LICENSES": [ + + // This is a name we use for our C4 portals. And it isn't published. + // We inherited the name from the Stanford ENCODE group, which had an MIT-licensed repo we forked + "encoded", // cgap-portal, fourfront, and smaht-portal all call themselves this + + // We believe that since these next here are part of the Pylons project, they're covered under + // the same license as the other Pylons projects. We're seeking clarification. + "pyramid-translogger", + "subprocess-middleware", + + // This appears to be a BSD 2-Clause "Simplified" License, according to GitHub. + // PyPi also says it's a BSD license. + // Ref: https://github.com/paulc/dnslib/blob/master/LICENSE + "dnslib", + + // This says it wants an ISC License, which we already have approval for but just isn't showing up. + // Ref: https://github.com/rthalley/dnspython/blob/master/LICENSE + "dnspython", + + // This appears to be a mostly-MIT-style license. + // There are references to parts being in the public domain, though it's not obvious if that's meaningful. + // It's probably sufficient for our purposes to treat this as a permissive license. + // Ref: https://github.com/tlsfuzzer/python-ecdsa/blob/master/LICENSE + "ecdsa", + + // This has an MIT license in its source repository + // Ref: https://github.com/xlwings/jsondiff/blob/master/LICENSE + "jsondiff", + + // This has an MIT license in its source repository + // Ref: https://github.com/pkerpedjiev/negspy/blob/master/LICENSE + "negspy", + + // This license statement is complicated, but seems adequately permissive. + // Ref: https://foss.heptapod.net/python-libs/passlib/-/blob/branch/stable/LICENSE + "passlib", + + // This seems to be a BSD-3-Clause license. + // Ref: https://github.com/protocolbuffers/protobuf/blob/main/LICENSE + // pypi agrees in the Meta section of protobuf's page, where it says "3-Clause BSD License" + // Ref: https://pypi.org/project/protobuf/ + "protobuf", + + // The WTFPL license is permissive. + // Ref: https://github.com/mk-fg/pretty-yaml/blob/master/COPYING + "pyaml", + + // This uses a BSD license + // Ref: https://github.com/eliben/pycparser/blob/master/LICENSE + "pycparser", + + // The source repo for pyDes says this is under an MIT license + // Ref: https://github.com/twhiteman/pyDes/blob/master/LICENSE.txt + // pypi, probably wrongly, thinks this is in the public domain (as of 2023-07-21) + // Ref: https://pypi.org/project/pyDes/ + "pyDes", + + // This uses an MIT license + // Ref: https://github.com/pysam-developers/pysam/blob/master/COPYING + "pysam", + + // The version of python-lambda that we forked calls itself this (and publishes at pypi under this name) + "python-lambda-4dn", + + // This is MIT-licensed: + // Ref: https://github.com/themiurgo/ratelim/blob/master/LICENSE + // pypi agrees + // Ref: https://pypi.org/project/ratelim/ + "ratelim", + + // This is a BSD-3-Clause-Modification license + // Ref: https://github.com/repoze/repoze.debug/blob/master/LICENSE.txt + "repoze.debug", + + // This is an Apache-2.0 license + // Ref: https://github.com/getsentry/responses/blob/master/LICENSE + "responses", + + // This seems to get flagged sometimes, but is not the pypi snovault library, it's what our dcicsnovault + // calls itself internally. In any case, it's under MIT license and OK. + // Ref: https://github.com/4dn-dcic/snovault/blob/master/LICENSE.txt + "snovault", + + // PyPi identifies the supervisor library license as "BSD-derived (http://www.repoze.org/LICENSE.txt)" + // Ref: https://pypi.org/project/supervisor/ + // In fact, though, the license is a bit more complicated, though apparently still permissive. + // Ref: https://github.com/Supervisor/supervisor/blob/main/LICENSES.txt + "supervisor", + + // This seems to be a BSD-3-Clause-Modification license. + // Ref: https://github.com/Pylons/translationstring/blob/master/LICENSE.txt + "translationstring", + + // This seems to be a BSD-3-Clause-Modification license. + // Ref: https://github.com/Pylons/venusian/blob/master/LICENSE.txt + "venusian", + + // PyPi identifies zope.deprecation as using the "Zope Public License (ZPL 2.1)" license. + // Ref: https://github.com/zopefoundation/Zope/blob/master/LICENSE.txt + "zope.deprecation" + + // Below are licenses last known to have licenses missing in pip-licenses and need to be investigated further. + // Note well that just because pip-licenses doesn't know the license doesn't mean the software has + // no license. It may just mean the library is poorly registered in pypi. Some licenses have to be + // found by looking at the library's documentation or source files. + + // (all of these have been classified at this point) + ] +} diff --git a/dcicutils/license_policies/park-lab-gpl-pipeline.jsonc b/dcicutils/license_policies/park-lab-gpl-pipeline.jsonc new file mode 100644 index 000000000..1ff0b2723 --- /dev/null +++ b/dcicutils/license_policies/park-lab-gpl-pipeline.jsonc @@ -0,0 +1,62 @@ +{ + "class_key": "park-lab-gpl-pipeline", + "class_name": "ParkLabGplPipelineLicenseChecker", + "inherits_from": ["park-lab-pipeline"], + "description": "Minimal/generic checker for GPL-approved pipelines from Park Lab.", + + "ALLOWED": [ + + // Linking = With Restrictions, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + // The "exceptions", if present, indicate waivers to source delivery requirements. + // Ref: https://spdx.org/licenses/LGPL-3.0-linking-exception.html + {"pattern": "GNU Lesser General Public License v2( or later)?( [(]LGPL[v]?[23][+]?[)])?"}, + // "GNU Lesser General Public License v2 or later (LGPLv2+)", + // "GNU Lesser General Public License v3 or later (LGPLv3+)", + // "LGPLv2", "LGPL-v2", "LGPL-v2.0", "LGPL-2", "LGPL-2.0", + // "LGPLv2+", "LGPL-v2+", "LGPL-v2.0+", "LGPL-2+", "LGPL-2.0+", + // "LGPLv3", "LGPL-v3", "LGPL-v3.0", "LGPL-3", "LGPL-3.0", + // "LGPLv3+", "LGPL-v3+", "LGPL-v3.0+", "LGPL-3+", "LGPL-3.0+", + {"pattern": "LGPL[v-]?[.0-9]*([+]|-only)?([- ]with[- ]exceptions)?"}, + + // Uncertain whether this is LGPL 2 or 3, but in any case we think weak copyleft should be OK + // for pipeline or server use as long as we"re not distributing sources. + "LGPL", + "GNU Library or Lesser General Public License (LGPL)", + + // GPL + // * library exception operates like LGPL + // * classpath exception is a linking exception related to Oracle + // Refs: + // * https://www.gnu.org/licenses/old-licenses/gpl-1.0.en.html + // * https://spdx.org/licenses/GPL-2.0-with-GCC-exception.html + // * https://spdx.org/licenses/GPL-3.0-with-GCC-exception.html + { + "pattern": [ + "(GNU General Public License|GPL)[ ]?[v-]?[123]([.]0)?([+]|[- ]only)?", + "([- ]with[- ]GCC(([- ]runtime)?[- ]library)?[- ]exception([- ][.0-9]*)?)?", + "([- ]with[- ]Classpath[- ]exception([- ][.0-9]+)?)?" + ] + }, + + // Linking = "GPLv3 compatible only", Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "GPL-2-or-3", // we sometimes generate this token + // "GPLv2+", "GPL-v2+", "GPL-v2.0+", "GPL-2+", "GPL-2.0+", + // "GPLv3", "GPL-v3", "GPL-v3.0", "GPL-3", "GPL-3.0", + // "GPLv3+", "GPL-v3+", "GPL-v3.0+", "GPL-3+", "GPL-3.0+", + // "GPLv3-only", "GPL-3-only", "GPL-v3-only", "GPL-3.0-only", "GPL-v3.0-only", + + // Uncertain whether this is GPL 2 or 3, but we'll assume that means we can use either. + // And version 3 is our preferred interpretation. + "GNU General Public License", + "GPL", + + // This is an arbitrary catch-all name we made up because the R language some things identify themselves + // as a specific part of the R language + // Ref: https://cran.r-project.org/doc/FAQ/R-FAQ.html#Legalese + // An important clarification to this is here: + // Ref: https://cran.r-project.org/doc/FAQ/R-FAQ.html#Can-I-use-R-for-commercial-purposes_003f + "R-language-license" + ] +} diff --git a/dcicutils/license_policies/park-lab-pipeline.jsonc b/dcicutils/license_policies/park-lab-pipeline.jsonc new file mode 100644 index 000000000..5fbcc6616 --- /dev/null +++ b/dcicutils/license_policies/park-lab-pipeline.jsonc @@ -0,0 +1,12 @@ +{ + "class_key": "park-lab-pipeline", + "class_name": "ParkLabPipelineLicenseChecker", + "inherits_from": ["park-lab-common"], + "description": "Minimal/generic checker for non-GPL-approved pipelines from Park Lab.", + + "LICENSE_FRAMEWORKS": ["python", "conda", "r"] +} + + + + diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index f0717b50e..8d6553b0a 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -4,7 +4,6 @@ import glob import io import json -# import logging import os import re import subprocess @@ -25,15 +24,17 @@ # import piplicenses from collections import defaultdict +from jsonc_parser.parser import JsoncParser from typing import Any, Dict, DefaultDict, List, Optional, Type, TypeVar, Union # For obscure reasons related to how this file is used for early prototyping, these must use absolute references # to modules, not relative references. Later when things are better installed, we can make refs relative again. +from dcicutils.common import Regexp, AnyJsonData from dcicutils.exceptions import InvalidParameterError -from dcicutils.lang_utils import there_are +from dcicutils.lang_utils import there_are, conjoined_list from dcicutils.misc_utils import ( PRINT, get_error_message, ignorable, ignored, json_file_contents, local_attrs, environ_bool, - remove_suffix, + remove_suffix, to_camel_case ) T = TypeVar("T") @@ -49,6 +50,10 @@ _NAME = 'name' _STATUS = 'status' +_INHERITS_FROM = 'inherits_from' +_ALLOWED = 'allowed' +_EXCEPT = 'except' + def pattern(x): return re.compile(x, re.IGNORECASE) @@ -72,14 +77,15 @@ class LicenseOptions: # Specific additional debugging output DEBUG = environ_bool("LICENSE_UTILS_DEBUG", default=False) CONDA_PREFIX = os.environ.get("CONDA_LICENSE_CHECKER_PREFIX", os.environ.get("CONDA_PREFIX", "")) + POLICY_DIR = os.environ.get("LICENSE_UTILS_POLICY_DIR") @classmethod @contextlib.contextmanager - def selected_options(cls, verbose=VERBOSE, debug=DEBUG, conda_prefix=CONDA_PREFIX): + def selected_options(cls, verbose=VERBOSE, debug=DEBUG, conda_prefix=CONDA_PREFIX, policy_dir=POLICY_DIR): """ Allows a script, for example, to specify overrides for these options dynamically. """ - with local_attrs(cls, VERBOSE=verbose, DEBUG=debug, CONDA_PREFIX=conda_prefix): + with local_attrs(cls, VERBOSE=verbose, DEBUG=debug, CONDA_PREFIX=conda_prefix, POLICY_DIR=policy_dir): yield @@ -148,6 +154,10 @@ def find_framework(cls, framework_spec: FrameworkSpec): def all_frameworks(cls): return sorted(cls.LICENSE_FRAMEWORKS.values(), key=lambda x: x.NAME) + @classmethod + def all_framework_names(cls): + return sorted(cls.LICENSE_FRAMEWORKS.keys()) + # This is intended to match ' (= 3)', ' (>= 3)', ' (version 3)', ' (version 3 or greater)' # It will incidentally and harmlessly also take ' (>version 3)' or '(>= 3 or greater)'. @@ -315,13 +325,10 @@ def get_dependencies(cls): package_name = data['name'] package_license = data.get('license') or "MISSING" if package_license: - # print(f"package_license={package_license}") simplified_package_license_spec = simplify_license_versions(package_license, for_package_name=package_name) - # print(f" =simplified_package_license_spec => {simplified_package_license_spec}") package_licenses = extract_boolean_terms(simplified_package_license_spec, for_package_name=package_name) - # print(f"=> {package_licenses}") else: package_licenses = [] entry = { @@ -331,8 +338,6 @@ def get_dependencies(cls): } result.append(entry) result.sort(key=lambda x: x['name']) - # print(f"conda get_dependencies result={json.dumps(result, indent=2)}") - # print("conda deps = ", json.dumps(result, indent=2)) return result @@ -492,22 +497,31 @@ def report(message): class LicenseChecker: """ - There are three important class variables to specify: + License checkers are defined as .jsonc. The JSONC file format is JSON with Comments. + (The comments are Javascript syntax, either '//' or '/* ... */'.) + + There are these important class variables to specify: LICENSE_TITLE is a string naming the license to be expected in LICENSE.txt COPYRIGHT_OWNER is the name of the copyright owner. - FRAMEWORKS will default to all defined frameworks (presently ['python', 'javascript'], but can be limited to - just ['python'] for example. It doesn't make a lot of sense to limit it to ['javascript'], though you could, - since you are using a Python library to do this, and it probably needs to have its dependencies checked. + LICENSE_FRAMEWORKS will default to all defined frameworks (presently ['python', 'javascript'], + but can be limited to just ['python'] for example. It doesn't make a lot of sense to limit it to + ['javascript'], though you could, since you are using a Python library to do this, and it probably + needs to have its dependencies checked. - ALLOWED is a list of license names as returned by the pip-licenses library. + ALLOWED is a list of license names as returned by the various license frameworks. Because they rely on different + underlying tools the exact format of the names that result might vary. For this reason, there is a regular + expression capability for this particular attribute. In addition to just a string, you can also use + {"pattern": ""} For very long regular expressions, {"pattern": ["", ...]} will + concatenate all the parts into a single regexp, so they can be gracefully broken over lines in the .jsonc + source file. If regexp flags are requierd, use {"pattern" "", "flags": ["flag1", ...]}. - EXPECTED_MISSING is a list of libraries that are expected to have no license information. This is so you don't - have to get warning fatigue by seeing a warning over and over for things you know about. If a new library - with no license info shows up that you don't expect, you should investigate it, make sure it's OK, - and then add it to this list. + EXPECTED_MISSING_LICENSES is a list of libraries that are expected to have no license information. + This is so you don't have to get warning fatigue by seeing a warning over and over for things you know about. + If a new library with no license info shows up that you don't expect, you should investigate it, + make sure it's OK, and then add it to this list. EXCEPTIONS is a table (a dict) keyed on license names with entries that are lists of library names that are allowed to use the indicated license even though the license might not be generally allowed. This should be @@ -673,6 +687,7 @@ def analyze_license_dependencies_by_framework(cls, *, @classmethod def show_unacceptable_licenses(cls, *, analysis: LicenseAnalysis) -> LicenseAnalysis: if analysis.unacceptable: + # This is part of the essential output, so is not conditional on switches. PRINT(there_are(analysis.unacceptable, kind="unacceptable license", show=False, punctuation_mark=':')) for license, names in sorted(analysis.unacceptable.items()): PRINT(f" {license}: {', '.join(names)}") @@ -726,10 +741,21 @@ def _register(license_checker_class: Type[LicenseChecker]): return _register @classmethod - def lookup_checker(cls, name: str) -> Type[LicenseChecker]: - result: Optional[Type[LicenseChecker]] = cls.REGISTRY.get(name) + def find_checker(cls, checker_name: str) -> Optional[Type[LicenseChecker]]: + return cls.REGISTRY.get(checker_name, None) + + @classmethod + def lookup_checker(cls, checker_name: str, autoload: bool = False) -> Type[LicenseChecker]: + result: Optional[Type[LicenseChecker]] = cls.find_checker(checker_name) if result is None: - raise InvalidParameterError(parameter='checker_name', value=name, + if autoload: + policy_dir = LicenseOptions.POLICY_DIR or POLICY_DIR + PRINT(f"Looking for custom policy {checker_name} in {policy_dir} ...") + result = find_or_create_license_class(policy_name=checker_name, + policy_dir=policy_dir) + if result: + return result + raise InvalidParameterError(parameter='checker_name', value=checker_name, options=cls.all_checker_names()) return result @@ -762,648 +788,243 @@ def __init__(self, message=None, unacceptable_licenses=None): super().__init__(message=message) -@LicenseCheckerRegistry.register_checker('park-lab-common') -class ParkLabCommonLicenseChecker(LicenseChecker): +def literal_string_or_regexp_from_dict(item): """ - Minimal checker common to all tech from Park Lab. + Expects either a string (which will be matched using ordinary equality) ore a regular expression, + expressed as a dictionary of the form {"pattern": , "flags": [, ...]} + The pattern is required. The flags may be omitted if null. + A pattern is either a string or a list of strings. If it is a list of strings, it will be concatenated + into a single string, which can be useful for breaking long strings over lines. + Flags are string names of re.WHATEVER flags that would be given to Python's re.compile. + UNICODE and IGNORECASE are on by default. """ + if isinstance(item, str): + return item + elif not isinstance(item, dict): + raise ValueError(f'Expected a string or a dictionary describing a regular expression.') + pattern = item.get('pattern') + # The pattern is permitted to be a string or list of strings, since in a JSON-style file we can't + # do the thing we do in python where we just juxtapose several strings, separated by whitespace + # and/or newlines, in order to have them taken as a single literal string. -kmp 29-Sep-2023 + if isinstance(pattern, str): + pass + elif isinstance(pattern, list): + pattern = ''.join(pattern) + else: + raise ValueError(f"Invalid pattern expression: {item!r}") + flags = item.get('flags') or [] + compilation_flags = re.IGNORECASE # UNICODE will default, but IGNORECASE we have to set up manually + for flag in flags: + if isinstance(flag, str) and flag.isupper(): + if hasattr(re, flag): + compilation_flags |= getattr(re, flag) + else: + raise ValueError(f"No such flag re.{flag}") + else: + raise ValueError(f"Flags must be strigs: {flag!r}") + regexp = re.compile(pattern, compilation_flags) + return regexp - COPYRIGHT_OWNER = "President and Fellows of Harvard College" - - ALLOWED = [ - - # <> - # Ref: https://opensource.org/license/0bsd/ - '0BSD', - - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'Academic Free License (AFL)', - 'AFL-2.1', - - # Linking = Permissive, Private Use = Yes - # Apache licenses before version 2.0 are controversial, but we here construe an unmarked naming to imply - # any version, and hence v2. - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'Apache Software License', - 'Apache-Style', - pattern("Apache([- ]2([.]0)?)?([- ]Licen[cs]e)?([- ]with[- ]LLVM[- ]exception)?"), - # 'Apache-2.0', - - # Artistic License 1.0 was confusing to people, so its status as permissive is in general uncertain, - # however the issue seems to revolve around point 8 (relating to whether or not perl is deliberately - # exposed). That isn't in play for our uses, so we don't flag it here. - # Artistic license 2.0 is a permissive license. - # Ref: https://en.wikipedia.org/wiki/Artistic_License - 'Artistic-1.0-Perl', - pattern('Artistic[- ]2([.]0)?'), - - # According to Wikipedia, the Boost is considered permissive and BSD-like. - # Refs: - # * - # * https://en.wikipedia.org/wiki/Boost_(C%2B%2B_libraries)#License - pattern('(BSL|Boost(([- ]Software)?[- ]License)?)([- ]1([.]0)?)?'), - - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - pattern('((modified[- ])?[234][- ]Clause[- ])?BSD([- ][234][- ]Clause)?( Licen[cs]e)?'), - # 'BSD License', - # 'BSD-2-Clause', - # 'BSD-3-Clause', - # 'BSD 3-Clause', - - # BZIP2 is a permissive license - # Ref: https://github.com/asimonov-im/bzip2/blob/master/LICENSE - pattern('bzip2(-1[.0-9]*)'), - - # Linking = Public Domain, Private Use = Public Domain - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'CC0', - 'CC0-1.0', - - # Linking = Permissive, Private Use = Permissive - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'CC-BY', - 'CC-BY-3.0', - 'CC-BY-4.0', - - # The curl license is a permissive license. - # Ref: https://curl.se/docs/copyright.html - 'curl', - - # Linking = Permissive, Private Use = ? - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'CDDL', - - # The original Eclipse Distribution License 1.0 is essentially a BSD-3-Clause license. - # Ref: https://www.eclipse.org/org/documents/edl-v10.php - 'Eclipse Distribution License', - - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'Eclipse Public License', - 'EPL-2.0', - - # The FSF Unlimited License (FSFUL) seems to be a completely permissive license. - # Refs: - # * https://spdx.org/licenses/FSFUL.html - # * https://fedoraproject.org/wiki/Licensing/FSF_Unlimited_License - 'FSF Unlimited License', - 'FSFUL', - - # The FreeType license is a permissive license. - # Ref: LicenseRef-FreeType - pattern('(Licen[cs]eRef-)?(FTL|FreeType( Licen[cs]e)?)'), - - # Linking = Yes, Cat = Permissive Software Licenses - # Ref: https://en.wikipedia.org/wiki/Historical_Permission_Notice_and_Disclaimer - 'Historical Permission Notice and Disclaimer (HPND)', - 'HPND', - pattern('(Licen[cs]eRef-)?PIL'), - # The Pillow or Python Image Library is an HPND license, which is a simple permissive license: - # Refs: - # * https://github.com/python-pillow/Pillow/blob/main/LICENSE - # * https://www.fsf.org/blogs/licensing/historical-permission-notice-and-disclaimer-added-to-license-list - - # The IJG license, used by Independent JPEG Group (IJG) is a custom permissive license. - # Refs: - # * https://en.wikipedia.org/wiki/Libjpeg - # * https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/LICENSE.md - 'IJG', - - # Linking = Permissive, Private Use = Permissive - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'ISC License (ISCL)', - 'ISC', - - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'MIT License', - 'MIT', - - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'Mozilla Public License 2.0 (MPL 2.0)', - 'MPL-1.1', - 'MPL-2.0', - - # The SIL Open Font License appears to be a copyleft-style license that applies narrowly - # to icons and not to the entire codebase. It is advertised as OK for use even in commercial - # applications. - # Ref: https://fontawesome.com/license/free - 'OFL-1.1', - - # Ref: https://en.wikipedia.org/wiki/Public_domain - pattern('(Licen[cs]eRef-)?Public[- ]Domain([- ]dedic[t]?ation)?'), # "dedictation" is a typo in docutils - - # Linking = Permissive, Private Use = Permissive - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - pattern('(Licen[cs]eRef-)?PSF-2([.][.0-9]*)'), - 'Python Software Foundation License', - 'Python-2.0', - - # License = BSD-like - # Ref: https://en.wikipedia.org/wiki/Pylons_project - 'Repoze Public License', - - # The TCL or Tcl/Tk licenses are permissive licenses. - # Ref: https://www.tcl.tk/software/tcltk/license.html - # The one used by the tktable library has a 'bourbon' clause that doesn't add compliance requirements - # Ref: https://github.com/wjoye/tktable/blob/master/license.txt - pattern('Tcl([/]tk)?'), - - # The Ubuntu Font Licence is mostly permissive. It contains some restrictions if you are going to modify the - # fonts that require you to change the name to avoid confusion. But for our purposes, we're assuming that's - # not done, and so we're not flagging it. - pattern('Ubuntu Font Licen[cs]e Version( 1([.]0)?)?'), - - # Linking = Permissive/Public domain, Private Use = Permissive/Public domain - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'The Unlicense (Unlicense)', - 'Unlicense', - - # Various licenses seem to call themselves or be summed up as unlimited. - # So far we know of none that are not highly permissive. - # * boot and KernSmooth are reported by R as being 'Unlimited' - # Refs: - # * https://cran.r-project.org/web/packages/KernSmooth/index.html - # (https://github.com/cran/KernSmooth/blob/master/LICENCE.note) - # * https://cran.r-project.org/package=boot - # (https://github.com/cran/boot/blob/master/DESCRIPTION) - 'Unlimited', - - # Linking = Permissive, Private Use = ? - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'W3C License', - 'W3C-20150513', - - # Linking = Permissive/Public Domain, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'WTFPL', - - # Copyleft = No - # Ref: https://en.wikipedia.org/wiki/Zlib_License - # Linking = Permissive, Private Use = ? (for zlib/libpng license) - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'Zlib', - - # Copyleft = No, FSF/OSI-approved: Yes - # Ref: https://en.wikipedia.org/wiki/Zope_Public_License - 'Zope Public License', - ] - - EXCEPTIONS = { - - # The Bioconductor zlibbioc license is a permissive license. - # Ref: https://github.com/Bioconductor/zlibbioc/blob/devel/LICENSE - 'Custom: bioconductor-zlibbioc file LICENSE': [ - 'bioconductor-zlibbioc' - ], - - # The Bioconductor rsamtools license is an MIT license - # Ref: https://bioconductor.org/packages/release/bioc/licenses/Rsamtools/LICENSE - 'Custom: bioconductor-rsamtools file LICENSE': [ - 'bioconductor-rsamtools' - ], - - # DFSG = Debian Free Software Guidelines - # Ref: https://en.wikipedia.org/wiki/Debian_Free_Software_Guidelines - # Used as an apparent modifier to other licenses, to say they are approved per Debian. - # For example in this case, pytest-timeout has license: DFSG approved, MIT License, - # but is really just an MIT License that someone has checked is DFSG approved. - 'DFSG approved': [ - 'pytest-timeout', # MIT Licensed - ], - - 'FOSS': [ - # The r-stringi library is a conda library that implements a stringi (pronounced "stringy") library for R. - # The COnda source feed is: https://github.com/conda-forge/r-stringi-feedstock - # This page explains that the home source is https://stringi.gagolewski.com/ but that's a doc page. - # The doc page says: - # > stringi’s source code is hosted on GitHub. - # > It is distributed under the open source BSD-3-clause license. - # The source code has a license that begins with a BSD-3-clause license and includes numerous others, - # but they all appear to be permissive. - # Ref: https://github.com/gagolews/stringi/blob/master/LICENSE - 'stringi', 'r-stringi', - ], - - # Linking = With Restrictions, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'GNU Lesser General Public License v2 or later (LGPLv2+)': [ - 'chardet' # used at runtime during server operation (ingestion), but not modified or distributed - ], - - # Linking = With Restrictions, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'GNU Lesser General Public License v3 or later (LGPLv3+)': [ - # used only privately in testing, not used in server code, not modified, not distributed - 'pytest-redis', - # required by pytest-redis (used only where it's used) - 'mirakuru', - ], - - 'GNU General Public License (GPL)': [ - 'docutils', # Used only privately as a separate documentation-generation task for ReadTheDocs - ], - - 'MIT/X11 Derivative': [ - # The license used by libxkbcommon is complicated and involves numerous included licenses, - # but all are permissive. - # Ref: https://github.com/xkbcommon/libxkbcommon/blob/master/LICENSE - 'libxkbcommon', - ], - - 'None': [ - # It's not obvious why Conda shows this license as 'None'. - # In fact, though, BSD 3-Clause "New" or "Revised" License - # Ref: https://github.com/AnacondaRecipes/_libgcc_mutex-feedstock/blob/master/LICENSE.txt - '_libgcc_mutex', - ], - - 'PostgreSQL': [ - # The libpq library is actually licensed with a permissive BSD 3-Clause "New" or "Revised" License - # Ref: https://github.com/lpsmith/postgresql-libpq/blob/master/LICENSE - 'libpq', - ], - - 'UCSD': [ - # It isn't obvious why these show up with a UCSD license in Conda. - # The actual sources say it should be a 2-clause BSD license: - # Refs: - # * https://github.com/AlexandrovLab/SigProfilerMatrixGenerator/blob/master/LICENSE - # * https://github.com/AlexandrovLab/SigProfilerPlotting/blob/master/LICENSE - 'sigprofilermatrixgenerator', - 'sigprofilerplotting', - ], - - 'X11': [ - # The ncurses library has a VERY complicated history, BUT seems consistently permissive - # and the most recent version seems to be essentially the MIT license. - # Refs: - # * https://en.wikipedia.org/wiki/Ncurses#License - # * https://invisible-island.net/ncurses/ncurses-license.html - 'ncurses' - ], - - 'zlib-acknowledgement': [ - # It isn't clear whey libpng shows up with this license name, but the license for libpng - # is a permissive license. - # Ref: https://github.com/glennrp/libpng/blob/libpng16/LICENSE - 'libpng', - ], - - } - - EXPECTED_MISSING_LICENSES = [ - - # This is a name we use for our C4 portals. And it isn't published. - # We inherited the name from the Stanford ENCODE group, which had an MIT-licensed repo we forked - 'encoded', # cgap-portal, fourfront, and smaht-portal all call themselves this - - # We believe that since these next here are part of the Pylons project, they're covered under - # the same license as the other Pylons projects. We're seeking clarification. - 'pyramid-translogger', - 'subprocess-middleware', - - # This appears to be a BSD 2-Clause "Simplified" License, according to GitHub. - # PyPi also says it's a BSD license. - # Ref: https://github.com/paulc/dnslib/blob/master/LICENSE - 'dnslib', - - # This says it wants an ISC License, which we already have approval for but just isn't showing up. - # Ref: https://github.com/rthalley/dnspython/blob/master/LICENSE - 'dnspython', - - # This appears to be a mostly-MIT-style license. - # There are references to parts being in the public domain, though it's not obvious if that's meaningful. - # It's probably sufficient for our purposes to treat this as a permissive license. - # Ref: https://github.com/tlsfuzzer/python-ecdsa/blob/master/LICENSE - 'ecdsa', - - # This has an MIT license in its source repository - # Ref: https://github.com/xlwings/jsondiff/blob/master/LICENSE - 'jsondiff', - - # This has an MIT license in its source repository - # Ref: https://github.com/pkerpedjiev/negspy/blob/master/LICENSE - 'negspy', - - # This license statement is complicated, but seems adequately permissive. - # Ref: https://foss.heptapod.net/python-libs/passlib/-/blob/branch/stable/LICENSE - 'passlib', - - # This seems to be a BSD-3-Clause license. - # Ref: https://github.com/protocolbuffers/protobuf/blob/main/LICENSE - # pypi agrees in the Meta section of protobuf's page, where it says "3-Clause BSD License" - # Ref: https://pypi.org/project/protobuf/ - 'protobuf', - - # The WTFPL license is permissive. - # Ref: https://github.com/mk-fg/pretty-yaml/blob/master/COPYING - 'pyaml', - - # This uses a BSD license - # Ref: https://github.com/eliben/pycparser/blob/master/LICENSE - 'pycparser', - - # The source repo for pyDes says this is under an MIT license - # Ref: https://github.com/twhiteman/pyDes/blob/master/LICENSE.txt - # pypi, probably wrongly, thinks this is in the public domain (as of 2023-07-21) - # Ref: https://pypi.org/project/pyDes/ - 'pyDes', - - # This uses an MIT license - # Ref: https://github.com/pysam-developers/pysam/blob/master/COPYING - 'pysam', - - # The version of python-lambda that we forked calls itself this (and publishes at pypi under this name) - "python-lambda-4dn", - - # This is MIT-licensed: - # Ref: https://github.com/themiurgo/ratelim/blob/master/LICENSE - # pypi agrees - # Ref: https://pypi.org/project/ratelim/ - 'ratelim', - - # This is a BSD-3-Clause-Modification license - # Ref: https://github.com/repoze/repoze.debug/blob/master/LICENSE.txt - 'repoze.debug', - - # This is an Apache-2.0 license - # Ref: https://github.com/getsentry/responses/blob/master/LICENSE - 'responses', - - # This seems to get flagged sometimes, but is not the pypi snovault library, it's what our dcicsnovault - # calls itself internally. In any case, it's under MIT license and OK. - # Ref: https://github.com/4dn-dcic/snovault/blob/master/LICENSE.txt - 'snovault', - - # PyPi identifies the supervisor library license as "BSD-derived (http://www.repoze.org/LICENSE.txt)" - # Ref: https://pypi.org/project/supervisor/ - # In fact, though, the license is a bit more complicated, though apparently still permissive. - # Ref: https://github.com/Supervisor/supervisor/blob/main/LICENSES.txt - 'supervisor', - - # This seems to be a BSD-3-Clause-Modification license. - # Ref: https://github.com/Pylons/translationstring/blob/master/LICENSE.txt - 'translationstring', - - # This seems to be a BSD-3-Clause-Modification license. - # Ref: https://github.com/Pylons/venusian/blob/master/LICENSE.txt - 'venusian', - # PyPi identifies zope.deprecation as using the "Zope Public License (ZPL 2.1)" license. - # Ref: https://github.com/zopefoundation/Zope/blob/master/LICENSE.txt - 'zope.deprecation', - - # Below are licenses last known to have licenses missing in pip-licenses and need to be investigated further. - # Note well that just because pip-licenses doesn't know the license doesn't mean the software has - # no license. It may just mean the library is poorly registered in pypi. Some licenses have to be - # found by looking at the library's documentation or source files. - - # (all of these have been classified at this point) - - ] - - -@LicenseCheckerRegistry.register_checker('park-lab-pipeline') -class ParkLabPipelineLicenseChecker(ParkLabCommonLicenseChecker): +def read_license_policy_file(file): """ - Minimal checker common to pipelines from Park Lab. + Reads a license policy file, which is a JSONC file (can contain JSON with Javascript-style comments) + The policy is a dictionary, but the ALLOWED option is a list that can contain special syntax allowing + a regular expression to be inferred. See documentation of `string_or_regexp_dict` for details. """ + data = JsoncParser.parse_file(file) + allowed = data.get('ALLOWED') + if isinstance(allowed, list): + # The "ALLOWED" option is specially permitted to contain regular expressions. + data['ALLOWED'] = [literal_string_or_regexp_from_dict(allowance) for allowance in allowed] + return data - LICENSE_FRAMEWORKS = ['python', 'conda', 'r'] +_MY_DIR = os.path.dirname(__file__) -@LicenseCheckerRegistry.register_checker('park-lab-gpl-pipeline') -class ParkLabGplPipelineLicenseChecker(ParkLabCommonLicenseChecker): - """ - Minimal checker common to GPL pipelines from Park Lab. - """ +POLICY_DIR = os.path.join(_MY_DIR, "license_policies") - ALLOWED = ParkLabPipelineLicenseChecker.ALLOWED + [ - - # Linking = With Restrictions, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - # The "exceptions", if present, indicate waivers to source delivery requirements. - # Ref: https://spdx.org/licenses/LGPL-3.0-linking-exception.html - pattern('GNU Lesser General Public License v2( or later)?( [(]LGPL[v]?[23][+]?[)])?'), - # 'GNU Lesser General Public License v2 or later (LGPLv2+)', - # 'GNU Lesser General Public License v3 or later (LGPLv3+)', - # 'LGPLv2', 'LGPL-v2', 'LGPL-v2.0', 'LGPL-2', 'LGPL-2.0', - # 'LGPLv2+', 'LGPL-v2+', 'LGPL-v2.0+', 'LGPL-2+', 'LGPL-2.0+', - # 'LGPLv3', 'LGPL-v3', 'LGPL-v3.0', 'LGPL-3', 'LGPL-3.0', - # 'LGPLv3+', 'LGPL-v3+', 'LGPL-v3.0+', 'LGPL-3+', 'LGPL-3.0+', - pattern('LGPL[v-]?[.0-9]*([+]|-only)?([- ]with[- ]exceptions)?'), - - # Uncertain whether this is LGPL 2 or 3, but in any case we think weak copyleft should be OK - # for pipeline or server use as long as we're not distributing sources. - 'LGPL', - 'GNU Library or Lesser General Public License (LGPL)', - - # GPL - # * library exception operates like LGPL - # * classpath exception is a linking exception related to Oracle - # Refs: - # * https://www.gnu.org/licenses/old-licenses/gpl-1.0.en.html - # * https://spdx.org/licenses/GPL-2.0-with-GCC-exception.html - # * https://spdx.org/licenses/GPL-3.0-with-GCC-exception.html - pattern('(GNU General Public License|GPL)[ ]?[v-]?[123]([.]0)?([+]|[- ]only)?' - '([- ]with[- ]GCC(([- ]runtime)?[- ]library)?[- ]exception([- ][.0-9]*)?)?' - '([- ]with[- ]Classpath[- ]exception([- ][.0-9]+)?)?'), - - # Linking = "GPLv3 compatible only", Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'GPL-2-or-3', # we sometimes generate this token - # 'GPLv2+', 'GPL-v2+', 'GPL-v2.0+', 'GPL-2+', 'GPL-2.0+', - # 'GPLv3', 'GPL-v3', 'GPL-v3.0', 'GPL-3', 'GPL-3.0', - # 'GPLv3+', 'GPL-v3+', 'GPL-v3.0+', 'GPL-3+', 'GPL-3.0+', - # 'GPLv3-only', 'GPL-3-only', 'GPL-v3-only', 'GPL-3.0-only', 'GPL-v3.0-only', - - # Uncertain whether this is GPL 2 or 3, but we'll assume that means we can use either. - # And version 3 is our preferred interpretation. - 'GNU General Public License', - 'GPL', - - RLicenseFramework.R_LANGUAGE_LICENSE_NAME - - ] - - -@LicenseCheckerRegistry.register_checker('park-lab-common-server') -class ParkLabCommonServerLicenseChecker(ParkLabCommonLicenseChecker): - """ - Checker for servers from Park Lab. +POLICY_DATA_CACHE = {} - If you're at some other organization, we recommend you make a class that has values - suitable to your own organizational needs. - """ - LICENSE_FRAMEWORKS = ['python', 'javascript'] - - EXCEPTIONS = augment( - ParkLabCommonLicenseChecker.EXCEPTIONS, - by={ - 'BSD*': [ - # Although modified to insert the author name into the license text itself, - # the license for these libraries are essentially BSD-3-Clause. - 'formatio', - 'samsam', - - # There are some slightly different versions of what appear to be BSD licenses here, - # but clearly the license is permissive. - # Ref: https://www.npmjs.com/package/mutation-observer?activeTab=readme - 'mutation-observer', - ], - - 'Custom: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global': [ - # The use of this URL appears to be a syntax error in the definition of entries-ponyfill - # In fact this seems to be covered by a CC0-1.0 license. - # Ref: https://unpkg.com/browse/object.entries-ponyfill@1.0.1/LICENSE - 'object.entries-ponyfill', - ], - - 'Custom: https://github.com/saikocat/colorbrewer.': [ - # The use of this URL appears to be a syntax error in the definition of cartocolor - # In fact, this seems to be covered by a CC-BY-3.0 license. - # Ref: https://www.npmjs.com/package/cartocolor?activeTab=readme - 'cartocolor', - ], - - 'Custom: https://travis-ci.org/component/emitter.png': [ - # The use of this png appears to be a syntax error in the definition of emitter-component. - # In fact, emitter-component uses an MIT License - # Ref: https://www.npmjs.com/package/emitter-component - # Ref: https://github.com/component/emitter/blob/master/LICENSE - 'emitter-component', - ], - - # The 'turfs-jsts' repository (https://github.com/DenisCarriere/turf-jsts/blob/master/README.md) - # seems to lack a license, but appears to be forked from the jsts library that uses - # the Eclipse Public License 1.0 and Eclipse Distribution License 1.0, so probably a permissive - # license is intended. - 'Custom: https://travis-ci.org/DenisCarriere/turf-jsts.svg': [ - 'turf-jsts' - ], - - 'GNU General Public License (GPL)': [ - 'docutils', # Used only privately as a separate documentation-generation task for ReadTheDocs - ], - - # Linking = With Restrictions, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - # 'GNU Lesser General Public License v3 or later (LGPLv3+)', - - # Linking = With Restrictions, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'GNU Library or Lesser General Public License (LGPL)': [ - 'psycopg2', # Used at runtime during server operation, but not modified or distributed - 'psycopg2-binary', # Used at runtime during server operation, but not modified or distributed - 'chardet', # Potentially used downstream in loadxl to detect charset for text files - 'pyzmq', # Used in post-deploy-perf-tests, not distributed, and not modified or distributed - ], - - 'GPL-2.0': [ - # The license file for the node-forge javascript library says: - # - # "You may use the Forge project under the terms of either the BSD License or the - # GNU General Public License (GPL) Version 2." - # - # (We choose to use it under the BSD license.) - # Ref: https://www.npmjs.com/package/node-forge?activeTab=code - 'node-forge', - ], - - 'MIT*': [ - - # This library uses a mix of licenses, but they (MIT, CC0) generally seem permissive. - # (It also mentions that some tools for building/testing use other libraries.) - # Ref: https://github.com/requirejs/domReady/blob/master/LICENSE - 'domready', - - # This library is under 'COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.1' - # Ref: https://github.com/javaee/jsonp/blob/master/LICENSE.txt - # About CDDL ... - # Linking = Permissive, Private Use = ? - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'jsonp', - - # This library says pretty clearly it intends MIT license. - # Ref: https://www.npmjs.com/package/component-indexof - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'component-indexof', - - # These look like a pretty straight MIT license. - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'mixin', # LICENSE file at https://www.npmjs.com/package/mixin?activeTab=code - 'stack-trace', # https://github.com/stacktracejs/stacktrace.js/blob/master/LICENSE - 'typed-function', # LICENSE at https://www.npmjs.com/package/typed-function?activeTab=code - - ], - - 'UNLICENSED': [ - # The udn-browser library is our own and has been observed to sometimes show up in some contexts - # as UNLICENSED, when really it's MIT. - # Ref: https://github.com/dbmi-bgm/udn-browser/blob/main/LICENSE - 'udn-browser', - ], - }) - - -@LicenseCheckerRegistry.register_checker('c4-infrastructure') -class C4InfrastructureLicenseChecker(ParkLabCommonServerLicenseChecker): - """ - Checker for C4 infrastructure (Fourfront, CGAP, SMaHT) from Park Lab. - """ +def built_in_policy_names(): + return [ + os.path.splitext(os.path.basename(license_policy_path))[0] + for license_policy_path in glob.glob(os.path.join(POLICY_DIR, "*.jsonc"))] - LICENSE_TITLE = "(The )?MIT License" + +def find_policy_data(policy_name: str, policy_dir: Optional[str] = None, + use_cache: bool = True, error_if_missing: bool = True): + policy_dir = POLICY_DIR if policy_dir is None else policy_dir + existing_data = POLICY_DATA_CACHE.get(policy_name) if use_cache else None + if existing_data: + return existing_data + else: + filename = os.path.join(policy_dir, policy_name + ".jsonc") + if not os.path.exists(filename): + if error_if_missing: + raise ValueError(f"No such policy: {policy_name!r}") + else: + return None + data = read_license_policy_file(filename) + POLICY_DATA_CACHE[policy_name] = data + return data -@LicenseCheckerRegistry.register_checker('c4-python-infrastructure') -class C4PythonInfrastructureLicenseChecker(C4InfrastructureLicenseChecker): +def find_or_create_license_class(*, policy_name: str, policy_dir: str, + # This next argument should never be passed explicitly by callers other than + # recursive calls to this function. -kmp 28-Sep-2023 + _creation_attmpts_in_progress=None): """ - Checker for C4 python library infrastructure (Fourfront, CGAP, SMaHT) from Park Lab. + Define a policy class given a policy name (like 'c4-infrastructure'). """ - LICENSE_FRAMEWORKS = ['python'] - - -@LicenseCheckerRegistry.register_checker('scan2-pipeline') -class Scan2PipelineLicenseChecker(ParkLabGplPipelineLicenseChecker): + _creation_attmpts_in_progress = _creation_attmpts_in_progress or [] + existing_checker = LicenseCheckerRegistry.find_checker(checker_name=policy_name) + if existing_checker: + return existing_checker + elif policy_name in _creation_attmpts_in_progress: + raise ValueError(f"Circular reference to {policy_name} detected" + f" while creating {conjoined_list(_creation_attmpts_in_progress)}.") + _creation_attmpts_in_progress.append(policy_name) + license_checker_class_name = to_camel_case(policy_name) + "LicenseChecker" + policy_data = find_policy_data(policy_name, policy_dir=policy_dir) + inherits_from = policy_data.get('inherits_from') + if not isinstance(inherits_from, list): + raise ValueError(f'Policy {policy_name!r} needs "inherits_from": [...parent names...],' + f' which may be empty but must be specified.') + license_frameworks = policy_data.get('LICENSE_FRAMEWORKS') + if license_frameworks == "ALL": + policy_data['LICENSE_FRAMEWORKS'] = LicenseFrameworkRegistry.all_framework_names() + parent_classes = [find_or_create_license_class(policy_name=parent_name, policy_dir=policy_dir, + _creation_attmpts_in_progress=_creation_attmpts_in_progress) + for parent_name in inherits_from] + defaulted_policy_data = default_policy_data(policy_name=policy_name, policy_data=policy_data, + parent_classes=parent_classes) + new_class = type(license_checker_class_name, + (*parent_classes, LicenseChecker), + {'_policy_data': policy_data, **defaulted_policy_data}) + new_class.__doc__ = policy_data.get("description") or f'License policy {policy_name} needs a "description".' + # Sigh. PyCharm can't figure this out type fact out, even with a type hint on the above assignment to new_class, + # such as 'new_class: Type[LicenseChecker] = ...'. That should have worked. Putting in an assert was the only way + # I could find to convince PyCharm of the truth. I don't expect this assertion to ever fail. It's just an artifact + # to prevent ugly browser highlighting. I'll try to arrange a bug report for them. -kmp 29-Sep-2023 + assert isinstance(new_class, type) and issubclass(new_class, LicenseChecker) + license_policy_class: Type[LicenseChecker] = new_class + decorator = LicenseCheckerRegistry.register_checker(name=policy_name) + registered_class = decorator(license_policy_class) + if LicenseOptions.DEBUG: # pragma: no cover - this doesn't have to work for production + found_class = LicenseCheckerRegistry.lookup_checker(policy_name) + PRINT(f"Registered checker class {policy_name!r}" + f" with license_frameworks {conjoined_list(found_class.LICENSE_FRAMEWORKS)}.") + _creation_attmpts_in_progress.remove(policy_name) + return registered_class + + +def use_policy_literal(*, policy_name, policy_datum, other_policy_data): + """This is used for datum that requires no merging. The policy_datum is returned. Other arguments are ignored.""" + ignored(policy_name, other_policy_data) + return policy_datum + + +def str_or_regexp_sort_key(datum: Union[str, Regexp]): """ - Checker for SCAN2 library from Park Lab. + Returns a key for a datum that is an element of a list of elements that are strings or compiled regular expressions. + Regular expressions will sort where their parttern would be in the series of strings. """ + # Rationale: We want something like this just to make testing predictable. + if isinstance(datum, str): + return datum + else: + return datum.pattern - EXCEPTIONS = augment( - ParkLabGplPipelineLicenseChecker.EXCEPTIONS, - by={ - 'Custom: Matrix file LICENCE': [ - # The custom information in https://cran.r-project.org/web/packages/Matrix/LICENCE - # says there are potential extra restrictions beyond a simple GPL license - # if SparseSuite is used, but it is not requested explicitly by Scan2, and we're - # trusting that any other libraries used by Scan2 would have investigated this. - # So, effectively, we think the Matrix library for this situation operates the - # same as if it were just GPL-3 licensed, and we are fine with that. - 'Matrix' - ], - - "MISSING": [ - # mysql-common and mysql-libs are GPL, but since they are delivered by conda - # and not distributed as part of the Scan2 distribution, they should be OK. - # Ref: https://redresscompliance.com/mysql-license-a-complete-guide-to-licensing/#:~:text=commercial%20use # noQA - 'mysql-common', - 'mysql-libs', - - # This is our own library - 'r-scan2', 'scan2', - ] - } - ) - - EXPECTED_MISSING_LICENSES = ParkLabGplPipelineLicenseChecker.EXPECTED_MISSING_LICENSES + [ - ] +def merge_policy_lists(*, policy_name, policy_datum, other_policy_data, sort_key=None): + """ + Merges a set of policy lists by appending them and de-duplicating. + By default, the result list is assumed to be homogenous in type and suitable for sorting. + If the list is of heterogeneous type, a sort_key is must be supplied to allow a total ordering. + """ + ignored(policy_name) + result = policy_datum + for other_datum in other_policy_data: + result += other_datum + # de-duplicate and apply a deterministic ordering to make testing easier. + return sorted(set(result), key=sort_key) + + +def merge_policy_strings_or_regexps(*, policy_name, policy_datum, other_policy_data): + return merge_policy_lists(policy_name=policy_name, policy_datum=policy_datum, other_policy_data=other_policy_data, + sort_key=str_or_regexp_sort_key) + + +def merge_policy_dicts(*, policy_name, policy_datum, other_policy_data): + ignored(policy_name) + merged = defaultdict(lambda: []) + + def add_to_merged(d): + for k, values in d.items(): + for value in values: + merged[k].append(value) + + add_to_merged(policy_datum) + for other_datum in other_policy_data: + add_to_merged(other_datum) + + return {k: sorted(set(v)) for k, v in sorted(merged.items())} + + +POLICY_ATTRS: callable = { + 'class_key': use_policy_literal, + 'class_name': use_policy_literal, + 'inherits_from': use_policy_literal, + 'description': use_policy_literal, + 'LICENSE_TITLE': use_policy_literal, + 'COPYRIGHT_OWNER': use_policy_literal, + 'LICENSE_FRAMEWORKS': use_policy_literal, + 'ALLOWED': merge_policy_strings_or_regexps, + 'EXPECTED_MISSING_LICENSES': merge_policy_lists, + 'EXCEPTIONS': merge_policy_dicts, +} + +POLICY_MERGE_LISTS = {'ALLOWED', 'EXPECTED_MISSING_LICENSES'} +POLICY_MERGE_DICTS = {'EXCEPTIONS'} + + +def get_attrs_for_classes(attr: str, class_data: List[Type]): + result = [] + for class_datum in class_data: + attr_val = getattr(class_datum, attr, None) # Intentionally treats explicit None the same as missing + if attr_val is not None: + result.append(attr_val) + return result + + +def default_policy_data(*, policy_name: str, policy_data: AnyJsonData, parent_classes: List[Type]): + result = {} + for key_to_default, val_to_be_defaulted in policy_data.items(): + attr_handler: Optional[callable] = POLICY_ATTRS.get(key_to_default) + if attr_handler is None: + raise ValueError(f"Bad policy attribute: {key_to_default}") + result[key_to_default] = attr_handler(policy_name=policy_name, policy_datum=val_to_be_defaulted, + other_policy_data=get_attrs_for_classes(key_to_default, parent_classes)) + return result + + +def load_license_policies(policy_dir=None): + for policy_name in built_in_policy_names(): + find_or_create_license_class(policy_name=policy_name, policy_dir=policy_dir) + + +# This will cause the definitions of classes to in the predefined set to be exported by this library +# in case they need to be imported elsewhere, for example to use in unit-testing. Those are things like +# * ParkLabCommonLicenseChecker, etc. +# * C4InfrastructureLicenseChecker, etc. +# See license_policies/*.jsonc for a full list. +load_license_policies() + +ParkLabCommonLicenseChecker = LicenseCheckerRegistry.lookup_checker('park-lab-common') +ParkLabCommonServerLicenseChecker = LicenseCheckerRegistry.lookup_checker('park-lab-common-server') +ParkLabPipelineLicenseChecker = LicenseCheckerRegistry.lookup_checker('park-lab-pipeline') +ParkLabGplPipelineLicenseChecker = LicenseCheckerRegistry.lookup_checker('park-lab-gpl-pipeline') +C4InfrastructureLicenseChecker = LicenseCheckerRegistry.lookup_checker('c4-infrastructure') +C4PythonInfrastructureLicenseChecker = LicenseCheckerRegistry.lookup_checker('c4-python-infrastructure') diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 115fd00ff..66ef4a371 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -1344,7 +1344,11 @@ def to_camel_case(s): """ Converts a string that might be in snake_case or CamelCase into CamelCase. """ - if s[:1].isupper() and '_' not in s: + hyphen_found = False + if '-' in s: + hyphen_found = True + s = s.replace('-', '_') + if not hyphen_found and s[:1].isupper() and '_' not in s: return s else: return snake_case_to_camel_case(s) diff --git a/dcicutils/scripts/run_license_checker.py b/dcicutils/scripts/run_license_checker.py index f2324c0cf..c07c06529 100644 --- a/dcicutils/scripts/run_license_checker.py +++ b/dcicutils/scripts/run_license_checker.py @@ -10,7 +10,8 @@ EPILOG = __doc__ -ALL_CHECKER_NAMES = LicenseCheckerRegistry.all_checker_names() +ALL_CHECKER_NAMES = sorted(LicenseCheckerRegistry.all_checker_names(), + key=lambda x: 'aaaaa-' + x if x.startswith('park-lab-') else x) NEWLINE = '\n' @@ -31,11 +32,14 @@ def main(): help="Requests additional debugging output.") parser.add_argument("--conda-prefix", "--conda_prefix", "--cp", default=LicenseOptions.CONDA_PREFIX, help=(f"Overrides the CONDA_PREFIX (default {LicenseOptions.CONDA_PREFIX!r}).")) + parser.add_argument("--policy-dir", "--policy_dir", "--pd", default=LicenseOptions.POLICY_DIR, + help=(f"Specifies a custom policy directory (default {LicenseOptions.POLICY_DIR!r}).")) args = parser.parse_args() with script_catch_errors(): - run_license_checker(name=args.name, verbose=not args.brief, debug=args.debug, conda_prefix=args.conda_prefix) + run_license_checker(name=args.name, verbose=not args.brief, debug=args.debug, conda_prefix=args.conda_prefix, + policy_dir=args.policy_dir) def show_help_for_choosing_license_checker(): @@ -62,16 +66,18 @@ def show_help_for_choosing_license_checker(): def run_license_checker(name: Optional[str], verbose=LicenseOptions.VERBOSE, debug=LicenseOptions.DEBUG, - conda_prefix=LicenseOptions.CONDA_PREFIX): + conda_prefix=LicenseOptions.CONDA_PREFIX, + policy_dir=LicenseOptions.POLICY_DIR): if name is None: show_help_for_choosing_license_checker() else: - try: - checker_class: Type[LicenseChecker] = LicenseCheckerRegistry.lookup_checker(name) - except Exception as e: - raise ScriptFailure(str(e)) - try: - with LicenseOptions.selected_options(verbose=verbose, debug=debug, conda_prefix=conda_prefix): + with LicenseOptions.selected_options(verbose=verbose, debug=debug, conda_prefix=conda_prefix, + policy_dir=policy_dir): + try: + checker_class: Type[LicenseChecker] = LicenseCheckerRegistry.lookup_checker(name, autoload=True) + except Exception as e: + raise ScriptFailure(str(e)) + try: checker_class.validate() - except LicenseCheckFailure as e: - raise ScriptFailure(get_error_message(e)) + except LicenseCheckFailure as e: + raise ScriptFailure(get_error_message(e)) diff --git a/poetry.lock b/poetry.lock index d7e77523c..c59c8f953 100644 --- a/poetry.lock +++ b/poetry.lock @@ -884,6 +884,18 @@ files = [ {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] +[[package]] +name = "jsonc-parser" +version = "1.1.5" +description = "A lightweight, native tool for parsing .jsonc files" +category = "main" +optional = false +python-versions = ">=3.5" +files = [ + {file = "jsonc-parser-1.1.5.tar.gz", hash = "sha256:7126d17725b0413cd40af4297d9f6412c4181a62135e4c41cdf8f6a82c5936e6"}, + {file = "jsonc_parser-1.1.5-py3-none-any.whl", hash = "sha256:abd1db76a4c6d1733ec7bb5340a89c49cbc878a181a1e7947ee6719eedf2c6cc"}, +] + [[package]] name = "mccabe" version = "0.7.0" @@ -1594,4 +1606,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = ">=3.7,<3.10" -content-hash = "b8d6612bb28cfb9da79306a82b2ac35a20678e1f62ef86c93b8af3c3d1ed798e" +content-hash = "ca11caee3bf14b381e0aaec68ca6bca23f89064db9d90a61e9500e23eab8106f" diff --git a/pyproject.toml b/pyproject.toml index 1fdb3578b..f919add77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ elasticsearch = "7.13.4" aws-requests-auth = ">=0.4.2,<1" docker = "^4.4.4" gitpython = "^3.1.2" +jsonc-parser = "^1.1.5" pytz = ">=2020.4" PyYAML = ">=5.1,<5.5" requests = "^2.21.0" diff --git a/test/test_license_utils.py b/test/test_license_utils.py index bb87ce8d8..e0d0fdc25 100644 --- a/test/test_license_utils.py +++ b/test/test_license_utils.py @@ -1,19 +1,26 @@ import copy import datetime +import glob import io import json import os import pytest +import re import subprocess as subprocess_module from collections import defaultdict +from dcicutils import license_utils as license_utils_module +from dcicutils.common import Regexp from dcicutils.license_utils import ( - LicenseOptions, LicenseFrameworkRegistry, LicenseFramework, + POLICY_DIR, + LicenseOptions, LicenseFrameworkRegistry, LicenseFramework, LicenseCheckerRegistry, PythonLicenseFramework, JavascriptLicenseFramework, CondaLicenseFramework, RLicenseFramework, LicenseAnalysis, LicenseChecker, LicenseStatus, LicenseFileParser, LicenseCheckFailure, LicenseOwnershipCheckFailure, LicenseAcceptabilityCheckFailure, warnings as license_utils_warnings_module, - extract_boolean_terms, simplify_license_versions, + extract_boolean_terms, simplify_license_versions, load_license_policies, literal_string_or_regexp_from_dict, + default_policy_data, str_or_regexp_sort_key, get_attrs_for_classes, find_or_create_license_class, + use_policy_literal, merge_policy_lists, merge_policy_strings_or_regexps, merge_policy_dicts, built_in_policy_names, ) from dcicutils.misc_utils import ignored, file_contents, local_attrs from dcicutils.qa_utils import printed_output, MockFileSystem @@ -760,3 +767,221 @@ def mocked_license_logger(message): LicenseFileParser.validate_simple_license_file(filename='LICENSE.txt', analysis=analysis) assert analysis.miscellaneous == ["The copyright year, '2020', should have '2023' at the end."] assert license_warnings == [] + + +def test_default_policy_data(): + + class MyCondaClass(LicenseChecker): + LICENSE_FRAMEWORKS = ['conda'] + EXCEPTIONS = { + 'something': ['some-lib'] + } + + def check_it(input, expected, *, parents=None): + parents = parents or [] + assert default_policy_data(policy_name='some-policy', policy_data=input, parent_classes=parents) == expected + + check_it({'LICENSE_FRAMEWORKS': ['a', 'b']}, {'LICENSE_FRAMEWORKS': ['a', 'b']}) + check_it({'LICENSE_FRAMEWORKS': ['a', 'b']}, {'LICENSE_FRAMEWORKS': ['a', 'b']}, parents=[MyCondaClass]) + check_it({}, {}, parents=[MyCondaClass]) + + check_it( + { + 'EXCEPTIONS': { + 'something': ['some-random-lib'], + 'something-else': ['some-other-lib'] + } + }, + { + 'EXCEPTIONS': { + 'something': ['some-lib', 'some-random-lib'], + 'something-else': ['some-other-lib'] + } + }, + parents=[MyCondaClass]) + + +def test_use_policy_literal(): + + class MyIgnoredLicenseChecker(LicenseChecker): + pass + + assert use_policy_literal(policy_name='ignored', policy_datum='anything', + other_policy_data=[MyIgnoredLicenseChecker]) == 'anything' + + +def test_str_or_regexp_sort_key(): + + assert str_or_regexp_sort_key('foo') == 'foo' + assert str_or_regexp_sort_key(re.compile('foo')) == 'foo' + + +def test_merge_policy_lists(): + + list1 = ['a', 'c', 'b'] + list2 = ['f', 'a'] + list3 = ['g', 'a'] + + actual = merge_policy_lists(policy_name='ignored', policy_datum=list1, other_policy_data=[]) + expected = ['a', 'b', 'c'] + assert actual == expected + + actual = merge_policy_lists(policy_name='ignored', policy_datum=list1, other_policy_data=[list2]) + expected = ['a', 'b', 'c', 'f'] + assert actual == expected + + actual = merge_policy_lists(policy_name='ignored', policy_datum=list1, other_policy_data=[list2, list3]) + expected = ['a', 'b', 'c', 'f', 'g'] + assert actual == expected + + with pytest.raises(Exception): + merge_policy_lists(policy_name='ignored', policy_datum=['a', re.compile('foo')], other_policy_data=[]) + + +def test_merge_policy_strings_or_regexps(): + + regexp_foo = re.compile('foo') + regexp_bar = re.compile('bar') + + list1 = ['a', regexp_foo, 'c', 'b'] + list2 = ['f', regexp_bar, 'a'] + list3 = [regexp_foo, 'g', 'a'] + + actual = merge_policy_strings_or_regexps(policy_name='ignored', policy_datum=list1, other_policy_data=[]) + expected = ['a', 'b', 'c', regexp_foo] + assert actual == expected + + actual = merge_policy_strings_or_regexps(policy_name='ignored', policy_datum=list1, other_policy_data=[list2]) + expected = ['a', 'b', regexp_bar, 'c', 'f', regexp_foo] + assert actual == expected + + actual = merge_policy_strings_or_regexps(policy_name='ignored', policy_datum=list1, + other_policy_data=[list2, list3]) + expected = ['a', 'b', regexp_bar, 'c', 'f', regexp_foo, 'g'] + assert actual == expected + + +def test_merge_policy_dicts(): + + dict1 = {'foo': ['a', 'b'], 'bar': ['x', 'z']} + dict2 = {'alpha': ['p', 'q']} + dict3 = {'foo': ['a', 'c'], 'baz': ['z', 'w']} + + actual = merge_policy_dicts(policy_name='ignored', policy_datum=dict1, other_policy_data=[]) + expected = {'bar': ['x', 'z'], 'foo': ['a', 'b']} + assert actual == expected + + actual = merge_policy_dicts(policy_name='ignored', policy_datum=dict1, other_policy_data=[dict2]) + expected = {'alpha': ['p', 'q'], 'bar': ['x', 'z'], 'foo': ['a', 'b']} + assert actual == expected + + actual = merge_policy_dicts(policy_name='ignored', policy_datum=dict1, other_policy_data=[dict2, dict3]) + expected = {'alpha': ['p', 'q'], 'bar': ['x', 'z'], 'baz': ['w', 'z'], 'foo': ['a', 'b', 'c']} + assert actual == expected + + +def test_get_attrs_for_classes(): + + class ClassA: + PROP1 = 'val1A' + PROP2 = 'val2A' + + class ClassB: + PROP2 = 'val2B' + PROP3 = 'val3B' + + class ClassC: + PROP1 = 'val1C' + + class ClassAB(ClassA): + PROP1 = 'val1AB' + PROP2 = None + + # Note that the order of the results is the order of the classes in which the value occurs, NOT alphabetical. + + assert get_attrs_for_classes('PROP1', [ClassA]) == ['val1A'] + assert get_attrs_for_classes('PROP1', [ClassA, ClassB]) == ['val1A'] + assert get_attrs_for_classes('PROP1', [ClassA, ClassB, ClassC]) == ['val1A', 'val1C'] + assert get_attrs_for_classes('PROP1', [ClassA, ClassB, ClassC, ClassAB]) == ['val1A', 'val1C', 'val1AB'] + assert get_attrs_for_classes('PROP1', [ClassAB, ClassA, ClassB, ClassC]) == ['val1AB', 'val1A', 'val1C'] + + assert get_attrs_for_classes('PROP2', [ClassA]) == ['val2A'] + assert get_attrs_for_classes('PROP2', [ClassA, ClassB]) == ['val2A', 'val2B'] + assert get_attrs_for_classes('PROP2', [ClassA, ClassB, ClassC]) == ['val2A', 'val2B'] + assert get_attrs_for_classes('PROP2', [ClassA, ClassB, ClassC, ClassAB]) == ['val2A', 'val2B'] # None is ignored + assert get_attrs_for_classes('PROP2', [ClassAB, ClassA, ClassB, ClassC]) == ['val2A', 'val2B'] # ditto + + assert get_attrs_for_classes('PROP3', [ClassA]) == [] + assert get_attrs_for_classes('PROP3', [ClassA, ClassB]) == ['val3B'] + assert get_attrs_for_classes('PROP3', [ClassA, ClassB, ClassC]) == ['val3B'] + assert get_attrs_for_classes('PROP3', [ClassA, ClassB, ClassC, ClassAB]) == ['val3B'] + assert get_attrs_for_classes('PROP3', [ClassA, ClassB, ClassC, ClassAB]) == ['val3B'] + + +def test_literal_string_or_regexp_from_dict(): + + print() # start on a fresh line + + sample_string = "foo" + assert literal_string_or_regexp_from_dict(sample_string) == sample_string + + sample_regexp_pattern_1 = "foo.*" + sample_regexp_pattern_2 = "(bar)" + sample_regexp_pattern_3 = sample_regexp_pattern_1 + sample_regexp_pattern_2 + + default_flags = re.UNICODE | re.IGNORECASE + + result = literal_string_or_regexp_from_dict({"pattern": sample_regexp_pattern_1}) + assert isinstance(result, Regexp) + assert result.pattern == sample_regexp_pattern_1 + assert result.flags == default_flags + + result = literal_string_or_regexp_from_dict({"pattern": [sample_regexp_pattern_1, sample_regexp_pattern_2]}) + assert isinstance(result, Regexp) + assert result.pattern == sample_regexp_pattern_3 + + result = literal_string_or_regexp_from_dict({"pattern": sample_regexp_pattern_1, "flags": ["VERBOSE"]}) + assert isinstance(result, Regexp) + assert result.pattern == sample_regexp_pattern_1 + assert result.flags == default_flags | re.VERBOSE + + +def test_find_or_create_license_class(): + test_registry = {} + policy_data_cache = {} + + class TestChecker(LicenseChecker): + pass + + with mock.patch.object(license_utils_module, "find_policy_data") as mock_find_policy_data: + with mock.patch.object(LicenseCheckerRegistry, "REGISTRY", test_registry): + with mock.patch.object(license_utils_module, "POLICY_DATA_CACHE", policy_data_cache): + + # This tests the find part + test_registry['test'] = TestChecker + assert find_or_create_license_class(policy_name='test', policy_dir='ignored') == TestChecker + mock_find_policy_data.assert_not_called() + + mock_find_policy_data.return_value = {"inherits_from": []} + policy_class = find_or_create_license_class(policy_name='something', policy_dir='/my/policy/dir') + assert issubclass(policy_class, LicenseChecker) + + +def test_load_license_policies(): + test_policy_names = ['my_project', 'your_project'] + policy_dir_for_testing = 'some/dir/' + with mock.patch.object(license_utils_module, "find_or_create_license_class") as mock_find_or_create_license_class: + with mock.patch.object(license_utils_module, "built_in_policy_names") as mock_built_in_policy_names: + mock_built_in_policy_names.return_value = test_policy_names + load_license_policies(policy_dir=policy_dir_for_testing) + mock_find_or_create_license_class.assert_has_calls([ + mock.call(policy_name=policy_name, policy_dir=policy_dir_for_testing) + for policy_name in test_policy_names + ]) + + +def test_built_in_policy_names(): + test_project_names = ['my_project', 'your_project'] + with mock.patch.object(glob, "glob") as mock_glob_glob: + mock_glob_glob.return_value = [os.path.join(POLICY_DIR, f"{name}.jsonc") for name in test_project_names] + assert built_in_policy_names() == test_project_names diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py index 778aabac3..5b80a8ae7 100644 --- a/test/test_misc_utils.py +++ b/test/test_misc_utils.py @@ -2000,8 +2000,9 @@ def test_snake_case_to_camel_case_hyphenated(token, expected): ('x_m_l_container', 'XMLContainer'), ('X_M_L_Container', 'XMLContainer'), ]) -def test_to_camel_case_hyphenated(token, expected): +def test_to_camel_case(token, expected): assert to_camel_case(token) == expected + assert to_camel_case(token.replace('_', '-')) == expected assert to_camel_case(expected) == expected # make sure it's stable