This repository has been archived by the owner on Aug 8, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgenerateLicenses.py
568 lines (452 loc) · 22.3 KB
/
generateLicenses.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
from subprocess import call
import git
from os import path
import os
import re
import requests
import subprocess
import json
import time
import sys
import base64
import shutil
import argparse
# Filenames for input/output files
workspaceFolder = "workspace_licenses"
reposFilename = "repos.txt"
combinedFilename = 'combinedThirdParty.txt'
dependencySumFilePath = 'summary.tsv'
missingSumFilePath = 'missingSummary.tsv'
cdapBuildRepo = 'cdapio/cdap-build'
cdapRepo = 'cdapio/cdap'
# Regex needed for matching and cleaning urls
licenseRegex = r'\((.*)\) .* \((.*) - (http.*)\)'
licenseNoURLRegex = r'\((.*)\) .* \((.*) - no url defined\)'
githubURLRegex = r'https?:\/\/github\.com\/([^\/]*)\/([^\/]*)\/?.*'
githubURLSub = r'https://github.com/\1/\2'
nonGithubURLRegex = r'https?:\/\/([^\.]*)\.([^\.^\/]*)\.[a-zA-Z]\/?.*'
nonGithubURLSub = r'https://github.com/\2/\1'
githubLicenseRegex = r'https?:\/\/github\.com\/([^\/]*)\/([^\/]*)[\/blob\/]*(.*)'
githubLicenseSub = r'https://raw.githubusercontent.com/\1/\2/\3'
uiPomVersionRegex = r'<nodeVersion>v(.*)</nodeVersion>$'
# Read repos contents
reposFilePath = path.join(os.getcwd(), reposFilename)
reposFile = open(reposFilePath)
repos = reposFile.read().split("\n")
repos = [r.strip("\n\t ") for r in repos if len(r.strip("\n\t ")) > 0]
reposFile.close()
localArtifactUrlMap = {}
releaseBranchMap = {}
githubToken = ''
# Constant paths that should NOT be changed
githubTokenPath = path.expanduser('~/.config/gh/hosts.yml')
cdapCopyrightPath = path.join(os.getcwd(), workspaceFolder, 'cdap/cdap-distributions/src/COPYRIGHT')
def printHeader(message):
""" Helper function to print a noticable divider with text in the console """
print('\n'+'='*len(message))
print(message)
def convertToGithubURLs(data):
"""
Helper function that attempts to convert all URLS from:
http[s]://<subdomain>.<domain>.xxx/../../
to
https://github.com/<domain>/<subdomain>
If the URL cannot be converted it is returned to the list unchanged
"""
newData = []
for triple in data:
artifact, url, lic = triple
# Convert http://ranger.apache.org/ to https://github.com/ranger/apache
url = re.sub(nonGithubURLRegex, nonGithubURLSub, url)
# If this url takes the form https://github.com/<org>/<repo>/.../..
if re.match(githubURLRegex, url):
url = re.sub(githubURLRegex, githubURLSub, url) # Remove everything after the org and repo in the url
newData.append((artifact, url, lic))
return newData
def getGithubAuthHeader():
"""
Returns the header required by Github for authentication.
This function will pull the token from a config file generated by the Github CLI (gh).
If it fails to find the config file then the user is asked to authenticate with the CLI and try again
"""
global githubToken
if githubToken == '':
# If the config file is not there
if not path.exists(githubTokenPath):
sys.stderr.write(
"ERROR: Github oauth token is missing! Please run './gh repo view' and follow the instructions to authenticate with Github then retry this script.\n")
sys.exit(1)
# Read the config file and extract the token
githubFile = open(githubTokenPath)
configLines = githubFile.read().split('\n')
githubFile.close()
tokenLine = [line for line in configLines if 'oauth_token:' in line]
# Make sure a token was actually extracted
if len(tokenLine) == 0:
sys.stderr.write("ERROR: Failed to read Github oauth token from config file:\n%s\n" % '\n'.join(configLines))
sys.exit(1)
# Clean up the token and store it
githubToken = tokenLine[0].split(":")[1].strip()
return {'Authorization': 'token %s' % githubToken}
def getLicenseFromGithub(url, redirectURL=None):
"""
Gets the base64 encoded license contents from the given Github URL. If the url
provided is a direct link to a license file then it is downloaded normally.
However if that fails then the Github API is used to fetch the license contents.
If that also fails that means this is not a valid Github repo URL.
"""
if url is None:
return None
# First attempt to fetch the license directly from this URL
# Regex is used to convert the URL in the 'raw.githubusercontent' domain
# so we can download the raw file contents instead of HTML
directURL = re.sub(githubLicenseRegex, githubLicenseSub, url)
if directURL != url:
resp = requests.get(directURL)
# If this works then return the string in base64 (to save space)
if resp.status_code == 200:
return base64.encodebytes(resp.text.encode('utf-8')).decode('utf-8') # base64 encoded
# If this URL is not a redirect URL and it is not a Github URL then quit
if redirectURL is None and re.match(githubURLRegex, url) is None:
return None
# Convert URL into api endpoint and make the request
githubUrl = ''
if redirectURL is not None:
githubUrl = redirectURL
else:
apiRequestRegexSub = r'https://api.github.com/repos/\1/\2/license'
githubUrl = re.sub(githubURLRegex, apiRequestRegexSub, url)
try:
resp = requests.get(githubUrl, headers=getGithubAuthHeader())
# This should not happen if the user is authenticated unless we are processing >4000 licenses
if resp.status_code == 403:
print("WARN: Hit GitHub rate limit. Sleeping for 5 minutes...")
time.sleep(5*60)
return getLicenseFromGithub(url)
# If the request fails then quit
if resp.status_code != 200:
return None
# Extract the base64 encoded license from the response
respJson = resp.json()
if 'content' in respJson:
return respJson['content'] # base64 encoded
# If Github responds with a redirect url
if 'message' in respJson and respJson['message'] == 'Moved Permanently':
return getLicenseFromGithub(url, redirectURL=resp['url'])
except:
print("unexpected error")
return None
# Should never get here
return None
def getLicenseFromUrl(url):
if url is None:
return None
resp = getLicenseFromGithub(url)
if resp is not None:
return resp
#attempt a text download
try:
resp = requests.get(url)
if resp.status_code == 200 and resp.headers['content-type'] == "text/plain":
return resp.text
return None
except:
return None
def getUrlFromLocalMap(artifact):
""" Helper function to retreive URL for a given artifact from the local map """
artifactWithoutVersion = ':'.join(artifact.split(':')[:-1])
if artifactWithoutVersion not in localArtifactUrlMap:
return None
return localArtifactUrlMap[artifactWithoutVersion]
def createArtifactLicenseMap(data):
"""
Generates a map for artifacts to license contents (base64 encoded).
This function will attempt to first get the license from the URL provided in the data.
If that fails it will fall back to the local map in 'artifactToRepoMap.csv'.
If that also fails then
"""
artifactLicenseMap = {} # Main map that stores results
urlLicenseMap = {} # Optimization map to avoid visiting the same url twice
for triple in data:
artifact, url, lic = triple
# If we've already been to this URL before then just use that license
# This can occur if multuple artifacts share the same Github repo
if url in urlLicenseMap:
artifactLicenseMap[artifact] = urlLicenseMap[url]
continue
# Attempt to get the license using the URL from the data
print("DEBUG: %s - Downloading license from mvn generated url %s" % (artifact, url))
licenseContents = getLicenseFromUrl(url)
# If we failed to get the license from that url then try the url from the local map
if licenseContents is None:
newUrl = getUrlFromLocalMap(artifact)
print("DEBUG: %s - Unable to fetch license from mvn generated url, falling back to local map." % artifact)
print("DEBUG: %s - Downloading license from local map url %s" % (artifact, newUrl))
licenseContents = getLicenseFromUrl(newUrl)
# If we failed to get the license from the local map then skip it
if licenseContents is None:
print("WARN: %s - Failed to fetch license" % artifact)
continue
artifactLicenseMap[artifact] = licenseContents
urlLicenseMap[url] = licenseContents
print("DEBUG: %s - Successfully downloaded license" % artifact)
return artifactLicenseMap
def createCDAPLicenses(version, existingLicensesMap):
""" Fetches and populates license files for all CDAP third party dependencies for a given version """
global localArtifactUrlMap
# Load the local map that will be used a backup incase the mvn-generated url do not work
mapFile = open('artifactToRepoMap.csv')
for line in mapFile:
artifact, url = line.strip('\n\t').split(',')
localArtifactUrlMap[artifact] = url
mapFile.close()
printHeader("Generating licenses for CDAP")
# Run commands to generate urls for licenses
repoPath = git.getRepoPath(cdapBuildRepo)
commands = []
commands.append('cd "%s"' % repoPath)
commands.append('git submodule update --init --recursive') # Make sure all sudmoules are loaded
commands.append('mvn clean install license:add-third-party -DskipTests -Ddocker.skip') # Generate the data files
commands.append('rm ../%s; find . | grep THIRD-PARTY.txt | xargs cat >> ../%s'
% (combinedFilename, combinedFilename)) # Combine all of the files into one file for processing
code = call(" && ".join(commands), shell=True)
if code != 0:
sys.stderr.write("ERROR: Failed to generate data for third party dependencies in CDAP." +
" Please manually resolve the issue and run the script again.\n")
sys.exit(1)
# Read the file with the combined data from all repos
combinedFile = open(path.join(workspaceFolder, combinedFilename))
allLicenses = combinedFile.read()
combinedFile.close()
parsedData = [] # Stores the data
artifactURLMap = {} # Used for deduplication
# Extracting artifacts with URLs
matches = re.finditer(licenseRegex, allLicenses, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
lic, artifact, url = match.groups()
# If we've already processed data for this artifact (the output file contains several duplicates for each artifact)
if artifact in artifactURLMap:
continue
artifactURLMap[artifact] = url
parsedData.append((artifact, url, lic))
# Extracting artifacts without URLs
missingLicenses = []
matches = re.finditer(licenseNoURLRegex, allLicenses, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
lic, artifact = match.groups()
# If we've already processed data for this artifact (the output file contains several duplicates for each artifact)
if artifact in artifactURLMap:
continue
# Add it to the map and the missingLicenses list which will be reported to the user later
localURL = getUrlFromLocalMap(artifact)
artifactURLMap[artifact] = localURL
if localURL is not None:
parsedData.append((artifact,localURL,lic))
else:
missingLicenses.append('%s\t%s\t%s' % (artifact, 'no url defined', lic))
# Clean the raw data and fetch the base64 encoded license contents
dataToBeFetched = []
alreadyExistingData = []
for triple in parsedData:
if triple[0] in existingLicensesMap:
alreadyExistingData.append(triple)
else:
dataToBeFetched.append(triple)
parsedData = convertToGithubURLs(dataToBeFetched) + alreadyExistingData
artifactLicenseMap = createArtifactLicenseMap(parsedData)
added = 0
summaryFileLines = []
for artifact, url, lic in parsedData:
summaryFileLines.append('%s\t%s\t%s' % (artifact, url, lic))
if artifact in existingLicensesMap:
existingLicensesMap[artifact] = True
added += 1
continue
if artifact not in artifactLicenseMap:
print('WARN: %s - Could not find license' % artifact)
missingLicenses.append('%s\t%s\t%s' % (artifact, url, lic))
continue
licenseContents = artifactLicenseMap[artifact]
try:
licenseContents = base64.decodebytes(licenseContents.encode('utf-8')).decode('utf-8')
except Exception as e:
print('WARN: %s - Failed to decode license. The license will be used as-is.' % artifact)
print("DEBUG: %s - Writing copyright" % artifact)
# Generate filepath and ensure the directories are created for that path
copyrightFilePath = path.join(cdapCopyrightPath, artifact, 'COPYRIGHT')
os.makedirs(path.dirname(copyrightFilePath), exist_ok=True)
# Write the license
copyrightFile = open(copyrightFilePath, 'w')
copyrightFile.write(licenseContents)
copyrightFile.close()
added += 1
# Write the output summary files
summaryFile = open(dependencySumFilePath, 'w')
summaryFile.write('\n'.join(summaryFileLines))
summaryFile.close()
missingFile = open(missingSumFilePath, 'w')
missingFile.write('\n'.join(missingLicenses))
missingFile.close()
# return #SuccessfullyAdded, #Failed
return added, len(missingLicenses)
def createUILicenses(version, existingLicensesMap):
""" Fetches and populates license files for all UI third party dependencies for a given version """
printHeader("Generating licenses for UI")
# Run a test command to see if nvm is installed
uiRepoPath = path.join(git.getRepoPath(cdapBuildRepo), 'cdap', 'cdap-ui')
commands = []
commands.append('cd "%s"' % uiRepoPath)
commands.append('export NVM_DIR="$HOME/.nvm"') # setup env for nvm commands
commands.append('[ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh"') # setup env for nvm commands
commands.append('nvm --version')
code = call(" && ".join(commands), shell=True)
# If nvm is not installed then install it
if code != 0:
print("WARN: NVM is not installed. Installing it now...")
# Extract the node version from the pom.xml file, this version is needed to configure nvm in the next step
commands.clear()
commands.append('cd "%s"' % uiRepoPath)
commands.append('cat pom.xml | grep -E "%s"' % uiPomVersionRegex) # grep using regex to only return the 'nodeVersion' lines
nodeVersions = subprocess.check_output(" && ".join(commands), shell=True).decode('utf-8')
nodeVersion = re.search(uiPomVersionRegex, nodeVersions).groups()[0] # Use the regex again to extract the version number out of the line
print("Got node version from pom.xml, using node v%s" % nodeVersion)
# Install Node
commands.pop()
commands.append('curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.35.3/install.sh | bash') # download nvm installer
commands.append('export NVM_DIR="$HOME/.nvm"') # setup env for nvm commands
commands.append('[ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh"') # setup env for nvm commands
commands.append('nvm install %s' % nodeVersion) # install node
call(" && ".join(commands), shell=True)
# Prepare to run license-checker module
commands.clear()
commands.append('cd "%s"' % uiRepoPath)
commands.append('export NVM_DIR="$HOME/.nvm"') # setup env for nvm commands
commands.append('[ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh"') # setup env for nvm commands
commands.append('nvm use node') # tell nvm to use the node version that was just installed
commands.append('sudo npm install -g license-checker ') # Install the license-checker module
commands.append('sudo npm install -g pkg-config ') # Install the pkg-config module
commands.append('sudo npm install -g bower ') # Install the bower module
commands.append('sudo npm install -g yarn ') # Install the yarn module
commands.append('sudo npm install -g gh ') # Install the gh module
commands.append('yarn install --production') # Install UI dependancies
commands.append('bower install') # Install UI dependancies
subprocess.call(" && ".join(commands), shell=True)
# Run license-checker module and parse json output
commands.clear()
commands.append('cd "%s"' % uiRepoPath)
commands.append('license-checker --json --production')
uiLicenses = subprocess.check_output(" && ".join(commands), shell=True).decode('utf-8')
uiLicenses = json.loads(uiLicenses)
missingLicenses = []
summaryFileLines = []
added = 0
for artifact in uiLicenses:
licObj = uiLicenses[artifact]
# Replace '/' in the artifact name sine it will be used as part of the path later
artifact = artifact.replace('/', '-')
# Extract required values from the json object
url = 'Unkown' # github url for dependency
lic = 'Not Found' # license name
srcFilePath = 'NOTFOUND' # path of the local copy of the license
if 'repository' in licObj:
url = licObj['repository']
if 'licenses' in licObj:
lic = licObj['licenses']
if 'licenseFile' in licObj:
srcFilePath = licObj['licenseFile']
summaryFileLines.append('%s\t%s\t%s' % (artifact, url, lic))
if artifact in existingLicensesMap:
existingLicensesMap[artifact] = True
added += 1
continue
# Make sure the local copy of the license exists, if it doesnt then mark this license as missing
if not path.exists(srcFilePath):
print('WARN: Could not find license for %s' % artifact)
missingLicenses.append('%s\t%s\t%s' % (artifact, url, lic))
continue
# Create the path and copy the license folder to the correct location
print("DEBUG: Writing copyright for %s" % artifact)
copyrightFilePath = path.join(cdapCopyrightPath, artifact, 'COPYRIGHT')
os.makedirs(path.dirname(copyrightFilePath), exist_ok=True)
shutil.copyfile(srcFilePath, copyrightFilePath)
added += 1
# Append results to the summary files already started by createCDAPLicenses()
summaryFile = open(dependencySumFilePath, 'a')
summaryFile.write('\n'.join(summaryFileLines))
summaryFile.close()
missingFile = open(missingSumFilePath, 'a')
missingFile.write('\n'.join(missingLicenses))
missingFile.close()
# return #SuccessfullyAdded, #Failed
return added, len(missingLicenses)
def parseArgs():
""" Parse command line arguments """
parser = argparse.ArgumentParser(
description='Script for automatically fetching copyright licenses for all third-party dependencies and placing them in the CDAP repo.')
parser.add_argument('version',
type=str,
help='Version string to generate release notes for. Ex. 6.1.4')
parser.add_argument('--output-path',
type=str,
help='Path to place the summary files for the missing and successfully added licenses')
args = parser.parse_args()
return args
def main():
""" Main function that does all the work """
global dependencySumFilePath, missingSumFilePath
startTime = time.time()
# Parse command args and setup constants
args = parseArgs()
version = args.version
# Update paths if the flag was passed
if args.output_path:
dependencySumFilePath = path.join(path.expanduser(args.output_path), dependencySumFilePath)
missingSumFilePath = path.join(path.expanduser(args.output_path), missingSumFilePath)
# Configure git helper library
git.setWorkspaceFolder(workspaceFolder)
git.setRepos(repos)
# Generate release branch name based on version
releaseBranch = ""
versionParts = version.split(".")
releaseBranch = "release/%s.%s" % (versionParts[0], versionParts[1])
# Init cdap-build repo
git.cloneRepo(cdapBuildRepo)
git.checkoutBranch(cdapBuildRepo, releaseBranch)
# Init cdap repo
git.deleteLocalRepo(cdapRepo)
git.cloneRepo(cdapRepo)
git.checkoutBranch(cdapRepo, releaseBranch)
# Create a new branch in cdap repo for changes and delete the existing copyright folder
changeBranch = "release-update-license-%s" % version.replace('.', '')
git.checkoutBranch(cdapRepo, changeBranch, createBranch=True)
alreadyExistingLicenses = os.listdir(cdapCopyrightPath)
existingLicensesUsedMap = {l: False for l in alreadyExistingLicenses} # This map will be used to skip licenses that are already present
# Populate licenses for CDAP and UI
cdapAdded, cdapFailed = createCDAPLicenses(version, existingLicensesUsedMap)
uiAdded, uiFailed = createUILicenses(version, existingLicensesUsedMap)
# Delete licenses that are no longer needed
for licenseFolder, used in existingLicensesUsedMap.items():
if used:
continue
print("DEBUG: %s - Deleted unused license file" % licenseFolder)
shutil.rmtree(path.join(cdapCopyrightPath, licenseFolder))
# Collect final numbers
added = cdapAdded+uiAdded
failed = cdapFailed+uiFailed
# Create PR
git.addAndCommit(cdapRepo, "-A", "Updated dependancy copyright license files.")
PRLink = git.pushAndCreatePR(cdapRepo, "[RELEASE-%s] Update Licenses" % version,
"This is an automated PR to update the copyright licenses for all dependancies of CDAP.\n\n" +
"%d licenses were generated automatically, %d require manual intervention" % (added, failed), changeBranch, releaseBranch, outputURLToFile=False)
timeDiff = time.time()-startTime
# Print summary data for operation
printHeader("Done, ran in %dm %dsec" % (timeDiff//60, timeDiff % 60))
print("Success! Created %d copyright files. Could not find licenses for %d artifacts, a summary of the failures can be found in %s." %
(added, failed, missingSumFilePath))
if failed > 0:
print("Please manually address the failures and add the changes to this PR: %s" % PRLink)
else:
print("PR for approval: %s" % PRLink)
if __name__ == '__main__':
exit_code = main()
sys.exit(exit_code)