forked from facebookarchive/mention-bot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mention-bot.js
630 lines (574 loc) · 17.9 KB
/
mention-bot.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
/**
* Copyright (c) 2015-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree. An additional grant
* of patent rights can be found in the PATENTS file in the same directory.
*
* @flow
*/
'use strict';
var githubAuthCookies = require('./githubAuthCookies');
var fs = require('fs');
var minimatch = require('minimatch');
async function downloadFileAsync(url: string, cookies: ?string): Promise<string> {
return new Promise(function(resolve, reject) {
var args = ['--silent', '-L', url];
if (cookies) {
args.push('-H', `Cookie: ${cookies}`);
}
require('child_process')
.execFile('curl', args, {encoding: 'utf8', maxBuffer: 1000 * 1024 * 10}, function(error, stdout, stderr) {
if (error) {
reject(error);
} else {
resolve(stdout.toString());
}
});
});
}
async function readFileAsync(name: string, encoding: string): Promise<string> {
return new Promise(function(resolve, reject) {
fs.readFile(name, encoding, function(err, data) {
if (err) {
reject(err);
} else {
resolve(data);
}
});
});
}
type FileInfo = {
path: string,
deletedLines: Array<number>,
};
type WhitelistUser = {
name: string,
files: Array<string>,
skipTeamPrs: bool
};
type TeamData = {
name: string,
id: number
};
type TeamMembership = {
name: string,
state: string
};
function startsWith(str, start) {
return str.substr(0, start.length) === start;
}
function parseDiffFile(lines: Array<string>): Array<FileInfo> {
// diff --git "a/path" "b/path" or rename to path/file or rename from path/file
var diffRegex = /^diff --git "?a\/(.+)"?\s/;
// @@ -from_line,from_count +to_line,to_count @@ first line
var offsetRegex = /^@@ -(\d+).+@@/;
var offset = null;
var current = 0;
return lines.reduce((f, line) => {
var match = diffRegex.exec(line);
if (match) {
offset = null;
f.push({path: match[1], deletedLines: []});
return f;
}
var offsetMatch = offsetRegex.exec(line);
if (offsetMatch) {
offset = parseInt(offsetMatch[1], 10)
current = 0;
return f;
}
if (line.startsWith('-') && offset) {
f[f.length -1].deletedLines.push(current + offset);
}
if (!line.startsWith('+')) {
current++;
}
return f;
}, [])
}
function parseDiff(diff: string): Array<FileInfo> {
var files = [];
// The algorithm is designed to be best effort. If the http request failed
// for some reason and we get an empty file, we should not crash.
if (!diff || !diff.match(/^diff/)) {
return files;
}
var lines = diff.trim().split('\n');
files = parseDiffFile(lines);
return files;
}
/**
* Sadly, github doesn't have an API to get line by line blame for a file.
* We could git clone the repo and blame, but it's annoying to have to
* maintain a local git repo and the blame is going to be name + email instead
* of the github username, so we'll have to do a http request anyway.
*
* There are two main ways to extract information from an HTML file:
* - First is to work like a browser: parse the html, build a DOM tree and
* use a jQuery-like API to traverse the DOM and extract what you need.
* The big issue is that creating the DOM is --extremely-- slow.
* - Second is to use regex to analyze the outputted html and find whatever
* we want.
*
* I(vjeux)'ve scraped hundreds of websites using both techniques and both of
* them have the same reliability when it comes to the website updating their
* markup. If they change what you are extracting you are screwed and if they
* change something around, both are resistant to it when written properly.
* So, might as well use the fastest one of the two: regex :)
*/
function parseBlame(blame: string): Array<string> {
// The way the document is structured is that commits and lines are
// interleaved. So every time we see a commit we grab the author's name
// and every time we see a line we log the last seen author.
var re = /(<img alt="@([^"]+)" class="avatar blame-commit-avatar"|<tr class="blame-line")/g;
var currentAuthor = 'none';
var lines = [];
var match;
while (match = re.exec(blame)) {
if (match[2]) {
currentAuthor = match[2];
} else {
lines.push(currentAuthor);
}
}
return lines;
}
function getDeletedOwners(
files: Array<FileInfo>,
blames: { [key: string]: Array<string> }
): { [key: string]: number } {
var owners = {};
files.forEach(function(file) {
var blame = blames[file['path']];
if (!blame) {
return;
}
file.deletedLines.forEach(function (line) {
// In a perfect world, this should never fail. However, in practice, the
// blame request may fail, the blame is checking against master and the
// pull request isn't, the blame file was too big and the curl wrapper
// only read the first n bytes...
// Since the output of the algorithm is best effort, it's better to just
// swallow errors and have a less accurate implementation than to crash.
var name = blame[line - 1];
if (!name) {
return;
}
owners[name] = (owners[name] || 0) + 1;
});
});
return owners;
}
function getAllOwners(
files: Array<FileInfo>,
blames: { [key: string]: Array<string> }
): { [key: string]: number } {
var owners = {};
files.forEach(function(file) {
var blame = blames[file.path];
if (!blame) {
return;
}
for (var i = 0; i < blame.length; ++i) {
var name = blame[i];
if (!name) {
return;
}
owners[name] = (owners[name] || 0) + 1;
}
});
return owners;
}
function getSortedOwners(
owners: { [key: string]: number }
): Array<string> {
var sorted_owners = Object.keys(owners);
sorted_owners.sort(function(a, b) {
var countA = owners[a];
var countB = owners[b];
return countA > countB ? -1 : (countA < countB ? 1 : 0);
});
return sorted_owners;
}
async function getDiffForPullRequest(
owner: string,
repo: string,
id: number,
github: Object
): Promise<string> {
return new Promise(function(resolve, reject) {
github.pullRequests.get({
owner: owner,
repo: repo,
number: id,
headers: {Accept: 'application/vnd.github.diff'}
}, function (err, result) {
if (err) {
reject(err);
} else {
resolve(result.data);
}
});
});
}
async function getMatchingOwners(
files: Array<FileInfo>,
whitelist: Array<WhitelistUser>,
creator: string,
org: ?string,
github: Object
): Promise<Array<string>> {
var owners = [];
var users = whitelist || [];
users.forEach(function(user) {
let userHasChangedFile = false;
user.files.forEach(function(pattern) {
if (!userHasChangedFile) {
userHasChangedFile = files.find(function(file) {
return minimatch(file.path, pattern, { dot: true });
});
}
});
if (userHasChangedFile && owners.indexOf(user.name) === -1) {
owners.push(user.name);
}
});
if (org) {
owners = await filterOwnTeam(users, owners, creator, org, github);
}
return owners;
}
async function filterOwnTeam(
users: Array<WhitelistUser>,
owners: Array<string>,
creator: string,
org: string,
github: Object
): Promise<Array<string>> {
if (!users.some(function(user) {
return user.skipTeamPrs;
})) {
return owners;
}
// GitHub does not provide an API to look up a team by name.
// Instead, get all teams, then filter against those matching
// our teams list who want to be excluded from their own PR's.
var teamData = await getTeams(org, github, 0);
teamData = teamData.filter(function(team) {
return users.some(function(user) {
return user.skipTeamPrs && user.name === team.name;
});
});
var promises = teamData.map(function(teamInfo) {
return getTeamMembership(creator, teamInfo, github);
});
var teamMemberships = await Promise.all(promises);
teamMemberships = teamMemberships.filter(function(membership) {
return membership.state === 'active';
});
return owners.filter(function(owner) {
return !teamMemberships.find(function(membership) {
return owner === membership.name;
});
});
}
/**
* While developing/debugging the algorithm itself, it's very important not to
* make http requests to github. Not only it's going to make the reload cycle
* much slower, it's also going to temporary/permanently ban your ip and
* you won't be able to get anymore work done when it happens :(
*/
async function fetch(url: string): Promise<string> {
const cacheKey = url.replace(/[^a-zA-Z0-9-_\.]/g, '-');
return cacheGet(cacheKey, () => downloadFileAsync(url, githubAuthCookies));
}
async function cacheGet(
cacheKey: string,
getFn: () => Promise<string>
): Promise<string> {
if (!module.exports.enableCachingForDebugging) {
return getFn();
}
const cacheDir = __dirname + '/cache/';
if (!fs.existsSync(cacheDir)) {
fs.mkdir(cacheDir);
}
cacheKey = cacheDir + cacheKey;
if (!fs.existsSync(cacheKey)) {
const contents = await getFn();
fs.writeFileSync(cacheKey, contents);
}
return readFileAsync(cacheKey, 'utf8');
}
async function getTeams(
org: string,
github: Object,
page: number
): Promise<Array<TeamData>> {
const perPage = 100;
return new Promise(function(resolve, reject) {
github.orgs.getTeams({
org: org,
page: page,
per_page: perPage
}, function(err, teams) {
if (err) {
reject(err);
} else {
var teamData = teams.map(function(team) {
return {
name: org + "/" + team.slug,
id: team.id
};
});
if (teamData.length === perPage) {
getTeams(org, github, ++page).then(function(results) {
resolve(teamData.concat(results));
})
.catch(reject);
} else {
resolve(teamData);
}
}
});
});
}
async function getOwnerOrgs(
username: string,
github: Object
): Promise<Array<string>> {
return new Promise(function(resolve, reject) {
github.orgs.getForUser({ username: username }, function(err, result) {
if (err) {
reject(err);
} else {
resolve(
result.map(function (obj){
return obj.login;
})
);
}
});
});
}
async function getMembersOfOrg(
org: string,
github: Object,
page: number
): Promise<Array<string>> {
const perPage = 100;
return new Promise(function(resolve, reject) {
github.orgs.getMembers({
org: org,
page: page,
per_page: perPage
}, function(err, members) {
if (err) {
reject(err);
} else {
var logins = members.map(function (obj){
return obj.login;
})
if(logins.length === perPage) {
getMembersOfOrg(org, github, ++page).then(function(results) {
resolve(logins.concat(results));
})
.catch(reject);
} else {
resolve(logins);
}
}
});
});
}
async function filterRequiredOrgs(
owners: Array<string>,
config: Object,
github: Object
): Promise<Array<string>> {
var promises = config.requiredOrgs.map(function(reqOrg) {
return getMembersOfOrg(reqOrg, github, 0);
})
var currentMembers = [].concat.apply([], await Promise.all(promises));
return owners.filter(function(owner) {
// User passes if they are in any of the required organizations
return currentMembers.indexOf(owner) >= 0;
});
}
async function getTeamMembership(
creator: string,
teamData: TeamData,
github: Object
): Promise<TeamMembership> {
return new Promise(function(resolve, reject) {
github.orgs.getTeamMembership({
id: teamData.id,
owner: creator
}, function(err, data) {
if (err) {
if (err.code === 404 &&
err.message === '{"message":"Not Found","documentation_url":"https://developer.github.com/v3"}') {
resolve({name: teamData.name, state: 'nonmember'});
} else {
reject(err);
}
} else {
resolve({name: teamData.name, state: data.state});
}
});
});
}
/**
* If the repo is private than we should only mention users that are still part
* of that org.
* Otherwise we could end up with a situation where all the people mentioned have
* left the org and none of the current staff get notified
**/
async function filterPrivateRepo(
owners: Array<string>,
org: string,
github: Object
): Promise<Array<string>> {
var currentMembers = await getMembersOfOrg(org, github, 0);
return owners.filter(function(owner, index) {
// user passes if they are still in the org
return currentMembers.some(function(member) {
return member === owner;
});
});
}
/**
* The problem at hand is to find a set of three best effort people that have
* context on a pull request. It doesn't (and actually can't) be perfect.
*
* The most precise information we have is when someone deletes or modifies
* a line of code. We know who last touched those lines and they are most
* likely good candidates for reviewing the code.
* This is of course not always the case, people can codemod big number of
* lines and have very little context on that particular one, people move
* file around and absorb all the blame...
*
* But, not all pull requests modify code, many of them just add new lines.
* I first played with giving credit to people that blamed the lines around
* but it was unclear how to spread out the credit.
* A much dumber strategy but which has proven to be effective is to
* completely ignore new lines and instead find the people that are blamed
* for the biggest number of lines in the file.
*
* Given those two observations, the algorithm is as follow:
* - For each line that has been deleted, give 1 ponumber to the blamed author
* in a 'deletedOwners' pool.
* - For each file that has been touched, for each line in that file, give 1
* ponumber to the blamed author in a 'allOwners' pool.
* Once you've got those two pools, sort them by number of points, dedupe
* them, concat them and finally take the first 3 names.
*/
async function guessOwners(
files: Array<FileInfo>,
blames: { [key: string]: Array<string> },
creator: string,
defaultOwners: Array<string>,
fallbackOwners: Array<string>,
privateRepo: boolean,
org: ?string,
config: Object,
github: Object
): Promise<Array<string>> {
var deletedOwners = getDeletedOwners(files, blames);
var allOwners = getAllOwners(files, blames);
deletedOwners = getSortedOwners(deletedOwners);
allOwners = getSortedOwners(allOwners);
// Remove owners that are also in deletedOwners
var deletedOwnersSet = new Set(deletedOwners);
var allOwners = allOwners.filter(function(element) {
return !deletedOwnersSet.has(element);
});
var owners = []
.concat(deletedOwners)
.concat(allOwners)
.filter(function(owner) {
return owner !== 'none';
})
.filter(function(owner) {
return owner !== creator;
})
.filter(function(owner) {
return config.userBlacklist.indexOf(owner) === -1;
});
if (config.requiredOrgs.length > 0) {
owners = await filterRequiredOrgs(owners, config, github);
}
if (privateRepo && org != null) {
owners = await filterPrivateRepo(owners, org, github);
}
if (owners.length === 0) {
defaultOwners = defaultOwners.concat(fallbackOwners);
}
return owners
.slice(0, config.maxReviewers)
.concat(defaultOwners)
.filter(function(owner, index, ownersFound) {
return ownersFound.indexOf(owner) === index;
});
}
async function guessOwnersForPullRequest(
repoURL: string,
id: number,
creator: string,
targetBranch: string,
privateRepo: boolean,
org: ?string,
config: Object,
github: Object
): Promise<Array<string>> {
const ownerAndRepo = repoURL.split('/').slice(-2);
const cacheKey = `${repoURL}-pull-${id}.diff`.replace(/[^a-zA-Z0-9-_\.]/g, '-');
const diff = await cacheGet(
cacheKey,
() => getDiffForPullRequest(ownerAndRepo[0], ownerAndRepo[1], id, github)
);
var files = parseDiff(diff);
var defaultOwners = await getMatchingOwners(files, config.alwaysNotifyForPaths, creator, org, github);
var fallbackOwners = await getMatchingOwners(files, config.fallbackNotifyForPaths, creator, org, github);
if (!config.findPotentialReviewers) {
return defaultOwners;
}
// There are going to be degenerated changes that end up modifying hundreds
// of files. In theory, it would be good to actually run the algorithm on
// all of them to get the best set of reviewers. In practice, we don't
// want to do hundreds of http requests. Using the top 5 files is enough
// to get us 3 people that may have context.
files.sort(function(a, b) {
var countA = a.deletedLines.length;
var countB = b.deletedLines.length;
return countA > countB ? -1 : (countA < countB ? 1 : 0);
});
// remove files that match any of the globs in the file blacklist config
config.fileBlacklist.forEach(function(glob) {
files = files.filter(function(file) {
return !minimatch(file.path, glob);
});
});
files = files.slice(0, config.numFilesToCheck);
var blames = {};
// create blame promises (allows concurrent loading)
var promises = files.map(function(file) {
return fetch(repoURL + '/blame/' + targetBranch + '/' + file.path);
});
// wait for all promises to resolve
var results = await Promise.all(promises);
results.forEach(function(result, index) {
blames[files[index].path] = parseBlame(result);
});
// This is the line that implements the actual algorithm, all the lines
// before are there to fetch and extract the data needed.
return guessOwners(files, blames, creator, defaultOwners, fallbackOwners, privateRepo, org, config, github);
}
module.exports = {
enableCachingForDebugging: false,
parseDiff: parseDiff,
parseBlame: parseBlame,
guessOwnersForPullRequest: guessOwnersForPullRequest,
};