mention-bot.js

/**
 * Copyright (c) 2015-present, Facebook, Inc.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree. An additional grant
 * of patent rights can be found in the PATENTS file in the same directory.
 *
 * @flow
 */

'use strict';

var githubAuthCookies = require('./githubAuthCookies');
var fs = require('fs');
var minimatch = require('minimatch');

async function downloadFileAsync(url: string, cookies: ?string): Promise<string> {
  return new Promise(function(resolve, reject) {
    var args = ['--silent', '-L', url];

    if (cookies) {
      args.push('-H', `Cookie: ${cookies}`);
    }

    require('child_process')
      .execFile('curl', args, {encoding: 'utf8', maxBuffer: 1000 * 1024 * 10}, function(error, stdout, stderr) {
        if (error) {
          reject(error);
        } else {
          resolve(stdout.toString());
        }
      });
  });
}

async function readFileAsync(name: string, encoding: string): Promise<string> {
  return new Promise(function(resolve, reject) {
    fs.readFile(name, encoding, function(err, data) {
      if (err) {
        reject(err);
      } else {
        resolve(data);
      }
    });
  });
}

type FileInfo = {
  path: string,
  deletedLines: Array<number>,
};

type WhitelistUser = {
  name: string,
  files: Array<string>,
  skipTeamPrs: bool
};

type TeamData = {
  name: string,
  id: number
};

type TeamMembership = {
  name: string,
  state: string
};

function startsWith(str, start) {
  return str.substr(0, start.length) === start;
}

function parseDiffFile(lines: Array<string>): Array<FileInfo> {
  // diff --git "a/path" "b/path" or rename to path/file or rename from path/file
  var diffRegex = /^diff --git "?a\/(.+)"?\s/;
  // @@ -from_line,from_count +to_line,to_count @@ first line
  var offsetRegex = /^@@ -(\d+).+@@/;
  var offset = null;
  var current = 0;
  return lines.reduce((f, line) => {
    var match = diffRegex.exec(line);
    if (match) {
      offset = null;
      f.push({path: match[1], deletedLines: []});
      return f;
    }
    var offsetMatch = offsetRegex.exec(line);
    if (offsetMatch) {
      offset = parseInt(offsetMatch[1], 10)
      current = 0;
      return f;
    }
    if (line.startsWith('-') && offset) {
      f[f.length -1].deletedLines.push(current + offset);
    }
    if (!line.startsWith('+')) {
      current++;
    }
    return f;
  }, [])
}

function parseDiff(diff: string): Array<FileInfo> {
  var files = [];
  // The algorithm is designed to be best effort. If the http request failed
  // for some reason and we get an empty file, we should not crash.
  if (!diff || !diff.match(/^diff/)) {
    return files;
  }

  var lines = diff.trim().split('\n');
  files = parseDiffFile(lines);

  return files;
}

/**
 * Sadly, github doesn't have an API to get line by line blame for a file.
 * We could git clone the repo and blame, but it's annoying to have to
 * maintain a local git repo and the blame is going to be name + email instead
 * of the github username, so we'll have to do a http request anyway.
 *
 * There are two main ways to extract information from an HTML file:
 *   - First is to work like a browser: parse the html, build a DOM tree and
 *     use a jQuery-like API to traverse the DOM and extract what you need.
 *     The big issue is that creating the DOM is --extremely-- slow.
 *   - Second is to use regex to analyze the outputted html and find whatever
 *     we want.
 *
 * I(vjeux)'ve scraped hundreds of websites using both techniques and both of
 * them have the same reliability when it comes to the website updating their
 * markup. If they change what you are extracting you are screwed and if they
 * change something around, both are resistant to it when written properly.
 * So, might as well use the fastest one of the two: regex :)
 */
function parseBlame(blame: string): Array<string> {
  // The way the document is structured is that commits and lines are
  // interleaved. So every time we see a commit we grab the author's name
  // and every time we see a line we log the last seen author.
  var re = /(<img alt="@([^"]+)" class="avatar blame-commit-avatar"|<tr class="blame-line")/g;

  var currentAuthor = 'none';
  var lines = [];
  var match;
  while (match = re.exec(blame)) {
    if (match[2]) {
      currentAuthor = match[2];
    } else {
      lines.push(currentAuthor);
    }
  }

  return lines;
}

function getDeletedOwners(
  files: Array<FileInfo>,
  blames: { [key: string]: Array<string> }
): { [key: string]: number } {
  var owners = {};
  files.forEach(function(file) {
    var blame = blames[file['path']];
    if (!blame) {
      return;
    }
    file.deletedLines.forEach(function (line) {
      // In a perfect world, this should never fail. However, in practice, the
      // blame request may fail, the blame is checking against master and the
      // pull request isn't, the blame file was too big and the curl wrapper
      // only read the first n bytes...
      // Since the output of the algorithm is best effort, it's better to just
      // swallow errors and have a less accurate implementation than to crash.
      var name = blame[line - 1];
      if (!name) {
        return;
      }
      owners[name] = (owners[name] || 0) + 1;
    });
  });
  return owners;
}

function getAllOwners(
  files: Array<FileInfo>,
  blames: { [key: string]: Array<string> }
): { [key: string]: number } {
  var owners = {};
  files.forEach(function(file) {
    var blame = blames[file.path];
    if (!blame) {
      return;
    }
    for (var i = 0; i < blame.length; ++i) {
      var name = blame[i];
      if (!name) {
        return;
      }
      owners[name] = (owners[name] || 0) + 1;
    }
  });
  return owners;
}

function getSortedOwners(
  owners: { [key: string]: number }
): Array<string> {
  var sorted_owners = Object.keys(owners);
  sorted_owners.sort(function(a, b) {
    var countA = owners[a];
    var countB = owners[b];
    return countA > countB ? -1 : (countA < countB ? 1 : 0);
  });
  return sorted_owners;
}

async function getDiffForPullRequest(
  owner: string,
  repo: string,
  id: number,
  github: Object
): Promise<string> {
  return new Promise(function(resolve, reject) {
    github.pullRequests.get({
      owner: owner,
      repo: repo,
      number: id,
      headers: {Accept: 'application/vnd.github.diff'}
    }, function (err, result) {
      if (err) {
        reject(err);
      } else {
        resolve(result.data);
      }
    });
  });
}

async function getMatchingOwners(
  files: Array<FileInfo>,
  whitelist: Array<WhitelistUser>,
  creator: string,
  org: ?string,
  github: Object
): Promise<Array<string>> {
  var owners = [];
  var users = whitelist || [];

  users.forEach(function(user) {
    let userHasChangedFile = false;

    user.files.forEach(function(pattern) {
      if (!userHasChangedFile) {
        userHasChangedFile = files.find(function(file) {
          return minimatch(file.path, pattern, { dot: true });
        });
      }
    });

    if (userHasChangedFile && owners.indexOf(user.name) === -1) {
      owners.push(user.name);
    }
  });

  if (org) {
    owners = await filterOwnTeam(users, owners, creator, org, github);
  }
  return owners;
}

async function filterOwnTeam(
  users: Array<WhitelistUser>,
  owners: Array<string>,
  creator: string,
  org: string,
  github: Object
): Promise<Array<string>> {
  if (!users.some(function(user) {
    return user.skipTeamPrs;
  })) {
    return owners;
  }

  // GitHub does not provide an API to look up a team by name.
  // Instead, get all teams, then filter against those matching
  // our teams list who want to be excluded from their own PR's.
  var teamData = await getTeams(org, github, 0);
  teamData = teamData.filter(function(team) {
    return users.some(function(user) {
      return user.skipTeamPrs && user.name === team.name;
    });
  });
  var promises = teamData.map(function(teamInfo) {
    return getTeamMembership(creator, teamInfo, github);
  });
  var teamMemberships = await Promise.all(promises);
  teamMemberships = teamMemberships.filter(function(membership) {
    return membership.state === 'active';
  });
  return owners.filter(function(owner) {
    return !teamMemberships.find(function(membership) {
        return owner === membership.name;
    });
  });
}

/**
 * While developing/debugging the algorithm itself, it's very important not to
 * make http requests to github. Not only it's going to make the reload cycle
 * much slower, it's also going to temporary/permanently ban your ip and
 * you won't be able to get anymore work done when it happens :(
 */
async function fetch(url: string): Promise<string> {
  const cacheKey = url.replace(/[^a-zA-Z0-9-_\.]/g, '-');
  return cacheGet(cacheKey, () => downloadFileAsync(url, githubAuthCookies));
}

async function cacheGet(
  cacheKey: string,
  getFn: () => Promise<string>
): Promise<string> {
  if (!module.exports.enableCachingForDebugging) {
    return getFn();
  }

  const cacheDir = __dirname + '/cache/';
  if (!fs.existsSync(cacheDir)) {
    fs.mkdir(cacheDir);
  }

  cacheKey = cacheDir + cacheKey;
  if (!fs.existsSync(cacheKey)) {
    const contents = await getFn();
    fs.writeFileSync(cacheKey, contents);
  }
  return readFileAsync(cacheKey, 'utf8');
}

async function getTeams(
  org: string,
  github: Object,
  page: number
): Promise<Array<TeamData>> {
  const perPage = 100;
  return new Promise(function(resolve, reject) {
    github.orgs.getTeams({
      org: org,
      page: page,
      per_page: perPage
    }, function(err, teams) {
      if (err) {
        reject(err);
      } else {
        var teamData = teams.map(function(team) {
          return {
            name: org + "/" + team.slug,
            id: team.id
          };
        });
        if (teamData.length === perPage) {
          getTeams(org, github, ++page).then(function(results) {
            resolve(teamData.concat(results));
          })
          .catch(reject);
        } else {
          resolve(teamData);
        }
      }
    });
  });
}

async function getOwnerOrgs(
  username: string,
  github: Object
): Promise<Array<string>> {
  return new Promise(function(resolve, reject) {
    github.orgs.getForUser({ username: username }, function(err, result) {
      if (err) {
        reject(err);
      } else {
        resolve(
          result.map(function (obj){
            return obj.login;
          })
        );
      }
    });
  });
}

async function getMembersOfOrg(
  org: string,
  github: Object,
  page: number
): Promise<Array<string>> {
  const perPage = 100;
  return new Promise(function(resolve, reject) {
    github.orgs.getMembers({
      org: org,
      page: page,
      per_page: perPage
    }, function(err, members) {
      if (err) {
        reject(err);
      } else {
        var logins = members.map(function (obj){
          return obj.login;
        })
        if(logins.length === perPage) {
          getMembersOfOrg(org, github, ++page).then(function(results) {
            resolve(logins.concat(results));
          })
          .catch(reject);
        } else {
          resolve(logins);
        }
      }
    });
  });
}

async function filterRequiredOrgs(
  owners: Array<string>,
  config: Object,
  github: Object
): Promise<Array<string>> {
  var promises = config.requiredOrgs.map(function(reqOrg) {
    return getMembersOfOrg(reqOrg, github, 0);
  })

  var currentMembers = [].concat.apply([], await Promise.all(promises));
  return owners.filter(function(owner) {
    // User passes if they are in any of the required organizations
    return currentMembers.indexOf(owner) >= 0;
  });
}

async function getTeamMembership(
  creator: string,
  teamData: TeamData,
  github: Object
): Promise<TeamMembership> {
  return new Promise(function(resolve, reject) {
    github.orgs.getTeamMembership({
      id: teamData.id,
      owner: creator
    }, function(err, data) {
      if (err) {
        if (err.code === 404 &&
                err.message === '{"message":"Not Found","documentation_url":"https://developer.github.com/v3"}') {
          resolve({name: teamData.name, state: 'nonmember'});
        } else {
          reject(err);
        }
      } else {
        resolve({name: teamData.name, state: data.state});
      }
    });
  });
}

/**
 * If the repo is private than we should only mention users that are still part
 * of that org.
 * Otherwise we could end up with a situation where all the people mentioned have
 * left the org and none of the current staff get notified
**/

async function filterPrivateRepo(
  owners: Array<string>,
  org: string,
  github: Object
): Promise<Array<string>> {
  var currentMembers = await getMembersOfOrg(org, github, 0);

  return owners.filter(function(owner, index) {
    // user passes if they are still in the org
    return currentMembers.some(function(member) {
      return member === owner;
    });
  });
}

/**
 * The problem at hand is to find a set of three best effort people that have
 * context on a pull request. It doesn't (and actually can't) be perfect.
 *
 * The most precise information we have is when someone deletes or modifies
 * a line of code. We know who last touched those lines and they are most
 * likely good candidates for reviewing the code.
 * This is of course not always the case, people can codemod big number of
 * lines and have very little context on that particular one, people move
 * file around and absorb all the blame...
 *
 * But, not all pull requests modify code, many of them just add new lines.
 * I first played with giving credit to people that blamed the lines around
 * but it was unclear how to spread out the credit.
 * A much dumber strategy but which has proven to be effective is to
 * completely ignore new lines and instead find the people that are blamed
 * for the biggest number of lines in the file.
 *
 * Given those two observations, the algorithm is as follow:
 *  - For each line that has been deleted, give 1 ponumber to the blamed author
 *    in a 'deletedOwners' pool.
 *  - For each file that has been touched, for each line in that file, give 1
 *    ponumber to the blamed author in a 'allOwners' pool.
 *  Once you've got those two pools, sort them by number of points, dedupe
 *  them, concat them and finally take the first 3 names.
 */
async function guessOwners(
  files: Array<FileInfo>,
  blames: { [key: string]: Array<string> },
  creator: string,
  defaultOwners: Array<string>,
  fallbackOwners: Array<string>,
  privateRepo: boolean,
  org: ?string,
  config: Object,
  github: Object
): Promise<Array<string>> {
  var deletedOwners = getDeletedOwners(files, blames);
  var allOwners = getAllOwners(files, blames);

  deletedOwners = getSortedOwners(deletedOwners);
  allOwners = getSortedOwners(allOwners);

  // Remove owners that are also in deletedOwners
  var deletedOwnersSet = new Set(deletedOwners);
  var allOwners = allOwners.filter(function(element) {
    return !deletedOwnersSet.has(element);
  });

  var owners = []
    .concat(deletedOwners)
    .concat(allOwners)
    .filter(function(owner) {
      return owner !== 'none';
    })
    .filter(function(owner) {
      return owner !== creator;
    })
    .filter(function(owner) {
      return config.userBlacklist.indexOf(owner) === -1;
    });

  if (config.requiredOrgs.length > 0) {
    owners = await filterRequiredOrgs(owners, config, github);
  }

  if (privateRepo && org != null) {
    owners = await filterPrivateRepo(owners, org, github);
  }

  if (owners.length === 0) {
    defaultOwners = defaultOwners.concat(fallbackOwners);
  }

  return owners
    .slice(0, config.maxReviewers)
    .concat(defaultOwners)
    .filter(function(owner, index, ownersFound) {
      return ownersFound.indexOf(owner) === index;
    });
}

async function guessOwnersForPullRequest(
  repoURL: string,
  id: number,
  creator: string,
  targetBranch: string,
  privateRepo: boolean,
  org: ?string,
  config: Object,
  github: Object
): Promise<Array<string>> {
  const ownerAndRepo = repoURL.split('/').slice(-2);
  const cacheKey = `${repoURL}-pull-${id}.diff`.replace(/[^a-zA-Z0-9-_\.]/g, '-');
  const diff = await cacheGet(
    cacheKey,
    () => getDiffForPullRequest(ownerAndRepo[0], ownerAndRepo[1], id, github)
  );
  var files = parseDiff(diff);
  var defaultOwners = await getMatchingOwners(files, config.alwaysNotifyForPaths, creator, org, github);
  var fallbackOwners = await getMatchingOwners(files, config.fallbackNotifyForPaths, creator, org, github);
  if (!config.findPotentialReviewers) {
      return defaultOwners;
  }

  // There are going to be degenerated changes that end up modifying hundreds
  // of files. In theory, it would be good to actually run the algorithm on
  // all of them to get the best set of reviewers. In practice, we don't
  // want to do hundreds of http requests. Using the top 5 files is enough
  // to get us 3 people that may have context.
  files.sort(function(a, b) {
    var countA = a.deletedLines.length;
    var countB = b.deletedLines.length;
    return countA > countB ? -1 : (countA < countB ? 1 : 0);
  });
  // remove files that match any of the globs in the file blacklist config
  config.fileBlacklist.forEach(function(glob) {
    files = files.filter(function(file) {
      return !minimatch(file.path, glob);
    });
  });
  files = files.slice(0, config.numFilesToCheck);

  var blames = {};
  // create blame promises (allows concurrent loading)
  var promises = files.map(function(file) {
    return fetch(repoURL + '/blame/' + targetBranch + '/' + file.path);
  });

  // wait for all promises to resolve
  var results = await Promise.all(promises);
  results.forEach(function(result, index) {
    blames[files[index].path] = parseBlame(result);
  });

  // This is the line that implements the actual algorithm, all the lines
  // before are there to fetch and extract the data needed.
  return guessOwners(files, blames, creator, defaultOwners, fallbackOwners, privateRepo, org, config, github);
}

module.exports = {
  enableCachingForDebugging: false,
  parseDiff: parseDiff,
  parseBlame: parseBlame,
  guessOwnersForPullRequest: guessOwnersForPullRequest,
};