From 6a17aac454c61cfc7f0dafb2e7735270812e70ba Mon Sep 17 00:00:00 2001 From: Andrew Date: Sat, 23 Dec 2017 22:22:05 +0000 Subject: [PATCH 1/3] Use try/catch around functions that might fail As we increase our API usage, these functions might occasionally fail for rate limiting purposes. We don't want the entire script to crash due to this rate limiting, so catch them with an empty value. --- lib/scrape.js | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/lib/scrape.js b/lib/scrape.js index 77ea57b..5443d22 100644 --- a/lib/scrape.js +++ b/lib/scrape.js @@ -369,7 +369,12 @@ async function findGitHubUser(displayName, org) { const username = await findGitHubUserInOrg(displayName, org) if (username) return username - const user = await getGitHubUser(shortName) + let user + try { + user = await getGitHubUser(shortName) + } catch (e) { + console.error(`Got error when finding user ${shortName}...`) + } if (!user) return const login = user.login @@ -381,10 +386,15 @@ async function findGitHubUser(displayName, org) { if (updatedTime.getTime() - openTime.getTime() < 0) return - const nov = await getGitHubUserHistory(login, '2017-11-28', '2017-11-30') - const dec = await getGitHubUserHistory(login, '2017-12-01', '2017-12-31') - const jan = await getGitHubUserHistory(login, '2018-01-01', '2018-01-17') - const orgs = [...nov, ...dec, ...jan].map(repo => repo.split('/')[0]) + let orgs = [] + try { + const nov = await getGitHubUserHistory(login, '2017-11-28', '2017-11-30') + const dec = await getGitHubUserHistory(login, '2017-12-01', '2017-12-31') + const jan = await getGitHubUserHistory(login, '2018-01-01', '2018-01-17') + orgs = [...nov, ...dec, ...jan].map(repo => repo.split('/')[0]) + } catch (e) { + console.error('Could not fetch user history...') + } if (orgs.includes(org)) { return user.login } @@ -418,7 +428,12 @@ async function freshenUserGitHubCache(user, existingUser, organization) { } if (checkGitHubUserCacheExpired(existingUser)) { - const exists = await checkGitHubUserExists(user.github_account) + let exists + try { + exists = await checkGitHubUserExists(user.github_account) + } catch (e) { + exists = false + } if (exists) { return { From b33962739d17a13bf8d16ca1606015efa9c6eabb Mon Sep 17 00:00:00 2001 From: Andrew Date: Sat, 23 Dec 2017 22:44:45 +0000 Subject: [PATCH 2/3] Ignore caching if no GitHub account --- lib/scrape.js | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/scrape.js b/lib/scrape.js index 5443d22..9721e7f 100644 --- a/lib/scrape.js +++ b/lib/scrape.js @@ -420,7 +420,13 @@ function checkGitHubUserCacheExpired(user) { } async function freshenUserGitHubCache(user, existingUser, organization) { - if (!(existingUser && existingUser.github_updated)) { + if ( + !( + existingUser && + existingUser.github_updated && + existingUser.github_account + ) + ) { return { login: await findGitHubUser(user.display_name, organization), updated: Date.now(), From 97acdb848ff554ead7eb094fc4395e86eec30e49 Mon Sep 17 00:00:00 2001 From: Andrew Date: Sat, 23 Dec 2017 22:23:51 +0000 Subject: [PATCH 3/3] Identify GCI users in repository information This begins to identify users in several meta repository stats, including stargazers, watchers and forks. bust-cache Closes https://github.com/coala/gci-leaders/issues/100 --- lib/queries/github_repo_info.graphql | 28 ++++++ lib/queries/index.js | 3 + lib/scrape.js | 133 +++++++++++++++++++++++++-- lib/utils.js | 8 ++ package-lock.json | 26 +++++- package.json | 1 + 6 files changed, 189 insertions(+), 10 deletions(-) create mode 100644 lib/queries/github_repo_info.graphql create mode 100644 lib/queries/index.js create mode 100644 lib/utils.js diff --git a/lib/queries/github_repo_info.graphql b/lib/queries/github_repo_info.graphql new file mode 100644 index 0000000..1c48ff3 --- /dev/null +++ b/lib/queries/github_repo_info.graphql @@ -0,0 +1,28 @@ +query($org: String!) { + organization(login: $org) { + repositories(first: 100) { + nodes { + watchers(first: 100) { + nodes { + login + name + } + } + stargazers(last: 100) { + nodes { + login + name + } + } + forks(last: 100) { + nodes { + owner { + login + } + createdAt + } + } + } + } + } +} diff --git a/lib/queries/index.js b/lib/queries/index.js new file mode 100644 index 0000000..4f770c3 --- /dev/null +++ b/lib/queries/index.js @@ -0,0 +1,3 @@ +const { loadQuery } = require('../utils') + +module.exports.GITHUB_REPO_INFO_QUERY = loadQuery('github_repo_info') diff --git a/lib/scrape.js b/lib/scrape.js index 9721e7f..d251fd9 100644 --- a/lib/scrape.js +++ b/lib/scrape.js @@ -1,4 +1,5 @@ const fetch = require('node-fetch') +const GraphQL = require('graphql-client') const chattie = require('chattie') const fs = require('fs') const json2yaml = require('json2yaml') @@ -7,10 +8,14 @@ const validUsername = require('valid-github-username') const wdk = require('wikidata-sdk') const cheerio = require('cheerio') +const { GITHUB_REPO_INFO_QUERY } = require('./queries') +const { getLatestCommitMessage } = require('./utils') + const GH_BASE = 'https://github.com' const GH_USER_BASE = `${GH_BASE}/users` const GH_ORG_BASE = `${GH_BASE}/orgs` const GH_API_BASE = 'https://api.github.com' +const GH_GQL_BASE = 'https://api.github.com/graphql' const GCI_API_BASE = 'https://codein.withgoogle.com/api' const MIN_SEARCH_SCORE = 10 @@ -54,6 +59,13 @@ const GH_API_OPTIONS = { : {}, } +const GH_GQL_OPTIONS = { + url: GH_GQL_BASE, + headers: process.env.GITHUB_TOKEN + ? { Authorization: `bearer ${process.env.GITHUB_TOKEN}` } + : {}, +} + const GH_WEB_OPTIONS = { headers: { Accept: 'text/html', @@ -65,6 +77,11 @@ const GH_WEB_OPTIONS = { compress: false, } +const client = GraphQL(GH_GQL_OPTIONS) + +let COMPETITION_OPEN +let BUST_GITHUB_CACHE + let existingData = [] try { existingData = JSON.parse( @@ -96,6 +113,71 @@ async function fetchLeaders(id) { return leaders } +let repositoryInfo = {} +async function fetchRepositoryInfo(org) { + if (repositoryInfo[org]) return repositoryInfo[org] + + const { data } = await client.query(GITHUB_REPO_INFO_QUERY, { org }) + + if (data) { + const info = data.organization.repositories.nodes.map(node => ({ + watchers: node.watchers.nodes, + stargazers: node.stargazers.nodes, + forks: node.forks.nodes, + })) + + repositoryInfo[org] = info + + return info + } else { + return [] + } +} + +async function getGitHubUserFromRepoInfo(org, displayName, shortName) { + let repos = [] + try { + repos = await fetchRepositoryInfo(org) + } catch (e) { + console.error(`Could not fetch repository info for ${org}...`) + } + + let logins = [] + let names = {} + + repos.forEach(repo => { + logins = logins + .concat(repo.watchers.map(u => u.login.toLowerCase())) + .concat(repo.stargazers.map(u => u.login.toLowerCase())) + .concat( + repo.forks + .map(u => { + const createdAt = new Date(u.createdAt) + if (createdAt.getTime() > COMPETITION_OPEN.getTime()) { + return u.owner.login.toLowerCase() + } + }) + .filter(login => login) + ) + + repo.watchers.forEach(watcher => { + if (watcher.name) { + names[watcher.name.toLowerCase()] = watcher.login + } + }) + }) + + logins = logins.filter((item, pos, self) => self.indexOf(item) === pos) + + if (logins.includes(shortName.toLowerCase())) { + return shortName + } + + if (names[displayName.toLowerCase()]) { + return names[displayName.toLowerCase()] + } +} + async function checkGitHubUserExists(user) { const res = await fetch(`${GH_BASE}/${user}`) return res.status === 200 @@ -366,25 +448,43 @@ async function findGitHubUser(displayName, org) { const shortName = validUsername(displayName) - const username = await findGitHubUserInOrg(displayName, org) - if (username) return username + let userInOrg + try { + userInOrg = await findGitHubUserInOrg(displayName, org) + } catch (e) { + console.error(`Failed to find user ${displayName} in org ${org}...`) + } + if (userInOrg) { + console.log(`${displayName}: ${userInOrg} (method: user in org)`) + return userInOrg + } let user try { user = await getGitHubUser(shortName) } catch (e) { - console.error(`Got error when finding user ${shortName}...`) + console.error(`Failed to find user ${shortName}...`) } - if (!user) return - const login = user.login + if (!user) { + const userFromRepo = await getGitHubUserFromRepoInfo( + org, + displayName, + shortName + ) - const { competition_open_starts } = await fetchProgram() + if (!userFromRepo) { + return + } + + user = userFromRepo + } + + const login = user.login const updatedTime = new Date(user.updated_at) - const openTime = new Date(competition_open_starts) - if (updatedTime.getTime() - openTime.getTime() < 0) return + if (updatedTime.getTime() - COMPETITION_OPEN.getTime() < 0) return let orgs = [] try { @@ -395,7 +495,9 @@ async function findGitHubUser(displayName, org) { } catch (e) { console.error('Could not fetch user history...') } + if (orgs.includes(org)) { + console.log(`${displayName}: ${user.login} (method: found user from name)`) return user.login } } @@ -425,7 +527,8 @@ async function freshenUserGitHubCache(user, existingUser, organization) { existingUser && existingUser.github_updated && existingUser.github_account - ) + ) || + BUST_GITHUB_CACHE ) { return { login: await findGitHubUser(user.display_name, organization), @@ -480,6 +583,8 @@ async function fetchOrgsWithData() { const orgWiki = await Promise.all(fetchingWiki) const fetchingAll = orgs.map(async (org, index) => { + await fetchRepositoryInfo(orgGitHub[index]) + const existingOrg = existingData.find(existing => existing.id === org.id) const fetchingUsers = orgLeaders[index].map(async user => { let existingUser @@ -526,6 +631,16 @@ async function fetchDates() { } ;(async () => { + const { competition_open_starts } = await fetchProgram() + COMPETITION_OPEN = new Date(competition_open_starts) + + const { stdout } = await getLatestCommitMessage() + BUST_GITHUB_CACHE = stdout.toLowerCase().includes('bust-cache') + + if (BUST_GITHUB_CACHE) { + console.log('Busting cache...') + } + const orgs = await fetchOrgsWithData() const dates = await fetchDates() diff --git a/lib/utils.js b/lib/utils.js new file mode 100644 index 0000000..4950ae1 --- /dev/null +++ b/lib/utils.js @@ -0,0 +1,8 @@ +const fs = require('fs') +const util = require('util') +const exec = util.promisify(require('child_process').exec) + +module.exports.getLatestCommitMessage = () => exec('git log -1 --pretty=%B') + +module.exports.loadQuery = name => + fs.readFileSync(`${__dirname}/queries/${name}.graphql`).toString() diff --git a/package-lock.json b/package-lock.json index 8965e4f..15d3fe5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -3742,7 +3742,8 @@ "fs.realpath": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", - "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=" + "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=", + "dev": true }, "function-bind": { "version": "1.1.1", @@ -3852,6 +3853,14 @@ "integrity": "sha1-Dovf5NHduIVNZOBOp8AOKgJuVlg=", "dev": true }, + "graphql-client": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/graphql-client/-/graphql-client-2.0.0.tgz", + "integrity": "sha512-C89fj9FcIVhxbai9qDa2QLrkyKFOI7wc83rp/EhNF+f48R2fMr4Vnq5XrT31+kaZyBhDMz01PfpDoyRTfOP1DQ==", + "requires": { + "isomorphic-fetch": "2.2.1" + } + }, "growly": { "version": "1.3.0", "resolved": "https://registry.npmjs.org/growly/-/growly-1.3.0.tgz", @@ -4576,6 +4585,15 @@ "isarray": "1.0.0" } }, + "isomorphic-fetch": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/isomorphic-fetch/-/isomorphic-fetch-2.2.1.tgz", + "integrity": "sha1-YRrhrPFPXoH3KVB0coGf6XM1WKk=", + "requires": { + "node-fetch": "1.7.3", + "whatwg-fetch": "2.0.3" + } + }, "isstream": { "version": "0.1.2", "resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz", @@ -5776,6 +5794,7 @@ "version": "3.0.4", "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", "integrity": "sha1-UWbihkV/AzBgZL5Ul+jbsMPTIIM=", + "dev": true, "requires": { "brace-expansion": "1.1.8" } @@ -9613,6 +9632,11 @@ "iconv-lite": "0.4.19" } }, + "whatwg-fetch": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/whatwg-fetch/-/whatwg-fetch-2.0.3.tgz", + "integrity": "sha1-nITsLc9oGH/wC8ZOEnS0QhduHIQ=" + }, "whatwg-url": { "version": "6.4.0", "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-6.4.0.tgz", diff --git a/package.json b/package.json index b55c55c..019c92d 100644 --- a/package.json +++ b/package.json @@ -26,6 +26,7 @@ "feed-read-parser": "^0.0.6", "find-rss": "^1.6.4", "glob": "^7.1.2", + "graphql-client": "^2.0.0", "jquery": "^3.2.1", "jquery.i18n": "git+https://github.com/wikimedia/jquery.i18n.git", "json2yaml": "^1.1.0",