diff --git a/lib/queries/github_repo_info.graphql b/lib/queries/github_repo_info.graphql new file mode 100644 index 0000000..1c48ff3 --- /dev/null +++ b/lib/queries/github_repo_info.graphql @@ -0,0 +1,28 @@ +query($org: String!) { + organization(login: $org) { + repositories(first: 100) { + nodes { + watchers(first: 100) { + nodes { + login + name + } + } + stargazers(last: 100) { + nodes { + login + name + } + } + forks(last: 100) { + nodes { + owner { + login + } + createdAt + } + } + } + } + } +} diff --git a/lib/queries/index.js b/lib/queries/index.js new file mode 100644 index 0000000..4f770c3 --- /dev/null +++ b/lib/queries/index.js @@ -0,0 +1,3 @@ +const { loadQuery } = require('../utils') + +module.exports.GITHUB_REPO_INFO_QUERY = loadQuery('github_repo_info') diff --git a/lib/scrape.js b/lib/scrape.js index 77ea57b..d251fd9 100644 --- a/lib/scrape.js +++ b/lib/scrape.js @@ -1,4 +1,5 @@ const fetch = require('node-fetch') +const GraphQL = require('graphql-client') const chattie = require('chattie') const fs = require('fs') const json2yaml = require('json2yaml') @@ -7,10 +8,14 @@ const validUsername = require('valid-github-username') const wdk = require('wikidata-sdk') const cheerio = require('cheerio') +const { GITHUB_REPO_INFO_QUERY } = require('./queries') +const { getLatestCommitMessage } = require('./utils') + const GH_BASE = 'https://github.com' const GH_USER_BASE = `${GH_BASE}/users` const GH_ORG_BASE = `${GH_BASE}/orgs` const GH_API_BASE = 'https://api.github.com' +const GH_GQL_BASE = 'https://api.github.com/graphql' const GCI_API_BASE = 'https://codein.withgoogle.com/api' const MIN_SEARCH_SCORE = 10 @@ -54,6 +59,13 @@ const GH_API_OPTIONS = { : {}, } +const GH_GQL_OPTIONS = { + url: GH_GQL_BASE, + headers: process.env.GITHUB_TOKEN + ? { Authorization: `bearer ${process.env.GITHUB_TOKEN}` } + : {}, +} + const GH_WEB_OPTIONS = { headers: { Accept: 'text/html', @@ -65,6 +77,11 @@ const GH_WEB_OPTIONS = { compress: false, } +const client = GraphQL(GH_GQL_OPTIONS) + +let COMPETITION_OPEN +let BUST_GITHUB_CACHE + let existingData = [] try { existingData = JSON.parse( @@ -96,6 +113,71 @@ async function fetchLeaders(id) { return leaders } +let repositoryInfo = {} +async function fetchRepositoryInfo(org) { + if (repositoryInfo[org]) return repositoryInfo[org] + + const { data } = await client.query(GITHUB_REPO_INFO_QUERY, { org }) + + if (data) { + const info = data.organization.repositories.nodes.map(node => ({ + watchers: node.watchers.nodes, + stargazers: node.stargazers.nodes, + forks: node.forks.nodes, + })) + + repositoryInfo[org] = info + + return info + } else { + return [] + } +} + +async function getGitHubUserFromRepoInfo(org, displayName, shortName) { + let repos = [] + try { + repos = await fetchRepositoryInfo(org) + } catch (e) { + console.error(`Could not fetch repository info for ${org}...`) + } + + let logins = [] + let names = {} + + repos.forEach(repo => { + logins = logins + .concat(repo.watchers.map(u => u.login.toLowerCase())) + .concat(repo.stargazers.map(u => u.login.toLowerCase())) + .concat( + repo.forks + .map(u => { + const createdAt = new Date(u.createdAt) + if (createdAt.getTime() > COMPETITION_OPEN.getTime()) { + return u.owner.login.toLowerCase() + } + }) + .filter(login => login) + ) + + repo.watchers.forEach(watcher => { + if (watcher.name) { + names[watcher.name.toLowerCase()] = watcher.login + } + }) + }) + + logins = logins.filter((item, pos, self) => self.indexOf(item) === pos) + + if (logins.includes(shortName.toLowerCase())) { + return shortName + } + + if (names[displayName.toLowerCase()]) { + return names[displayName.toLowerCase()] + } +} + async function checkGitHubUserExists(user) { const res = await fetch(`${GH_BASE}/${user}`) return res.status === 200 @@ -366,26 +448,56 @@ async function findGitHubUser(displayName, org) { const shortName = validUsername(displayName) - const username = await findGitHubUserInOrg(displayName, org) - if (username) return username + let userInOrg + try { + userInOrg = await findGitHubUserInOrg(displayName, org) + } catch (e) { + console.error(`Failed to find user ${displayName} in org ${org}...`) + } + if (userInOrg) { + console.log(`${displayName}: ${userInOrg} (method: user in org)`) + return userInOrg + } - const user = await getGitHubUser(shortName) - if (!user) return + let user + try { + user = await getGitHubUser(shortName) + } catch (e) { + console.error(`Failed to find user ${shortName}...`) + } - const login = user.login + if (!user) { + const userFromRepo = await getGitHubUserFromRepoInfo( + org, + displayName, + shortName + ) - const { competition_open_starts } = await fetchProgram() + if (!userFromRepo) { + return + } + + user = userFromRepo + } + + const login = user.login const updatedTime = new Date(user.updated_at) - const openTime = new Date(competition_open_starts) - if (updatedTime.getTime() - openTime.getTime() < 0) return + if (updatedTime.getTime() - COMPETITION_OPEN.getTime() < 0) return + + let orgs = [] + try { + const nov = await getGitHubUserHistory(login, '2017-11-28', '2017-11-30') + const dec = await getGitHubUserHistory(login, '2017-12-01', '2017-12-31') + const jan = await getGitHubUserHistory(login, '2018-01-01', '2018-01-17') + orgs = [...nov, ...dec, ...jan].map(repo => repo.split('/')[0]) + } catch (e) { + console.error('Could not fetch user history...') + } - const nov = await getGitHubUserHistory(login, '2017-11-28', '2017-11-30') - const dec = await getGitHubUserHistory(login, '2017-12-01', '2017-12-31') - const jan = await getGitHubUserHistory(login, '2018-01-01', '2018-01-17') - const orgs = [...nov, ...dec, ...jan].map(repo => repo.split('/')[0]) if (orgs.includes(org)) { + console.log(`${displayName}: ${user.login} (method: found user from name)`) return user.login } } @@ -410,7 +522,14 @@ function checkGitHubUserCacheExpired(user) { } async function freshenUserGitHubCache(user, existingUser, organization) { - if (!(existingUser && existingUser.github_updated)) { + if ( + !( + existingUser && + existingUser.github_updated && + existingUser.github_account + ) || + BUST_GITHUB_CACHE + ) { return { login: await findGitHubUser(user.display_name, organization), updated: Date.now(), @@ -418,7 +537,12 @@ async function freshenUserGitHubCache(user, existingUser, organization) { } if (checkGitHubUserCacheExpired(existingUser)) { - const exists = await checkGitHubUserExists(user.github_account) + let exists + try { + exists = await checkGitHubUserExists(user.github_account) + } catch (e) { + exists = false + } if (exists) { return { @@ -459,6 +583,8 @@ async function fetchOrgsWithData() { const orgWiki = await Promise.all(fetchingWiki) const fetchingAll = orgs.map(async (org, index) => { + await fetchRepositoryInfo(orgGitHub[index]) + const existingOrg = existingData.find(existing => existing.id === org.id) const fetchingUsers = orgLeaders[index].map(async user => { let existingUser @@ -505,6 +631,16 @@ async function fetchDates() { } ;(async () => { + const { competition_open_starts } = await fetchProgram() + COMPETITION_OPEN = new Date(competition_open_starts) + + const { stdout } = await getLatestCommitMessage() + BUST_GITHUB_CACHE = stdout.toLowerCase().includes('bust-cache') + + if (BUST_GITHUB_CACHE) { + console.log('Busting cache...') + } + const orgs = await fetchOrgsWithData() const dates = await fetchDates() diff --git a/lib/utils.js b/lib/utils.js new file mode 100644 index 0000000..4950ae1 --- /dev/null +++ b/lib/utils.js @@ -0,0 +1,8 @@ +const fs = require('fs') +const util = require('util') +const exec = util.promisify(require('child_process').exec) + +module.exports.getLatestCommitMessage = () => exec('git log -1 --pretty=%B') + +module.exports.loadQuery = name => + fs.readFileSync(`${__dirname}/queries/${name}.graphql`).toString() diff --git a/package-lock.json b/package-lock.json index 8965e4f..15d3fe5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -3742,7 +3742,8 @@ "fs.realpath": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", - "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=" + "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=", + "dev": true }, "function-bind": { "version": "1.1.1", @@ -3852,6 +3853,14 @@ "integrity": "sha1-Dovf5NHduIVNZOBOp8AOKgJuVlg=", "dev": true }, + "graphql-client": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/graphql-client/-/graphql-client-2.0.0.tgz", + "integrity": "sha512-C89fj9FcIVhxbai9qDa2QLrkyKFOI7wc83rp/EhNF+f48R2fMr4Vnq5XrT31+kaZyBhDMz01PfpDoyRTfOP1DQ==", + "requires": { + "isomorphic-fetch": "2.2.1" + } + }, "growly": { "version": "1.3.0", "resolved": "https://registry.npmjs.org/growly/-/growly-1.3.0.tgz", @@ -4576,6 +4585,15 @@ "isarray": "1.0.0" } }, + "isomorphic-fetch": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/isomorphic-fetch/-/isomorphic-fetch-2.2.1.tgz", + "integrity": "sha1-YRrhrPFPXoH3KVB0coGf6XM1WKk=", + "requires": { + "node-fetch": "1.7.3", + "whatwg-fetch": "2.0.3" + } + }, "isstream": { "version": "0.1.2", "resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz", @@ -5776,6 +5794,7 @@ "version": "3.0.4", "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", "integrity": "sha1-UWbihkV/AzBgZL5Ul+jbsMPTIIM=", + "dev": true, "requires": { "brace-expansion": "1.1.8" } @@ -9613,6 +9632,11 @@ "iconv-lite": "0.4.19" } }, + "whatwg-fetch": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/whatwg-fetch/-/whatwg-fetch-2.0.3.tgz", + "integrity": "sha1-nITsLc9oGH/wC8ZOEnS0QhduHIQ=" + }, "whatwg-url": { "version": "6.4.0", "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-6.4.0.tgz", diff --git a/package.json b/package.json index b55c55c..019c92d 100644 --- a/package.json +++ b/package.json @@ -26,6 +26,7 @@ "feed-read-parser": "^0.0.6", "find-rss": "^1.6.4", "glob": "^7.1.2", + "graphql-client": "^2.0.0", "jquery": "^3.2.1", "jquery.i18n": "git+https://github.com/wikimedia/jquery.i18n.git", "json2yaml": "^1.1.0",