-
Notifications
You must be signed in to change notification settings - Fork 45
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Identify GCI users in repository information #114
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
query($org: String!) { | ||
organization(login: $org) { | ||
repositories(first: 100) { | ||
nodes { | ||
watchers(first: 100) { | ||
nodes { | ||
login | ||
name | ||
} | ||
} | ||
stargazers(last: 100) { | ||
nodes { | ||
login | ||
name | ||
} | ||
} | ||
forks(last: 100) { | ||
nodes { | ||
owner { | ||
login | ||
} | ||
createdAt | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
const { loadQuery } = require('../utils') | ||
|
||
module.exports.GITHUB_REPO_INFO_QUERY = loadQuery('github_repo_info') |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
const fetch = require('node-fetch') | ||
const GraphQL = require('graphql-client') | ||
const chattie = require('chattie') | ||
const fs = require('fs') | ||
const json2yaml = require('json2yaml') | ||
|
@@ -7,10 +8,14 @@ const validUsername = require('valid-github-username') | |
const wdk = require('wikidata-sdk') | ||
const cheerio = require('cheerio') | ||
|
||
const { GITHUB_REPO_INFO_QUERY } = require('./queries') | ||
const { getLatestCommitMessage } = require('./utils') | ||
|
||
const GH_BASE = 'https://github.com' | ||
const GH_USER_BASE = `${GH_BASE}/users` | ||
const GH_ORG_BASE = `${GH_BASE}/orgs` | ||
const GH_API_BASE = 'https://api.github.com' | ||
const GH_GQL_BASE = 'https://api.github.com/graphql' | ||
const GCI_API_BASE = 'https://codein.withgoogle.com/api' | ||
|
||
const MIN_SEARCH_SCORE = 10 | ||
|
@@ -54,6 +59,13 @@ const GH_API_OPTIONS = { | |
: {}, | ||
} | ||
|
||
const GH_GQL_OPTIONS = { | ||
url: GH_GQL_BASE, | ||
headers: process.env.GITHUB_TOKEN | ||
? { Authorization: `bearer ${process.env.GITHUB_TOKEN}` } | ||
: {}, | ||
} | ||
|
||
const GH_WEB_OPTIONS = { | ||
headers: { | ||
Accept: 'text/html', | ||
|
@@ -65,6 +77,11 @@ const GH_WEB_OPTIONS = { | |
compress: false, | ||
} | ||
|
||
const client = GraphQL(GH_GQL_OPTIONS) | ||
|
||
let COMPETITION_OPEN | ||
let BUST_GITHUB_CACHE | ||
|
||
let existingData = [] | ||
try { | ||
existingData = JSON.parse( | ||
|
@@ -96,6 +113,71 @@ async function fetchLeaders(id) { | |
return leaders | ||
} | ||
|
||
let repositoryInfo = {} | ||
async function fetchRepositoryInfo(org) { | ||
if (repositoryInfo[org]) return repositoryInfo[org] | ||
|
||
const { data } = await client.query(GITHUB_REPO_INFO_QUERY, { org }) | ||
|
||
if (data) { | ||
const info = data.organization.repositories.nodes.map(node => ({ | ||
watchers: node.watchers.nodes, | ||
stargazers: node.stargazers.nodes, | ||
forks: node.forks.nodes, | ||
})) | ||
|
||
repositoryInfo[org] = info | ||
|
||
return info | ||
} else { | ||
return [] | ||
} | ||
} | ||
|
||
async function getGitHubUserFromRepoInfo(org, displayName, shortName) { | ||
let repos = [] | ||
try { | ||
repos = await fetchRepositoryInfo(org) | ||
} catch (e) { | ||
console.error(`Could not fetch repository info for ${org}...`) | ||
} | ||
|
||
let logins = [] | ||
let names = {} | ||
|
||
repos.forEach(repo => { | ||
logins = logins | ||
.concat(repo.watchers.map(u => u.login.toLowerCase())) | ||
.concat(repo.stargazers.map(u => u.login.toLowerCase())) | ||
.concat( | ||
repo.forks | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so I could create account https://github.com/WinningIt , fork a Ubuntu repo, and my fake account gets linked? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suppose you could if you wanted to be malicious, but with the current implementation you could create that same fake account and make a BS issue on any Ubuntu repository and the account would be linked. Or even just do a PR review. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. An issue is a very noticable action. It is a new thing, and the malicious person is the creator. stars/watches/forks are almost invisible. And it is very hard to file 'abuse' with GitHub for doing those actions. Your issue is a specific attempt to do #8 . The biggest problem is that so far this algorithm has a lower confidence level than the previous algorithms, but it is being used first. This could be used to override the existing algorithms, which are more reliable. Once you have a potential match, which wasnt found using the existing more reliable algorithms, you need to look at the match profiles to determine how you can increase the confidence level of your match. The more effort you require of the abuser, the more likely their abuse can only be viewed as intentional or at least highly suspicious. Probably also a good idea to annotate each match with the matching method used. API hits isn't relevant now, as the hits can grow over time. You can include probable matches in the yaml which are not included in the rendered page, as they are needing more analysis deferred until a subsequent build has extra API calls to use. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about running the same checks we do on the user found this way as we do for users found by removing spaces in the display name? i.e. we make sure the user has at least a commit, issue, PR or review on a repository of the org That way we can have the same confidence in this prediction as we do for all the others There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The problem with ^ is that it cuts the number of matches in half and gets rid of some valid matches, but it does result in a much more accurate result. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the total number of matches increased? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe it only increases by 1 if we verify using our current method, but will increase by 3 if we allow contributions to any of the GCI orgs to count (not just the one they are a leader for). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 1 is enough to meet the requirements of the task |
||
.map(u => { | ||
const createdAt = new Date(u.createdAt) | ||
if (createdAt.getTime() > COMPETITION_OPEN.getTime()) { | ||
return u.owner.login.toLowerCase() | ||
} | ||
}) | ||
.filter(login => login) | ||
) | ||
|
||
repo.watchers.forEach(watcher => { | ||
if (watcher.name) { | ||
names[watcher.name.toLowerCase()] = watcher.login | ||
} | ||
}) | ||
}) | ||
|
||
logins = logins.filter((item, pos, self) => self.indexOf(item) === pos) | ||
|
||
if (logins.includes(shortName.toLowerCase())) { | ||
return shortName | ||
} | ||
|
||
if (names[displayName.toLowerCase()]) { | ||
return names[displayName.toLowerCase()] | ||
} | ||
} | ||
|
||
async function checkGitHubUserExists(user) { | ||
const res = await fetch(`${GH_BASE}/${user}`) | ||
return res.status === 200 | ||
|
@@ -366,26 +448,56 @@ async function findGitHubUser(displayName, org) { | |
|
||
const shortName = validUsername(displayName) | ||
|
||
const username = await findGitHubUserInOrg(displayName, org) | ||
if (username) return username | ||
let userInOrg | ||
try { | ||
userInOrg = await findGitHubUserInOrg(displayName, org) | ||
} catch (e) { | ||
console.error(`Failed to find user ${displayName} in org ${org}...`) | ||
} | ||
if (userInOrg) { | ||
console.log(`${displayName}: ${userInOrg} (method: user in org)`) | ||
return userInOrg | ||
} | ||
|
||
const user = await getGitHubUser(shortName) | ||
if (!user) return | ||
let user | ||
try { | ||
user = await getGitHubUser(shortName) | ||
} catch (e) { | ||
console.error(`Failed to find user ${shortName}...`) | ||
} | ||
|
||
const login = user.login | ||
if (!user) { | ||
const userFromRepo = await getGitHubUserFromRepoInfo( | ||
org, | ||
displayName, | ||
shortName | ||
) | ||
|
||
const { competition_open_starts } = await fetchProgram() | ||
if (!userFromRepo) { | ||
return | ||
} | ||
|
||
user = userFromRepo | ||
} | ||
|
||
const login = user.login | ||
|
||
const updatedTime = new Date(user.updated_at) | ||
const openTime = new Date(competition_open_starts) | ||
|
||
if (updatedTime.getTime() - openTime.getTime() < 0) return | ||
if (updatedTime.getTime() - COMPETITION_OPEN.getTime() < 0) return | ||
|
||
let orgs = [] | ||
try { | ||
const nov = await getGitHubUserHistory(login, '2017-11-28', '2017-11-30') | ||
const dec = await getGitHubUserHistory(login, '2017-12-01', '2017-12-31') | ||
const jan = await getGitHubUserHistory(login, '2018-01-01', '2018-01-17') | ||
orgs = [...nov, ...dec, ...jan].map(repo => repo.split('/')[0]) | ||
} catch (e) { | ||
console.error('Could not fetch user history...') | ||
} | ||
|
||
const nov = await getGitHubUserHistory(login, '2017-11-28', '2017-11-30') | ||
const dec = await getGitHubUserHistory(login, '2017-12-01', '2017-12-31') | ||
const jan = await getGitHubUserHistory(login, '2018-01-01', '2018-01-17') | ||
const orgs = [...nov, ...dec, ...jan].map(repo => repo.split('/')[0]) | ||
if (orgs.includes(org)) { | ||
console.log(`${displayName}: ${user.login} (method: found user from name)`) | ||
return user.login | ||
} | ||
} | ||
|
@@ -410,15 +522,27 @@ function checkGitHubUserCacheExpired(user) { | |
} | ||
|
||
async function freshenUserGitHubCache(user, existingUser, organization) { | ||
if (!(existingUser && existingUser.github_updated)) { | ||
if ( | ||
!( | ||
existingUser && | ||
existingUser.github_updated && | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree with Airbnb, but this is a styling thing in our ESLint config that we should probably change in another PR. |
||
existingUser.github_account | ||
) || | ||
BUST_GITHUB_CACHE | ||
) { | ||
return { | ||
login: await findGitHubUser(user.display_name, organization), | ||
updated: Date.now(), | ||
} | ||
} | ||
|
||
if (checkGitHubUserCacheExpired(existingUser)) { | ||
const exists = await checkGitHubUserExists(user.github_account) | ||
let exists | ||
try { | ||
exists = await checkGitHubUserExists(user.github_account) | ||
} catch (e) { | ||
exists = false | ||
} | ||
|
||
if (exists) { | ||
return { | ||
|
@@ -459,6 +583,8 @@ async function fetchOrgsWithData() { | |
const orgWiki = await Promise.all(fetchingWiki) | ||
|
||
const fetchingAll = orgs.map(async (org, index) => { | ||
await fetchRepositoryInfo(orgGitHub[index]) | ||
|
||
const existingOrg = existingData.find(existing => existing.id === org.id) | ||
const fetchingUsers = orgLeaders[index].map(async user => { | ||
let existingUser | ||
|
@@ -505,6 +631,16 @@ async function fetchDates() { | |
} | ||
|
||
;(async () => { | ||
const { competition_open_starts } = await fetchProgram() | ||
COMPETITION_OPEN = new Date(competition_open_starts) | ||
|
||
const { stdout } = await getLatestCommitMessage() | ||
BUST_GITHUB_CACHE = stdout.toLowerCase().includes('bust-cache') | ||
|
||
if (BUST_GITHUB_CACHE) { | ||
console.log('Busting cache...') | ||
} | ||
|
||
const orgs = await fetchOrgsWithData() | ||
const dates = await fetchDates() | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
const fs = require('fs') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
const util = require('util') | ||
const exec = util.promisify(require('child_process').exec) | ||
|
||
module.exports.getLatestCommitMessage = () => exec('git log -1 --pretty=%B') | ||
|
||
module.exports.loadQuery = name => | ||
fs.readFileSync(`${__dirname}/queries/${name}.graphql`).toString() |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
GCI Leaders now using Babel, isn't it? Use
import
instead?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Only frontend code (
/static
) uses Babel at the moment, so this isn't yet possible.