Skip to content

Commit

Permalink
Identify GCI users in repository information
Browse files Browse the repository at this point in the history
This begins to identify users in several meta repository stats,
including stargazers, watchers and forks.

bust-cache

Closes #100
  • Loading branch information
andrewda committed Dec 25, 2017
1 parent d6ec5df commit a36b823
Show file tree
Hide file tree
Showing 6 changed files with 215 additions and 944 deletions.
3 changes: 3 additions & 0 deletions lib/queries/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
const { loadQuery } = require('../utils')

module.exports.REPO_INFO_QUERY = loadQuery('repo_info')
28 changes: 28 additions & 0 deletions lib/queries/repo_info.graphql
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
query($org: String!) {
organization(login: $org) {
repositories(first: 100) {
nodes {
watchers(first: 100) {
nodes {
login
name
}
}
stargazers(last: 100) {
nodes {
login
name
}
}
forks(last: 100) {
nodes {
owner {
login
}
createdAt
}
}
}
}
}
}
133 changes: 124 additions & 9 deletions lib/scrape.js
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
const fetch = require('node-fetch')
const GraphQL = require('graphql-client')
const chattie = require('chattie')
const fs = require('fs')
const json2yaml = require('json2yaml')
const validUsername = require('valid-github-username')
const wdk = require('wikidata-sdk')
const cheerio = require('cheerio')

const { REPO_INFO_QUERY } = require('./queries')
const { getLatestCommitMessage } = require('./utils')

const GH_BASE = 'https://github.com'
const GH_USER_BASE = `${GH_BASE}/users`
const GH_ORG_BASE = `${GH_BASE}/orgs`
const GH_API_BASE = 'https://api.github.com'
const GH_GQL_BASE = 'https://api.github.com/graphql'
const GCI_API_BASE = 'https://codein.withgoogle.com/api'

const MIN_SEARCH_SCORE = 10
Expand Down Expand Up @@ -43,6 +48,13 @@ const GH_API_OPTIONS = {
: {},
}

const GH_GQL_OPTIONS = {
url: GH_GQL_BASE,
headers: process.env.GITHUB_TOKEN
? { Authorization: `bearer ${process.env.GITHUB_TOKEN}` }
: {},
}

const GH_WEB_OPTIONS = {
headers: {
Accept: 'text/html',
Expand All @@ -54,6 +66,11 @@ const GH_WEB_OPTIONS = {
compress: false,
}

const client = GraphQL(GH_GQL_OPTIONS)

let COMPETITION_OPEN
let BUST_GITHUB_CACHE

let existingData = []
try {
existingData = JSON.parse(
Expand All @@ -80,6 +97,71 @@ async function fetchLeaders(id) {
return leaders
}

let repositoryInfo = {}
async function fetchRepositoryInfo(org) {
if (repositoryInfo[org]) return repositoryInfo[org]

const { data } = await client.query(REPO_INFO_QUERY, { org })

if (data) {
const info = data.organization.repositories.nodes.map(node => ({
watchers: node.watchers.nodes,
stargazers: node.stargazers.nodes,
forks: node.forks.nodes,
}))

repositoryInfo[org] = info

return info
} else {
return []
}
}

async function getGitHubUserFromRepoInfo(org, displayName, shortName) {
let repos = []
try {
repos = await fetchRepositoryInfo(org)
} catch (e) {
console.error(`Could not fetch repository info for ${org}...`)
}

let logins = []
let names = {}

repos.forEach(repo => {
logins = logins
.concat(repo.watchers.map(u => u.login.toLowerCase()))
.concat(repo.stargazers.map(u => u.login.toLowerCase()))
.concat(
repo.forks
.map(u => {
const createdAt = new Date(u.createdAt)
if (createdAt.getTime() > COMPETITION_OPEN.getTime()) {
return u.owner.login.toLowerCase()
}
})
.filter(login => login)
)

repo.watchers.forEach(watcher => {
if (watcher.name) {
names[watcher.name.toLowerCase()] = watcher.login
}
})
})

logins = logins.filter((item, pos, self) => self.indexOf(item) === pos)

if (logins.includes(shortName.toLowerCase())) {
return shortName
}

if (names[displayName.toLowerCase()]) {
return names[displayName.toLowerCase()]
}
}

async function checkGitHubUserExists(user) {
const res = await fetch(`${GH_BASE}/${user}`)
return res.status === 200
Expand Down Expand Up @@ -280,25 +362,43 @@ async function findGitHubUser(displayName, org) {

const shortName = validUsername(displayName)

const username = await findGitHubUserInOrg(displayName, org)
if (username) return username
let userInOrg
try {
userInOrg = await findGitHubUserInOrg(displayName, org)
} catch (e) {
console.error(`Failed to find user ${displayName} in org ${org}...`)
}
if (userInOrg) {
console.log(`${displayName}: ${userInOrg} (method: user in org)`)
return userInOrg
}

let user
try {
user = await getGitHubUser(shortName)
} catch (e) {
console.error(`Got error when finding user ${shortName}...`)
console.error(`Failed to find user ${shortName}...`)
}
if (!user) return

const login = user.login
if (!user) {
const userFromRepo = await getGitHubUserFromRepoInfo(
org,
displayName,
shortName
)

const { competition_open_starts } = await fetchProgram()
if (!userFromRepo) {
return
}

user = userFromRepo
}

const login = user.login

const updatedTime = new Date(user.updated_at)
const openTime = new Date(competition_open_starts)

if (updatedTime.getTime() - openTime.getTime() < 0) return
if (updatedTime.getTime() - COMPETITION_OPEN.getTime() < 0) return

let orgs = []
try {
Expand All @@ -309,7 +409,9 @@ async function findGitHubUser(displayName, org) {
} catch (e) {
console.error('Could not fetch user history...')
}

if (orgs.includes(org)) {
console.log(`${displayName}: ${user.login} (method: found user from name)`)
return user.login
}
}
Expand Down Expand Up @@ -339,7 +441,8 @@ async function freshenUserGitHubCache(user, existingUser, organization) {
existingUser &&
existingUser.github_updated &&
existingUser.github_account
)
) ||
BUST_GITHUB_CACHE
) {
return {
login: await findGitHubUser(user.display_name, organization),
Expand Down Expand Up @@ -394,6 +497,8 @@ async function fetchOrgsWithData() {
const orgWiki = await Promise.all(fetchingWiki)

const fetchingAll = orgs.map(async (org, index) => {
await fetchRepositoryInfo(orgGitHub[index])

const existingOrg = existingData.find(existing => existing.id === org.id)
const fetchingUsers = orgLeaders[index].map(async user => {
let existingUser
Expand Down Expand Up @@ -439,6 +544,16 @@ async function fetchDates() {
}

;(async () => {
const { competition_open_starts } = await fetchProgram()
COMPETITION_OPEN = new Date(competition_open_starts)

const { stdout } = await getLatestCommitMessage()
BUST_GITHUB_CACHE = stdout.toLowerCase().includes('bust-cache')

if (BUST_GITHUB_CACHE) {
console.log('Busting cache...')
}

const data = await fetchOrgsWithData()
const dates = await fetchDates()

Expand Down
8 changes: 8 additions & 0 deletions lib/utils.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
const fs = require('fs')
const util = require('util')
const exec = util.promisify(require('child_process').exec)

module.exports.getLatestCommitMessage = () => exec('git log -1 --pretty=%B')

module.exports.loadQuery = name =>
fs.readFileSync(`${__dirname}/queries/${name}.graphql`).toString()
Loading

0 comments on commit a36b823

Please sign in to comment.