Skip to content

Commit

Permalink
feat: enrich url entities
Browse files Browse the repository at this point in the history
  • Loading branch information
transitive-bullshit committed Feb 25, 2024
1 parent c4b59dd commit b89c76d
Show file tree
Hide file tree
Showing 7 changed files with 180 additions and 27 deletions.
78 changes: 75 additions & 3 deletions bin/debug-scrape-url.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pMap from 'p-map'

import '../src/config.js'
import { ScraperClient } from '../src/services/scraper-client.js'
import { omit } from '../src/utils.js'
Expand All @@ -8,10 +10,80 @@ import { omit } from '../src/utils.js'
async function main() {
const scraperClient = new ScraperClient()

const res = await scraperClient.scrapeUrl(
'https://www.nytimes.com/2023/05/31/magazine/ai-start-up-accelerator-san-francisco.html'
const urls = [
'https://www.nytimes.com/2023/05/31/magazine/ai-start-up-accelerator-san-francisco.html',
'https://www.youtube.com/watch?v=NNgdcn4Ux1k&ab_channel=LexClips',
'https://digg.com/memes-ranked/link/best-memes-ranked-pussy-in-bio-mandela-effect-room-space?utm_source=digg',
'https://platform.openai.com/docs/guides/vision',
'https://en.wikipedia.org/wiki/Larry_Page',
'https://www.flowrestling.org/articles/12162675-oklahoma-state-wrestling-on-the-hunt-for-upsets-against-iowa',
'https://github.com/transitive-bullshit/lqip-modern',
'https://www.gatesnotes.com/AI-agents',
'https://blog.eladgil.com/p/early-days-of-ai',
'https://bair.berkeley.edu/blog/2024/02/18/compound-ai-systems/',
'https://www.bbc.com/news/business-68387018',
'https://www.bbc.com/sport/football/68395310',
'https://www.kayak.com/',
'https://marmelab.com/blog/2024/01/23/react-19-new-hooks.html?ref=labnotes.org',
'https://www.foxnews.com/us/ai-technology-could-help-us-allies-monitor-chinas-taiwan-invasion-intensions',
'https://twitter.com/paulg/status/1761731253764579573',
'https://twitter.com/transitive_bs',
'https://transitivebullsh.it/chatgpt-twitter-bot-lessons',
'https://www.swyx.io/learn-in-public',
'https://leerob.io/blog/developer-experience-examples',
'https://rauchg.com/2021/making-the-web-faster',
'https://blog.google/products/gemini/bard-gemini-advanced-app/',
'https://apnews.com/article/2024-qatar-swimming-worlds-underwater-camera-splash',
'https://www.amazon.com/Deepness-Sky-Zones-Thought-Book-ebook/dp/B002H8ORKM/?_encoding=UTF8&pd_rd_w=4N09q&content-id=amzn1.sym.379956f8-690b-4143-ad17-ba606cbec0c1&pf_rd_p=379956f8-690b-4143-ad17-ba606cbec0c1&pf_rd_r=NXZSG4MAQ5P40FP5T5ZR&pd_rd_wg=t7KmU&pd_rd_r=5c051a29-61a2-468a-bc68-ad2754e52d05&ref_=pd_gw_bmx27b',
'https://www.reddit.com/r/MadeMeSmile/comments/u33nuc/he_finally_got_his_acorn/',
'https://www.reddit.com/r/Damnthatsinteresting/comments/ujl32z/this_is_jeanbaptiste_kempf_the_creator_of_vlc/',
'https://news.ycombinator.com/item?id=35154527',
'https://news.ycombinator.com/item?id=11116274',
'https://www.bbc.com/news/uk-43396008',
'https://www.apple.com/customer-letter/',
'https://openai.com/blog/openai-announces-leadership-transition',
'https://www.apple.com/stevejobs/', // output includes some weird #{ref} stuff
'https://groups.google.com/g/vim_announce/c/tWahca9zkt4?pli=1',
'https://bensbites.beehiiv.com/',
'https://bensbites.beehiiv.com/p/open-ai-serious-building-new-app-store',
'https://anilist.co/anime/1/Cowboy-Bebop/',
'https://dexa.ai/',
'https://dexa.ai/s/S7RDMg3f',
'https://www.quora.com/What-can-I-learn-know-right-now-in-10-minutes-that-will-be-useful-for-the-rest-of-my-life',
'https://www.quora.com/How-do-top-students-study',
'https://www.quora.com/What-are-the-most-surreal-places-to-visit',
'https://www.instagram.com/p/BTKd8z2jM14/?img_index=1',
'https://www.linkedin.com/in/fisch2/',
'https://www.facebook.com/zuck/',
'https://github.com/sindresorhus',
'https://www.pornhub.com/',
'https://www.tiktok.com/@zachking/video/6768504823336815877?embed_source=71929438%2C121374463%2C121351166%2C121331973%2C120811592%2C120810756%3Bnull%3Bembed_blank&refer=embed&referer_url=metricool.com%2Ftiktoks-most-viral-videos%2F&referer_video_id=6768504823336815877',
'https://www.tiktok.com/@zachking/video/6749520869598481669'
]

const results = (
await pMap(
urls,
async (url) => {
try {
return await scraperClient.scrapeUrl(url)
} catch (err: any) {
console.error('error processing url', url, err.toString())
}
},
{
concurrency: 4
}
)
).filter(Boolean)

console.log(
JSON.stringify(
results.map((res) => omit(res, 'content', 'rawHtml')),
null,
2
)
)
console.log(JSON.stringify(omit(res, 'content', 'rawHtml'), null, 2))
}

main()
Expand Down
5 changes: 1 addition & 4 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,16 +82,13 @@ Flags:
## TODO
- understand why mentions from non-verified accounts aren't being reported by the twitter api
- support quote tweet and retweet context
- support user entity context
- add test fixtures testing these different use cases
- fix support for empty mentions
- currently works but duplicates the previous tweet's contents
- support `url` entities
- expand them with metadata
- support `media` entities
- populate media entities
- openai use gpt-4-vision-preview
- for openai, use gpt-4-vision-preview
- conditionally preprocess images using `sharp` to ensure they are supported by gpt4v
- improve openai answer engine
- dalle tool
Expand Down
10 changes: 7 additions & 3 deletions src/answer-engine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import {
sanitizeTweetText,
stripUserMentions
} from './twitter-utils.js'
import { assert } from './utils.js'
import { assert, pick } from './utils.js'

export abstract class AnswerEngine {
readonly type: types.AnswerEngineType
Expand All @@ -30,7 +30,10 @@ export abstract class AnswerEngine {
ctx: types.AnswerEngineContext
) {
const query = await this.resolveMessageThread(message, ctx)
console.log(`>>> ${this.type} answer engine`, query)
console.log(
`\n>>> ${this.type} answer engine`,
pick(query, 'message', 'chatMessages', 'tweets', 'entityMap')
)

message.response = await this.generateResponseForQuery(query, ctx)

Expand Down Expand Up @@ -255,7 +258,8 @@ export abstract class AnswerEngine {

const rawEntityMap: types.RawEntityMap = {
users: {},
tweets: {}
tweets: {},
urls: entityMap.urls ?? {}
}

if (entityMap?.users) {
Expand Down
107 changes: 92 additions & 15 deletions src/entities.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import pMap from 'p-map'
import pMemoize from 'p-memoize'
import { z } from 'zod'

import * as db from './db.js'
import type * as types from './types.js'
import { ScraperClient } from './services/scraper-client.js'

export const URLEntitySchema = z.object({
type: z.literal('url'),
Expand All @@ -12,7 +15,10 @@ export const URLEntitySchema = z.object({
// Will only exist if this URL references a known media entity
mediaId: z.string().optional(),
title: z.string().optional(),
description: z.string().optional()
description: z.string().optional(),
markdownContent: z.string().optional(),
siteName: z.string().optional(),
author: z.string().optional()
})
export type URLEntity = z.infer<typeof URLEntitySchema>

Expand Down Expand Up @@ -86,7 +92,8 @@ export type MediaEntity = z.infer<typeof MediaEntitySchema>
export const EntityMapSchema = z.object({
users: z.record(UserEntitySchema).optional(),
tweets: z.record(TweetEntitySchema).optional(),
media: z.record(MediaEntitySchema).optional()
media: z.record(MediaEntitySchema).optional(),
urls: z.record(URLEntitySchema).optional()
})
export type EntityMap = z.infer<typeof EntityMapSchema>

Expand Down Expand Up @@ -127,70 +134,94 @@ export async function convertTweetToEntityMap(
fetchMissingEntities?: boolean
} = {}
): Promise<EntityMap> {
const EntityMap: Required<EntityMap> = {
const entityMap: Required<EntityMap> = {
users: {},
tweets: {},
// TODO: currently not resolving media entities
media: {}
media: {},
urls: {}
}
const tweetEntity = convertTweetToEntity(tweet)
EntityMap.tweets[tweetEntity.id] = tweetEntity
entityMap.tweets[tweetEntity.id] = tweetEntity

const urls: Record<string, URLEntity> = {}
const referencedUserIds = new Set<string>()
const referencedTweetIds = new Set<string>()

if (tweetEntity.repliedToUserId)
if (tweetEntity.repliedToUserId) {
referencedUserIds.add(tweetEntity.repliedToUserId)
if (tweetEntity.quotedTweetId)
}

if (tweetEntity.quotedTweetId) {
referencedTweetIds.add(tweetEntity.quotedTweetId)
if (tweetEntity.retweetedTweetId)
}

if (tweetEntity.retweetedTweetId) {
referencedTweetIds.add(tweetEntity.retweetedTweetId)
}

// Attempt to resolve any referenced tweets
for (const tweetId of referencedTweetIds) {
if (EntityMap.tweets[tweetId]) continue
if (entityMap.tweets[tweetId]) continue

const referencedTweet = await db.tryGetTweetById(tweetId, ctx, {
fetchFromTwitter: !!fetchMissingEntities
})
if (!referencedTweet) continue

EntityMap.tweets[referencedTweet.id] = convertTweetToEntity(referencedTweet)
entityMap.tweets[referencedTweet.id] = convertTweetToEntity(referencedTweet)
}

for (const tweet of Object.values(EntityMap.tweets)) {
for (const tweet of Object.values(entityMap.tweets)) {
if (tweet.repliedToUserId) referencedUserIds.add(tweet.repliedToUserId)
if (tweet.authorId) referencedUserIds.add(tweet.authorId)
}

// Attempt to resolve any referenced users
for (const userId of referencedUserIds) {
if (EntityMap.users[userId]) continue
if (entityMap.users[userId]) continue

const user = await db.tryGetUserById(userId)
if (!user) continue

const userEntity = (EntityMap.users[user.id] =
const userEntity = (entityMap.users[user.id] =
convertTwitterUserToEntity(user))
if (userEntity.twitterPinnedTweetId) {
referencedTweetIds.add(userEntity.twitterPinnedTweetId)
}
}

return EntityMap
for (const tweetEntity of Object.values(entityMap.tweets)) {
if (!tweetEntity.urls) continue
for (const urlEntity of tweetEntity.urls) {
urls[urlEntity.url] = urlEntity
}
}

for (const userEntity of Object.values(entityMap.users)) {
if (!userEntity.urls) continue
for (const urlEntity of userEntity.urls) {
urls[urlEntity.url] = urlEntity
}
}

entityMap.urls = await enrichEntityUrls(Object.values(urls))
return entityMap
}

export function mergeEntityMaps(...entityMaps: EntityMap[]): EntityMap {
const result: Required<EntityMap> = {
users: {},
tweets: {},
media: {}
media: {},
urls: {}
}

for (const entityMap of entityMaps) {
Object.assign(result.users, entityMap.users)
Object.assign(result.tweets, entityMap.tweets)
Object.assign(result.media, entityMap.media)
Object.assign(result.urls, entityMap.urls)
}

return result
Expand Down Expand Up @@ -272,3 +303,49 @@ export function convertTwitterUrlToEntity(url: types.TwitterUrl): URLEntity {
mediaId: url.media_key
}
}

export async function enrichEntityUrls(
urls: URLEntity[],
{
concurrency = 5
}: {
concurrency?: number
} = {}
): Promise<Record<string, URLEntity>> {
const enrichedUrls: Record<string, URLEntity> = {}

await pMap(
urls,
async (urlEntity) => {
if (urlEntity.mediaId) return

const scrapedUrl = await scrapeUrl(urlEntity.url)
if (!scrapedUrl) return

urlEntity.title = scrapedUrl.title
urlEntity.description = scrapedUrl.description
urlEntity.author = scrapedUrl.author
urlEntity.siteName = scrapedUrl.siteName
// urlEntity.markdownContent = scrapedUrl.markdownContent

enrichedUrls[urlEntity.url] = urlEntity
},
{
concurrency
}
)

return enrichedUrls
}

const scraperClient = new ScraperClient()
export const scrapeUrl = pMemoize(scrapeUrlImpl)

async function scrapeUrlImpl(url: string) {
try {
return await scraperClient.scrapeUrl(url)
} catch (err: any) {
console.warn('error scraping url', url, err.message)
return null
}
}
3 changes: 2 additions & 1 deletion src/mentions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,9 @@ export async function getTweetMentionsBatch(
batch.mentions,
async (mention) => {
const message = await db.messages.get(mention.id)
// console.log('mention', { mention, message })

if (message && (!message.error || message.isErrorFinal)) {
if (message && (message.response || message.isErrorFinal)) {
const isDebugTweet =
!ctx.debugAnswerEngine && ctx.debugTweetIds?.includes(mention.id)

Expand Down
1 change: 1 addition & 0 deletions src/respond-to-new-mentions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ export async function respondToNewMentions(ctx: types.Context) {
delete message.errorStatus
delete message.isErrorFinal

console.log()
console.log('message', message)
console.log()

Expand Down
3 changes: 2 additions & 1 deletion src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import type { AsyncReturnType, SetOptional, Simplify } from 'type-fest'

import type { AnswerEngine } from './answer-engine.js'
import type { BotErrorType } from './bot-error.js'
import type { Entities, EntityMap } from './entities.js'
import type { Entities, EntityMap, URLEntity } from './entities.js'

Check warning on line 8 in src/types.ts

View workflow job for this annotation

GitHub Actions / Test Node.js 20

'Entities' is defined but never used

Check warning on line 8 in src/types.ts

View workflow job for this annotation

GitHub Actions / Test Node.js 20

'Entities' is defined but never used

export type { TwitterClient }

Expand Down Expand Up @@ -204,6 +204,7 @@ export type AnswerEngineQuery = {
export type RawEntityMap = {
users: Record<string, Partial<TwitterUser>>
tweets: Record<string, Partial<Tweet>>
urls: Record<string, URLEntity>
}

export type AnswerEngineContext = Pick<
Expand Down

0 comments on commit b89c76d

Please sign in to comment.