feat: enrich url entities

dexaai · Feb 25, 2024 · b89c76d · b89c76d
1 parent c4b59dd
commit b89c76d
Show file tree

Hide file tree

Showing 7 changed files with 180 additions and 27 deletions.
diff --git a/bin/debug-scrape-url.ts b/bin/debug-scrape-url.ts
@@ -1,3 +1,5 @@
+import pMap from 'p-map'
+
 import '../src/config.js'
 import { ScraperClient } from '../src/services/scraper-client.js'
 import { omit } from '../src/utils.js'
@@ -8,10 +10,80 @@ import { omit } from '../src/utils.js'
 async function main() {
   const scraperClient = new ScraperClient()
 
-  const res = await scraperClient.scrapeUrl(
-    'https://www.nytimes.com/2023/05/31/magazine/ai-start-up-accelerator-san-francisco.html'
+  const urls = [
+    'https://www.nytimes.com/2023/05/31/magazine/ai-start-up-accelerator-san-francisco.html',
+    'https://www.youtube.com/watch?v=NNgdcn4Ux1k&ab_channel=LexClips',
+    'https://digg.com/memes-ranked/link/best-memes-ranked-pussy-in-bio-mandela-effect-room-space?utm_source=digg',
+    'https://platform.openai.com/docs/guides/vision',
+    'https://en.wikipedia.org/wiki/Larry_Page',
+    'https://www.flowrestling.org/articles/12162675-oklahoma-state-wrestling-on-the-hunt-for-upsets-against-iowa',
+    'https://github.com/transitive-bullshit/lqip-modern',
+    'https://www.gatesnotes.com/AI-agents',
+    'https://blog.eladgil.com/p/early-days-of-ai',
+    'https://bair.berkeley.edu/blog/2024/02/18/compound-ai-systems/',
+    'https://www.bbc.com/news/business-68387018',
+    'https://www.bbc.com/sport/football/68395310',
+    'https://www.kayak.com/',
+    'https://marmelab.com/blog/2024/01/23/react-19-new-hooks.html?ref=labnotes.org',
+    'https://www.foxnews.com/us/ai-technology-could-help-us-allies-monitor-chinas-taiwan-invasion-intensions',
+    'https://twitter.com/paulg/status/1761731253764579573',
+    'https://twitter.com/transitive_bs',
+    'https://transitivebullsh.it/chatgpt-twitter-bot-lessons',
+    'https://www.swyx.io/learn-in-public',
+    'https://leerob.io/blog/developer-experience-examples',
+    'https://rauchg.com/2021/making-the-web-faster',
+    'https://blog.google/products/gemini/bard-gemini-advanced-app/',
+    'https://apnews.com/article/2024-qatar-swimming-worlds-underwater-camera-splash',
+    'https://www.amazon.com/Deepness-Sky-Zones-Thought-Book-ebook/dp/B002H8ORKM/?_encoding=UTF8&pd_rd_w=4N09q&content-id=amzn1.sym.379956f8-690b-4143-ad17-ba606cbec0c1&pf_rd_p=379956f8-690b-4143-ad17-ba606cbec0c1&pf_rd_r=NXZSG4MAQ5P40FP5T5ZR&pd_rd_wg=t7KmU&pd_rd_r=5c051a29-61a2-468a-bc68-ad2754e52d05&ref_=pd_gw_bmx27b',
+    'https://www.reddit.com/r/MadeMeSmile/comments/u33nuc/he_finally_got_his_acorn/',
+    'https://www.reddit.com/r/Damnthatsinteresting/comments/ujl32z/this_is_jeanbaptiste_kempf_the_creator_of_vlc/',
+    'https://news.ycombinator.com/item?id=35154527',
+    'https://news.ycombinator.com/item?id=11116274',
+    'https://www.bbc.com/news/uk-43396008',
+    'https://www.apple.com/customer-letter/',
+    'https://openai.com/blog/openai-announces-leadership-transition',
+    'https://www.apple.com/stevejobs/', // output includes some weird #{ref} stuff
+    'https://groups.google.com/g/vim_announce/c/tWahca9zkt4?pli=1',
+    'https://bensbites.beehiiv.com/',
+    'https://bensbites.beehiiv.com/p/open-ai-serious-building-new-app-store',
+    'https://anilist.co/anime/1/Cowboy-Bebop/',
+    'https://dexa.ai/',
+    'https://dexa.ai/s/S7RDMg3f',
+    'https://www.quora.com/What-can-I-learn-know-right-now-in-10-minutes-that-will-be-useful-for-the-rest-of-my-life',
+    'https://www.quora.com/How-do-top-students-study',
+    'https://www.quora.com/What-are-the-most-surreal-places-to-visit',
+    'https://www.instagram.com/p/BTKd8z2jM14/?img_index=1',
+    'https://www.linkedin.com/in/fisch2/',
+    'https://www.facebook.com/zuck/',
+    'https://github.com/sindresorhus',
+    'https://www.pornhub.com/',
+    'https://www.tiktok.com/@zachking/video/6768504823336815877?embed_source=71929438%2C121374463%2C121351166%2C121331973%2C120811592%2C120810756%3Bnull%3Bembed_blank&refer=embed&referer_url=metricool.com%2Ftiktoks-most-viral-videos%2F&referer_video_id=6768504823336815877',
+    'https://www.tiktok.com/@zachking/video/6749520869598481669'
+  ]
+
+  const results = (
+    await pMap(
+      urls,
+      async (url) => {
+        try {
+          return await scraperClient.scrapeUrl(url)
+        } catch (err: any) {
+          console.error('error processing url', url, err.toString())
+        }
+      },
+      {
+        concurrency: 4
+      }
+    )
+  ).filter(Boolean)
+
+  console.log(
+    JSON.stringify(
+      results.map((res) => omit(res, 'content', 'rawHtml')),
+      null,
+      2
+    )
   )
-  console.log(JSON.stringify(omit(res, 'content', 'rawHtml'), null, 2))
 }
 
 main()

diff --git a/readme.md b/readme.md
@@ -82,16 +82,13 @@ Flags:
 ## TODO
 
 - understand why mentions from non-verified accounts aren't being reported by the twitter api
-- support quote tweet and retweet context
-- support user entity context
-- add test fixtures testing these different use cases
 - fix support for empty mentions
   - currently works but duplicates the previous tweet's contents
 - support `url` entities
   - expand them with metadata
 - support `media` entities
   - populate media entities
-  - openai use gpt-4-vision-preview
+  - for openai, use gpt-4-vision-preview
   - conditionally preprocess images using `sharp` to ensure they are supported by gpt4v
 - improve openai answer engine
   - dalle tool

diff --git a/src/answer-engine.ts b/src/answer-engine.ts
@@ -16,7 +16,7 @@ import {
   sanitizeTweetText,
   stripUserMentions
 } from './twitter-utils.js'
-import { assert } from './utils.js'
+import { assert, pick } from './utils.js'
 
 export abstract class AnswerEngine {
   readonly type: types.AnswerEngineType
@@ -30,7 +30,10 @@ export abstract class AnswerEngine {
     ctx: types.AnswerEngineContext
   ) {
     const query = await this.resolveMessageThread(message, ctx)
-    console.log(`>>> ${this.type} answer engine`, query)
+    console.log(
+      `\n>>> ${this.type} answer engine`,
+      pick(query, 'message', 'chatMessages', 'tweets', 'entityMap')
+    )
 
     message.response = await this.generateResponseForQuery(query, ctx)
 
@@ -255,7 +258,8 @@ export abstract class AnswerEngine {
 
     const rawEntityMap: types.RawEntityMap = {
       users: {},
-      tweets: {}
+      tweets: {},
+      urls: entityMap.urls ?? {}
     }
 
     if (entityMap?.users) {

diff --git a/src/entities.ts b/src/entities.ts
@@ -1,7 +1,10 @@
+import pMap from 'p-map'
+import pMemoize from 'p-memoize'
 import { z } from 'zod'
 
 import * as db from './db.js'
 import type * as types from './types.js'
+import { ScraperClient } from './services/scraper-client.js'
 
 export const URLEntitySchema = z.object({
   type: z.literal('url'),
@@ -12,7 +15,10 @@ export const URLEntitySchema = z.object({
   // Will only exist if this URL references a known media entity
   mediaId: z.string().optional(),
   title: z.string().optional(),
-  description: z.string().optional()
+  description: z.string().optional(),
+  markdownContent: z.string().optional(),
+  siteName: z.string().optional(),
+  author: z.string().optional()
 })
 export type URLEntity = z.infer<typeof URLEntitySchema>
 
@@ -86,7 +92,8 @@ export type MediaEntity = z.infer<typeof MediaEntitySchema>
 export const EntityMapSchema = z.object({
   users: z.record(UserEntitySchema).optional(),
   tweets: z.record(TweetEntitySchema).optional(),
-  media: z.record(MediaEntitySchema).optional()
+  media: z.record(MediaEntitySchema).optional(),
+  urls: z.record(URLEntitySchema).optional()
 })
 export type EntityMap = z.infer<typeof EntityMapSchema>
 
@@ -127,70 +134,94 @@ export async function convertTweetToEntityMap(
     fetchMissingEntities?: boolean
   } = {}
 ): Promise<EntityMap> {
-  const EntityMap: Required<EntityMap> = {
+  const entityMap: Required<EntityMap> = {
     users: {},
     tweets: {},
     // TODO: currently not resolving media entities
-    media: {}
+    media: {},
+    urls: {}
   }
   const tweetEntity = convertTweetToEntity(tweet)
-  EntityMap.tweets[tweetEntity.id] = tweetEntity
+  entityMap.tweets[tweetEntity.id] = tweetEntity
 
+  const urls: Record<string, URLEntity> = {}
   const referencedUserIds = new Set<string>()
   const referencedTweetIds = new Set<string>()
 
-  if (tweetEntity.repliedToUserId)
+  if (tweetEntity.repliedToUserId) {
     referencedUserIds.add(tweetEntity.repliedToUserId)
-  if (tweetEntity.quotedTweetId)
+  }
+
+  if (tweetEntity.quotedTweetId) {
     referencedTweetIds.add(tweetEntity.quotedTweetId)
-  if (tweetEntity.retweetedTweetId)
+  }
+
+  if (tweetEntity.retweetedTweetId) {
     referencedTweetIds.add(tweetEntity.retweetedTweetId)
+  }
 
   // Attempt to resolve any referenced tweets
   for (const tweetId of referencedTweetIds) {
-    if (EntityMap.tweets[tweetId]) continue
+    if (entityMap.tweets[tweetId]) continue
 
     const referencedTweet = await db.tryGetTweetById(tweetId, ctx, {
       fetchFromTwitter: !!fetchMissingEntities
     })
     if (!referencedTweet) continue
 
-    EntityMap.tweets[referencedTweet.id] = convertTweetToEntity(referencedTweet)
+    entityMap.tweets[referencedTweet.id] = convertTweetToEntity(referencedTweet)
   }
 
-  for (const tweet of Object.values(EntityMap.tweets)) {
+  for (const tweet of Object.values(entityMap.tweets)) {
     if (tweet.repliedToUserId) referencedUserIds.add(tweet.repliedToUserId)
     if (tweet.authorId) referencedUserIds.add(tweet.authorId)
   }
 
   // Attempt to resolve any referenced users
   for (const userId of referencedUserIds) {
-    if (EntityMap.users[userId]) continue
+    if (entityMap.users[userId]) continue
 
     const user = await db.tryGetUserById(userId)
     if (!user) continue
 
-    const userEntity = (EntityMap.users[user.id] =
+    const userEntity = (entityMap.users[user.id] =
       convertTwitterUserToEntity(user))
     if (userEntity.twitterPinnedTweetId) {
       referencedTweetIds.add(userEntity.twitterPinnedTweetId)
     }
   }
 
-  return EntityMap
+  for (const tweetEntity of Object.values(entityMap.tweets)) {
+    if (!tweetEntity.urls) continue
+    for (const urlEntity of tweetEntity.urls) {
+      urls[urlEntity.url] = urlEntity
+    }
+  }
+
+  for (const userEntity of Object.values(entityMap.users)) {
+    if (!userEntity.urls) continue
+    for (const urlEntity of userEntity.urls) {
+      urls[urlEntity.url] = urlEntity
+    }
+  }
+
+  entityMap.urls = await enrichEntityUrls(Object.values(urls))
+  return entityMap
 }
 
 export function mergeEntityMaps(...entityMaps: EntityMap[]): EntityMap {
   const result: Required<EntityMap> = {
     users: {},
     tweets: {},
-    media: {}
+    media: {},
+    urls: {}
   }
 
   for (const entityMap of entityMaps) {
     Object.assign(result.users, entityMap.users)
     Object.assign(result.tweets, entityMap.tweets)
     Object.assign(result.media, entityMap.media)
+    Object.assign(result.urls, entityMap.urls)
   }
 
   return result
@@ -272,3 +303,49 @@ export function convertTwitterUrlToEntity(url: types.TwitterUrl): URLEntity {
     mediaId: url.media_key
   }
 }
+
+export async function enrichEntityUrls(
+  urls: URLEntity[],
+  {
+    concurrency = 5
+  }: {
+    concurrency?: number
+  } = {}
+): Promise<Record<string, URLEntity>> {
+  const enrichedUrls: Record<string, URLEntity> = {}
+
+  await pMap(
+    urls,
+    async (urlEntity) => {
+      if (urlEntity.mediaId) return
+
+      const scrapedUrl = await scrapeUrl(urlEntity.url)
+      if (!scrapedUrl) return
+
+      urlEntity.title = scrapedUrl.title
+      urlEntity.description = scrapedUrl.description
+      urlEntity.author = scrapedUrl.author
+      urlEntity.siteName = scrapedUrl.siteName
+      // urlEntity.markdownContent = scrapedUrl.markdownContent
+
+      enrichedUrls[urlEntity.url] = urlEntity
+    },
+    {
+      concurrency
+    }
+  )
+
+  return enrichedUrls
+}
+
+const scraperClient = new ScraperClient()
+export const scrapeUrl = pMemoize(scrapeUrlImpl)
+
+async function scrapeUrlImpl(url: string) {
+  try {
+    return await scraperClient.scrapeUrl(url)
+  } catch (err: any) {
+    console.warn('error scraping url', url, err.message)
+    return null
+  }
+}
diff --git a/src/mentions.ts b/src/mentions.ts
@@ -65,8 +65,9 @@ export async function getTweetMentionsBatch(
         batch.mentions,
         async (mention) => {
           const message = await db.messages.get(mention.id)
+          // console.log('mention', { mention, message })
 
-          if (message && (!message.error || message.isErrorFinal)) {
+          if (message && (message.response || message.isErrorFinal)) {
             const isDebugTweet =
               !ctx.debugAnswerEngine && ctx.debugTweetIds?.includes(mention.id)
 

diff --git a/src/respond-to-new-mentions.ts b/src/respond-to-new-mentions.ts
@@ -229,6 +229,7 @@ export async function respondToNewMentions(ctx: types.Context) {
           delete message.errorStatus
           delete message.isErrorFinal
 
+          console.log()
           console.log('message', message)
           console.log()
 

diff --git a/src/types.ts b/src/types.ts
@@ -5,7 +5,7 @@ import type { AsyncReturnType, SetOptional, Simplify } from 'type-fest'
 
 import type { AnswerEngine } from './answer-engine.js'
 import type { BotErrorType } from './bot-error.js'
-import type { Entities, EntityMap } from './entities.js'
+import type { Entities, EntityMap, URLEntity } from './entities.js'
 
 export type { TwitterClient }
 
@@ -204,6 +204,7 @@ export type AnswerEngineQuery = {
 export type RawEntityMap = {
   users: Record<string, Partial<TwitterUser>>
   tweets: Record<string, Partial<Tweet>>
+  urls: Record<string, URLEntity>
 }
 
 export type AnswerEngineContext = Pick<