From b89c76d470097fcee8c61868436e20d797209713 Mon Sep 17 00:00:00 2001 From: Travis Fischer Date: Sun, 25 Feb 2024 15:26:18 -0600 Subject: [PATCH] feat: enrich url entities --- bin/debug-scrape-url.ts | 78 +++++++++++++++++++++++- readme.md | 5 +- src/answer-engine.ts | 10 ++- src/entities.ts | 107 ++++++++++++++++++++++++++++----- src/mentions.ts | 3 +- src/respond-to-new-mentions.ts | 1 + src/types.ts | 3 +- 7 files changed, 180 insertions(+), 27 deletions(-) diff --git a/bin/debug-scrape-url.ts b/bin/debug-scrape-url.ts index d1a53ff..f21c5a1 100644 --- a/bin/debug-scrape-url.ts +++ b/bin/debug-scrape-url.ts @@ -1,3 +1,5 @@ +import pMap from 'p-map' + import '../src/config.js' import { ScraperClient } from '../src/services/scraper-client.js' import { omit } from '../src/utils.js' @@ -8,10 +10,80 @@ import { omit } from '../src/utils.js' async function main() { const scraperClient = new ScraperClient() - const res = await scraperClient.scrapeUrl( - 'https://www.nytimes.com/2023/05/31/magazine/ai-start-up-accelerator-san-francisco.html' + const urls = [ + 'https://www.nytimes.com/2023/05/31/magazine/ai-start-up-accelerator-san-francisco.html', + 'https://www.youtube.com/watch?v=NNgdcn4Ux1k&ab_channel=LexClips', + 'https://digg.com/memes-ranked/link/best-memes-ranked-pussy-in-bio-mandela-effect-room-space?utm_source=digg', + 'https://platform.openai.com/docs/guides/vision', + 'https://en.wikipedia.org/wiki/Larry_Page', + 'https://www.flowrestling.org/articles/12162675-oklahoma-state-wrestling-on-the-hunt-for-upsets-against-iowa', + 'https://github.com/transitive-bullshit/lqip-modern', + 'https://www.gatesnotes.com/AI-agents', + 'https://blog.eladgil.com/p/early-days-of-ai', + 'https://bair.berkeley.edu/blog/2024/02/18/compound-ai-systems/', + 'https://www.bbc.com/news/business-68387018', + 'https://www.bbc.com/sport/football/68395310', + 'https://www.kayak.com/', + 'https://marmelab.com/blog/2024/01/23/react-19-new-hooks.html?ref=labnotes.org', + 'https://www.foxnews.com/us/ai-technology-could-help-us-allies-monitor-chinas-taiwan-invasion-intensions', + 'https://twitter.com/paulg/status/1761731253764579573', + 'https://twitter.com/transitive_bs', + 'https://transitivebullsh.it/chatgpt-twitter-bot-lessons', + 'https://www.swyx.io/learn-in-public', + 'https://leerob.io/blog/developer-experience-examples', + 'https://rauchg.com/2021/making-the-web-faster', + 'https://blog.google/products/gemini/bard-gemini-advanced-app/', + 'https://apnews.com/article/2024-qatar-swimming-worlds-underwater-camera-splash', + 'https://www.amazon.com/Deepness-Sky-Zones-Thought-Book-ebook/dp/B002H8ORKM/?_encoding=UTF8&pd_rd_w=4N09q&content-id=amzn1.sym.379956f8-690b-4143-ad17-ba606cbec0c1&pf_rd_p=379956f8-690b-4143-ad17-ba606cbec0c1&pf_rd_r=NXZSG4MAQ5P40FP5T5ZR&pd_rd_wg=t7KmU&pd_rd_r=5c051a29-61a2-468a-bc68-ad2754e52d05&ref_=pd_gw_bmx27b', + 'https://www.reddit.com/r/MadeMeSmile/comments/u33nuc/he_finally_got_his_acorn/', + 'https://www.reddit.com/r/Damnthatsinteresting/comments/ujl32z/this_is_jeanbaptiste_kempf_the_creator_of_vlc/', + 'https://news.ycombinator.com/item?id=35154527', + 'https://news.ycombinator.com/item?id=11116274', + 'https://www.bbc.com/news/uk-43396008', + 'https://www.apple.com/customer-letter/', + 'https://openai.com/blog/openai-announces-leadership-transition', + 'https://www.apple.com/stevejobs/', // output includes some weird #{ref} stuff + 'https://groups.google.com/g/vim_announce/c/tWahca9zkt4?pli=1', + 'https://bensbites.beehiiv.com/', + 'https://bensbites.beehiiv.com/p/open-ai-serious-building-new-app-store', + 'https://anilist.co/anime/1/Cowboy-Bebop/', + 'https://dexa.ai/', + 'https://dexa.ai/s/S7RDMg3f', + 'https://www.quora.com/What-can-I-learn-know-right-now-in-10-minutes-that-will-be-useful-for-the-rest-of-my-life', + 'https://www.quora.com/How-do-top-students-study', + 'https://www.quora.com/What-are-the-most-surreal-places-to-visit', + 'https://www.instagram.com/p/BTKd8z2jM14/?img_index=1', + 'https://www.linkedin.com/in/fisch2/', + 'https://www.facebook.com/zuck/', + 'https://github.com/sindresorhus', + 'https://www.pornhub.com/', + 'https://www.tiktok.com/@zachking/video/6768504823336815877?embed_source=71929438%2C121374463%2C121351166%2C121331973%2C120811592%2C120810756%3Bnull%3Bembed_blank&refer=embed&referer_url=metricool.com%2Ftiktoks-most-viral-videos%2F&referer_video_id=6768504823336815877', + 'https://www.tiktok.com/@zachking/video/6749520869598481669' + ] + + const results = ( + await pMap( + urls, + async (url) => { + try { + return await scraperClient.scrapeUrl(url) + } catch (err: any) { + console.error('error processing url', url, err.toString()) + } + }, + { + concurrency: 4 + } + ) + ).filter(Boolean) + + console.log( + JSON.stringify( + results.map((res) => omit(res, 'content', 'rawHtml')), + null, + 2 + ) ) - console.log(JSON.stringify(omit(res, 'content', 'rawHtml'), null, 2)) } main() diff --git a/readme.md b/readme.md index f33e619..9019988 100644 --- a/readme.md +++ b/readme.md @@ -82,16 +82,13 @@ Flags: ## TODO - understand why mentions from non-verified accounts aren't being reported by the twitter api -- support quote tweet and retweet context -- support user entity context -- add test fixtures testing these different use cases - fix support for empty mentions - currently works but duplicates the previous tweet's contents - support `url` entities - expand them with metadata - support `media` entities - populate media entities - - openai use gpt-4-vision-preview + - for openai, use gpt-4-vision-preview - conditionally preprocess images using `sharp` to ensure they are supported by gpt4v - improve openai answer engine - dalle tool diff --git a/src/answer-engine.ts b/src/answer-engine.ts index 01f9dd4..6e8eb88 100644 --- a/src/answer-engine.ts +++ b/src/answer-engine.ts @@ -16,7 +16,7 @@ import { sanitizeTweetText, stripUserMentions } from './twitter-utils.js' -import { assert } from './utils.js' +import { assert, pick } from './utils.js' export abstract class AnswerEngine { readonly type: types.AnswerEngineType @@ -30,7 +30,10 @@ export abstract class AnswerEngine { ctx: types.AnswerEngineContext ) { const query = await this.resolveMessageThread(message, ctx) - console.log(`>>> ${this.type} answer engine`, query) + console.log( + `\n>>> ${this.type} answer engine`, + pick(query, 'message', 'chatMessages', 'tweets', 'entityMap') + ) message.response = await this.generateResponseForQuery(query, ctx) @@ -255,7 +258,8 @@ export abstract class AnswerEngine { const rawEntityMap: types.RawEntityMap = { users: {}, - tweets: {} + tweets: {}, + urls: entityMap.urls ?? {} } if (entityMap?.users) { diff --git a/src/entities.ts b/src/entities.ts index bdb041a..971ded6 100644 --- a/src/entities.ts +++ b/src/entities.ts @@ -1,7 +1,10 @@ +import pMap from 'p-map' +import pMemoize from 'p-memoize' import { z } from 'zod' import * as db from './db.js' import type * as types from './types.js' +import { ScraperClient } from './services/scraper-client.js' export const URLEntitySchema = z.object({ type: z.literal('url'), @@ -12,7 +15,10 @@ export const URLEntitySchema = z.object({ // Will only exist if this URL references a known media entity mediaId: z.string().optional(), title: z.string().optional(), - description: z.string().optional() + description: z.string().optional(), + markdownContent: z.string().optional(), + siteName: z.string().optional(), + author: z.string().optional() }) export type URLEntity = z.infer @@ -86,7 +92,8 @@ export type MediaEntity = z.infer export const EntityMapSchema = z.object({ users: z.record(UserEntitySchema).optional(), tweets: z.record(TweetEntitySchema).optional(), - media: z.record(MediaEntitySchema).optional() + media: z.record(MediaEntitySchema).optional(), + urls: z.record(URLEntitySchema).optional() }) export type EntityMap = z.infer @@ -127,70 +134,94 @@ export async function convertTweetToEntityMap( fetchMissingEntities?: boolean } = {} ): Promise { - const EntityMap: Required = { + const entityMap: Required = { users: {}, tweets: {}, // TODO: currently not resolving media entities - media: {} + media: {}, + urls: {} } const tweetEntity = convertTweetToEntity(tweet) - EntityMap.tweets[tweetEntity.id] = tweetEntity + entityMap.tweets[tweetEntity.id] = tweetEntity + const urls: Record = {} const referencedUserIds = new Set() const referencedTweetIds = new Set() - if (tweetEntity.repliedToUserId) + if (tweetEntity.repliedToUserId) { referencedUserIds.add(tweetEntity.repliedToUserId) - if (tweetEntity.quotedTweetId) + } + + if (tweetEntity.quotedTweetId) { referencedTweetIds.add(tweetEntity.quotedTweetId) - if (tweetEntity.retweetedTweetId) + } + + if (tweetEntity.retweetedTweetId) { referencedTweetIds.add(tweetEntity.retweetedTweetId) + } // Attempt to resolve any referenced tweets for (const tweetId of referencedTweetIds) { - if (EntityMap.tweets[tweetId]) continue + if (entityMap.tweets[tweetId]) continue const referencedTweet = await db.tryGetTweetById(tweetId, ctx, { fetchFromTwitter: !!fetchMissingEntities }) if (!referencedTweet) continue - EntityMap.tweets[referencedTweet.id] = convertTweetToEntity(referencedTweet) + entityMap.tweets[referencedTweet.id] = convertTweetToEntity(referencedTweet) } - for (const tweet of Object.values(EntityMap.tweets)) { + for (const tweet of Object.values(entityMap.tweets)) { if (tweet.repliedToUserId) referencedUserIds.add(tweet.repliedToUserId) if (tweet.authorId) referencedUserIds.add(tweet.authorId) } // Attempt to resolve any referenced users for (const userId of referencedUserIds) { - if (EntityMap.users[userId]) continue + if (entityMap.users[userId]) continue const user = await db.tryGetUserById(userId) if (!user) continue - const userEntity = (EntityMap.users[user.id] = + const userEntity = (entityMap.users[user.id] = convertTwitterUserToEntity(user)) if (userEntity.twitterPinnedTweetId) { referencedTweetIds.add(userEntity.twitterPinnedTweetId) } } - return EntityMap + for (const tweetEntity of Object.values(entityMap.tweets)) { + if (!tweetEntity.urls) continue + for (const urlEntity of tweetEntity.urls) { + urls[urlEntity.url] = urlEntity + } + } + + for (const userEntity of Object.values(entityMap.users)) { + if (!userEntity.urls) continue + for (const urlEntity of userEntity.urls) { + urls[urlEntity.url] = urlEntity + } + } + + entityMap.urls = await enrichEntityUrls(Object.values(urls)) + return entityMap } export function mergeEntityMaps(...entityMaps: EntityMap[]): EntityMap { const result: Required = { users: {}, tweets: {}, - media: {} + media: {}, + urls: {} } for (const entityMap of entityMaps) { Object.assign(result.users, entityMap.users) Object.assign(result.tweets, entityMap.tweets) Object.assign(result.media, entityMap.media) + Object.assign(result.urls, entityMap.urls) } return result @@ -272,3 +303,49 @@ export function convertTwitterUrlToEntity(url: types.TwitterUrl): URLEntity { mediaId: url.media_key } } + +export async function enrichEntityUrls( + urls: URLEntity[], + { + concurrency = 5 + }: { + concurrency?: number + } = {} +): Promise> { + const enrichedUrls: Record = {} + + await pMap( + urls, + async (urlEntity) => { + if (urlEntity.mediaId) return + + const scrapedUrl = await scrapeUrl(urlEntity.url) + if (!scrapedUrl) return + + urlEntity.title = scrapedUrl.title + urlEntity.description = scrapedUrl.description + urlEntity.author = scrapedUrl.author + urlEntity.siteName = scrapedUrl.siteName + // urlEntity.markdownContent = scrapedUrl.markdownContent + + enrichedUrls[urlEntity.url] = urlEntity + }, + { + concurrency + } + ) + + return enrichedUrls +} + +const scraperClient = new ScraperClient() +export const scrapeUrl = pMemoize(scrapeUrlImpl) + +async function scrapeUrlImpl(url: string) { + try { + return await scraperClient.scrapeUrl(url) + } catch (err: any) { + console.warn('error scraping url', url, err.message) + return null + } +} diff --git a/src/mentions.ts b/src/mentions.ts index 6721394..68429f9 100644 --- a/src/mentions.ts +++ b/src/mentions.ts @@ -65,8 +65,9 @@ export async function getTweetMentionsBatch( batch.mentions, async (mention) => { const message = await db.messages.get(mention.id) + // console.log('mention', { mention, message }) - if (message && (!message.error || message.isErrorFinal)) { + if (message && (message.response || message.isErrorFinal)) { const isDebugTweet = !ctx.debugAnswerEngine && ctx.debugTweetIds?.includes(mention.id) diff --git a/src/respond-to-new-mentions.ts b/src/respond-to-new-mentions.ts index c4b51de..85629e1 100644 --- a/src/respond-to-new-mentions.ts +++ b/src/respond-to-new-mentions.ts @@ -229,6 +229,7 @@ export async function respondToNewMentions(ctx: types.Context) { delete message.errorStatus delete message.isErrorFinal + console.log() console.log('message', message) console.log() diff --git a/src/types.ts b/src/types.ts index 4130cae..b4ac77f 100644 --- a/src/types.ts +++ b/src/types.ts @@ -5,7 +5,7 @@ import type { AsyncReturnType, SetOptional, Simplify } from 'type-fest' import type { AnswerEngine } from './answer-engine.js' import type { BotErrorType } from './bot-error.js' -import type { Entities, EntityMap } from './entities.js' +import type { Entities, EntityMap, URLEntity } from './entities.js' export type { TwitterClient } @@ -204,6 +204,7 @@ export type AnswerEngineQuery = { export type RawEntityMap = { users: Record> tweets: Record> + urls: Record } export type AnswerEngineContext = Pick<