Skip to content

Commit

Permalink
Merge pull request #3 from dexaai/feature/answer-engine-use-raw-tweet…
Browse files Browse the repository at this point in the history
…-data
  • Loading branch information
transitive-bullshit authored Feb 26, 2024
2 parents a8d22cd + c608492 commit 1e7fade
Show file tree
Hide file tree
Showing 16 changed files with 2,538 additions and 1,241 deletions.
1 change: 1 addition & 0 deletions bin/debug-answer-engine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ async function main() {
}
)

console.log(`logging ${batch.messages.length} message threads to stderr...`)
console.log()
console.warn(JSON.stringify(answerEngineQueries, null, 2))
}
Expand Down
78 changes: 75 additions & 3 deletions bin/debug-scrape-url.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pMap from 'p-map'

import '../src/config.js'
import { ScraperClient } from '../src/services/scraper-client.js'
import { omit } from '../src/utils.js'
Expand All @@ -8,10 +10,80 @@ import { omit } from '../src/utils.js'
async function main() {
const scraperClient = new ScraperClient()

const res = await scraperClient.scrapeUrl(
'https://www.nytimes.com/2023/05/31/magazine/ai-start-up-accelerator-san-francisco.html'
const urls = [
'https://www.nytimes.com/2023/05/31/magazine/ai-start-up-accelerator-san-francisco.html',
'https://www.youtube.com/watch?v=NNgdcn4Ux1k&ab_channel=LexClips',
'https://digg.com/memes-ranked/link/best-memes-ranked-pussy-in-bio-mandela-effect-room-space?utm_source=digg',
'https://platform.openai.com/docs/guides/vision',
'https://en.wikipedia.org/wiki/Larry_Page',
'https://www.flowrestling.org/articles/12162675-oklahoma-state-wrestling-on-the-hunt-for-upsets-against-iowa',
'https://github.com/transitive-bullshit/lqip-modern',
'https://www.gatesnotes.com/AI-agents',
'https://blog.eladgil.com/p/early-days-of-ai',
'https://bair.berkeley.edu/blog/2024/02/18/compound-ai-systems/',
'https://www.bbc.com/news/business-68387018',
'https://www.bbc.com/sport/football/68395310',
'https://www.kayak.com/',
'https://marmelab.com/blog/2024/01/23/react-19-new-hooks.html?ref=labnotes.org',
'https://www.foxnews.com/us/ai-technology-could-help-us-allies-monitor-chinas-taiwan-invasion-intensions',
'https://twitter.com/paulg/status/1761731253764579573',
'https://twitter.com/transitive_bs',
'https://transitivebullsh.it/chatgpt-twitter-bot-lessons',
'https://www.swyx.io/learn-in-public',
'https://leerob.io/blog/developer-experience-examples',
'https://rauchg.com/2021/making-the-web-faster',
'https://blog.google/products/gemini/bard-gemini-advanced-app/',
'https://apnews.com/article/2024-qatar-swimming-worlds-underwater-camera-splash',
'https://www.amazon.com/Deepness-Sky-Zones-Thought-Book-ebook/dp/B002H8ORKM/?_encoding=UTF8&pd_rd_w=4N09q&content-id=amzn1.sym.379956f8-690b-4143-ad17-ba606cbec0c1&pf_rd_p=379956f8-690b-4143-ad17-ba606cbec0c1&pf_rd_r=NXZSG4MAQ5P40FP5T5ZR&pd_rd_wg=t7KmU&pd_rd_r=5c051a29-61a2-468a-bc68-ad2754e52d05&ref_=pd_gw_bmx27b',
'https://www.reddit.com/r/MadeMeSmile/comments/u33nuc/he_finally_got_his_acorn/',
'https://www.reddit.com/r/Damnthatsinteresting/comments/ujl32z/this_is_jeanbaptiste_kempf_the_creator_of_vlc/',
'https://news.ycombinator.com/item?id=35154527',
'https://news.ycombinator.com/item?id=11116274',
'https://www.bbc.com/news/uk-43396008',
'https://www.apple.com/customer-letter/',
'https://openai.com/blog/openai-announces-leadership-transition',
'https://www.apple.com/stevejobs/', // output includes some weird #{ref} stuff
'https://groups.google.com/g/vim_announce/c/tWahca9zkt4?pli=1',
'https://bensbites.beehiiv.com/',
'https://bensbites.beehiiv.com/p/open-ai-serious-building-new-app-store',
'https://anilist.co/anime/1/Cowboy-Bebop/',
'https://dexa.ai/',
'https://dexa.ai/s/S7RDMg3f',
'https://www.quora.com/What-can-I-learn-know-right-now-in-10-minutes-that-will-be-useful-for-the-rest-of-my-life',
'https://www.quora.com/How-do-top-students-study',
'https://www.quora.com/What-are-the-most-surreal-places-to-visit',
'https://www.instagram.com/p/BTKd8z2jM14/?img_index=1',
'https://www.linkedin.com/in/fisch2/',
'https://www.facebook.com/zuck/',
'https://github.com/sindresorhus',
'https://www.pornhub.com/',
'https://www.tiktok.com/@zachking/video/6768504823336815877?embed_source=71929438%2C121374463%2C121351166%2C121331973%2C120811592%2C120810756%3Bnull%3Bembed_blank&refer=embed&referer_url=metricool.com%2Ftiktoks-most-viral-videos%2F&referer_video_id=6768504823336815877',
'https://www.tiktok.com/@zachking/video/6749520869598481669'
]

const results = (
await pMap(
urls,
async (url) => {
try {
return await scraperClient.scrapeUrl(url)
} catch (err: any) {
console.error('error processing url', url, err.toString())
}
},
{
concurrency: 4
}
)
).filter(Boolean)

console.log(
JSON.stringify(
results.map((res) => omit(res, 'content', 'rawHtml')),
null,
2
)
)
console.log(JSON.stringify(omit(res, 'content', 'rawHtml'), null, 2))
}

main()
Expand Down
5 changes: 1 addition & 4 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,16 +82,13 @@ Flags:
## TODO
- understand why mentions from non-verified accounts aren't being reported by the twitter api
- support quote tweet and retweet context
- support user entity context
- add test fixtures testing these different use cases
- fix support for empty mentions
- currently works but duplicates the previous tweet's contents
- support `url` entities
- expand them with metadata
- support `media` entities
- populate media entities
- openai use gpt-4-vision-preview
- for openai, use gpt-4-vision-preview
- conditionally preprocess images using `sharp` to ensure they are supported by gpt4v
- improve openai answer engine
- dalle tool
Expand Down
2 changes: 2 additions & 0 deletions src/__snapshots__/entities.test.ts.snap
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ exports[`mergeEntityMaps 1`] = `
"type": "tweet",
},
},
"urls": {},
"users": {
"1235525929335689217": {
"name": "Lofi Grind",
Expand Down Expand Up @@ -67,6 +68,7 @@ exports[`mergeEntityMaps 2`] = `
"1760384146004996333": {},
"test": {},
},
"urls": {},
"users": {
"1235525929335689217": {},
"327034465": {},
Expand Down
9 changes: 6 additions & 3 deletions src/answer-engine.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ import { getTwitterClient } from './services/twitter-client.js'
import { rUrl } from './utils.js'

const fixtures = fixturesData as unknown as types.AnswerEngineQuery[]
// const answerEngines = [new OpenAIAnswerEngine(), new DexaAnswerEngine()]
const answerEngines = [new OpenAIAnswerEngine()]
const answerEngines = [new OpenAIAnswerEngine(), new DexaAnswerEngine()]
// const answerEngines = [new OpenAIAnswerEngine()]

for (const answerEngine of answerEngines) {
describe(`${answerEngine.type} answer engine`, async () => {
Expand Down Expand Up @@ -40,7 +40,10 @@ for (const answerEngine of answerEngines) {
ctx
)

console.log(`${answerEngine.type} tweet ${tweetUrl} ⇒`, response)
console.log(
`\n**QUESTION** ${tweetUrl}\n\n**ANSWER**\n\n${response}\n\n`
)

assert(response.length > 0, 'response should not be empty')
assert(response.trim() === response, 'response should be trimmed')

Expand Down
117 changes: 72 additions & 45 deletions src/answer-engine.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
import { Msg } from '@dexaai/dexter'
import { Msg, stringifyForModel } from '@dexaai/dexter'
import pMap from 'p-map'

import * as config from '../src/config.js'
import * as db from './db.js'
import type * as types from './types.js'
import { BotError } from './bot-error.js'
import {
type EntitiesMap,
convertTweetToEntitiesMap,
type EntityMap,
convertTweetToEntityMap,
mergeEntityMaps
} from './entities.js'
import { sanitizeTweetText, stripUserMentions } from './twitter-utils.js'
import { assert } from './utils.js'
import {
getPrunedTweet,
getPrunedTwitterUser,
sanitizeTweetText,
stripUserMentions
} from './twitter-utils.js'
import { assert, pick } from './utils.js'

export abstract class AnswerEngine {
readonly type: types.AnswerEngineType
Expand All @@ -25,7 +30,10 @@ export abstract class AnswerEngine {
ctx: types.AnswerEngineContext
) {
const query = await this.resolveMessageThread(message, ctx)
console.log(`>>> ${this.type} answer engine`, query)
console.log(
`\n>>> ${this.type} answer engine`,
pick(query, 'message', 'chatMessages', 'tweets', 'entityMap')
)

message.response = await this.generateResponseForQuery(query, ctx)

Expand All @@ -46,6 +54,9 @@ export abstract class AnswerEngine {
}

try {
// replace markdown lists with unicode bullet points
response = response.replaceAll(/^\s*-\s+/gm, '• ')

response = sanitizeTweetText(response, {
label: `generated by answer engine "${this.type}"`
})
Expand Down Expand Up @@ -161,10 +172,7 @@ export abstract class AnswerEngine {
...Msg.user(tweet.text, {
name: userIdToUsernameMap[tweet.author_id!]
}),

entities: {
tweetIds: [tweet.id]
}
tweetId: tweet.id
})
)

Expand All @@ -175,23 +183,15 @@ export abstract class AnswerEngine {
...Msg.user(message.prompt, {
name: userIdToUsernameMap[message.promptUserId]
}),

entities: {
tweetIds: [message.promptTweetId]
}
tweetId: message.promptTweetId
},

message.response && message !== leafMessage
? {
...Msg.assistant(message.response!, {
name: userIdToUsernameMap[ctx.twitterBotUserId]
}),

entities: {
tweetIds: message.responseTweetId
? [message.responseTweetId!]
: []
}
tweetId: message.responseTweetId!
}
: null
].filter(Boolean)
Expand All @@ -210,56 +210,83 @@ export abstract class AnswerEngine {
.reverse()
}

const chatMessages = answerEngineMessages.map(
({ tweetId, ...message }) => message
)

// Resolve all entity maps for the tweets and messages in the thread and then
// condense them into a single, normalized enitity map
let entityMap: EntitiesMap = {}

for (const answerEngineMessage of answerEngineMessages) {
if (!answerEngineMessage.entities?.tweetIds) continue

for (const tweetId of answerEngineMessage.entities.tweetIds) {
if (entityMap.tweets?.[tweetId]) continue

const tweet = await db.tryGetTweetById(tweetId, ctx, {
fetchFromTwitter: false
})
if (!tweet) continue

const tweetEntityMap = await convertTweetToEntitiesMap(tweet, ctx, {
fetchMissingEntities: true
})

entityMap = mergeEntityMaps(entityMap, tweetEntityMap)
}
}
let entityMap: EntityMap = {}

// Construct a raw array of tweets to pass to the answer engine, which may
// be easier to work with than our structured AnswerEngineMessage format
// be easier to work with than our AnswerEngineMessage format
const tweets = (
await pMap(
answerEngineMessages,
async (message) => {
const tweetId = message.entities?.tweetIds?.[0]
const { tweetId } = message
assert(tweetId)

const tweet = await db.tryGetTweetById(tweetId, ctx, {
fetchFromTwitter: true
})
if (!tweet) return

return tweet
const tweetEntityMap = await convertTweetToEntityMap(tweet, ctx, {
fetchMissingEntities: true
})

entityMap = mergeEntityMaps(entityMap, tweetEntityMap)

return getPrunedTweet(tweet)
},
{
concurrency: 8
}
)
).filter(Boolean)

const rawChatMessages = tweets.map((tweet) =>
tweet.author_id === ctx.twitterBotUserId
? Msg.assistant(stringifyForModel(tweet), {
name: userIdToUsernameMap[tweet.author_id!]
})
: Msg.user(stringifyForModel(tweet), {
name: userIdToUsernameMap[tweet.author_id!]
})
)

const rawEntityMap: types.RawEntityMap = {
users: {},
tweets: {},
urls: entityMap.urls ?? {}
}

if (entityMap?.users) {
for (const user of Object.values(entityMap.users)) {
assert(user.twitterId)
const twitterUser = await db.tryGetUserById(user.twitterId)
if (!twitterUser) continue
rawEntityMap.users[user.twitterId] = getPrunedTwitterUser(twitterUser)
}
}

if (entityMap?.tweets) {
for (const tweet of Object.values(entityMap.tweets)) {
assert(tweet.id)
const twittertweet = await db.tryGetTweetById(tweet.id, ctx)
if (!twittertweet) continue
rawEntityMap.tweets[tweet.id] = getPrunedTweet(twittertweet)
}
}

return {
message,
answerEngineMessages,
chatMessages,
rawChatMessages,
tweets,
entityMap
entityMap,
rawEntityMap
}
}
}
2 changes: 1 addition & 1 deletion src/answer-engines/dexa-answer-engine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ export class DexaAnswerEngine extends AnswerEngine {
ctx: types.AnswerEngineContext
): Promise<string> {
return this._dexaClient.generateResponse({
messages: query.answerEngineMessages,
messages: query.chatMessages,
entityMap: query.entityMap
})
}
Expand Down
Loading

0 comments on commit 1e7fade

Please sign in to comment.