diff --git a/lib/errors/index.test.ts b/lib/errors/index.test.ts index e1af5105878618..2538b7a8876b09 100644 --- a/lib/errors/index.test.ts +++ b/lib/errors/index.test.ts @@ -66,13 +66,13 @@ describe('route throws an error', () => { expect(value).toBe('9'); break; case 'Hot Routes:': - expect(value).toBe('6 /test/:id
'); + expect(value).toBe('6 /test/:id/:params?
'); break; case 'Hot Paths:': expect(value).toBe('2 /test/error
2 /test/slow
1 /test/httperror
1 /test/config-not-found-error
1 /test/invalid-parameter-error
1 /thisDoesNotExist
1 /
'); break; case 'Hot Error Routes:': - expect(value).toBe('5 /test/:id
'); + expect(value).toBe('5 /test/:id/:params?
'); break; case 'Hot Error Paths:': expect(value).toBe('2 /test/error
1 /test/httperror
1 /test/slow
1 /test/config-not-found-error
1 /test/invalid-parameter-error
1 /thisDoesNotExist
'); diff --git a/lib/routes/test/index.ts b/lib/routes/test/index.ts index 138e3691b4896c..2990917bd7bcd8 100644 --- a/lib/routes/test/index.ts +++ b/lib/routes/test/index.ts @@ -3,13 +3,14 @@ import { config } from '@/config'; import got from '@/utils/got'; import wait from '@/utils/wait'; import cache from '@/utils/cache'; +import { fetchArticle } from '@/utils/wechat-mp'; import ConfigNotFoundError from '@/errors/types/config-not-found'; import InvalidParameterError from '@/errors/types/invalid-parameter'; let cacheIndex = 0; export const route: Route = { - path: '/:id', + path: '/:id/:params?', name: 'Unknown', maintainers: ['DIYgod', 'NeverBehave'], handler, @@ -384,6 +385,15 @@ async function handler(ctx) { ]; } + if (ctx.req.param('id') === 'wechat-mp') { + const params = ctx.req.param('params'); + if (!params) { + throw new InvalidParameterError('Invalid parameter'); + } + const mpUrl = 'https:/mp.weixin.qq.com/s' + (params.includes('&') ? '?' : '/') + params; + item = [await fetchArticle(mpUrl)]; + } + return { title: `Test ${ctx.req.param('id')}`, itunes_author: ctx.req.param('id') === 'enclosure' ? 'DIYgod' : null, diff --git a/lib/setup.test.ts b/lib/setup.test.ts index 192d9fe8275c3f..a06f67c51c2a30 100644 --- a/lib/setup.test.ts +++ b/lib/setup.test.ts @@ -2,6 +2,29 @@ import { afterAll, afterEach } from 'vitest'; import { setupServer } from 'msw/node'; import { http, HttpResponse } from 'msw'; +const genWeChatMpPage = (rich_media_content: string, scripts: string[] | string) => { + if (!Array.isArray(scripts)) { + scripts = [scripts]; + } + let pageHtml = ` + + + + + + +
mpName
`; + for (const script of scripts) { + pageHtml += ` +`; + } + return pageHtml; +}; + const server = setupServer( http.post(`https://api.openai.mock/v1/chat/completions`, () => HttpResponse.json({ @@ -33,21 +56,106 @@ const server = setupServer( `) ), - http.get(`https://mp.weixin.qq.com/rsshub_test/wechatMp_fetchArticle`, () => + http.get(`https://mp.weixin.qq.com/rsshub_test/appMsg`, () => + HttpResponse.text( + genWeChatMpPage( + ` +description + + +`, + ` +var item_show_type = "0"; +var real_item_show_type = "0"; +var appmsg_type = "9"; +var ct = "${1_636_626_300}"; +var msg_source_url = "https://mp.weixin.qq.com/rsshub_test/fake"; +window.ip_wording = { + countryName: '中国', + countryId: '156', + provinceName: '福建', + provinceId: '', + cityName: '', + cityId: '' +};` + ) + ) + ), + http.get(`https://mp.weixin.qq.com/rsshub_test/img`, () => + HttpResponse.text( + genWeChatMpPage('fake_description', [ + ` +var item_show_type = "8"; +var real_item_show_type = "8"; +var appmsg_type = "9"; +var ct = "${1_636_626_300}"; +`, + ` +window.picture_page_info_list = [ +{ + cdn_url: 'https://mmbiz.qpic.cn/rsshub_test/fake_img_1/0?wx_fmt=jpeg', +}, +{ + cdn_url: 'https://mmbiz.qpic.cn/rsshub_test/fake_img_2/0?wx_fmt=jpeg', +}, +].slice(0, 20); +`, + ]) + ) + ), + http.get(`https://mp.weixin.qq.com/rsshub_test/audio`, () => + HttpResponse.text( + genWeChatMpPage('fake_description', [ + ` +var item_show_type = "7"; +var real_item_show_type = "7"; +var appmsg_type = "9"; +var ct = "${1_636_626_300}"; +`, + ` +reportOpt = { + voiceid: "", + uin: "", + biz: "", + mid: "", + idx: "" +}; +window.cgiData = { + voiceid: "rsshub_test_voiceid_1", + duration: "6567" * 1, +}; +`, + ]) + ) + ), + http.get(`https://mp.weixin.qq.com/rsshub_test/video`, () => + HttpResponse.text( + genWeChatMpPage( + 'fake_description', + ` +var item_show_type = "5"; +var real_item_show_type = "5"; +var appmsg_type = "9"; +var ct = "${1_636_626_300}"; +` + ) + ) + ), + http.get(`https://mp.weixin.qq.com/rsshub_test/fallback`, () => HttpResponse.text( - '\n' + - '\n' + - '\n' + - '\n' + - '\n' + - '\n' + - '
mpName
\n' + - '' + genWeChatMpPage( + 'fake_description', + ` +var item_show_type = "99988877"; +var real_item_show_type = "99988877"; +var appmsg_type = "9"; +var ct = "${1_636_626_300}"; +` + ) ) ), + http.get(`https://mp.weixin.qq.com/s/rsshub_test`, () => HttpResponse.text(genWeChatMpPage('', ''))), + http.get(`https://mp.weixin.qq.com/s?__biz=rsshub_test&mid=1&idx=1&sn=1`, () => HttpResponse.text(genWeChatMpPage('', ''))), http.get(`http://rsshub.test/headers`, ({ request }) => HttpResponse.json({ ...Object.fromEntries(request.headers.entries()), diff --git a/lib/utils/wechat-mp.test.ts b/lib/utils/wechat-mp.test.ts index e700a6e425bdc6..2343ee69c6b9e7 100644 --- a/lib/utils/wechat-mp.test.ts +++ b/lib/utils/wechat-mp.test.ts @@ -1,6 +1,27 @@ -import { describe, expect, it } from 'vitest'; +import { describe, expect, it, vi } from 'vitest'; import { load } from 'cheerio'; -import { fixArticleContent, fetchArticle, finishArticleItem, normalizeUrl } from '@/utils/wechat-mp'; +import Parser from 'rss-parser'; +import InvalidParameterError from '@/errors/types/invalid-parameter'; +import { exportedForTestingOnly, fetchArticle, finishArticleItem, fixArticleContent, normalizeUrl } from '@/utils/wechat-mp'; +const { ExtractMetadata, showTypeMapReverse } = exportedForTestingOnly; + +vi.mock('@/utils/request-rewriter', () => ({ default: null })); +const { default: app } = await import('@/app'); +const parser = new Parser(); + +const expectedItem: { + title: string; + summary: string; + author: string; + mpName: string; + link: string; +} = { + title: 'title', + summary: 'summary', + author: 'author', + mpName: 'mpName', + link: '', // to be filled +}; // date from the cache will be an ISO8601 string, so we need to use this function const compareDate = (date1, date2) => { @@ -8,8 +29,227 @@ const compareDate = (date1, date2) => { date2 = typeof date2 === 'string' ? new Date(date2) : date2; return date1.getTime() === date2.getTime(); }; +const genScriptHtmlStr = (script: string) => ` + + + +`; +const testFetchArticleFinishArticleItem = async (path: string, { setMpNameAsAuthor = false, skipLink = false } = {}) => { + const ct = 1_636_626_300; + const httpsUrl = `https://mp.weixin.qq.com/rsshub_test${path}`; + const httpUrl = 'http' + httpsUrl.slice(5); + + const expectedDate = new Date(ct * 1000); + + const expectedItem_ = { + ...expectedItem, + link: httpsUrl, + }; + + const fetchArticleItem = await fetchArticle(httpUrl); + expect(compareDate(fetchArticleItem.pubDate, expectedDate)).toBe(true); + expect(fetchArticleItem).toMatchObject(expectedItem_); + + const ToBeFinishedArticleItem = { link: httpUrl }; + const expectedFinishedArticleItem = { ...fetchArticleItem }; + expectedFinishedArticleItem.author = setMpNameAsAuthor ? expectedFinishedArticleItem.mpName : expectedFinishedArticleItem.author; + expectedFinishedArticleItem.link = skipLink ? ToBeFinishedArticleItem.link : expectedFinishedArticleItem.link; + + const finishedArticleItem = await finishArticleItem(ToBeFinishedArticleItem, setMpNameAsAuthor, skipLink); + expect(compareDate(finishedArticleItem.pubDate, fetchArticleItem.pubDate)).toBe(true); + delete expectedFinishedArticleItem.pubDate; + expect(finishedArticleItem).toMatchObject(expectedFinishedArticleItem); + + return fetchArticleItem; +}; describe('wechat-mp', () => { + it('ExtractMetadata.common', () => { + expect(ExtractMetadata.common(load(''))).toStrictEqual({}); + + expect( + ExtractMetadata.common( + load( + genScriptHtmlStr(` + window.fake_item_show_type = '5' || ''; + window.fake_real_item_show_type = '5' || ''; + window.fake_ct = '1713009660' || ''; + `) + ) + ) + ).toMatchObject({}); + + expect( + ExtractMetadata.common( + load( + genScriptHtmlStr(` + window.item_show_type = '5' || ''; + window.real_item_show_type = '5' || ''; + window.ct = '1713009660' || ''; + `) + ) + ) + ).toMatchObject({ + showType: showTypeMapReverse['5'], + realShowType: showTypeMapReverse['5'], + createTime: '1713009660', + }); + + expect( + ExtractMetadata.common( + load( + genScriptHtmlStr(` + var item_show_type = "5"; + var real_item_show_type = "5"; + var ct = "1713009660"; + var msg_source_url = 'https://mp.weixin.qq.com/rsshub_test/fake'; + `) + ) + ) + ).toMatchObject({ + showType: showTypeMapReverse['5'], + realShowType: showTypeMapReverse['5'], + createTime: '1713009660', + sourceUrl: 'https://mp.weixin.qq.com/rsshub_test/fake', + }); + + expect( + ExtractMetadata.common( + load( + genScriptHtmlStr(` + var item_show_type = "998877665544332211"; + var real_item_show_type = "112233445566778899"; + var ct = "1713009660"; + `) + ) + ) + ).toMatchObject({ + showType: '998877665544332211', + realShowType: '112233445566778899', + createTime: '1713009660', + }); + }); + it('ExtractMetadata.img', () => { + expect(ExtractMetadata.img(load(''))).toStrictEqual({}); + + expect( + ExtractMetadata.img( + load( + genScriptHtmlStr(` + window.picture_page_info_list = [ + { + cdn_url: 'https://mmbiz.qpic.cn/rsshub_test/fake_img_1/0?wx_fmt=jpeg', + }, + { + cdn_url: 'https://mmbiz.qpic.cn/rsshub_test/fake_img_2/0?wx_fmt=jpeg', + }, + ].slice(0, 20); + `) + ) + ) + ).toMatchObject({ + imgUrls: ['https://mmbiz.qpic.cn/rsshub_test/fake_img_1/0?wx_fmt=jpeg', 'https://mmbiz.qpic.cn/rsshub_test/fake_img_2/0?wx_fmt=jpeg'], + }); + }); + it('ExtractMetadata.audio', () => { + expect(ExtractMetadata.audio(load(''))).toStrictEqual({}); + + expect( + ExtractMetadata.audio( + load( + genScriptHtmlStr(` + reportOpt = { + voiceid: "", + uin: "", + biz: "", + mid: "", + idx: "" + }; + `) + ) + ) + ).toMatchObject({}); + + expect( + ExtractMetadata.audio( + load( + genScriptHtmlStr(` + window.cgiData = { + voiceid: "rsshub_test_voiceid_1", + duration: "6567" * 1, + }; + `) + ) + ) + ).toMatchObject({ + voiceId: 'rsshub_test_voiceid_1', + duration: '6567', + }); + + expect( + ExtractMetadata.audio( + load( + genScriptHtmlStr(` + window.cgiData = { + voiceid: "rsshub_test_voiceid_1", + }; + `) + ) + ) + ).toMatchObject({ + voiceId: 'rsshub_test_voiceid_1', + duration: null, + }); + + expect( + ExtractMetadata.audio( + load( + genScriptHtmlStr(` + reportOpt = { + voiceid: "", + uin: "", + biz: "", + mid: "", + idx: "" + }; + window.cgiData = { + voiceid: "rsshub_test_voiceid_1", + duration: "6567" * 1, + }; + `) + ) + ) + ).toMatchObject({ + voiceId: 'rsshub_test_voiceid_1', + duration: '6567', + }); + }); + it('ExtractMetadata.location', () => { + expect(ExtractMetadata.location(load(''))).toStrictEqual({}); + + expect( + ExtractMetadata.location( + load( + genScriptHtmlStr(` + window.ip_wording = { + countryName: '中国', + countryId: '156', + provinceName: '广东', + provinceId: '', + cityName: '', + cityId: '' + }; + `) + ) + ) + ).toMatchObject({ + countryName: '中国', + provinceName: '广东', + cityName: '', + }); + }); it('fixArticleContent', () => { const divHeader = '
'; const divFooter = '
'; @@ -88,37 +328,105 @@ describe('wechat-mp', () => { expect(normalizeUrl(notWechatMp, true)).toBe(notWechatMp); }); - it('fetchArticle_&_finishArticleItem', async () => { - const ct = 1_636_626_300; - const httpsUrl = 'https://mp.weixin.qq.com/rsshub_test/wechatMp_fetchArticle'; - const httpUrl = httpsUrl.replace(/^https:\/\//, 'http://'); - - const expectedItem: { - title: string; - summary: string; - author: string; - description: string; - mpName?: string; - link: string; - } = { + it('fetchArticle_&_finishArticleItem_appMsg', async () => { + const fetchArticleItem = await testFetchArticleFinishArticleItem('/appMsg'); + const $ = load(fetchArticleItem.description); + expect($('iframe').attr()).toMatchObject({ + src: + 'https://v.qq.com/txp/iframe/player.html?origin=https%3A%2F%2Fmp.weixin.qq.com' + + '&containerId=js_tx_video_container_0.3863487104715233&vid=fake&width=677&height=380.8125' + + '&autoplay=false&allowFullScreen=true&chid=17&full=true&show1080p=false&isDebugIframe=false', + width: '677', + height: '380.8125', + }); + expect($('audio').attr()).toMatchObject({ + src: 'https://res.wx.qq.com/voice/getvoice?mediaid=rsshub_test', + title: 'title', + }); + expect($('a').attr()).toMatchObject({ + href: 'https://mp.weixin.qq.com/rsshub_test/fake', + }); + expect(fetchArticleItem.description).toContain('description'); + expect(fetchArticleItem.description).toContain('📍发表于:中国 福建'); + expect(fetchArticleItem.description).toContain('🔗️ 阅读原文'); + }); + + it('fetchArticle_&_finishArticleItem_img', async () => { + const fetchArticleItem = await testFetchArticleFinishArticleItem('/img'); + const $ = load(fetchArticleItem.description); + expect($.text()).toBe('summary'); + expect($('img:nth-of-type(1)').attr()).toMatchObject({ + src: 'https://mmbiz.qpic.cn/rsshub_test/fake_img_1/0?wx_fmt=jpeg', + }); + expect($('img:nth-of-type(2)').attr()).toMatchObject({ + src: 'https://mmbiz.qpic.cn/rsshub_test/fake_img_2/0?wx_fmt=jpeg', + }); + }); + + it('fetchArticle_&_finishArticleItem_audio', async () => { + const fetchArticleItem = await testFetchArticleFinishArticleItem('/audio'); + const $ = load(fetchArticleItem.description); + expect($.text()).toBe('summary'); + expect($('audio').attr()).toMatchObject({ + controls: '', + src: 'https://res.wx.qq.com/voice/getvoice?mediaid=rsshub_test_voiceid_1', + style: 'width:100%', title: 'title', - summary: 'summary', - author: 'author', - description: 'description', - mpName: 'mpName', - link: httpsUrl, + }); + expect(fetchArticleItem).toMatchObject({ + enclosure_type: 'audio/mp3', + enclosure_url: 'https://res.wx.qq.com/voice/getvoice?mediaid=rsshub_test_voiceid_1', + itunes_duration: '6567', + }); + }); + + it('fetchArticle_&_finishArticleItem_video', async () => { + const fetchArticleItem = await testFetchArticleFinishArticleItem('/video'); + const $ = load(fetchArticleItem.description); + expect($.text()).toBe('summary'); + expect($('img').attr()).toMatchObject({ + src: 'https://mmbiz.qpic.cn/rsshub_test/og_img_1/0?wx_fmt=jpeg', + }); + }); + + it('fetchArticle_&_finishArticleItem_fallback', async () => { + const fetchArticleItem = await testFetchArticleFinishArticleItem('/fallback'); + const $ = load(fetchArticleItem.description); + expect($.text()).toBe('summary'); + expect($('img').attr()).toMatchObject({ + src: 'https://mmbiz.qpic.cn/rsshub_test/og_img_1/0?wx_fmt=jpeg', + }); + }); + + it('finishArticleItem_param', async () => { + await testFetchArticleFinishArticleItem('/fallback', { setMpNameAsAuthor: false, skipLink: false }); + await testFetchArticleFinishArticleItem('/fallback', { setMpNameAsAuthor: true, skipLink: false }); + await testFetchArticleFinishArticleItem('/fallback', { setMpNameAsAuthor: false, skipLink: true }); + await testFetchArticleFinishArticleItem('/fallback', { setMpNameAsAuthor: true, skipLink: true }); + }); + + it('route_test', async () => { + try { + await app.request('/test/wechat-mp'); + } catch (error) { + expect(error).toBeInstanceOf(InvalidParameterError); + } + + const responseShort = await app.request('/test/wechat-mp/rsshub_test'); + const parsedShort = await parser.parseString(await responseShort.text()); + const expectedItemShort = { + author: expectedItem.author, + title: expectedItem.title, + link: 'https://mp.weixin.qq.com/s/rsshub_test', + }; + expect(parsedShort.items[0]).toMatchObject(expectedItemShort); + + const responseLong = await app.request('/test/wechat-mp/__biz=rsshub_test&mid=1&idx=1&sn=1'); + const parsedLong = await parser.parseString(await responseLong.text()); + const expectedItemLong = { + ...expectedItemShort, + link: 'https://mp.weixin.qq.com/s?__biz=rsshub_test&mid=1&idx=1&sn=1', }; - const expectedDate = new Date(ct * 1000); - - const fetchArticleItem = await fetchArticle(httpUrl); - expect(compareDate(fetchArticleItem.pubDate, expectedDate)).toBe(true); - delete fetchArticleItem.pubDate; - expect(fetchArticleItem).toEqual(expectedItem); - - delete expectedItem.mpName; - const finishedArticleItem = await finishArticleItem({ link: httpUrl }); - expect(compareDate(finishedArticleItem.pubDate, expectedDate)).toBe(true); - delete finishedArticleItem.pubDate; - expect(finishedArticleItem).toEqual(expectedItem); + expect(parsedLong.items[0]).toMatchObject(expectedItemLong); }); }); diff --git a/lib/utils/wechat-mp.ts b/lib/utils/wechat-mp.ts index 0615ab1c944950..268b41578697b7 100644 --- a/lib/utils/wechat-mp.ts +++ b/lib/utils/wechat-mp.ts @@ -26,9 +26,215 @@ */ import ofetch from '@/utils/ofetch'; -import { load, type Cheerio, type Element } from 'cheerio'; +import { type Cheerio, type CheerioAPI, type Element, load } from 'cheerio'; import { parseDate } from '@/utils/parse-date'; import cache from '@/utils/cache'; +import logger from '@/utils/logger'; + +const MAINTAINERS = ['Rongronggg9']; + +const warn = (reason: string, details: string) => + logger.warn(`wechat-mp: ${reason}: ${details}, +consider raise an issue (mentioning ${MAINTAINERS.join(', ')}) with the article URL for further investigation`); + +const replaceReturnNewline = (() => { + const returnRegExp = /\r|\\(r|x0d)/g; + const newlineRegExp = /\n|\\(n|x0a)/g; + return (text: string, replaceReturnWith = '', replaceNewlineWith = '
') => text.replaceAll(returnRegExp, replaceReturnWith).replaceAll(newlineRegExp, replaceNewlineWith); +})(); +const fixUrl = (() => { + const ampRegExp = /(&|\\x26)amp;/g; + return (text: string) => text.replaceAll(ampRegExp, '&'); +})(); + +class LoopContinue extends Error { + constructor() { + super(''); + this.name = 'LoopContinue'; + } +} + +class LoopReturn extends Error { + to_return: any; + + constructor(to_return: any) { + super(''); + this.name = 'LoopReturn'; + this.to_return = to_return; + } +} + +const forEachScript = ($: CheerioAPI | string, callback: (script) => void, defaultReturn: any = null, selector = 'script[nonce][type="text/javascript"]') => { + const scripts = typeof $ === 'string' ? [$] : $(selector).toArray(); + for (const script of scripts) { + try { + callback(script); + } catch (error) { + if (error instanceof LoopReturn) { + return error.to_return; + } else if (error instanceof LoopContinue) { + continue; + } + throw error; + } + } + return defaultReturn; +}; + +// view-source a *_SHARE_PAGE type article and search for `ITEM_SHOW_TYPE_MAP` +// Please update the comments below if you find new types or new examples +const showTypeMap = { + // "Article". + // May be combined with media, but type won't change + // Combined with audio and iframe: https://mp.weixin.qq.com/s/FnjcMXZ1xdS-d6n-pUUyyw + APP_MSG_PAGE: '0', + // https://mp.weixin.qq.com/s?__biz=Mzg4NTA1MTkwNA==&mid=2247532942&idx=1&sn=a84e4adbe49fdb39e4d4c1b5c12a4c3f + VIDEO_SHARE_PAGE: '5', + MUSIC_SHARE_PAGE: '6', + // https://mp.weixin.qq.com/s/FY6yQC_e4NMAxK0FBr6jwQ + AUDIO_SHARE_PAGE: '7', + // https://mp.weixin.qq.com/s/4p5YmYuASiQSYFiy7KqydQ + // https://mp.weixin.qq.com/s?__biz=Mzg4NTA1MTkwNA==&mid=2247532936&idx=4&sn=624054c20ded6ee85c6632f419c6f758 + IMG_SHARE_PAGE: '8', + TEXT_SHARE_PAGE: '10', + SHORT_CONTENT_PAGE: '17', +}; +const showTypeMapReverse = Object.fromEntries(Object.entries(showTypeMap).map(([k, v]) => [v, k])); + +class ExtractMetadata { + private static genAssignmentRegExp = (varName: string, valuePattern: string, assignPattern: string) => RegExp(`\\b${varName}\\s*${assignPattern}\\s*(?["'])(?${valuePattern})\\k`, 'mg'); + + private static genExtractFunc = ( + varName: string, + { + valuePattern = '\\w+', + assignPattern = '=', + allowNotFound = false, + multiple = false, + }: { + valuePattern?: string; + assignPattern?: string; + allowNotFound?: boolean; + multiple?: boolean; + } + ) => { + const regExp = this.genAssignmentRegExp(varName, valuePattern, assignPattern); + return (str: string) => { + const values: string[] = []; + for (const match of str.matchAll(regExp)) { + const value = match.groups?.value; + if (!multiple) { + return value; + } + values.push(value); + } + if (!allowNotFound && values.length === 0) { + throw new LoopContinue(); + } + return multiple ? values : null; + }; + }; + + private static doExtract = (metadataToBeExtracted: Record string | string[] | null | undefined>, scriptText: string) => { + const metadataExtracted: Record = {}; + for (const [key, extractFunc] of Object.entries(metadataToBeExtracted)) { + metadataExtracted[key] = extractFunc(scriptText); + } + metadataExtracted._extractedFrom = scriptText; + return metadataExtracted; + }; + + private static commonMetadataToBeExtracted = { + showType: this.genExtractFunc('item_show_type', { valuePattern: '\\d+' }), + realShowType: this.genExtractFunc('real_item_show_type', { valuePattern: '\\d+' }), + createTime: this.genExtractFunc('ct', { valuePattern: '\\d+' }), + sourceUrl: this.genExtractFunc('msg_source_url', { valuePattern: `https?://[^'"]*`, allowNotFound: true }), + }; + + static common = ($: CheerioAPI) => + forEachScript( + $, + (script) => { + const scriptText = $(script).text(); + const metadataExtracted = > this.doExtract(this.commonMetadataToBeExtracted, scriptText); + const showType = showTypeMapReverse[metadataExtracted.showType]; + const realShowType = showTypeMapReverse[metadataExtracted.realShowType]; + metadataExtracted.sourceUrl = metadataExtracted.sourceUrl && fixUrl(metadataExtracted.sourceUrl); + if (showType) { + metadataExtracted.showType = showType; + } else { + warn('showType not found', `item_show_type=${metadataExtracted.showType}`); + } + if (realShowType) { + metadataExtracted.realShowType = realShowType; + } else { + warn('realShowType not found', `real_item_show_type=${metadataExtracted.realShowType}`); + } + if (metadataExtracted.showType !== metadataExtracted.realShowType) { + // never seen this happen, waiting for examples + warn('showType mismatch', `item_show_type=${metadataExtracted.showType}, real_item_show_type=${metadataExtracted.realShowType}`); + } + throw new LoopReturn(metadataExtracted); + }, + {}, + 'script[nonce][type="text/javascript"]:contains("real_item_show_type")' + ); + + private static audioMetadataToBeExtracted = { + voiceId: this.genExtractFunc('voiceid', { assignPattern: ':' }), + duration: this.genExtractFunc('duration', { valuePattern: '\\d*', assignPattern: ':', allowNotFound: true }), + }; + + // never seen a audio article containing multiple audio, waiting for examples + static audio = ($: CheerioAPI) => + forEachScript( + $, + (script) => { + const scriptText = $(script).text(); + const metadataExtracted = > this.doExtract(this.audioMetadataToBeExtracted, scriptText); + throw new LoopReturn(metadataExtracted); + }, + {}, + 'script[nonce][type="text/javascript"]:contains("voiceid")' + ); + + private static imgMetadataToBeExtracted = { + imgUrls: this.genExtractFunc('cdn_url', { valuePattern: `https?://[^'"]*`, assignPattern: ':', multiple: true }), + }; + + static img = ($: CheerioAPI) => + forEachScript( + $, + (script) => { + const scriptText = $(script).text(); + const metadataExtracted = > this.doExtract(this.imgMetadataToBeExtracted, scriptText); + if (Array.isArray(metadataExtracted.imgUrls)) { + metadataExtracted.imgUrls = metadataExtracted.imgUrls.map((url) => fixUrl(url)); + } + throw new LoopReturn(metadataExtracted); + }, + {}, + 'script[nonce][type="text/javascript"]:contains("picture_page_info_list")' + ); + + private static locationMetadataToBeExtracted = { + countryName: this.genExtractFunc('countryName', { valuePattern: `[^'"]*`, assignPattern: ':' }), + provinceName: this.genExtractFunc('provinceName', { valuePattern: `[^'"]*`, assignPattern: ':' }), + cityName: this.genExtractFunc('cityName', { valuePattern: `[^'"]*`, assignPattern: ':' }), + }; + + static location = ($: CheerioAPI) => + forEachScript( + $, + (script) => { + const scriptText = $(script).text(); + const metadataExtracted = this.doExtract(this.locationMetadataToBeExtracted, scriptText); + throw new LoopReturn(metadataExtracted); + }, + {}, + 'script[nonce][type="text/javascript"]:contains("countryName")' + ); +} const replaceTag = ($, oldTag, newTagName) => { oldTag = $(oldTag); @@ -55,15 +261,23 @@ const detectOriginalArticleUrl = ($) => { return null; }; -const detectSourceUrl = ($) => { - const matchs = $.root() - .html() - .match(/msg_source_url = '(.+)';/); - - if (matchs) { - return matchs[1]; - } - return null; +const genAudioSrc = (voiceId: string) => `https://res.wx.qq.com/voice/getvoice?mediaid=${voiceId}`; +const genAudioTag = (src: string, title: string) => `