diff --git a/lib/config.ts b/lib/config.ts index 0d55e685f9a76d..2f1793201e599f 100644 --- a/lib/config.ts +++ b/lib/config.ts @@ -230,6 +230,9 @@ export type Config = { pkubbs: { cookie?: string; }; + qingting: { + id?: string; + }; saraba1st: { cookie?: string; }; @@ -575,6 +578,9 @@ const calculateValue = () => { pkubbs: { cookie: envs.PKUBBS_COOKIE, }, + qingting: { + id: envs.QINGTING_ID, + }, saraba1st: { cookie: envs.SARABA1ST_COOKIE, }, diff --git a/lib/errors/index.test.ts b/lib/errors/index.test.ts index e1af5105878618..2538b7a8876b09 100644 --- a/lib/errors/index.test.ts +++ b/lib/errors/index.test.ts @@ -66,13 +66,13 @@ describe('route throws an error', () => { expect(value).toBe('9'); break; case 'Hot Routes:': - expect(value).toBe('6 /test/:id
'); + expect(value).toBe('6 /test/:id/:params?
'); break; case 'Hot Paths:': expect(value).toBe('2 /test/error
2 /test/slow
1 /test/httperror
1 /test/config-not-found-error
1 /test/invalid-parameter-error
1 /thisDoesNotExist
1 /
'); break; case 'Hot Error Routes:': - expect(value).toBe('5 /test/:id
'); + expect(value).toBe('5 /test/:id/:params?
'); break; case 'Hot Error Paths:': expect(value).toBe('2 /test/error
1 /test/httperror
1 /test/slow
1 /test/config-not-found-error
1 /test/invalid-parameter-error
1 /thisDoesNotExist
'); diff --git a/lib/routes/0x80/index.ts b/lib/routes/0x80/index.ts new file mode 100644 index 00000000000000..d6aa248a436061 --- /dev/null +++ b/lib/routes/0x80/index.ts @@ -0,0 +1,87 @@ +import { Route } from '@/types'; + +import cache from '@/utils/cache'; +import got from '@/utils/got'; +import { load } from 'cheerio'; +import { parseDate } from '@/utils/parse-date'; + +export const route: Route = { + path: '/blog', + categories: ['blog'], + example: '/0x80/blog', + url: '0x80.pl/notesen.html', + name: 'Articles', + maintainers: ['xnum'], + handler, +}; + +function extractDateFromURL(url: string) { + const regex = /\d{4}-\d{2}-\d{2}/; + const match = url.match(regex); + + return match ? match[0] : null; +} + +async function handler() { + // The TLS cert is invalid, we are limited to use HTTP unfortunately. + const baseUrl = 'http://0x80.pl/'; + const targetUrl = `${baseUrl}notesen.html`; + + const response = await got({ + method: 'get', + url: targetUrl, + }); + + const $ = load(response.data); + + const alist = $('a.reference.external'); + + const list = alist + .toArray() + .map((item) => { + item = $(item); + + const link = item.attr('href') || ''; + const title = item.text() || ''; + const pubDate = extractDateFromURL(link); + + return { + title, + link, + pubDate, + category: 'Uncategoried', + }; + }) + .filter((item) => item.link.startsWith('notesen')); + + const items = await Promise.all( + list.map((item) => + cache.tryGet(item.link, async () => { + const articleUrl = `${baseUrl}${item.link}`; + const response = await got({ + method: 'get', + url: articleUrl, + }); + + const $ = load(response.data); + + const author = $('tr.author.field td.field-body').text(); + const articlePubDate = $('tr.added-on.field td.field-body').text(); + + item.author = author; + // Some articles might be missing the added-on field. + // As a safeguard, if the date from url is null, fallbacks to the article one. + item.pubDate = parseDate(item.pubDate || articlePubDate); + item.description = $('div.document').first().html(); + + return item; + }) + ) + ); + + return { + title: '0x80.pl articles', + link: targetUrl, + item: items, + }; +} diff --git a/lib/routes/0x80/namespace.ts b/lib/routes/0x80/namespace.ts new file mode 100644 index 00000000000000..8866d08e7919a9 --- /dev/null +++ b/lib/routes/0x80/namespace.ts @@ -0,0 +1,7 @@ +import type { Namespace } from '@/types'; + +export const namespace: Namespace = { + name: 'Wojciech Muła', + url: '0x80.pl', + description: '', +}; diff --git a/lib/routes/apple/podcast.ts b/lib/routes/apple/podcast.ts new file mode 100644 index 00000000000000..07d1acd36e875b --- /dev/null +++ b/lib/routes/apple/podcast.ts @@ -0,0 +1,64 @@ +import { Route } from '@/types'; +import got from '@/utils/got'; +import { load } from 'cheerio'; +import { parseDate } from '@/utils/parse-date'; + +export const route: Route = { + path: '/podcast/:id', + categories: ['multimedia'], + example: '/apple/podcast/id1559695855', + parameters: { id: '播客id,可以在 Apple 播客app 内分享的播客的 URL 中找到' }, + features: { + requireConfig: false, + requirePuppeteer: false, + antiCrawler: false, + supportBT: false, + supportPodcast: false, + supportScihub: false, + }, + radar: [ + { + source: ['podcasts.apple.com/cn/podcast/:id'], + }, + ], + name: '播客', + maintainers: ['Acring'], + handler, + url: 'https://www.apple.com.cn/apple-podcasts/', +}; + +async function handler(ctx) { + const link = `https://podcasts.apple.com/cn/podcast/${ctx.req.param('id')}`; + const response = await got({ + method: 'get', + url: link, + }); + + const $ = load(response.data); + + const page_data = JSON.parse($('#shoebox-media-api-cache-amp-podcasts').text()); + + const data = JSON.parse(page_data[Object.keys(page_data)[0]]).d[0]; + const attributes = data.attributes; + + const episodes = data.relationships.episodes.data.map((item) => { + const attr = item.attributes; + return { + title: attr.name, + enclosure_url: attr.assetUrl, + itunes_duration: attr.durationInMilliseconds / 1000, + enclosure_type: 'audio/mp4', + link: attr.url, + pubDate: parseDate(attr.releaseDateTime), + description: attr.description.standard.replaceAll('\n', '
'), + }; + }); + + return { + title: attributes.name, + link: attributes.url, + itunes_author: attributes.artistName, + item: episodes, + description: attributes.description.standard, + }; +} diff --git a/lib/routes/bilibili/cache.ts b/lib/routes/bilibili/cache.ts index 86699ef7e5ef33..01f58714b6b1e1 100644 --- a/lib/routes/bilibili/cache.ts +++ b/lib/routes/bilibili/cache.ts @@ -27,7 +27,7 @@ const getCookie = () => { await page.goto('https://space.bilibili.com/1/dynamic'); const cookieString = await waitForRequest; logger.debug(`Got bilibili cookie: ${cookieString}`); - + await browser.close(); return cookieString; }); }; diff --git a/lib/routes/qingting/podcast.ts b/lib/routes/qingting/podcast.ts index 9211f3e8c9df41..17d1c6b4bab23a 100644 --- a/lib/routes/qingting/podcast.ts +++ b/lib/routes/qingting/podcast.ts @@ -1,9 +1,12 @@ -import { Route } from '@/types'; +import type { DataItem, Route } from '@/types'; import cache from '@/utils/cache'; import crypto from 'crypto'; import got from '@/utils/got'; import timezone from '@/utils/timezone'; import { parseDate } from '@/utils/parse-date'; +import { config } from '@/config'; + +const qingtingId = config.qingting.id ?? ''; export const route: Route = { path: '/podcast/:id', @@ -11,12 +14,14 @@ export const route: Route = { example: '/qingting/podcast/293411', parameters: { id: '专辑id, 可在专辑页 URL 中找到' }, features: { - requireConfig: false, - requirePuppeteer: false, - antiCrawler: false, - supportBT: false, supportPodcast: true, - supportScihub: false, + requireConfig: [ + { + name: 'QINGTING_ID', + optional: true, + description: '用户id, 部分专辑需要会员身份,用户id可以通过从网页端登录蜻蜓fm后使用开发者工具,在控制台中运行JSON.parse(localStorage.getItem("user")).qingting_id获取', + }, + ], }, radar: [ { @@ -29,21 +34,35 @@ export const route: Route = { description: `获取的播放 URL 有效期只有 1 天,需要开启播客 APP 的自动下载功能。`, }; +function getMediaUrl(channelId: string, mediaId: string) { + const path = `/audiostream/redirect/${channelId}/${mediaId}?access_token=&device_id=MOBILESITE&qingting_id=${qingtingId}&t=${Date.now()}`; + const sign = crypto.createHmac('md5', 'fpMn12&38f_2e').update(path).digest('hex').toString(); + return `https://audio.qingting.fm${path}&sign=${sign}`; +} + async function handler(ctx) { - const channelUrl = `https://i.qingting.fm/capi/v3/channel/${ctx.req.param('id')}`; - let response = await got({ + const channelId = ctx.req.param('id'); + + const channelUrl = `https://i.qingting.fm/capi/v3/channel/${channelId}`; + const response = await got({ method: 'get', url: channelUrl, headers: { Referer: 'https://www.qingting.fm/', }, }); + const title = response.data.data.title; const channel_img = response.data.data.thumbs['400_thumb']; const authors = response.data.data.podcasters.map((author) => author.nick_name).join(','); const desc = response.data.data.description; - const programUrl = `https://i.qingting.fm/capi/channel/${ctx.req.param('id')}/programs/${response.data.data.v}?curpage=1&pagesize=10&order=asc`; - response = await got({ + const programUrl = `https://i.qingting.fm/capi/channel/${channelId}/programs/${response.data.data.v}?curpage=1&pagesize=10&order=asc`; + + const { + data: { + data: { programs }, + }, + } = await got({ method: 'get', url: programUrl, headers: { @@ -51,45 +70,48 @@ async function handler(ctx) { }, }); + const { + data: { data: channelInfo }, + } = await got(`https://i.qingting.fm/capi/v3/channel/${channelId}?user_id=${qingtingId}`); + + const isCharged = channelInfo.purchase?.item_type !== 0; + + const isPaid = channelInfo.user_relevance?.sale_status === 'paid'; + const resultItems = await Promise.all( - response.data.data.programs.map((item) => - cache.tryGet(`qingting:podcast:${ctx.req.param('id')}:${item.id}`, async () => { - const link = `https://www.qingting.fm/channels/${ctx.req.param('id')}/programs/${item.id}/`; - - const path = `/audiostream/redirect/${ctx.req.param('id')}/${item.id}?access_token=&device_id=MOBILESITE&qingting_id=&t=${Date.now()}`; - const sign = crypto.createHmac('md5', 'fpMn12&38f_2e').update(path).digest('hex').toString(); - - const [detailRes, mediaRes] = await Promise.all([ - got({ - method: 'get', - url: link, - headers: { - Referer: 'https://www.qingting.fm/', - }, - }), - got({ - method: 'get', - url: `https://audio.qingting.fm${path}&sign=${sign}`, - headers: { - Referer: 'https://www.qingting.fm/', - }, - }), - ]); + programs.map(async (item) => { + const data = (await cache.tryGet(`qingting:podcast:${channelId}:${item.id}`, async () => { + const link = `https://www.qingting.fm/channels/${channelId}/programs/${item.id}/`; + + const detailRes = await got({ + method: 'get', + url: link, + headers: { + Referer: 'https://www.qingting.fm/', + }, + }); const detail = JSON.parse(detailRes.data.match(/},"program":(.*?),"plist":/)[1]); - return { + const rssItem = { title: item.title, link, itunes_item_image: item.cover, itunes_duration: item.duration, pubDate: timezone(parseDate(item.update_time), +8), description: detail.richtext, - enclosure_url: mediaRes.url, - enclosure_type: 'audio/x-m4a', }; - }) - ) + + return rssItem; + })) as DataItem; + + if (!isCharged || isPaid || item.isfree) { + data.enclosure_url = getMediaUrl(channelId, item.id); + data.enclosure_type = 'audio/x-m4a'; + } + + return data; + }) ); return { @@ -97,7 +119,7 @@ async function handler(ctx) { description: desc, itunes_author: authors, image: channel_img, - link: `https://www.qingting.fm/channels/${ctx.req.param('id')}`, + link: `https://www.qingting.fm/channels/${channelId}`, item: resultItems, }; } diff --git a/lib/routes/test/index.ts b/lib/routes/test/index.ts index 138e3691b4896c..2990917bd7bcd8 100644 --- a/lib/routes/test/index.ts +++ b/lib/routes/test/index.ts @@ -3,13 +3,14 @@ import { config } from '@/config'; import got from '@/utils/got'; import wait from '@/utils/wait'; import cache from '@/utils/cache'; +import { fetchArticle } from '@/utils/wechat-mp'; import ConfigNotFoundError from '@/errors/types/config-not-found'; import InvalidParameterError from '@/errors/types/invalid-parameter'; let cacheIndex = 0; export const route: Route = { - path: '/:id', + path: '/:id/:params?', name: 'Unknown', maintainers: ['DIYgod', 'NeverBehave'], handler, @@ -384,6 +385,15 @@ async function handler(ctx) { ]; } + if (ctx.req.param('id') === 'wechat-mp') { + const params = ctx.req.param('params'); + if (!params) { + throw new InvalidParameterError('Invalid parameter'); + } + const mpUrl = 'https:/mp.weixin.qq.com/s' + (params.includes('&') ? '?' : '/') + params; + item = [await fetchArticle(mpUrl)]; + } + return { title: `Test ${ctx.req.param('id')}`, itunes_author: ctx.req.param('id') === 'enclosure' ? 'DIYgod' : null, diff --git a/lib/setup.test.ts b/lib/setup.test.ts index 192d9fe8275c3f..a06f67c51c2a30 100644 --- a/lib/setup.test.ts +++ b/lib/setup.test.ts @@ -2,6 +2,29 @@ import { afterAll, afterEach } from 'vitest'; import { setupServer } from 'msw/node'; import { http, HttpResponse } from 'msw'; +const genWeChatMpPage = (rich_media_content: string, scripts: string[] | string) => { + if (!Array.isArray(scripts)) { + scripts = [scripts]; + } + let pageHtml = ` + + + + + + +
mpName
`; + for (const script of scripts) { + pageHtml += ` +`; + } + return pageHtml; +}; + const server = setupServer( http.post(`https://api.openai.mock/v1/chat/completions`, () => HttpResponse.json({ @@ -33,21 +56,106 @@ const server = setupServer( `) ), - http.get(`https://mp.weixin.qq.com/rsshub_test/wechatMp_fetchArticle`, () => + http.get(`https://mp.weixin.qq.com/rsshub_test/appMsg`, () => + HttpResponse.text( + genWeChatMpPage( + ` +description + + +`, + ` +var item_show_type = "0"; +var real_item_show_type = "0"; +var appmsg_type = "9"; +var ct = "${1_636_626_300}"; +var msg_source_url = "https://mp.weixin.qq.com/rsshub_test/fake"; +window.ip_wording = { + countryName: '中国', + countryId: '156', + provinceName: '福建', + provinceId: '', + cityName: '', + cityId: '' +};` + ) + ) + ), + http.get(`https://mp.weixin.qq.com/rsshub_test/img`, () => + HttpResponse.text( + genWeChatMpPage('fake_description', [ + ` +var item_show_type = "8"; +var real_item_show_type = "8"; +var appmsg_type = "9"; +var ct = "${1_636_626_300}"; +`, + ` +window.picture_page_info_list = [ +{ + cdn_url: 'https://mmbiz.qpic.cn/rsshub_test/fake_img_1/0?wx_fmt=jpeg', +}, +{ + cdn_url: 'https://mmbiz.qpic.cn/rsshub_test/fake_img_2/0?wx_fmt=jpeg', +}, +].slice(0, 20); +`, + ]) + ) + ), + http.get(`https://mp.weixin.qq.com/rsshub_test/audio`, () => + HttpResponse.text( + genWeChatMpPage('fake_description', [ + ` +var item_show_type = "7"; +var real_item_show_type = "7"; +var appmsg_type = "9"; +var ct = "${1_636_626_300}"; +`, + ` +reportOpt = { + voiceid: "", + uin: "", + biz: "", + mid: "", + idx: "" +}; +window.cgiData = { + voiceid: "rsshub_test_voiceid_1", + duration: "6567" * 1, +}; +`, + ]) + ) + ), + http.get(`https://mp.weixin.qq.com/rsshub_test/video`, () => + HttpResponse.text( + genWeChatMpPage( + 'fake_description', + ` +var item_show_type = "5"; +var real_item_show_type = "5"; +var appmsg_type = "9"; +var ct = "${1_636_626_300}"; +` + ) + ) + ), + http.get(`https://mp.weixin.qq.com/rsshub_test/fallback`, () => HttpResponse.text( - '\n' + - '\n' + - '\n' + - '\n' + - '\n' + - '\n' + - '
mpName
\n' + - '' + genWeChatMpPage( + 'fake_description', + ` +var item_show_type = "99988877"; +var real_item_show_type = "99988877"; +var appmsg_type = "9"; +var ct = "${1_636_626_300}"; +` + ) ) ), + http.get(`https://mp.weixin.qq.com/s/rsshub_test`, () => HttpResponse.text(genWeChatMpPage('', ''))), + http.get(`https://mp.weixin.qq.com/s?__biz=rsshub_test&mid=1&idx=1&sn=1`, () => HttpResponse.text(genWeChatMpPage('', ''))), http.get(`http://rsshub.test/headers`, ({ request }) => HttpResponse.json({ ...Object.fromEntries(request.headers.entries()), diff --git a/lib/utils/wechat-mp.test.ts b/lib/utils/wechat-mp.test.ts index e700a6e425bdc6..2343ee69c6b9e7 100644 --- a/lib/utils/wechat-mp.test.ts +++ b/lib/utils/wechat-mp.test.ts @@ -1,6 +1,27 @@ -import { describe, expect, it } from 'vitest'; +import { describe, expect, it, vi } from 'vitest'; import { load } from 'cheerio'; -import { fixArticleContent, fetchArticle, finishArticleItem, normalizeUrl } from '@/utils/wechat-mp'; +import Parser from 'rss-parser'; +import InvalidParameterError from '@/errors/types/invalid-parameter'; +import { exportedForTestingOnly, fetchArticle, finishArticleItem, fixArticleContent, normalizeUrl } from '@/utils/wechat-mp'; +const { ExtractMetadata, showTypeMapReverse } = exportedForTestingOnly; + +vi.mock('@/utils/request-rewriter', () => ({ default: null })); +const { default: app } = await import('@/app'); +const parser = new Parser(); + +const expectedItem: { + title: string; + summary: string; + author: string; + mpName: string; + link: string; +} = { + title: 'title', + summary: 'summary', + author: 'author', + mpName: 'mpName', + link: '', // to be filled +}; // date from the cache will be an ISO8601 string, so we need to use this function const compareDate = (date1, date2) => { @@ -8,8 +29,227 @@ const compareDate = (date1, date2) => { date2 = typeof date2 === 'string' ? new Date(date2) : date2; return date1.getTime() === date2.getTime(); }; +const genScriptHtmlStr = (script: string) => ` + + + +`; +const testFetchArticleFinishArticleItem = async (path: string, { setMpNameAsAuthor = false, skipLink = false } = {}) => { + const ct = 1_636_626_300; + const httpsUrl = `https://mp.weixin.qq.com/rsshub_test${path}`; + const httpUrl = 'http' + httpsUrl.slice(5); + + const expectedDate = new Date(ct * 1000); + + const expectedItem_ = { + ...expectedItem, + link: httpsUrl, + }; + + const fetchArticleItem = await fetchArticle(httpUrl); + expect(compareDate(fetchArticleItem.pubDate, expectedDate)).toBe(true); + expect(fetchArticleItem).toMatchObject(expectedItem_); + + const ToBeFinishedArticleItem = { link: httpUrl }; + const expectedFinishedArticleItem = { ...fetchArticleItem }; + expectedFinishedArticleItem.author = setMpNameAsAuthor ? expectedFinishedArticleItem.mpName : expectedFinishedArticleItem.author; + expectedFinishedArticleItem.link = skipLink ? ToBeFinishedArticleItem.link : expectedFinishedArticleItem.link; + + const finishedArticleItem = await finishArticleItem(ToBeFinishedArticleItem, setMpNameAsAuthor, skipLink); + expect(compareDate(finishedArticleItem.pubDate, fetchArticleItem.pubDate)).toBe(true); + delete expectedFinishedArticleItem.pubDate; + expect(finishedArticleItem).toMatchObject(expectedFinishedArticleItem); + + return fetchArticleItem; +}; describe('wechat-mp', () => { + it('ExtractMetadata.common', () => { + expect(ExtractMetadata.common(load(''))).toStrictEqual({}); + + expect( + ExtractMetadata.common( + load( + genScriptHtmlStr(` + window.fake_item_show_type = '5' || ''; + window.fake_real_item_show_type = '5' || ''; + window.fake_ct = '1713009660' || ''; + `) + ) + ) + ).toMatchObject({}); + + expect( + ExtractMetadata.common( + load( + genScriptHtmlStr(` + window.item_show_type = '5' || ''; + window.real_item_show_type = '5' || ''; + window.ct = '1713009660' || ''; + `) + ) + ) + ).toMatchObject({ + showType: showTypeMapReverse['5'], + realShowType: showTypeMapReverse['5'], + createTime: '1713009660', + }); + + expect( + ExtractMetadata.common( + load( + genScriptHtmlStr(` + var item_show_type = "5"; + var real_item_show_type = "5"; + var ct = "1713009660"; + var msg_source_url = 'https://mp.weixin.qq.com/rsshub_test/fake'; + `) + ) + ) + ).toMatchObject({ + showType: showTypeMapReverse['5'], + realShowType: showTypeMapReverse['5'], + createTime: '1713009660', + sourceUrl: 'https://mp.weixin.qq.com/rsshub_test/fake', + }); + + expect( + ExtractMetadata.common( + load( + genScriptHtmlStr(` + var item_show_type = "998877665544332211"; + var real_item_show_type = "112233445566778899"; + var ct = "1713009660"; + `) + ) + ) + ).toMatchObject({ + showType: '998877665544332211', + realShowType: '112233445566778899', + createTime: '1713009660', + }); + }); + it('ExtractMetadata.img', () => { + expect(ExtractMetadata.img(load(''))).toStrictEqual({}); + + expect( + ExtractMetadata.img( + load( + genScriptHtmlStr(` + window.picture_page_info_list = [ + { + cdn_url: 'https://mmbiz.qpic.cn/rsshub_test/fake_img_1/0?wx_fmt=jpeg', + }, + { + cdn_url: 'https://mmbiz.qpic.cn/rsshub_test/fake_img_2/0?wx_fmt=jpeg', + }, + ].slice(0, 20); + `) + ) + ) + ).toMatchObject({ + imgUrls: ['https://mmbiz.qpic.cn/rsshub_test/fake_img_1/0?wx_fmt=jpeg', 'https://mmbiz.qpic.cn/rsshub_test/fake_img_2/0?wx_fmt=jpeg'], + }); + }); + it('ExtractMetadata.audio', () => { + expect(ExtractMetadata.audio(load(''))).toStrictEqual({}); + + expect( + ExtractMetadata.audio( + load( + genScriptHtmlStr(` + reportOpt = { + voiceid: "", + uin: "", + biz: "", + mid: "", + idx: "" + }; + `) + ) + ) + ).toMatchObject({}); + + expect( + ExtractMetadata.audio( + load( + genScriptHtmlStr(` + window.cgiData = { + voiceid: "rsshub_test_voiceid_1", + duration: "6567" * 1, + }; + `) + ) + ) + ).toMatchObject({ + voiceId: 'rsshub_test_voiceid_1', + duration: '6567', + }); + + expect( + ExtractMetadata.audio( + load( + genScriptHtmlStr(` + window.cgiData = { + voiceid: "rsshub_test_voiceid_1", + }; + `) + ) + ) + ).toMatchObject({ + voiceId: 'rsshub_test_voiceid_1', + duration: null, + }); + + expect( + ExtractMetadata.audio( + load( + genScriptHtmlStr(` + reportOpt = { + voiceid: "", + uin: "", + biz: "", + mid: "", + idx: "" + }; + window.cgiData = { + voiceid: "rsshub_test_voiceid_1", + duration: "6567" * 1, + }; + `) + ) + ) + ).toMatchObject({ + voiceId: 'rsshub_test_voiceid_1', + duration: '6567', + }); + }); + it('ExtractMetadata.location', () => { + expect(ExtractMetadata.location(load(''))).toStrictEqual({}); + + expect( + ExtractMetadata.location( + load( + genScriptHtmlStr(` + window.ip_wording = { + countryName: '中国', + countryId: '156', + provinceName: '广东', + provinceId: '', + cityName: '', + cityId: '' + }; + `) + ) + ) + ).toMatchObject({ + countryName: '中国', + provinceName: '广东', + cityName: '', + }); + }); it('fixArticleContent', () => { const divHeader = '
'; const divFooter = '
'; @@ -88,37 +328,105 @@ describe('wechat-mp', () => { expect(normalizeUrl(notWechatMp, true)).toBe(notWechatMp); }); - it('fetchArticle_&_finishArticleItem', async () => { - const ct = 1_636_626_300; - const httpsUrl = 'https://mp.weixin.qq.com/rsshub_test/wechatMp_fetchArticle'; - const httpUrl = httpsUrl.replace(/^https:\/\//, 'http://'); - - const expectedItem: { - title: string; - summary: string; - author: string; - description: string; - mpName?: string; - link: string; - } = { + it('fetchArticle_&_finishArticleItem_appMsg', async () => { + const fetchArticleItem = await testFetchArticleFinishArticleItem('/appMsg'); + const $ = load(fetchArticleItem.description); + expect($('iframe').attr()).toMatchObject({ + src: + 'https://v.qq.com/txp/iframe/player.html?origin=https%3A%2F%2Fmp.weixin.qq.com' + + '&containerId=js_tx_video_container_0.3863487104715233&vid=fake&width=677&height=380.8125' + + '&autoplay=false&allowFullScreen=true&chid=17&full=true&show1080p=false&isDebugIframe=false', + width: '677', + height: '380.8125', + }); + expect($('audio').attr()).toMatchObject({ + src: 'https://res.wx.qq.com/voice/getvoice?mediaid=rsshub_test', + title: 'title', + }); + expect($('a').attr()).toMatchObject({ + href: 'https://mp.weixin.qq.com/rsshub_test/fake', + }); + expect(fetchArticleItem.description).toContain('description'); + expect(fetchArticleItem.description).toContain('📍发表于:中国 福建'); + expect(fetchArticleItem.description).toContain('🔗️ 阅读原文'); + }); + + it('fetchArticle_&_finishArticleItem_img', async () => { + const fetchArticleItem = await testFetchArticleFinishArticleItem('/img'); + const $ = load(fetchArticleItem.description); + expect($.text()).toBe('summary'); + expect($('img:nth-of-type(1)').attr()).toMatchObject({ + src: 'https://mmbiz.qpic.cn/rsshub_test/fake_img_1/0?wx_fmt=jpeg', + }); + expect($('img:nth-of-type(2)').attr()).toMatchObject({ + src: 'https://mmbiz.qpic.cn/rsshub_test/fake_img_2/0?wx_fmt=jpeg', + }); + }); + + it('fetchArticle_&_finishArticleItem_audio', async () => { + const fetchArticleItem = await testFetchArticleFinishArticleItem('/audio'); + const $ = load(fetchArticleItem.description); + expect($.text()).toBe('summary'); + expect($('audio').attr()).toMatchObject({ + controls: '', + src: 'https://res.wx.qq.com/voice/getvoice?mediaid=rsshub_test_voiceid_1', + style: 'width:100%', title: 'title', - summary: 'summary', - author: 'author', - description: 'description', - mpName: 'mpName', - link: httpsUrl, + }); + expect(fetchArticleItem).toMatchObject({ + enclosure_type: 'audio/mp3', + enclosure_url: 'https://res.wx.qq.com/voice/getvoice?mediaid=rsshub_test_voiceid_1', + itunes_duration: '6567', + }); + }); + + it('fetchArticle_&_finishArticleItem_video', async () => { + const fetchArticleItem = await testFetchArticleFinishArticleItem('/video'); + const $ = load(fetchArticleItem.description); + expect($.text()).toBe('summary'); + expect($('img').attr()).toMatchObject({ + src: 'https://mmbiz.qpic.cn/rsshub_test/og_img_1/0?wx_fmt=jpeg', + }); + }); + + it('fetchArticle_&_finishArticleItem_fallback', async () => { + const fetchArticleItem = await testFetchArticleFinishArticleItem('/fallback'); + const $ = load(fetchArticleItem.description); + expect($.text()).toBe('summary'); + expect($('img').attr()).toMatchObject({ + src: 'https://mmbiz.qpic.cn/rsshub_test/og_img_1/0?wx_fmt=jpeg', + }); + }); + + it('finishArticleItem_param', async () => { + await testFetchArticleFinishArticleItem('/fallback', { setMpNameAsAuthor: false, skipLink: false }); + await testFetchArticleFinishArticleItem('/fallback', { setMpNameAsAuthor: true, skipLink: false }); + await testFetchArticleFinishArticleItem('/fallback', { setMpNameAsAuthor: false, skipLink: true }); + await testFetchArticleFinishArticleItem('/fallback', { setMpNameAsAuthor: true, skipLink: true }); + }); + + it('route_test', async () => { + try { + await app.request('/test/wechat-mp'); + } catch (error) { + expect(error).toBeInstanceOf(InvalidParameterError); + } + + const responseShort = await app.request('/test/wechat-mp/rsshub_test'); + const parsedShort = await parser.parseString(await responseShort.text()); + const expectedItemShort = { + author: expectedItem.author, + title: expectedItem.title, + link: 'https://mp.weixin.qq.com/s/rsshub_test', + }; + expect(parsedShort.items[0]).toMatchObject(expectedItemShort); + + const responseLong = await app.request('/test/wechat-mp/__biz=rsshub_test&mid=1&idx=1&sn=1'); + const parsedLong = await parser.parseString(await responseLong.text()); + const expectedItemLong = { + ...expectedItemShort, + link: 'https://mp.weixin.qq.com/s?__biz=rsshub_test&mid=1&idx=1&sn=1', }; - const expectedDate = new Date(ct * 1000); - - const fetchArticleItem = await fetchArticle(httpUrl); - expect(compareDate(fetchArticleItem.pubDate, expectedDate)).toBe(true); - delete fetchArticleItem.pubDate; - expect(fetchArticleItem).toEqual(expectedItem); - - delete expectedItem.mpName; - const finishedArticleItem = await finishArticleItem({ link: httpUrl }); - expect(compareDate(finishedArticleItem.pubDate, expectedDate)).toBe(true); - delete finishedArticleItem.pubDate; - expect(finishedArticleItem).toEqual(expectedItem); + expect(parsedLong.items[0]).toMatchObject(expectedItemLong); }); }); diff --git a/lib/utils/wechat-mp.ts b/lib/utils/wechat-mp.ts index 0615ab1c944950..a45ee305027e56 100644 --- a/lib/utils/wechat-mp.ts +++ b/lib/utils/wechat-mp.ts @@ -26,9 +26,215 @@ */ import ofetch from '@/utils/ofetch'; -import { load, type Cheerio, type Element } from 'cheerio'; +import { type Cheerio, type CheerioAPI, type Element, load } from 'cheerio'; import { parseDate } from '@/utils/parse-date'; import cache from '@/utils/cache'; +import logger from '@/utils/logger'; + +const MAINTAINERS = ['Rongronggg9']; + +const warn = (reason: string, details: string) => + logger.warn(`wechat-mp: ${reason}: ${details}, +consider raise an issue (mentioning ${MAINTAINERS.join(', ')}) with the article URL for further investigation`); + +const replaceReturnNewline = (() => { + const returnRegExp = /\r|\\(r|x0d)/g; + const newlineRegExp = /\n|\\(n|x0a)/g; + return (text: string, replaceReturnWith = '', replaceNewlineWith = '
') => text.replaceAll(returnRegExp, replaceReturnWith).replaceAll(newlineRegExp, replaceNewlineWith); +})(); +const fixUrl = (() => { + const ampRegExp = /(&|\\x26)amp;/g; + return (text: string) => text.replaceAll(ampRegExp, '&'); +})(); + +class LoopContinue extends Error { + constructor() { + super(''); + this.name = 'LoopContinue'; + } +} + +class LoopReturn extends Error { + to_return: any; + + constructor(to_return: any) { + super(''); + this.name = 'LoopReturn'; + this.to_return = to_return; + } +} + +const forEachScript = ($: CheerioAPI | string, callback: (script) => void, defaultReturn: any = null, selector = 'script[nonce][type="text/javascript"]') => { + const scripts = typeof $ === 'string' ? [$] : $(selector).toArray(); + for (const script of scripts) { + try { + callback(script); + } catch (error) { + if (error instanceof LoopReturn) { + return error.to_return; + } else if (error instanceof LoopContinue) { + continue; + } + throw error; + } + } + return defaultReturn; +}; + +// view-source a *_SHARE_PAGE type article and search for `ITEM_SHOW_TYPE_MAP` +// Please update the comments below if you find new types or new examples +const showTypeMap = { + // "Article". + // May be combined with media, but type won't change + // Combined with audio and iframe: https://mp.weixin.qq.com/s/FnjcMXZ1xdS-d6n-pUUyyw + APP_MSG_PAGE: '0', + // https://mp.weixin.qq.com/s?__biz=Mzg4NTA1MTkwNA==&mid=2247532942&idx=1&sn=a84e4adbe49fdb39e4d4c1b5c12a4c3f + VIDEO_SHARE_PAGE: '5', + MUSIC_SHARE_PAGE: '6', + // https://mp.weixin.qq.com/s/FY6yQC_e4NMAxK0FBr6jwQ + AUDIO_SHARE_PAGE: '7', + // https://mp.weixin.qq.com/s/4p5YmYuASiQSYFiy7KqydQ + // https://mp.weixin.qq.com/s?__biz=Mzg4NTA1MTkwNA==&mid=2247532936&idx=4&sn=624054c20ded6ee85c6632f419c6f758 + IMG_SHARE_PAGE: '8', + TEXT_SHARE_PAGE: '10', + SHORT_CONTENT_PAGE: '17', +}; +const showTypeMapReverse = Object.fromEntries(Object.entries(showTypeMap).map(([k, v]) => [v, k])); + +class ExtractMetadata { + private static genAssignmentRegExp = (varName: string, valuePattern: string, assignPattern: string) => RegExp(`\\b${varName}\\s*${assignPattern}\\s*(?["'])(?${valuePattern})\\k`, 'mg'); + + private static genExtractFunc = ( + varName: string, + { + valuePattern = '\\w+', + assignPattern = '=', + allowNotFound = false, + multiple = false, + }: { + valuePattern?: string; + assignPattern?: string; + allowNotFound?: boolean; + multiple?: boolean; + } + ) => { + const regExp = this.genAssignmentRegExp(varName, valuePattern, assignPattern); + return (str: string) => { + const values: string[] = []; + for (const match of str.matchAll(regExp)) { + const value = match.groups?.value; + if (!multiple) { + return value; + } + values.push(value); + } + if (!allowNotFound && values.length === 0) { + throw new LoopContinue(); + } + return multiple ? values : null; + }; + }; + + private static doExtract = (metadataToBeExtracted: Record string | string[] | null | undefined>, scriptText: string) => { + const metadataExtracted: Record = {}; + for (const [key, extractFunc] of Object.entries(metadataToBeExtracted)) { + metadataExtracted[key] = extractFunc(scriptText); + } + metadataExtracted._extractedFrom = scriptText; + return metadataExtracted; + }; + + private static commonMetadataToBeExtracted = { + showType: this.genExtractFunc('item_show_type', { valuePattern: '\\d+' }), + realShowType: this.genExtractFunc('real_item_show_type', { valuePattern: '\\d+' }), + createTime: this.genExtractFunc('ct', { valuePattern: '\\d+' }), + sourceUrl: this.genExtractFunc('msg_source_url', { valuePattern: `https?://[^'"]*`, allowNotFound: true }), + }; + + static common = ($: CheerioAPI) => + forEachScript( + $, + (script) => { + const scriptText = $(script).text(); + const metadataExtracted = >this.doExtract(this.commonMetadataToBeExtracted, scriptText); + const showType = showTypeMapReverse[metadataExtracted.showType]; + const realShowType = showTypeMapReverse[metadataExtracted.realShowType]; + metadataExtracted.sourceUrl = metadataExtracted.sourceUrl && fixUrl(metadataExtracted.sourceUrl); + if (showType) { + metadataExtracted.showType = showType; + } else { + warn('showType not found', `item_show_type=${metadataExtracted.showType}`); + } + if (realShowType) { + metadataExtracted.realShowType = realShowType; + } else { + warn('realShowType not found', `real_item_show_type=${metadataExtracted.realShowType}`); + } + if (metadataExtracted.showType !== metadataExtracted.realShowType) { + // never seen this happen, waiting for examples + warn('showType mismatch', `item_show_type=${metadataExtracted.showType}, real_item_show_type=${metadataExtracted.realShowType}`); + } + throw new LoopReturn(metadataExtracted); + }, + {}, + 'script[nonce][type="text/javascript"]:contains("real_item_show_type")' + ); + + private static audioMetadataToBeExtracted = { + voiceId: this.genExtractFunc('voiceid', { assignPattern: ':' }), + duration: this.genExtractFunc('duration', { valuePattern: '\\d*', assignPattern: ':', allowNotFound: true }), + }; + + // never seen a audio article containing multiple audio, waiting for examples + static audio = ($: CheerioAPI) => + forEachScript( + $, + (script) => { + const scriptText = $(script).text(); + const metadataExtracted = >this.doExtract(this.audioMetadataToBeExtracted, scriptText); + throw new LoopReturn(metadataExtracted); + }, + {}, + 'script[nonce][type="text/javascript"]:contains("voiceid")' + ); + + private static imgMetadataToBeExtracted = { + imgUrls: this.genExtractFunc('cdn_url', { valuePattern: `https?://[^'"]*`, assignPattern: ':', multiple: true }), + }; + + static img = ($: CheerioAPI) => + forEachScript( + $, + (script) => { + const scriptText = $(script).text(); + const metadataExtracted = >this.doExtract(this.imgMetadataToBeExtracted, scriptText); + if (Array.isArray(metadataExtracted.imgUrls)) { + metadataExtracted.imgUrls = metadataExtracted.imgUrls.map((url) => fixUrl(url)); + } + throw new LoopReturn(metadataExtracted); + }, + {}, + 'script[nonce][type="text/javascript"]:contains("picture_page_info_list")' + ); + + private static locationMetadataToBeExtracted = { + countryName: this.genExtractFunc('countryName', { valuePattern: `[^'"]*`, assignPattern: ':' }), + provinceName: this.genExtractFunc('provinceName', { valuePattern: `[^'"]*`, assignPattern: ':' }), + cityName: this.genExtractFunc('cityName', { valuePattern: `[^'"]*`, assignPattern: ':' }), + }; + + static location = ($: CheerioAPI) => + forEachScript( + $, + (script) => { + const scriptText = $(script).text(); + const metadataExtracted = this.doExtract(this.locationMetadataToBeExtracted, scriptText); + throw new LoopReturn(metadataExtracted); + }, + {}, + 'script[nonce][type="text/javascript"]:contains("countryName")' + ); +} const replaceTag = ($, oldTag, newTagName) => { oldTag = $(oldTag); @@ -55,15 +261,23 @@ const detectOriginalArticleUrl = ($) => { return null; }; -const detectSourceUrl = ($) => { - const matchs = $.root() - .html() - .match(/msg_source_url = '(.+)';/); - - if (matchs) { - return matchs[1]; - } - return null; +const genAudioSrc = (voiceId: string) => `https://res.wx.qq.com/voice/getvoice?mediaid=${voiceId}`; +const genAudioTag = (src: string, title: string) => `