diff --git a/lib/errors/index.test.ts b/lib/errors/index.test.ts
index e1af5105878618..2538b7a8876b09 100644
--- a/lib/errors/index.test.ts
+++ b/lib/errors/index.test.ts
@@ -66,13 +66,13 @@ describe('route throws an error', () => {
expect(value).toBe('9');
break;
case 'Hot Routes:':
- expect(value).toBe('6 /test/:id ');
+ expect(value).toBe('6 /test/:id/:params? ');
break;
case 'Hot Paths:':
expect(value).toBe('2 /test/error 2 /test/slow 1 /test/httperror 1 /test/config-not-found-error 1 /test/invalid-parameter-error 1 /thisDoesNotExist 1 / ');
break;
case 'Hot Error Routes:':
- expect(value).toBe('5 /test/:id ');
+ expect(value).toBe('5 /test/:id/:params? ');
break;
case 'Hot Error Paths:':
expect(value).toBe('2 /test/error 1 /test/httperror 1 /test/slow 1 /test/config-not-found-error 1 /test/invalid-parameter-error 1 /thisDoesNotExist ');
diff --git a/lib/routes/test/index.ts b/lib/routes/test/index.ts
index 138e3691b4896c..2990917bd7bcd8 100644
--- a/lib/routes/test/index.ts
+++ b/lib/routes/test/index.ts
@@ -3,13 +3,14 @@ import { config } from '@/config';
import got from '@/utils/got';
import wait from '@/utils/wait';
import cache from '@/utils/cache';
+import { fetchArticle } from '@/utils/wechat-mp';
import ConfigNotFoundError from '@/errors/types/config-not-found';
import InvalidParameterError from '@/errors/types/invalid-parameter';
let cacheIndex = 0;
export const route: Route = {
- path: '/:id',
+ path: '/:id/:params?',
name: 'Unknown',
maintainers: ['DIYgod', 'NeverBehave'],
handler,
@@ -384,6 +385,15 @@ async function handler(ctx) {
];
}
+ if (ctx.req.param('id') === 'wechat-mp') {
+ const params = ctx.req.param('params');
+ if (!params) {
+ throw new InvalidParameterError('Invalid parameter');
+ }
+ const mpUrl = 'https:/mp.weixin.qq.com/s' + (params.includes('&') ? '?' : '/') + params;
+ item = [await fetchArticle(mpUrl)];
+ }
+
return {
title: `Test ${ctx.req.param('id')}`,
itunes_author: ctx.req.param('id') === 'enclosure' ? 'DIYgod' : null,
diff --git a/lib/setup.test.ts b/lib/setup.test.ts
index 192d9fe8275c3f..a06f67c51c2a30 100644
--- a/lib/setup.test.ts
+++ b/lib/setup.test.ts
@@ -2,6 +2,29 @@ import { afterAll, afterEach } from 'vitest';
import { setupServer } from 'msw/node';
import { http, HttpResponse } from 'msw';
+const genWeChatMpPage = (rich_media_content: string, scripts: string[] | string) => {
+ if (!Array.isArray(scripts)) {
+ scripts = [scripts];
+ }
+ let pageHtml = `
+
+
+
+
+
+
+${rich_media_content}
+
+mpName
`;
+ for (const script of scripts) {
+ pageHtml += `
+`;
+ }
+ return pageHtml;
+};
+
const server = setupServer(
http.post(`https://api.openai.mock/v1/chat/completions`, () =>
HttpResponse.json({
@@ -33,21 +56,106 @@ const server = setupServer(
`)
),
- http.get(`https://mp.weixin.qq.com/rsshub_test/wechatMp_fetchArticle`, () =>
+ http.get(`https://mp.weixin.qq.com/rsshub_test/appMsg`, () =>
+ HttpResponse.text(
+ genWeChatMpPage(
+ `
+description
+
+
+`,
+ `
+var item_show_type = "0";
+var real_item_show_type = "0";
+var appmsg_type = "9";
+var ct = "${1_636_626_300}";
+var msg_source_url = "https://mp.weixin.qq.com/rsshub_test/fake";
+window.ip_wording = {
+ countryName: '中国',
+ countryId: '156',
+ provinceName: '福建',
+ provinceId: '',
+ cityName: '',
+ cityId: ''
+};`
+ )
+ )
+ ),
+ http.get(`https://mp.weixin.qq.com/rsshub_test/img`, () =>
+ HttpResponse.text(
+ genWeChatMpPage('fake_description', [
+ `
+var item_show_type = "8";
+var real_item_show_type = "8";
+var appmsg_type = "9";
+var ct = "${1_636_626_300}";
+`,
+ `
+window.picture_page_info_list = [
+{
+ cdn_url: 'https://mmbiz.qpic.cn/rsshub_test/fake_img_1/0?wx_fmt=jpeg',
+},
+{
+ cdn_url: 'https://mmbiz.qpic.cn/rsshub_test/fake_img_2/0?wx_fmt=jpeg',
+},
+].slice(0, 20);
+`,
+ ])
+ )
+ ),
+ http.get(`https://mp.weixin.qq.com/rsshub_test/audio`, () =>
+ HttpResponse.text(
+ genWeChatMpPage('fake_description', [
+ `
+var item_show_type = "7";
+var real_item_show_type = "7";
+var appmsg_type = "9";
+var ct = "${1_636_626_300}";
+`,
+ `
+reportOpt = {
+ voiceid: "",
+ uin: "",
+ biz: "",
+ mid: "",
+ idx: ""
+};
+window.cgiData = {
+ voiceid: "rsshub_test_voiceid_1",
+ duration: "6567" * 1,
+};
+`,
+ ])
+ )
+ ),
+ http.get(`https://mp.weixin.qq.com/rsshub_test/video`, () =>
+ HttpResponse.text(
+ genWeChatMpPage(
+ 'fake_description',
+ `
+var item_show_type = "5";
+var real_item_show_type = "5";
+var appmsg_type = "9";
+var ct = "${1_636_626_300}";
+`
+ )
+ )
+ ),
+ http.get(`https://mp.weixin.qq.com/rsshub_test/fallback`, () =>
HttpResponse.text(
- '\n' +
- ' \n' +
- ' \n' +
- ' \n' +
- ' \n' +
- 'description
\n' +
- 'mpName
\n' +
- ''
+ genWeChatMpPage(
+ 'fake_description',
+ `
+var item_show_type = "99988877";
+var real_item_show_type = "99988877";
+var appmsg_type = "9";
+var ct = "${1_636_626_300}";
+`
+ )
)
),
+ http.get(`https://mp.weixin.qq.com/s/rsshub_test`, () => HttpResponse.text(genWeChatMpPage('', ''))),
+ http.get(`https://mp.weixin.qq.com/s?__biz=rsshub_test&mid=1&idx=1&sn=1`, () => HttpResponse.text(genWeChatMpPage('', ''))),
http.get(`http://rsshub.test/headers`, ({ request }) =>
HttpResponse.json({
...Object.fromEntries(request.headers.entries()),
diff --git a/lib/utils/wechat-mp.test.ts b/lib/utils/wechat-mp.test.ts
index e700a6e425bdc6..2343ee69c6b9e7 100644
--- a/lib/utils/wechat-mp.test.ts
+++ b/lib/utils/wechat-mp.test.ts
@@ -1,6 +1,27 @@
-import { describe, expect, it } from 'vitest';
+import { describe, expect, it, vi } from 'vitest';
import { load } from 'cheerio';
-import { fixArticleContent, fetchArticle, finishArticleItem, normalizeUrl } from '@/utils/wechat-mp';
+import Parser from 'rss-parser';
+import InvalidParameterError from '@/errors/types/invalid-parameter';
+import { exportedForTestingOnly, fetchArticle, finishArticleItem, fixArticleContent, normalizeUrl } from '@/utils/wechat-mp';
+const { ExtractMetadata, showTypeMapReverse } = exportedForTestingOnly;
+
+vi.mock('@/utils/request-rewriter', () => ({ default: null }));
+const { default: app } = await import('@/app');
+const parser = new Parser();
+
+const expectedItem: {
+ title: string;
+ summary: string;
+ author: string;
+ mpName: string;
+ link: string;
+} = {
+ title: 'title',
+ summary: 'summary',
+ author: 'author',
+ mpName: 'mpName',
+ link: '', // to be filled
+};
// date from the cache will be an ISO8601 string, so we need to use this function
const compareDate = (date1, date2) => {
@@ -8,8 +29,227 @@ const compareDate = (date1, date2) => {
date2 = typeof date2 === 'string' ? new Date(date2) : date2;
return date1.getTime() === date2.getTime();
};
+const genScriptHtmlStr = (script: string) => `
+
+
+
+`;
+const testFetchArticleFinishArticleItem = async (path: string, { setMpNameAsAuthor = false, skipLink = false } = {}) => {
+ const ct = 1_636_626_300;
+ const httpsUrl = `https://mp.weixin.qq.com/rsshub_test${path}`;
+ const httpUrl = 'http' + httpsUrl.slice(5);
+
+ const expectedDate = new Date(ct * 1000);
+
+ const expectedItem_ = {
+ ...expectedItem,
+ link: httpsUrl,
+ };
+
+ const fetchArticleItem = await fetchArticle(httpUrl);
+ expect(compareDate(fetchArticleItem.pubDate, expectedDate)).toBe(true);
+ expect(fetchArticleItem).toMatchObject(expectedItem_);
+
+ const ToBeFinishedArticleItem = { link: httpUrl };
+ const expectedFinishedArticleItem = { ...fetchArticleItem };
+ expectedFinishedArticleItem.author = setMpNameAsAuthor ? expectedFinishedArticleItem.mpName : expectedFinishedArticleItem.author;
+ expectedFinishedArticleItem.link = skipLink ? ToBeFinishedArticleItem.link : expectedFinishedArticleItem.link;
+
+ const finishedArticleItem = await finishArticleItem(ToBeFinishedArticleItem, setMpNameAsAuthor, skipLink);
+ expect(compareDate(finishedArticleItem.pubDate, fetchArticleItem.pubDate)).toBe(true);
+ delete expectedFinishedArticleItem.pubDate;
+ expect(finishedArticleItem).toMatchObject(expectedFinishedArticleItem);
+
+ return fetchArticleItem;
+};
describe('wechat-mp', () => {
+ it('ExtractMetadata.common', () => {
+ expect(ExtractMetadata.common(load(''))).toStrictEqual({});
+
+ expect(
+ ExtractMetadata.common(
+ load(
+ genScriptHtmlStr(`
+ window.fake_item_show_type = '5' || '';
+ window.fake_real_item_show_type = '5' || '';
+ window.fake_ct = '1713009660' || '';
+ `)
+ )
+ )
+ ).toMatchObject({});
+
+ expect(
+ ExtractMetadata.common(
+ load(
+ genScriptHtmlStr(`
+ window.item_show_type = '5' || '';
+ window.real_item_show_type = '5' || '';
+ window.ct = '1713009660' || '';
+ `)
+ )
+ )
+ ).toMatchObject({
+ showType: showTypeMapReverse['5'],
+ realShowType: showTypeMapReverse['5'],
+ createTime: '1713009660',
+ });
+
+ expect(
+ ExtractMetadata.common(
+ load(
+ genScriptHtmlStr(`
+ var item_show_type = "5";
+ var real_item_show_type = "5";
+ var ct = "1713009660";
+ var msg_source_url = 'https://mp.weixin.qq.com/rsshub_test/fake';
+ `)
+ )
+ )
+ ).toMatchObject({
+ showType: showTypeMapReverse['5'],
+ realShowType: showTypeMapReverse['5'],
+ createTime: '1713009660',
+ sourceUrl: 'https://mp.weixin.qq.com/rsshub_test/fake',
+ });
+
+ expect(
+ ExtractMetadata.common(
+ load(
+ genScriptHtmlStr(`
+ var item_show_type = "998877665544332211";
+ var real_item_show_type = "112233445566778899";
+ var ct = "1713009660";
+ `)
+ )
+ )
+ ).toMatchObject({
+ showType: '998877665544332211',
+ realShowType: '112233445566778899',
+ createTime: '1713009660',
+ });
+ });
+ it('ExtractMetadata.img', () => {
+ expect(ExtractMetadata.img(load(''))).toStrictEqual({});
+
+ expect(
+ ExtractMetadata.img(
+ load(
+ genScriptHtmlStr(`
+ window.picture_page_info_list = [
+ {
+ cdn_url: 'https://mmbiz.qpic.cn/rsshub_test/fake_img_1/0?wx_fmt=jpeg',
+ },
+ {
+ cdn_url: 'https://mmbiz.qpic.cn/rsshub_test/fake_img_2/0?wx_fmt=jpeg',
+ },
+ ].slice(0, 20);
+ `)
+ )
+ )
+ ).toMatchObject({
+ imgUrls: ['https://mmbiz.qpic.cn/rsshub_test/fake_img_1/0?wx_fmt=jpeg', 'https://mmbiz.qpic.cn/rsshub_test/fake_img_2/0?wx_fmt=jpeg'],
+ });
+ });
+ it('ExtractMetadata.audio', () => {
+ expect(ExtractMetadata.audio(load(''))).toStrictEqual({});
+
+ expect(
+ ExtractMetadata.audio(
+ load(
+ genScriptHtmlStr(`
+ reportOpt = {
+ voiceid: "",
+ uin: "",
+ biz: "",
+ mid: "",
+ idx: ""
+ };
+ `)
+ )
+ )
+ ).toMatchObject({});
+
+ expect(
+ ExtractMetadata.audio(
+ load(
+ genScriptHtmlStr(`
+ window.cgiData = {
+ voiceid: "rsshub_test_voiceid_1",
+ duration: "6567" * 1,
+ };
+ `)
+ )
+ )
+ ).toMatchObject({
+ voiceId: 'rsshub_test_voiceid_1',
+ duration: '6567',
+ });
+
+ expect(
+ ExtractMetadata.audio(
+ load(
+ genScriptHtmlStr(`
+ window.cgiData = {
+ voiceid: "rsshub_test_voiceid_1",
+ };
+ `)
+ )
+ )
+ ).toMatchObject({
+ voiceId: 'rsshub_test_voiceid_1',
+ duration: null,
+ });
+
+ expect(
+ ExtractMetadata.audio(
+ load(
+ genScriptHtmlStr(`
+ reportOpt = {
+ voiceid: "",
+ uin: "",
+ biz: "",
+ mid: "",
+ idx: ""
+ };
+ window.cgiData = {
+ voiceid: "rsshub_test_voiceid_1",
+ duration: "6567" * 1,
+ };
+ `)
+ )
+ )
+ ).toMatchObject({
+ voiceId: 'rsshub_test_voiceid_1',
+ duration: '6567',
+ });
+ });
+ it('ExtractMetadata.location', () => {
+ expect(ExtractMetadata.location(load(''))).toStrictEqual({});
+
+ expect(
+ ExtractMetadata.location(
+ load(
+ genScriptHtmlStr(`
+ window.ip_wording = {
+ countryName: '中国',
+ countryId: '156',
+ provinceName: '广东',
+ provinceId: '',
+ cityName: '',
+ cityId: ''
+ };
+ `)
+ )
+ )
+ ).toMatchObject({
+ countryName: '中国',
+ provinceName: '广东',
+ cityName: '',
+ });
+ });
it('fixArticleContent', () => {
const divHeader = '';
const divFooter = '
';
@@ -88,37 +328,105 @@ describe('wechat-mp', () => {
expect(normalizeUrl(notWechatMp, true)).toBe(notWechatMp);
});
- it('fetchArticle_&_finishArticleItem', async () => {
- const ct = 1_636_626_300;
- const httpsUrl = 'https://mp.weixin.qq.com/rsshub_test/wechatMp_fetchArticle';
- const httpUrl = httpsUrl.replace(/^https:\/\//, 'http://');
-
- const expectedItem: {
- title: string;
- summary: string;
- author: string;
- description: string;
- mpName?: string;
- link: string;
- } = {
+ it('fetchArticle_&_finishArticleItem_appMsg', async () => {
+ const fetchArticleItem = await testFetchArticleFinishArticleItem('/appMsg');
+ const $ = load(fetchArticleItem.description);
+ expect($('iframe').attr()).toMatchObject({
+ src:
+ 'https://v.qq.com/txp/iframe/player.html?origin=https%3A%2F%2Fmp.weixin.qq.com' +
+ '&containerId=js_tx_video_container_0.3863487104715233&vid=fake&width=677&height=380.8125' +
+ '&autoplay=false&allowFullScreen=true&chid=17&full=true&show1080p=false&isDebugIframe=false',
+ width: '677',
+ height: '380.8125',
+ });
+ expect($('audio').attr()).toMatchObject({
+ src: 'https://res.wx.qq.com/voice/getvoice?mediaid=rsshub_test',
+ title: 'title',
+ });
+ expect($('a').attr()).toMatchObject({
+ href: 'https://mp.weixin.qq.com/rsshub_test/fake',
+ });
+ expect(fetchArticleItem.description).toContain('description');
+ expect(fetchArticleItem.description).toContain('📍发表于:中国 福建');
+ expect(fetchArticleItem.description).toContain('🔗️ 阅读原文');
+ });
+
+ it('fetchArticle_&_finishArticleItem_img', async () => {
+ const fetchArticleItem = await testFetchArticleFinishArticleItem('/img');
+ const $ = load(fetchArticleItem.description);
+ expect($.text()).toBe('summary');
+ expect($('img:nth-of-type(1)').attr()).toMatchObject({
+ src: 'https://mmbiz.qpic.cn/rsshub_test/fake_img_1/0?wx_fmt=jpeg',
+ });
+ expect($('img:nth-of-type(2)').attr()).toMatchObject({
+ src: 'https://mmbiz.qpic.cn/rsshub_test/fake_img_2/0?wx_fmt=jpeg',
+ });
+ });
+
+ it('fetchArticle_&_finishArticleItem_audio', async () => {
+ const fetchArticleItem = await testFetchArticleFinishArticleItem('/audio');
+ const $ = load(fetchArticleItem.description);
+ expect($.text()).toBe('summary');
+ expect($('audio').attr()).toMatchObject({
+ controls: '',
+ src: 'https://res.wx.qq.com/voice/getvoice?mediaid=rsshub_test_voiceid_1',
+ style: 'width:100%',
title: 'title',
- summary: 'summary',
- author: 'author',
- description: 'description',
- mpName: 'mpName',
- link: httpsUrl,
+ });
+ expect(fetchArticleItem).toMatchObject({
+ enclosure_type: 'audio/mp3',
+ enclosure_url: 'https://res.wx.qq.com/voice/getvoice?mediaid=rsshub_test_voiceid_1',
+ itunes_duration: '6567',
+ });
+ });
+
+ it('fetchArticle_&_finishArticleItem_video', async () => {
+ const fetchArticleItem = await testFetchArticleFinishArticleItem('/video');
+ const $ = load(fetchArticleItem.description);
+ expect($.text()).toBe('summary');
+ expect($('img').attr()).toMatchObject({
+ src: 'https://mmbiz.qpic.cn/rsshub_test/og_img_1/0?wx_fmt=jpeg',
+ });
+ });
+
+ it('fetchArticle_&_finishArticleItem_fallback', async () => {
+ const fetchArticleItem = await testFetchArticleFinishArticleItem('/fallback');
+ const $ = load(fetchArticleItem.description);
+ expect($.text()).toBe('summary');
+ expect($('img').attr()).toMatchObject({
+ src: 'https://mmbiz.qpic.cn/rsshub_test/og_img_1/0?wx_fmt=jpeg',
+ });
+ });
+
+ it('finishArticleItem_param', async () => {
+ await testFetchArticleFinishArticleItem('/fallback', { setMpNameAsAuthor: false, skipLink: false });
+ await testFetchArticleFinishArticleItem('/fallback', { setMpNameAsAuthor: true, skipLink: false });
+ await testFetchArticleFinishArticleItem('/fallback', { setMpNameAsAuthor: false, skipLink: true });
+ await testFetchArticleFinishArticleItem('/fallback', { setMpNameAsAuthor: true, skipLink: true });
+ });
+
+ it('route_test', async () => {
+ try {
+ await app.request('/test/wechat-mp');
+ } catch (error) {
+ expect(error).toBeInstanceOf(InvalidParameterError);
+ }
+
+ const responseShort = await app.request('/test/wechat-mp/rsshub_test');
+ const parsedShort = await parser.parseString(await responseShort.text());
+ const expectedItemShort = {
+ author: expectedItem.author,
+ title: expectedItem.title,
+ link: 'https://mp.weixin.qq.com/s/rsshub_test',
+ };
+ expect(parsedShort.items[0]).toMatchObject(expectedItemShort);
+
+ const responseLong = await app.request('/test/wechat-mp/__biz=rsshub_test&mid=1&idx=1&sn=1');
+ const parsedLong = await parser.parseString(await responseLong.text());
+ const expectedItemLong = {
+ ...expectedItemShort,
+ link: 'https://mp.weixin.qq.com/s?__biz=rsshub_test&mid=1&idx=1&sn=1',
};
- const expectedDate = new Date(ct * 1000);
-
- const fetchArticleItem = await fetchArticle(httpUrl);
- expect(compareDate(fetchArticleItem.pubDate, expectedDate)).toBe(true);
- delete fetchArticleItem.pubDate;
- expect(fetchArticleItem).toEqual(expectedItem);
-
- delete expectedItem.mpName;
- const finishedArticleItem = await finishArticleItem({ link: httpUrl });
- expect(compareDate(finishedArticleItem.pubDate, expectedDate)).toBe(true);
- delete finishedArticleItem.pubDate;
- expect(finishedArticleItem).toEqual(expectedItem);
+ expect(parsedLong.items[0]).toMatchObject(expectedItemLong);
});
});
diff --git a/lib/utils/wechat-mp.ts b/lib/utils/wechat-mp.ts
index 0615ab1c944950..268b41578697b7 100644
--- a/lib/utils/wechat-mp.ts
+++ b/lib/utils/wechat-mp.ts
@@ -26,9 +26,215 @@
*/
import ofetch from '@/utils/ofetch';
-import { load, type Cheerio, type Element } from 'cheerio';
+import { type Cheerio, type CheerioAPI, type Element, load } from 'cheerio';
import { parseDate } from '@/utils/parse-date';
import cache from '@/utils/cache';
+import logger from '@/utils/logger';
+
+const MAINTAINERS = ['Rongronggg9'];
+
+const warn = (reason: string, details: string) =>
+ logger.warn(`wechat-mp: ${reason}: ${details},
+consider raise an issue (mentioning ${MAINTAINERS.join(', ')}) with the article URL for further investigation`);
+
+const replaceReturnNewline = (() => {
+ const returnRegExp = /\r|\\(r|x0d)/g;
+ const newlineRegExp = /\n|\\(n|x0a)/g;
+ return (text: string, replaceReturnWith = '', replaceNewlineWith = ' ') => text.replaceAll(returnRegExp, replaceReturnWith).replaceAll(newlineRegExp, replaceNewlineWith);
+})();
+const fixUrl = (() => {
+ const ampRegExp = /(&|\\x26)amp;/g;
+ return (text: string) => text.replaceAll(ampRegExp, '&');
+})();
+
+class LoopContinue extends Error {
+ constructor() {
+ super('');
+ this.name = 'LoopContinue';
+ }
+}
+
+class LoopReturn extends Error {
+ to_return: any;
+
+ constructor(to_return: any) {
+ super('');
+ this.name = 'LoopReturn';
+ this.to_return = to_return;
+ }
+}
+
+const forEachScript = ($: CheerioAPI | string, callback: (script) => void, defaultReturn: any = null, selector = 'script[nonce][type="text/javascript"]') => {
+ const scripts = typeof $ === 'string' ? [$] : $(selector).toArray();
+ for (const script of scripts) {
+ try {
+ callback(script);
+ } catch (error) {
+ if (error instanceof LoopReturn) {
+ return error.to_return;
+ } else if (error instanceof LoopContinue) {
+ continue;
+ }
+ throw error;
+ }
+ }
+ return defaultReturn;
+};
+
+// view-source a *_SHARE_PAGE type article and search for `ITEM_SHOW_TYPE_MAP`
+// Please update the comments below if you find new types or new examples
+const showTypeMap = {
+ // "Article".
+ // May be combined with media, but type won't change
+ // Combined with audio and iframe: https://mp.weixin.qq.com/s/FnjcMXZ1xdS-d6n-pUUyyw
+ APP_MSG_PAGE: '0',
+ // https://mp.weixin.qq.com/s?__biz=Mzg4NTA1MTkwNA==&mid=2247532942&idx=1&sn=a84e4adbe49fdb39e4d4c1b5c12a4c3f
+ VIDEO_SHARE_PAGE: '5',
+ MUSIC_SHARE_PAGE: '6',
+ // https://mp.weixin.qq.com/s/FY6yQC_e4NMAxK0FBr6jwQ
+ AUDIO_SHARE_PAGE: '7',
+ // https://mp.weixin.qq.com/s/4p5YmYuASiQSYFiy7KqydQ
+ // https://mp.weixin.qq.com/s?__biz=Mzg4NTA1MTkwNA==&mid=2247532936&idx=4&sn=624054c20ded6ee85c6632f419c6f758
+ IMG_SHARE_PAGE: '8',
+ TEXT_SHARE_PAGE: '10',
+ SHORT_CONTENT_PAGE: '17',
+};
+const showTypeMapReverse = Object.fromEntries(Object.entries(showTypeMap).map(([k, v]) => [v, k]));
+
+class ExtractMetadata {
+ private static genAssignmentRegExp = (varName: string, valuePattern: string, assignPattern: string) => RegExp(`\\b${varName}\\s*${assignPattern}\\s*(?["'])(?${valuePattern})\\k`, 'mg');
+
+ private static genExtractFunc = (
+ varName: string,
+ {
+ valuePattern = '\\w+',
+ assignPattern = '=',
+ allowNotFound = false,
+ multiple = false,
+ }: {
+ valuePattern?: string;
+ assignPattern?: string;
+ allowNotFound?: boolean;
+ multiple?: boolean;
+ }
+ ) => {
+ const regExp = this.genAssignmentRegExp(varName, valuePattern, assignPattern);
+ return (str: string) => {
+ const values: string[] = [];
+ for (const match of str.matchAll(regExp)) {
+ const value = match.groups?.value;
+ if (!multiple) {
+ return value;
+ }
+ values.push(value);
+ }
+ if (!allowNotFound && values.length === 0) {
+ throw new LoopContinue();
+ }
+ return multiple ? values : null;
+ };
+ };
+
+ private static doExtract = (metadataToBeExtracted: Record string | string[] | null | undefined>, scriptText: string) => {
+ const metadataExtracted: Record = {};
+ for (const [key, extractFunc] of Object.entries(metadataToBeExtracted)) {
+ metadataExtracted[key] = extractFunc(scriptText);
+ }
+ metadataExtracted._extractedFrom = scriptText;
+ return metadataExtracted;
+ };
+
+ private static commonMetadataToBeExtracted = {
+ showType: this.genExtractFunc('item_show_type', { valuePattern: '\\d+' }),
+ realShowType: this.genExtractFunc('real_item_show_type', { valuePattern: '\\d+' }),
+ createTime: this.genExtractFunc('ct', { valuePattern: '\\d+' }),
+ sourceUrl: this.genExtractFunc('msg_source_url', { valuePattern: `https?://[^'"]*`, allowNotFound: true }),
+ };
+
+ static common = ($: CheerioAPI) =>
+ forEachScript(
+ $,
+ (script) => {
+ const scriptText = $(script).text();
+ const metadataExtracted = > this.doExtract(this.commonMetadataToBeExtracted, scriptText);
+ const showType = showTypeMapReverse[metadataExtracted.showType];
+ const realShowType = showTypeMapReverse[metadataExtracted.realShowType];
+ metadataExtracted.sourceUrl = metadataExtracted.sourceUrl && fixUrl(metadataExtracted.sourceUrl);
+ if (showType) {
+ metadataExtracted.showType = showType;
+ } else {
+ warn('showType not found', `item_show_type=${metadataExtracted.showType}`);
+ }
+ if (realShowType) {
+ metadataExtracted.realShowType = realShowType;
+ } else {
+ warn('realShowType not found', `real_item_show_type=${metadataExtracted.realShowType}`);
+ }
+ if (metadataExtracted.showType !== metadataExtracted.realShowType) {
+ // never seen this happen, waiting for examples
+ warn('showType mismatch', `item_show_type=${metadataExtracted.showType}, real_item_show_type=${metadataExtracted.realShowType}`);
+ }
+ throw new LoopReturn(metadataExtracted);
+ },
+ {},
+ 'script[nonce][type="text/javascript"]:contains("real_item_show_type")'
+ );
+
+ private static audioMetadataToBeExtracted = {
+ voiceId: this.genExtractFunc('voiceid', { assignPattern: ':' }),
+ duration: this.genExtractFunc('duration', { valuePattern: '\\d*', assignPattern: ':', allowNotFound: true }),
+ };
+
+ // never seen a audio article containing multiple audio, waiting for examples
+ static audio = ($: CheerioAPI) =>
+ forEachScript(
+ $,
+ (script) => {
+ const scriptText = $(script).text();
+ const metadataExtracted = > this.doExtract(this.audioMetadataToBeExtracted, scriptText);
+ throw new LoopReturn(metadataExtracted);
+ },
+ {},
+ 'script[nonce][type="text/javascript"]:contains("voiceid")'
+ );
+
+ private static imgMetadataToBeExtracted = {
+ imgUrls: this.genExtractFunc('cdn_url', { valuePattern: `https?://[^'"]*`, assignPattern: ':', multiple: true }),
+ };
+
+ static img = ($: CheerioAPI) =>
+ forEachScript(
+ $,
+ (script) => {
+ const scriptText = $(script).text();
+ const metadataExtracted = > this.doExtract(this.imgMetadataToBeExtracted, scriptText);
+ if (Array.isArray(metadataExtracted.imgUrls)) {
+ metadataExtracted.imgUrls = metadataExtracted.imgUrls.map((url) => fixUrl(url));
+ }
+ throw new LoopReturn(metadataExtracted);
+ },
+ {},
+ 'script[nonce][type="text/javascript"]:contains("picture_page_info_list")'
+ );
+
+ private static locationMetadataToBeExtracted = {
+ countryName: this.genExtractFunc('countryName', { valuePattern: `[^'"]*`, assignPattern: ':' }),
+ provinceName: this.genExtractFunc('provinceName', { valuePattern: `[^'"]*`, assignPattern: ':' }),
+ cityName: this.genExtractFunc('cityName', { valuePattern: `[^'"]*`, assignPattern: ':' }),
+ };
+
+ static location = ($: CheerioAPI) =>
+ forEachScript(
+ $,
+ (script) => {
+ const scriptText = $(script).text();
+ const metadataExtracted = this.doExtract(this.locationMetadataToBeExtracted, scriptText);
+ throw new LoopReturn(metadataExtracted);
+ },
+ {},
+ 'script[nonce][type="text/javascript"]:contains("countryName")'
+ );
+}
const replaceTag = ($, oldTag, newTagName) => {
oldTag = $(oldTag);
@@ -55,15 +261,23 @@ const detectOriginalArticleUrl = ($) => {
return null;
};
-const detectSourceUrl = ($) => {
- const matchs = $.root()
- .html()
- .match(/msg_source_url = '(.+)';/);
-
- if (matchs) {
- return matchs[1];
- }
- return null;
+const genAudioSrc = (voiceId: string) => `https://res.wx.qq.com/voice/getvoice?mediaid=${voiceId}`;
+const genAudioTag = (src: string, title: string) => ` `;
+const genVideoSrc = (videoId: string) => {
+ const newSearchParams = new URLSearchParams({
+ origin: 'https://mp.weixin.qq.com',
+ containerId: 'js_tx_video_container_0.3863487104715233',
+ vid: videoId,
+ width: '677',
+ height: '380.8125',
+ autoplay: 'false',
+ allowFullScreen: 'true',
+ chid: '17',
+ full: 'true',
+ show1080p: 'false',
+ isDebugIframe: 'false',
+ });
+ return `https://v.qq.com/txp/iframe/player.html?${newSearchParams.toString()}`;
};
/**
@@ -99,6 +313,33 @@ const fixArticleContent = (html?: string | Cheerio, skipImg = false) =>
}
});
}
+ // fix audio: https://mp.weixin.qq.com/s/FnjcMXZ1xdS-d6n-pUUyyw
+ $('mpvoice[voice_encode_fileid]').each((_, voice) => {
+ const $voice = $(voice);
+ const voiceId = $voice.attr('voice_encode_fileid');
+ if (voiceId) {
+ const title = $voice.attr('name') || 'Audio';
+ $voice.replaceWith(genAudioTag(genAudioSrc(voiceId), title));
+ }
+ });
+ // fix iframe: https://mp.weixin.qq.com/s/FnjcMXZ1xdS-d6n-pUUyyw
+ $('iframe.video_iframe[data-src]').each((_, iframe) => {
+ const $iframe = $(iframe);
+ const dataSrc = $iframe.attr('data-src');
+ const srcUrlObj = new URL(dataSrc);
+ if (srcUrlObj.host === 'v.qq.com' && srcUrlObj.searchParams.has('vid')) {
+ const newSrc = genVideoSrc(srcUrlObj.searchParams.get('vid'));
+ $iframe.attr('src', newSrc);
+ $iframe.removeAttr('data-src');
+ const width = $iframe.attr('data-w');
+ const ratio = $iframe.attr('data-ratio');
+ if (width && ratio) {
+ const width_ = Math.min(Number.parseInt(width), 677);
+ $iframe.attr('width', width_.toString());
+ $iframe.attr('height', (width_ / Number.parseFloat(ratio)).toString());
+ }
+ } // else {} FIXME: https://mp.weixin.qq.com/s?__biz=Mzg5Mjk3MzE4OQ==&mid=2247549515&idx=2&sn=a608fca597f0589c1aebd6d0b82ff6e9
+ });
// fix section
$('section').each((_, section) => {
const $section = $(section);
@@ -122,17 +363,6 @@ const fixArticleContent = (html?: string | Cheerio, skipImg = false) =>
// clear line index tags in code section
$('.code-snippet__line-index').remove();
- // fix single picture article
- // example: https://mp.weixin.qq.com/s/4p5YmYuASiQSYFiy7KqydQ
- $('script').each((_, script) => {
- const $script = $(script);
- const matchs = $script.html()?.match(/document\.getElementById\('js_image_desc'\)\.innerHTML = "(.*)"\.replace/);
-
- if (matchs) {
- $script.replaceWith(matchs[1].replaceAll('\r', '').replaceAll('\n', ' ').replaceAll('\\x0d', '').replaceAll('\\x0a', ' '));
- }
- });
-
// clean scripts
$('script').remove();
return $.html();
@@ -184,51 +414,124 @@ const normalizeUrl = (url, bypassHostCheck = false) => {
return urlObj.href;
};
+class PageParsers {
+ private static common = ($: CheerioAPI, commonMetadata: Record) => {
+ const title = replaceReturnNewline($('meta[property="og:title"]').attr('content') || '', '', ' ');
+ const author = replaceReturnNewline($('meta[name=author]').attr('content') || '', '', ' ');
+ const pubDate = commonMetadata.createTime ? parseDate(Number.parseInt(commonMetadata.createTime) * 1000) : undefined;
+ const mpName = $('.wx_follow_nickname').first().text()?.trim();
+
+ let summary = replaceReturnNewline($('meta[name=description]').attr('content') || '');
+ const description = summary;
+ summary = summary.replaceAll(' ', ' ') === title ? '' : summary;
+
+ return { title, author, description, summary, pubDate, mpName } as {
+ title: string;
+ author: string;
+ description: string;
+ summary: string;
+ pubDate?: Date;
+ mpName?: string;
+ enclosure_url?: string;
+ itunes_duration?: string | number;
+ enclosure_type?: string;
+ };
+ };
+ private static appMsg = async ($: CheerioAPI, commonMetadata: Record) => {
+ const page = PageParsers.common($, commonMetadata);
+ page.description = fixArticleContent($('#js_content'));
+ const originalArticleUrl = detectOriginalArticleUrl($);
+ if (originalArticleUrl) {
+ // No article or article is too short, try to fetch the description from the original article
+ const data = await ofetch(normalizeUrl(originalArticleUrl));
+ const original$ = load(data);
+ page.description += fixArticleContent(original$('#js_content'));
+ }
+ return page;
+ };
+ private static img = ($: CheerioAPI, commonMetadata: Record) => {
+ const page = PageParsers.common($, commonMetadata);
+ const imgUrls = ExtractMetadata.img($)?.imgUrls;
+ let imgHtml = '';
+ if (Array.isArray(imgUrls) && imgUrls.length > 0) {
+ for (const imgUrl of imgUrls) {
+ imgHtml += ` `;
+ }
+ }
+ page.description += imgHtml;
+ return page;
+ };
+ private static audio = ($: CheerioAPI, commonMetadata: Record) => {
+ const page = PageParsers.common($, commonMetadata);
+ const audioMetadata = ExtractMetadata.audio($);
+ const audioUrl = genAudioSrc(audioMetadata.voiceId);
+ page.enclosure_url = audioUrl;
+ page.itunes_duration = audioMetadata.duration;
+ page.enclosure_type = 'audio/mp3'; // FIXME: may it be other types?
+ page.description += ' ' + genAudioTag(audioUrl, page.title);
+ return page;
+ };
+ private static fallback = ($: CheerioAPI, commonMetadata: Record) => {
+ const page = PageParsers.common($, commonMetadata);
+ const image = $('meta[property="og:image"]').attr('content');
+ if (image) {
+ page.description += ` `;
+ }
+ return page;
+ };
+ static dispatch = async ($: CheerioAPI) => {
+ const commonMetadata = ExtractMetadata.common($);
+ let page: Record;
+ switch (commonMetadata.showType) {
+ case 'APP_MSG_PAGE':
+ page = await PageParsers.appMsg($, commonMetadata);
+ break;
+ case 'AUDIO_SHARE_PAGE':
+ page = PageParsers.audio($, commonMetadata);
+ break;
+ case 'IMG_SHARE_PAGE':
+ page = PageParsers.img($, commonMetadata);
+ break;
+ case 'VIDEO_SHARE_PAGE':
+ page = PageParsers.fallback($, commonMetadata);
+ break;
+ default:
+ warn('new showType, trying fallback method', `showType=${commonMetadata.showType}`);
+ page = PageParsers.fallback($, commonMetadata);
+ }
+ const locationMetadata = ExtractMetadata.location($);
+ let location = '';
+ for (const loc of [locationMetadata.countryName, locationMetadata.provinceName, locationMetadata.cityName]) {
+ if (loc) {
+ location += loc + ' ';
+ }
+ }
+ location = location.trim();
+ if (location) {
+ page.description += `📍发表于:${location}
`;
+ }
+ if (commonMetadata.sourceUrl) {
+ page.description += `🔗️ 阅读原文
`;
+ }
+ return page;
+ };
+}
+
/**
* Fetch article and its metadata from WeChat MP (mp.weixin.qq.com).
*
* If you use this function, no need to call `fixArticleContent`
- * @param {object} ctx - The context object.
- * @param {string} url - The url of the article.
- * @param {boolean} bypassHostCheck - Whether to bypass host check.
- * @return {Promise} - An object containing the article and its metadata.
+ * @param url - The url of the article.
+ * @param bypassHostCheck - Whether to bypass host check.
+ * @return - An object containing the article and its metadata.
*/
-const fetchArticle = (url, bypassHostCheck = false) => {
+const fetchArticle = (url: string, bypassHostCheck: boolean = false) => {
url = normalizeUrl(url, bypassHostCheck);
return cache.tryGet(url, async () => {
const data = await ofetch(url);
const $ = load(data);
-
- const title = ($('meta[property="og:title"]').attr('content') || '').replaceAll('\\r', '').replaceAll('\\n', ' ');
- const author = $('meta[name=author]').attr('content');
- let summary = $('meta[name=description]').attr('content');
- summary = summary === title ? '' : summary;
- let description = fixArticleContent($('#js_content'));
- // No article get or article is too short, try the original url
- const originalUrl = detectOriginalArticleUrl($);
- if (originalUrl) {
- // try to fetch the description from the original article
- const data = await ofetch(normalizeUrl(originalUrl, bypassHostCheck));
- const original$ = load(data);
- description += fixArticleContent(original$('#js_content'));
- }
-
- const sourceUrl = detectSourceUrl($);
- if (sourceUrl) {
- description += `阅读原文 `;
- }
-
- let pubDate;
- const publish_time_script = $('script[nonce][type="text/javascript"]:contains("var ct")').text();
- const publish_time_match = publish_time_script && publish_time_script.match(/var ct *= *"?(\d{10})"?/);
- const publish_timestamp = publish_time_match && publish_time_match[1];
- if (publish_timestamp) {
- pubDate = parseDate(Number.parseInt(publish_timestamp) * 1000);
- }
-
- let mpName = $('.profile_nickname').first().text();
- mpName = mpName && mpName.trim();
- return { title, author, description, summary, pubDate, mpName, link: url };
+ const page = await PageParsers.dispatch($);
+ return { ...page, link: url };
}) as Promise<{
title: string;
author: string;
@@ -237,6 +540,9 @@ const fetchArticle = (url, bypassHostCheck = false) => {
pubDate?: Date;
mpName?: string;
link: string;
+ enclosure_type?: string;
+ enclosure_url?: string;
+ itunes_duration?: string | number;
}>;
};
@@ -257,18 +563,23 @@ const fetchArticle = (url, bypassHostCheck = false) => {
* @return {Promise} - The incoming `item` object, with the article and its metadata filled in.
*/
const finishArticleItem = async (item, setMpNameAsAuthor = false, skipLink = false) => {
- const { title, author, description, summary, pubDate, mpName, link } = await fetchArticle(item.link);
- item.title = title || item.title;
- item.description = description || item.description;
- item.summary = summary || item.summary;
- item.pubDate = pubDate || item.pubDate;
- item.author = setMpNameAsAuthor
- ? mpName || item.author // the Official Account itself. if your route return articles from different accounts, you may want to use this
- : author || item.author; // the real author of the article. if your route return articles from a certain account, use this
- if (!skipLink) {
- item.link = link || item.link;
+ const fetchedItem = await fetchArticle(item.link);
+ for (const key in fetchedItem) {
+ switch (key) {
+ case 'author':
+ item.author = setMpNameAsAuthor
+ ? fetchedItem.mpName || item.author // the Official Account itself. if your route return articles from different accounts, you may want to use this
+ : fetchedItem.author || item.author; // the real author of the article. if your route return articles from a certain account, use this
+ break;
+ case 'link':
+ item.link = skipLink ? item.link : fetchedItem.link || item.link;
+ break;
+ default:
+ item[key] = item[key] || fetchedItem[key];
+ }
}
return item;
};
-export { fixArticleContent, fetchArticle, finishArticleItem, normalizeUrl };
+const exportedForTestingOnly = { ExtractMetadata, showTypeMapReverse };
+export { exportedForTestingOnly, fixArticleContent, fetchArticle, finishArticleItem, normalizeUrl };