diff --git a/__tests__/ExpensiMark-Markdown-Truncate-test.js b/__tests__/ExpensiMark-Markdown-Truncate-test.js new file mode 100644 index 00000000..37bbda4c --- /dev/null +++ b/__tests__/ExpensiMark-Markdown-Truncate-test.js @@ -0,0 +1,108 @@ +import ExpensiMark from '../lib/ExpensiMark'; + +const parser = new ExpensiMark(); + +describe('truncateHTML', () => { + test('should return original text as HTML if it does not exceed the limit', () => { + const markdown = 'This is a *short* text that does not exceed the character limit.'; + const limit = 100; + const html = parser.replace(markdown); + const result = parser.truncateHTML(html, limit, {ellipsis: true}); + expect(result).toBe(html); + }); + + test('should truncate HTML and add ellipsis if it exceeds the limit', () => { + const markdown = + 'This is a *long* text that exceeds the character limit. It contains multiple sentences to test the truncation functionality. The truncation should occur at the specified character count.'; + const limit = 80; + const html = parser.replace(markdown); + const result = parser.truncateHTML(html, limit, {ellipsis: true}); + expect(result).toBe('This is a long text that exceeds the character limit. It contains multiple sente...'); + }); + + test('should handle HTML with multiple markdown elements', () => { + const markdown = + 'This is a *long* text with _multiple_ markdown ~elements~. It includes *bold*, _italic_, and ~strikethrough~ formatting. The truncation should preserve the markdown syntax.'; + const limit = 150; + const html = parser.replace(markdown); + const result = parser.truncateHTML(html, limit, {ellipsis: true}); + expect(result).toBe( + 'This is a long text with multiple markdown elements. It includes bold, italic, and strikethrough formatting. The truncation should preserve the markdo...', + ); + }); + + test('should handle HTML with nested markdown elements', () => { + const markdown = 'This is a *long _nested_ markdown* text. It contains *_bold italic_* and ~_strikethrough italic_~ formatting. The truncation should handle the nesting correctly.'; + const limit = 120; + const html = parser.replace(markdown); + const result = parser.truncateHTML(html, limit, {ellipsis: true}); + expect(result).toBe( + 'This is a long nested markdown text. It contains bold italic and strikethrough italic formatting. The truncation should ...', + ); + }); + + test('should handle HTML with links', () => { + const markdown = + 'This is a text with [a link](https://example.com) that exceeds the limit. The link should be preserved in the truncated text. Additionally, it includes [another link](https://example.org) for testing purposes.'; + const limit = 141; + const html = parser.replace(markdown); + const result = parser.truncateHTML(html, limit, {ellipsis: true}); + expect(result).toBe( + `This is a text with a link that exceeds the limit. The link should be preserved in the truncated text. Additionally, it includes another link...`, + ); + }); + + test('should handle HTML with inline code', () => { + const markdown = 'This is a text with `inline code`. The inline code should be preserved in the truncated text.'; + const limit = 25; + const html = parser.replace(markdown); + const result = parser.truncateHTML(html, limit, {ellipsis: true}); + expect(result).toBe('This is a text with inlin...'); + }); + + test('should handle HTML with headings', () => { + const markdown = + '# Heading 1\n\nThis is a text with headings that exceeds the limit. The headings should be preserved in the truncated text. Here is an example of a heading:\n\n## Heading 2\n\nThe truncation should handle headings correctly.'; + const limit = 100; + const html = parser.replace(markdown); + const result = parser.truncateHTML(html, limit, {ellipsis: true}); + expect(result).toBe('

Heading 1


This is a text with headings that exceeds the limit. The headings should be preserved in th...'); + }); + + test('should handle HTML with lists', () => { + const markdown = ` + This is a text with lists. Here are examples: + + Unordered list: + - Item 1 + - Item 2 + - Item 3 + + Ordered list: + 1. First item + 2. Second item + 3. Third item + `; + const limit = 250; // Increased to accommodate full lists + const html = parser.replace(markdown); + const result = parser.truncateHTML(html, limit, {ellipsis: false}); + expect(result).toBe(`
This is a text with lists. Here are examples:

Unordered list:
- Item 1
- Item 2
- Item 3

Ordered list:
1. First item
2. Second item
3. Third item
`); + }); + + test('should handle HTML with horizontal rules', () => { + const markdown = 'This is a text with horizontal rules. Here is an example of a horizontal rule:\n\n---\n\nThe truncation should handle horizontal rules correctly.'; + const limit = 100; + const html = parser.replace(markdown); + const result = parser.truncateHTML(html, limit, {ellipsis: true}); + expect(result).toBe('This is a text with horizontal rules. Here is an example of a horizontal rule:

---

The truncation shou...'); + }); + + test('should handle HTML with images', () => { + const markdown = + 'This is a text with an image ![alt text](https://example.com/image.jpg) that exceeds the limit. The image should be preserved in the truncated text. Additionally, it includes another image ![alt text 2](https://example.org/image2.jpg) for testing purposes.'; + const limit = 80; + const html = parser.replace(markdown); + const result = parser.truncateHTML(html, limit, {ellipsis: true}); + expect(result).toBe('This is a text with an image alt text that exceeds the limit. The image should be preser...'); + }); +}); diff --git a/lib/ExpensiMark.ts b/lib/ExpensiMark.ts index 311974cb..dd04a638 100644 --- a/lib/ExpensiMark.ts +++ b/lib/ExpensiMark.ts @@ -42,6 +42,13 @@ type ReplaceOptions = { shouldKeepRawInput?: boolean; }; +type TruncateOptions = { + ellipsis?: string; + truncateLastWord?: boolean; + slop?: number; + removeImageTag?: boolean; +}; + const MARKDOWN_LINK_REGEX = new RegExp(`\\[([^\\][]*(?:\\[[^\\][]*][^\\][]*)*)]\\(${UrlPatterns.MARKDOWN_URL_REGEX}\\)(?![^<]*(<\\/pre>|<\\/code>))`, 'gi'); const MARKDOWN_IMAGE_REGEX = new RegExp(`\\!(?:\\[([^\\][]*(?:\\[[^\\][]*][^\\][]*)*)])?\\(${UrlPatterns.MARKDOWN_URL_REGEX}\\)(?![^<]*(<\\/pre>|<\\/code>))`, 'gi'); @@ -1233,6 +1240,198 @@ export default class ExpensiMark { return Utils.escape(originalContent); } + /** + * Determines the end position to truncate the HTML content while considering word boundaries. + * + * @param {string} content - The HTML content to be truncated. + * @param {number} tailPosition - The position up to which the content should be considered. + * @param {number} maxLength - The maximum length of the truncated content. + * @param {number} totalLength - The length of the content processed so far. + * @param {Object} opts - Options to customize the truncation. + * @returns {number} The calculated position to truncate the content. + */ + getEndPosition(content: string, tailPosition: number | undefined, maxLength: number, totalLength: number, opts: TruncateOptions) { + const WORD_BREAK_REGEX = /\W+/g; + + // Calculate the default position to truncate based on the maximum length and the length of the content processed so far + const defaultPosition = maxLength - totalLength; + + // Define the slop value, which determines the tolerance for cutting off content near the maximum length + const slop = opts.slop; + if (!slop) return defaultPosition; + + // Initialize the position to the default position + let position = defaultPosition; + + // Determine if the default position is considered "short" based on the slop value + const isShort = defaultPosition < slop; + + // Calculate the position within the slop range + const slopPos = isShort ? defaultPosition : slop - 1; + + // Extract the substring to analyze for word boundaries, considering the slop and tail position + const substr = content.slice(isShort ? 0 : defaultPosition - slop, tailPosition !== undefined ? tailPosition : defaultPosition + slop); + + // Find the first word boundary within the substring + const wordBreakMatch = WORD_BREAK_REGEX.exec(substr); + + // Adjust the position to avoid truncating in the middle of a word if the option is enabled + if (!opts.truncateLastWord) { + if (tailPosition && substr.length <= tailPosition) { + // If tail position is defined and the substring length is within the tail position, set position to the substring length + position = substr.length; + } else { + // Iterate through word boundary matches to adjust the position + while (wordBreakMatch !== null) { + if (wordBreakMatch.index < slopPos) { + // If the word boundary is before the slop position, adjust position backward + position = defaultPosition - (slopPos - wordBreakMatch.index); + if (wordBreakMatch.index === 0 && defaultPosition <= 1) { + break; + } + } else if (wordBreakMatch.index === slopPos) { + // If the word boundary is at the slop position, set position to the default position + position = defaultPosition; + break; + } else { + // If the word boundary is after the slop position, adjust position forward + position = defaultPosition + (wordBreakMatch.index - slopPos); + break; + } + } + } + // If the character at the determined position is a whitespace, adjust position backward + if (content.charAt(position - 1).match(/\s$/)) { + position--; + } + } + + // Return the calculated position to truncate the content + return position; + } + + /** + * Truncate HTML string and keep tag safe. + * pulled from https://github.com/huang47/nodejs-html-truncate/blob/master/lib/truncate.js + * + * @param {string} html - The string that needs to be truncated + * @param {number} maxLength - Length of truncated string + * @param {Object} [options] - Optional configuration options + * @returns {string} The truncated string + */ + truncateHTML(html: string, maxLength: number, options?: TruncateOptions) { + const EMPTY_STRING = ''; + const DEFAULT_TRUNCATE_SYMBOL = '...'; + const DEFAULT_SLOP = Math.min(10, maxLength); + const tagsStack = []; + const KEY_VALUE_REGEX = '((?:\\s+(?:\\w+|-)+(?:\\s*=\\s*(?:"(?:\\\\.|[^"\\\\])*"|\'(?:\\\\.|[^\'\\\\])*\'|[^\'">\\s]+))?)*)'; + const IS_CLOSE_REGEX = '\\s*\\/?\\s*'; + const CLOSE_REGEX = '\\s*\\/\\s*'; + const SELF_CLOSE_REGEX = new RegExp(`<\\/?(\\w+)${KEY_VALUE_REGEX}${CLOSE_REGEX}>`); + const HTML_TAG_REGEX = new RegExp(`<\\/?(\\w+)${KEY_VALUE_REGEX}${IS_CLOSE_REGEX}>`); + const URL_REGEX = /(((ftp|https?):\/\/)[\\-\w@:%_\\+.~#?,&\\/\\/=]+)|((mailto:)?[_.\w\\-]+@([\w][\w\\-]+\.)+[a-zA-Z]{2,3})/g; + const IMAGE_TAG_REGEX = new RegExp(``); + let truncatedContent = EMPTY_STRING; + let totalLength = 0; + let matches = HTML_TAG_REGEX.exec(html); + let endResult; + let index; + let tag; + let selfClose = null; + let htmlString = html; + + const opts = { + ellipsis: DEFAULT_TRUNCATE_SYMBOL, + truncateLastWord: true, + slop: DEFAULT_SLOP, + ...options, + }; + + function removeImageTag(content: string): string { + const match = IMAGE_TAG_REGEX.exec(content); + if (!match) { + return content; + } + + const matchIndex = match.index; + const matchLength = match[0].length; + + return content.substring(0, matchIndex) + content.substring(matchIndex + matchLength); + } + + function closeTags(tags: string[]): string { + return tags + .reverse() + .map((mappedTag) => { + return ``; + }) + .join(''); + } + + while (matches) { + matches = HTML_TAG_REGEX.exec(htmlString); + + if (!matches) { + if (totalLength >= maxLength) { + break; + } + + matches = URL_REGEX.exec(htmlString); + if (!matches || matches.index >= maxLength) { + truncatedContent += htmlString.substring(0, this.getEndPosition(htmlString, undefined, maxLength, totalLength, opts)); + break; + } + + while (matches) { + endResult = matches[0]; + if (endResult !== null) { + index = matches.index; + truncatedContent += htmlString.substring(0, index + endResult.length - totalLength); + htmlString = htmlString.substring(index + endResult.length); + matches = URL_REGEX.exec(htmlString); + } + } + break; + } + + endResult = matches[0]; + index = matches.index; + + if (totalLength + index > maxLength) { + truncatedContent += htmlString.substring(0, this.getEndPosition(htmlString, index, maxLength, totalLength, opts)); + break; + } else { + totalLength += index; + truncatedContent += htmlString.substring(0, index); + } + + if (endResult[1] === '/') { + tagsStack.pop(); + selfClose = null; + } else { + selfClose = SELF_CLOSE_REGEX.exec(endResult); + if (!selfClose) { + tag = matches[1]; + tagsStack.push(tag); + } + } + + truncatedContent += selfClose ? selfClose[0] : endResult; + htmlString = htmlString.substring(index + endResult.length); // Update htmlString + } + + if (htmlString.length > maxLength - totalLength && opts.ellipsis) { + truncatedContent += opts.ellipsis ? '...' : ''; + } + truncatedContent += closeTags(tagsStack); + + if (opts.removeImageTag) { + truncatedContent = removeImageTag(truncatedContent); + } + + return truncatedContent; + } + /** * Replaces text with a replacement based on a regex * @param text - The text to replace