feature: support emojis and symbols (#9)

lukePeavey · Nov 22, 2020 · f6f9984 · f6f9984
1 parent ede1a9a
commit f6f9984
Show file tree

Hide file tree

Showing 3 changed files with 175 additions and 18 deletions.
diff --git a/__tests__/split.test.js b/__tests__/split.test.js
@@ -104,7 +104,7 @@ describe('split(element, options)', () => {
     expect(result.lines[0].textContent).toEqual(textContent)
   })
 
-  test(`Splits text into lines and words`, () => {
+  it(`Splits text into lines and words`, () => {
     const settings = { ...defaults, types: 'lines, words' }
     const elem = createElement('div', { textContent })
     const result = split(elem, settings)
@@ -126,7 +126,7 @@ describe('split(element, options)', () => {
     expect(result.lines[0].textContent).toEqual(textContent)
   })
 
-  test(`Splits text into lines and characters`, () => {
+  it(`Splits text into lines and characters`, () => {
     const settings = { ...defaults, types: 'lines, chars' }
     const elem = createElement('div', { textContent })
     const result = split(elem, settings)
@@ -148,7 +148,7 @@ describe('split(element, options)', () => {
     expect(result.lines[0].textContent).toEqual(textContent)
   })
 
-  test(`Splits text into lines, words and characters`, () => {
+  it(`Splits text into lines, words and characters`, () => {
     const settings = defaults
     const elem = createElement('div', { textContent })
     const result = split(elem, settings)
@@ -176,4 +176,10 @@ describe('split(element, options)', () => {
     expect(result.lines[0].classList.contains(settings.lineClass)).toBe(true)
     expect(result.lines[0].textContent).toEqual(textContent)
   })
+
+  it(`Splits text containing unicode characters`, () => {
+    const elem = createElement('div', { textContent: '📌foo' })
+    const result = split(elem)
+    expect(result.chars[0].textContent).toEqual('📌')
+  })
 })
diff --git a/__tests__/utils/toChars.test.js b/__tests__/utils/toChars.test.js
@@ -1,11 +1,32 @@
 import toChars from '../../lib/utils/toChars'
 
 describe('utils.toChars(value)', () => {
-  it(`Splits a string into characters`, () => {
+  it(`Splits a string into an array of characters`, () => {
     expect(toChars('foo')).toEqual(['f', 'o', 'o'])
     expect(toChars('foo bar')).toEqual(['f', 'o', 'o', ' ', 'b', 'a', 'r'])
     expect(toChars('f-o-o', /-/)).toEqual(['f', 'o', 'o'])
     expect(toChars('f-o-o', '-')).toEqual(['f', 'o', 'o'])
     expect(toChars()).toEqual([])
   })
-})
+
+  it(`Splits a string containing emojis`, () => {
+    expect(toChars('👋🏽😀✂️')).toEqual(['👋🏽', '😀', '✂️'])
+    expect(toChars('foo😀')).toEqual(['f', 'o', 'o', '😀'])
+  })
+
+  it(`Splits a string containing non-english characters`, () => {
+    expect(toChars('ふりが')).toEqual(['ふ', 'り', 'が'])
+  })
+})
+
+describe('utils.toChars(value, separator)', () => {
+  it(`Splits a string using a custom string separator`, () => {
+    expect(toChars('f-o-o', '-')).toEqual(['f', 'o', 'o'])
+    expect(toChars('👋🏽-😀-✂️', '-')).toEqual(['👋🏽', '😀', '✂️'])
+  })
+
+  it(`Splits a string using a RegExp separator pattern`, () => {
+    expect(toChars('f-o_o', /-|_/)).toEqual(['f', 'o', 'o'])
+    expect(toChars('👋🏽-😀_✂️', /-|_/)).toEqual(['👋🏽', '😀', '✂️'])
+  })
+})
diff --git a/lib/utils/toChars.js b/lib/utils/toChars.js
@@ -1,26 +1,156 @@
 /**
- * Splits a string into an array of characters
+ * Based on lodash#split <https://lodash.com/license>
+ * Copyright jQuery Foundation and other contributors <https://jquery.org/>
+ * Copyright Jeremy Ashkenas, DocumentCloud and Investigative Reporters &
+ * Editors
+ */
+
+/* eslint-disable prefer-template */
+/* eslint-disable no-misleading-character-class */
+import isString from './isString'
+
+const rsAstralRange = '\\ud800-\\udfff'
+const rsComboMarksRange = '\\u0300-\\u036f\\ufe20-\\ufe23'
+const rsComboSymbolsRange = '\\u20d0-\\u20f0'
+const rsVarRange = '\\ufe0e\\ufe0f'
+
+/** Used to compose unicode capture groups. */
+const rsAstral = `[${rsAstralRange}]`
+const rsCombo = `[${rsComboMarksRange}${rsComboSymbolsRange}]`
+const rsFitz = '\\ud83c[\\udffb-\\udfff]'
+const rsModifier = `(?:${rsCombo}|${rsFitz})`
+const rsNonAstral = `[^${rsAstralRange}]`
+const rsRegional = '(?:\\ud83c[\\udde6-\\uddff]){2}'
+const rsSurrPair = '[\\ud800-\\udbff][\\udc00-\\udfff]'
+const rsZWJ = '\\u200d'
+
+/** Used to compose unicode regexes. */
+const reOptMod = `${rsModifier}?`
+const rsOptVar = `[${rsVarRange}]?`
+const rsOptJoin =
+  '(?:' +
+  rsZWJ +
+  '(?:' +
+  [rsNonAstral, rsRegional, rsSurrPair].join('|') +
+  ')' +
+  rsOptVar +
+  reOptMod +
+  ')*'
+const rsSeq = rsOptVar + reOptMod + rsOptJoin
+const rsSymbol = `(?:${[
+  `${rsNonAstral}${rsCombo}?`,
+  rsCombo,
+  rsRegional,
+  rsSurrPair,
+  rsAstral,
+].join('|')}
+)`
+
+/** Used to match [string symbols](https://mathiasbynens.be/notes/javascript-unicode). */
+const reUnicode = RegExp(`${rsFitz}(?=${rsFitz})|${rsSymbol}${rsSeq}`, 'g')
+
+/** Used to detect strings with [zero-width joiners or code points from the astral planes](http://eev.ee/blog/2015/09/12/dark-corners-of-unicode/). */
+const unicodeRange = [
+  rsZWJ,
+  rsAstralRange,
+  rsComboMarksRange,
+  rsComboSymbolsRange,
+  rsVarRange,
+]
+const reHasUnicode = RegExp(`[${unicodeRange.join('')}]`)
+
+/**
+ * Converts an ASCII `string` to an array.
+ *
+ * @private
+ * @param {string} string The string to convert.
+ * @returns {Array} Returns the converted array.
+ */
+function asciiToArray(string) {
+  return string.split('')
+}
+
+/**
+ * Checks if `string` contains Unicode symbols.
+ *
+ * @private
+ * @param {string} string The string to inspect.
+ * @returns {boolean} Returns `true` if a symbol is found, else `false`.
+ */
+function hasUnicode(string) {
+  return reHasUnicode.test(string)
+}
+
+/**
+ * Converts a Unicode `string` to an array.
  *
- * TODO:
- * Add support strings that contain unicode characters (ie Emojis and symbols)
+ * @private
+ * @param {string} string The string to convert.
+ * @returns {Array} Returns the converted array.
+ */
+function unicodeToArray(string) {
+  return string.match(reUnicode) || []
+}
+
+/**
+ * Converts `string` to an array.
  *
- * @param {string} string the string to split
- * @param {string|RegExp} [separator = ''] Pattern used to separate characters
- * @return {string[]} the array of chars
+ * @private
+ * @param {string} string The string to convert.
+ * @returns {Array} Returns the converted array.
+ */
+export function stringToArray(string) {
+  return hasUnicode(string) ? unicodeToArray(string) : asciiToArray(string)
+}
+
+/**
+ * Converts `value` to a string. An empty string is returned for `null`
+ * and `undefined` values.
+ *
+ * @param {*} value The value to process.
+ * @returns {string} Returns the string.
  * @example
- * toChars('foo')
+ *
+ * _.toString(null);
+ * // => ''
+ *
+ * _.toString([1, 2, 3]);
+ * // => '1,2,3'
+ */
+function toString(value) {
+  return value == null ? '' : String(value)
+}
+
+/**
+ * Splits `string` into an array of characters. If `separator` is omitted,
+ * it behaves likes split.split('').
+ *
+ * Unlike native string.split(''), it can split strings that contain unicode
+ * characters like emojis and symbols.
+ *
+ * @param {string} [string=''] The string to split.
+ * @param {RegExp|string} [separator=''] The separator pattern to split by.
+ * @returns {Array} Returns the string segments.
+ * @example
+ * toChars('foo');
  * // => ['f', 'o', 'o']
  *
- * toChars('foo bar')
+ * toChars('foo bar');
  * // => ["f", "o", "o", " ", "b", "a", "r"]
  *
- * toChars('f-o-o', /-/)
- * // => ['f', 'o', 'o']
+ * toChars('f😀o');
+ * // => ['f', '😀', 'o']
+ *
+ * toChars('f-😀-o', /-/);
+ * // => ['f', '😀', 'o']
  *
- * toChars()
- * // => []
  */
 export default function toChars(string, separator = '') {
-  string = string == null ? '' : String(string)
+  string = toString(string)
+  if (string && isString(string)) {
+    if (!separator && hasUnicode(string)) {
+      return stringToArray(string)
+    }
+  }
   return string.split(separator)
 }