From ac0f0c4e64cf1a96ff5feac568cd38039ac1c3ec Mon Sep 17 00:00:00 2001 From: JiuqingSong Date: Tue, 19 May 2026 13:24:58 -0700 Subject: [PATCH] Filter out invisible unicode characters --- .../lib/modelApi/creators/createText.ts | 17 ++++- .../test/endToEndTest.ts | 34 ++++++++++ .../test/modelApi/creators/creatorsTest.ts | 68 +++++++++++++++++++ 3 files changed, 118 insertions(+), 1 deletion(-) diff --git a/packages/roosterjs-content-model-dom/lib/modelApi/creators/createText.ts b/packages/roosterjs-content-model-dom/lib/modelApi/creators/createText.ts index c837d11f5c96..4b8a3e48ae95 100644 --- a/packages/roosterjs-content-model-dom/lib/modelApi/creators/createText.ts +++ b/packages/roosterjs-content-model-dom/lib/modelApi/creators/createText.ts @@ -19,9 +19,10 @@ export function createText( link?: ReadonlyContentModelLink, code?: ReadonlyContentModelCode ): ContentModelText { + const filterText = stripInvisibleUnicode(text); const result: ContentModelText = { segmentType: 'Text', - text: text, + text: filterText, format: { ...format }, }; @@ -35,3 +36,17 @@ export function createText( return result; } + +// According to https://embracethered.com/blog/posts/2024/hiding-and-finding-text-with-unicode-tags/ +// there are some invisible unicode characters in the range of U+E0000 to U+EFFFF, which are used for hiding text in HTML. +// We need to strip them out before processing the pasted content, otherwise they will be treated as normal text and cause unexpected behavior. +const INVISIBLE_UNICODE_REGEX = /[\u{E0000}-\u{EFFFF}]/gu; + +/** + * Strip invisible unicode characters from the given string + * @param value The string to be processed + * @returns The string with invisible unicode characters removed + */ +function stripInvisibleUnicode(value: string): string { + return value.replace(INVISIBLE_UNICODE_REGEX, ''); +} diff --git a/packages/roosterjs-content-model-dom/test/endToEndTest.ts b/packages/roosterjs-content-model-dom/test/endToEndTest.ts index 8d55177d6929..ff6bcc5af49a 100644 --- a/packages/roosterjs-content-model-dom/test/endToEndTest.ts +++ b/packages/roosterjs-content-model-dom/test/endToEndTest.ts @@ -3028,6 +3028,40 @@ describe('End to end test for DOM => Model => DOM/TEXT', () => { ); }); + it('Text with invisible unicode tag characters is stripped, meaningful invisible chars preserved', () => { + // Source HTML contains U+E0041 / U+E0042 (unicode tag range — must be stripped) + // mixed with U+200B (ZWSP), U+200D (ZWJ), U+202E (RLO), U+202C (PDF) + // which must be preserved. + runTest( + '

a\u{E0041}b\u{200B}c\u{E0042}d\u{202E}evil\u{202C}e

', + { + blockGroupType: 'Document', + blocks: [ + { + blockType: 'Paragraph', + segments: [ + { + segmentType: 'Text', + text: 'ab\u{200B}cd\u{202E}evil\u{202C}e', + format: {}, + }, + ], + format: { + marginTop: '1em', + marginBottom: '1em', + }, + decorator: { + tagName: 'p', + format: {}, + }, + }, + ], + }, + 'ab\u{200B}cd\u{202E}evil\u{202C}e', + '

ab\u{200B}cd\u{202E}evil\u{202C}e

' + ); + }); + it('LI without UL followed by other blocks', () => { runTest( '
  • test
  • other
    ', diff --git a/packages/roosterjs-content-model-dom/test/modelApi/creators/creatorsTest.ts b/packages/roosterjs-content-model-dom/test/modelApi/creators/creatorsTest.ts index 2429877479b3..e40927c6f377 100644 --- a/packages/roosterjs-content-model-dom/test/modelApi/creators/creatorsTest.ts +++ b/packages/roosterjs-content-model-dom/test/modelApi/creators/creatorsTest.ts @@ -233,6 +233,74 @@ describe('Creators', () => { }); }); + it('createText with invisible unicode characters', () => { + const text = 'a\u{E0041}b\u{E0042}c'; + const result = createText(text); + + expect(result).toEqual({ + segmentType: 'Text', + format: {}, + text: 'abc', + }); + }); + + it('createText with only invisible unicode characters', () => { + const text = '\u{E0000}\u{E007F}\u{EFFFF}'; + const result = createText(text); + + expect(result).toEqual({ + segmentType: 'Text', + format: {}, + text: '', + }); + }); + + it('createText with invisible unicode at boundary range', () => { + const text = '\u{DFFFF}start\u{E0000}mid\u{EFFFF}end\u{F0000}'; + const result = createText(text); + + expect(result).toEqual({ + segmentType: 'Text', + format: {}, + text: '\u{DFFFF}startmidend\u{F0000}', + }); + }); + + it('createText preserves meaningful invisible characters outside the tag range', () => { + // ​ = Zero-Width Space, ‍ = Zero-Width Joiner, + // ‮ = Right-to-Left Override, ‬ = Pop Directional Formatting + const text = 'a​b‍c‮d‬e'; + const result = createText(text); + + expect(result).toEqual({ + segmentType: 'Text', + format: {}, + text: 'a​b‍c‮d‬e', + }); + }); + + it('createText strips only tag-range chars, keeps meaningful invisible chars', () => { + const text = 'a​\u{E0041}b‮\u{E0042}c'; + const result = createText(text); + + expect(result).toEqual({ + segmentType: 'Text', + format: {}, + text: 'a​b‮c', + }); + }); + + it('createText does not strip visible characters', () => { + const text = 'hello world 你好   ​'; + const result = createText(text); + + expect(result).toEqual({ + segmentType: 'Text', + format: {}, + text: 'hello world 你好   ​', + }); + }); + it('createTableRow', () => { const row = createTableRow();