Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@ export function createText(
link?: ReadonlyContentModelLink,
code?: ReadonlyContentModelCode
): ContentModelText {
const filterText = stripInvisibleUnicode(text);
const result: ContentModelText = {
segmentType: 'Text',
text: text,
text: filterText,
format: { ...format },
};

Expand All @@ -35,3 +36,17 @@ export function createText(

return result;
}

// According to https://embracethered.com/blog/posts/2024/hiding-and-finding-text-with-unicode-tags/
// there are some invisible unicode characters in the range of U+E0000 to U+EFFFF, which are used for hiding text in HTML.
// We need to strip them out before processing the pasted content, otherwise they will be treated as normal text and cause unexpected behavior.
const INVISIBLE_UNICODE_REGEX = /[\u{E0000}-\u{EFFFF}]/gu;

/**
* Strip invisible unicode characters from the given string
* @param value The string to be processed
* @returns The string with invisible unicode characters removed
*/
function stripInvisibleUnicode(value: string): string {
return value.replace(INVISIBLE_UNICODE_REGEX, '');
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I remember this was originally limited to the content was inserted as initial content in the editor and for Links. Do we have any perf concerns with applying the regex to all created text on every call?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a valid concern.

I investigated the original security issue, and realize that the attack can happen from any kind of source, as long as the content is put into editor. So manual operations (new editor, paste), or 3rd party code (call formatContentModel() can both trigger the result. Of cause the manual operation is easier to do.

What do you think? Should we limit the check to manual operation only? I'm open to any suggestion.

@romanisa fyi.

}
34 changes: 34 additions & 0 deletions packages/roosterjs-content-model-dom/test/endToEndTest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3028,6 +3028,40 @@ describe('End to end test for DOM => Model => DOM/TEXT', () => {
);
});

it('Text with invisible unicode tag characters is stripped, meaningful invisible chars preserved', () => {
// Source HTML contains U+E0041 / U+E0042 (unicode tag range — must be stripped)
// mixed with U+200B (ZWSP), U+200D (ZWJ), U+202E (RLO), U+202C (PDF)
// which must be preserved.
runTest(
'<p>a\u{E0041}b\u{200B}c\u{E0042}d\u{202E}evil\u{202C}e</p>',
{
blockGroupType: 'Document',
blocks: [
{
blockType: 'Paragraph',
segments: [
{
segmentType: 'Text',
text: 'ab\u{200B}cd\u{202E}evil\u{202C}e',
format: {},
},
],
format: {
marginTop: '1em',
marginBottom: '1em',
},
decorator: {
tagName: 'p',
format: {},
},
},
],
},
'ab\u{200B}cd\u{202E}evil\u{202C}e',
'<p>ab\u{200B}cd\u{202E}evil\u{202C}e</p>'
);
});

it('LI without UL followed by other blocks', () => {
runTest(
'<li>test</li><div>other</div>',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,74 @@ describe('Creators', () => {
});
});

it('createText with invisible unicode characters', () => {
const text = 'a\u{E0041}b\u{E0042}c';
const result = createText(text);

expect(result).toEqual({
segmentType: 'Text',
format: {},
text: 'abc',
});
});

it('createText with only invisible unicode characters', () => {
const text = '\u{E0000}\u{E007F}\u{EFFFF}';
const result = createText(text);

expect(result).toEqual({
segmentType: 'Text',
format: {},
text: '',
});
});

it('createText with invisible unicode at boundary range', () => {
const text = '\u{DFFFF}start\u{E0000}mid\u{EFFFF}end\u{F0000}';
const result = createText(text);

expect(result).toEqual({
segmentType: 'Text',
format: {},
text: '\u{DFFFF}startmidend\u{F0000}',
});
});

it('createText preserves meaningful invisible characters outside the tag range', () => {
// ​ = Zero-Width Space, ‍ = Zero-Width Joiner,
// ‮ = Right-to-Left Override, ‬ = Pop Directional Formatting
const text = 'a​b‍c‮d‬e';
const result = createText(text);

expect(result).toEqual({
segmentType: 'Text',
format: {},
text: 'a​b‍c‮d‬e',
});
});

it('createText strips only tag-range chars, keeps meaningful invisible chars', () => {
const text = 'a​\u{E0041}b‮\u{E0042}c';
const result = createText(text);

expect(result).toEqual({
segmentType: 'Text',
format: {},
text: 'a​b‮c',
});
});

it('createText does not strip visible characters', () => {
const text = 'hello world 你好   ​';
const result = createText(text);

expect(result).toEqual({
segmentType: 'Text',
format: {},
text: 'hello world 你好   ​',
});
});

it('createTableRow', () => {
const row = createTableRow();

Expand Down
Loading