Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions benchmark/util/text-decoder-stream.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,23 @@

const common = require('../common.js');

const MAX_N = 256 * 1024;
const TOTAL_SIZE = 256 * 1024 * 1024;
Comment on lines +5 to +6
Copy link
Member Author

@ChALkeR ChALkeR Feb 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is quite slow esp when run 30 times on CI
I wanted to denoise it though as otherwise it's still noisy even on e.g. 2x less numbers
If slow benchmarks are an issue I can exclude this change


const bench = common.createBenchmark(main, {
encoding: ['utf-8', 'utf-16le'],
ignoreBOM: [0, 1],
fatal: [0, 1],
unicode: [0, 1],
len: [256, 1024 * 16, 1024 * 128],
chunks: [10],
n: [1e3],
type: ['SharedArrayBuffer', 'ArrayBuffer', 'Buffer'],
});

const UNICODE_ALPHA = 'Blåbærsyltetøy';
const ASCII_ALPHA = 'Blueberry jam';

function main({ encoding, len, unicode, chunks, n, ignoreBOM, type, fatal }) {
function main({ encoding, len, unicode, chunks, ignoreBOM, type, fatal }) {
const decoder = new TextDecoder(encoding, { ignoreBOM, fatal });
let buf;

Expand All @@ -41,6 +43,7 @@ function main({ encoding, len, unicode, chunks, n, ignoreBOM, type, fatal }) {

const chunk = Math.ceil(len / chunks);
const max = len - chunk;
const n = Math.min(MAX_N, Math.ceil(TOTAL_SIZE / len));
bench.start();
for (let i = 0; i < n; i++) {
let pos = 0;
Expand Down
4 changes: 2 additions & 2 deletions benchmark/util/text-decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
const common = require('../common.js');

const bench = common.createBenchmark(main, {
encoding: ['utf-8', 'windows-1252', 'iso-8859-3'],
encoding: ['utf-8', 'utf-16le', 'utf-16be', 'windows-1252', 'iso-8859-3'],
ignoreBOM: [0, 1],
fatal: [0, 1],
len: [256, 1024 * 16, 1024 * 128],
Expand All @@ -25,7 +25,7 @@ function main({ encoding, len, n, ignoreBOM, type, fatal }) {
break;
}
case 'Buffer': {
buf = Buffer.allocUnsafe(len);
buf = Buffer.alloc(len);
break;
}
}
Expand Down
89 changes: 46 additions & 43 deletions lib/internal/encoding.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ const { FastBuffer } = require('internal/buffer');
const {
ERR_ENCODING_NOT_SUPPORTED,
ERR_INVALID_ARG_TYPE,
ERR_ENCODING_INVALID_ENCODED_DATA,
ERR_INVALID_THIS,
ERR_NO_ICU,
} = require('internal/errors').codes;
const kSingleByte = Symbol('single-byte');
const kHandle = Symbol('handle');
Expand All @@ -30,11 +30,11 @@ const kEncoding = Symbol('encoding');
const kDecoder = Symbol('decoder');
const kChunk = Symbol('chunk');
const kFatal = Symbol('kFatal');
const kUTF8FastPath = Symbol('kUTF8FastPath');
const kUnicode = Symbol('kUnicode');
const kIgnoreBOM = Symbol('kIgnoreBOM');

const { isSinglebyteEncoding, createSinglebyteDecoder } = require('internal/encoding/single-byte');
const { unfinishedBytesUtf8, mergePrefixUtf8 } = require('internal/encoding/util');
const { unfinishedBytes, mergePrefix } = require('internal/encoding/util');

const {
getConstructorOf,
Expand Down Expand Up @@ -419,11 +419,33 @@ if (hasIntl) {

const kBOMSeen = Symbol('BOM seen');

let StringDecoder;
function lazyStringDecoder() {
if (StringDecoder === undefined)
({ StringDecoder } = require('string_decoder'));
return StringDecoder;
function decodeUTF16bufferLE(le, ignoreBom, fatal, encoding) {
let suffix = '';
if (le.length % 2 !== 0) {
if (fatal) throw new ERR_ENCODING_INVALID_ENCODED_DATA(encoding, undefined);
le = le.subarray(0, -1);
suffix = '\ufffd';
}
if (le.length === 0) return suffix;
let res = le.ucs2Slice();
if (!ignoreBom && res[0] === '\ufeff') res = StringPrototypeSlice(res, 1);
if (!fatal) return res.toWellFormed() + suffix;
if (!res.isWellFormed()) throw new ERR_ENCODING_INVALID_ENCODED_DATA(encoding, undefined);
return res;
}

function decodeUTF16le(input, ignoreBom, fatal) {
const le = parseInput(input);
return decodeUTF16bufferLE(le, ignoreBom, fatal, 'utf-16le');
}

function decodeUTF16be(input, ignoreBom, fatal) {
const be = parseInput(input);
const le = new FastBuffer(be.length);
le.set(be);
const swap = le.length % 2 === 0 ? le : le.subarray(0, -1);
swap.swap16();
return decodeUTF16bufferLE(le, ignoreBom, fatal, 'utf-16be');
}

class TextDecoder {
Expand All @@ -446,33 +468,29 @@ class TextDecoder {
this[kEncoding] = enc;
this[kIgnoreBOM] = Boolean(options?.ignoreBOM);
this[kFatal] = Boolean(options?.fatal);
this[kUTF8FastPath] = false;
this[kUnicode] = undefined;
this[kHandle] = undefined;
this[kSingleByte] = undefined; // Does not care about streaming or BOM
this[kChunk] = null; // A copy of previous streaming tail or null

if (enc === 'utf-8') {
this[kUTF8FastPath] = true;
this[kUnicode] = decodeUTF8;
this[kBOMSeen] = false;
} else if (enc === 'utf-16le') {
this[kUnicode] = decodeUTF16le;
this[kBOMSeen] = false;
} else if (enc === 'utf-16be') {
this[kUnicode] = decodeUTF16be;
this[kBOMSeen] = false;
} else if (isSinglebyteEncoding(enc)) {
this[kSingleByte] = createSinglebyteDecoder(enc, this[kFatal]);
} else {
this.#prepareConverter(); // Need to throw early if we don't support the encoding
}
}

#prepareConverter() {
if (hasIntl) {
} else if (hasIntl) {
let icuEncoding = this[kEncoding];
if (icuEncoding === 'gbk') icuEncoding = 'gb18030'; // 10.1.1. GBK's decoder is gb18030's decoder
const handle = icuGetConverter(icuEncoding, this[kFlags]);
if (handle === undefined)
throw new ERR_ENCODING_NOT_SUPPORTED(this[kEncoding]);
this[kHandle] = handle;
} else if (this[kEncoding] === 'utf-16le') {
if (this[kFatal]) throw new ERR_NO_ICU('"fatal" option');
this[kHandle] = new (lazyStringDecoder())(this[kEncoding]);
this[kBOMSeen] = false;
} else {
throw new ERR_ENCODING_NOT_SUPPORTED(this[kEncoding]);
}
Expand All @@ -485,19 +503,19 @@ class TextDecoder {
if (this[kSingleByte]) return this[kSingleByte](parseInput(input));

const stream = options?.stream;
if (this[kUTF8FastPath]) {
if (this[kUnicode]) {
const chunk = this[kChunk];
const ignoreBom = this[kIgnoreBOM] || this[kBOMSeen];
if (!stream) {
this[kBOMSeen] = false;
if (!chunk) return decodeUTF8(input, ignoreBom, this[kFatal]);
if (!chunk) return this[kUnicode](input, ignoreBom, this[kFatal]);
}

let u = parseInput(input);
if (u.length === 0 && stream) return ''; // no state change
let prefix;
if (chunk) {
const merged = mergePrefixUtf8(u, this[kChunk]);
const merged = mergePrefix(u, this[kChunk], this[kEncoding]);
if (u.length < 3) {
u = merged; // Might be unfinished, but fully consumed old u
} else {
Expand All @@ -510,7 +528,7 @@ class TextDecoder {
}

if (stream) {
const trail = unfinishedBytesUtf8(u, u.length);
const trail = unfinishedBytes(u, u.length, this[kEncoding]);
if (trail > 0) {
this[kChunk] = new FastBuffer(u.subarray(-trail)); // copy
if (!prefix && trail === u.length) return ''; // No further state change
Expand All @@ -519,8 +537,8 @@ class TextDecoder {
}

try {
const res = (prefix ? decodeUTF8(prefix, ignoreBom, this[kFatal]) : '') +
decodeUTF8(u, ignoreBom || prefix, this[kFatal]);
const res = (prefix ? this[kUnicode](prefix, ignoreBom, this[kFatal]) : '') +
this[kUnicode](u, ignoreBom || prefix, this[kFatal]);

// "BOM seen" is set on the current decode call only if it did not error,
// in "serialize I/O queue" after decoding
Expand All @@ -541,22 +559,7 @@ class TextDecoder {
return icuDecode(this[kHandle], input, flags, this[kEncoding]);
}

input = parseInput(input);

let result = stream ? this[kHandle].write(input) : this[kHandle].end(input);

if (result.length > 0 && !this[kBOMSeen] && !this[kIgnoreBOM]) {
// If the very first result in the stream is a BOM, and we are not
// explicitly told to ignore it, then we discard it.
if (result[0] === '\ufeff') {
result = StringPrototypeSlice(result, 1);
}
this[kBOMSeen] = true;
}

if (!stream) this[kBOMSeen] = false;

return result;
// Unreachable
}
}

Expand Down
57 changes: 36 additions & 21 deletions lib/internal/encoding/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,39 +7,54 @@ const {
Uint8Array,
} = primordials;


/**
* Get a number of last bytes in an Uint8Array `data` ending at `len` that don't
* form a codepoint yet, but can be a part of a single codepoint on more data.
* @param {Uint8Array} data Uint8Array of potentially UTF-8 bytes
* @param {Uint8Array} data Uint8Array of potentially UTF-8/UTF-16 bytes
* @param {number} len Position to look behind from
* @returns {number} Number of unfinished potentially valid UTF-8 bytes ending at position `len`
* @param {string} enc Encoding to use: utf-8, utf-16le, or utf16-be
* @returns {number} Number (0-3) of unfinished potentially valid UTF bytes ending at position `len`
*/
function unfinishedBytesUtf8(data, len) {
// 0-3
let pos = 0;
while (pos < 2 && pos < len && (data[len - pos - 1] & 0xc0) === 0x80) pos++; // Go back 0-2 trailing bytes
if (pos === len) return 0; // no space for lead
const lead = data[len - pos - 1];
if (lead < 0xc2 || lead > 0xf4) return 0; // not a lead
if (pos === 0) return 1; // Nothing to recheck, we have only lead, return it. 2-byte must return here
if (lead < 0xe0 || (lead < 0xf0 && pos >= 2)) return 0; // 2-byte, or 3-byte or less and we already have 2 trailing
const lower = lead === 0xf0 ? 0x90 : lead === 0xe0 ? 0xa0 : 0x80;
const upper = lead === 0xf4 ? 0x8f : lead === 0xed ? 0x9f : 0xbf;
const next = data[len - pos];
return next >= lower && next <= upper ? pos + 1 : 0;
function unfinishedBytes(data, len, enc) {
switch (enc) {
case 'utf-8': {
// 0-3
let pos = 0;
while (pos < 2 && pos < len && (data[len - pos - 1] & 0xc0) === 0x80) pos++; // Go back 0-2 trailing bytes
if (pos === len) return 0; // no space for lead
const lead = data[len - pos - 1];
if (lead < 0xc2 || lead > 0xf4) return 0; // not a lead
if (pos === 0) return 1; // Nothing to recheck, we have only lead, return it. 2-byte must return here
if (lead < 0xe0 || (lead < 0xf0 && pos >= 2)) return 0; // 2-byte, 3-byte or less and we already have 2 trailing
const lower = lead === 0xf0 ? 0x90 : lead === 0xe0 ? 0xa0 : 0x80;
const upper = lead === 0xf4 ? 0x8f : lead === 0xed ? 0x9f : 0xbf;
const next = data[len - pos];
return next >= lower && next <= upper ? pos + 1 : 0;
}

case 'utf-16le':
case 'utf-16be': {
// 0-3
const uneven = len % 2; // Uneven byte length adds 1
if (len < 2) return uneven;
const l = len - uneven - 1;
const last = enc === 'utf-16le' ? (data[l] << 8) ^ data[l - 1] : (data[l - 1] << 8) ^ data[l];
return last >= 0xd8_00 && last < 0xdc_00 ? uneven + 2 : uneven; // lone lead adds 2
}
}
}

/**
* Merge prefix `chunk` with `data` and return new combined prefix.
* For data.length < 3, fully consumes data and can return unfinished data,
* otherwise returns a prefix with no unfinished bytes
* @param {Uint8Array} data Uint8Array of potentially UTF-8 bytes
* @param {Uint8Array} data Uint8Array of potentially UTF-8/UTF-16 bytes
* @param {Uint8Array} chunk Prefix to prepend before `data`
* @param {string} enc Encoding to use: utf-8, utf-16le, or utf16-be
* @returns {Uint8Array} If data.length >= 3: an Uint8Array containing `chunk` and a slice of `data`
* so that the result has no unfinished UTF-8 codepoints. If data.length < 3: concat(chunk, data).
* so that the result has no unfinished codepoints. If data.length < 3: concat(chunk, data).
*/
function mergePrefixUtf8(data, chunk) {
function mergePrefix(data, chunk, enc) {
if (data.length === 0) return chunk;
if (data.length < 3) {
// No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence
Expand All @@ -57,7 +72,7 @@ function mergePrefixUtf8(data, chunk) {
// Stop at the first offset where unfinished bytes reaches 0 or fits into data
// If that doesn't happen (data too short), just concat chunk and data completely (above)
for (let i = 1; i <= 3; i++) {
const unfinished = unfinishedBytesUtf8(temp, chunk.length + i); // 0-3
const unfinished = unfinishedBytes(temp, chunk.length + i, enc); // 0-3
if (unfinished <= i) {
// Always reachable at 3, but we still need 'unfinished' value for it
const add = i - unfinished; // 0-3
Expand All @@ -69,4 +84,4 @@ function mergePrefixUtf8(data, chunk) {
return null;
}

module.exports = { unfinishedBytesUtf8, mergePrefixUtf8 };
module.exports = { unfinishedBytes, mergePrefix };
2 changes: 1 addition & 1 deletion test/parallel/test-whatwg-encoding-custom-textdecoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ assert(TextDecoder);
}

// Test TextDecoder, UTF-16be
if (common.hasIntl) {
{
const dec = new TextDecoder('utf-16be');
const res = dec.decode(Buffer.from('test€', 'utf-16le').swap16());
assert.strictEqual(res, 'test€');
Expand Down
Loading