Skip to content

Commit 9a0d32c

Browse files
committed
Implement UTF-32 encode/decode and fix UTF-16 empty encode
- Add UTF-32, UTF-32-LE, UTF-32-BE encode/decode in _pycodecs.py - Register utf_32 codec functions in codecs.rs via delegate_pycodecs - Fix PyUnicode_EncodeUTF16 returning "" instead of [] for empty input - Remove resolved expectedFailure decorators in test_codecs.py - Add failure reasons to remaining expectedFailure comments
1 parent 5e732c5 commit 9a0d32c

File tree

3 files changed

+250
-109
lines changed

3 files changed

+250
-109
lines changed

Lib/_pycodecs.py

Lines changed: 140 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,145 @@ def utf_16_be_decode( data, errors='strict', byteorder=0, final = 0):
357357
return res, consumed
358358

359359

360+
def STORECHAR32(ch, byteorder):
361+
"""Store a 32-bit character as 4 bytes in the specified byte order."""
362+
b0 = ch & 0xff
363+
b1 = (ch >> 8) & 0xff
364+
b2 = (ch >> 16) & 0xff
365+
b3 = (ch >> 24) & 0xff
366+
if byteorder == 'little':
367+
return [b0, b1, b2, b3]
368+
else: # big-endian
369+
return [b3, b2, b1, b0]
370+
371+
372+
def PyUnicode_EncodeUTF32(s, size, errors, byteorder='little'):
373+
"""Encode a Unicode string to UTF-32."""
374+
p = []
375+
bom = sys.byteorder
376+
377+
if byteorder == 'native':
378+
bom = sys.byteorder
379+
# Add BOM for native encoding
380+
p += STORECHAR32(0xFEFF, bom)
381+
382+
if size == 0:
383+
return []
384+
385+
if byteorder == 'little':
386+
bom = 'little'
387+
elif byteorder == 'big':
388+
bom = 'big'
389+
390+
for c in s:
391+
ch = ord(c)
392+
# UTF-32 doesn't need surrogate pairs, each character is encoded directly
393+
p += STORECHAR32(ch, bom)
394+
395+
return p
396+
397+
398+
def utf_32_encode(obj, errors='strict'):
399+
"""UTF-32 encoding with BOM."""
400+
res = PyUnicode_EncodeUTF32(obj, len(obj), errors, 'native')
401+
res = bytes(res)
402+
return res, len(obj)
403+
404+
405+
def utf_32_le_encode(obj, errors='strict'):
406+
"""UTF-32 little-endian encoding without BOM."""
407+
res = PyUnicode_EncodeUTF32(obj, len(obj), errors, 'little')
408+
res = bytes(res)
409+
return res, len(obj)
410+
411+
412+
def utf_32_be_encode(obj, errors='strict'):
413+
"""UTF-32 big-endian encoding without BOM."""
414+
res = PyUnicode_EncodeUTF32(obj, len(obj), errors, 'big')
415+
res = bytes(res)
416+
return res, len(obj)
417+
418+
419+
def PyUnicode_DecodeUTF32Stateful(data, size, errors, byteorder='little', final=0):
420+
"""Decode UTF-32 encoded bytes to Unicode string."""
421+
if size == 0:
422+
return [], 0, 0
423+
424+
if size % 4 != 0:
425+
if not final:
426+
# Incomplete data, return what we can decode
427+
size = (size // 4) * 4
428+
if size == 0:
429+
return [], 0, 0
430+
else:
431+
# Final data must be complete
432+
if errors == 'strict':
433+
raise UnicodeDecodeError('utf-32', bytes(data), size - (size % 4), size,
434+
'truncated data')
435+
elif errors == 'ignore':
436+
size = (size // 4) * 4
437+
elif errors == 'replace':
438+
size = (size // 4) * 4
439+
440+
result = []
441+
pos = 0
442+
443+
while pos + 3 < size:
444+
if byteorder == 'little':
445+
ch = data[pos] | (data[pos+1] << 8) | (data[pos+2] << 16) | (data[pos+3] << 24)
446+
else: # big-endian
447+
ch = (data[pos] << 24) | (data[pos+1] << 16) | (data[pos+2] << 8) | data[pos+3]
448+
449+
# Validate code point
450+
if ch > 0x10FFFF:
451+
if errors == 'strict':
452+
raise UnicodeDecodeError('utf-32', bytes(data), pos, pos+4,
453+
'codepoint not in range(0x110000)')
454+
elif errors == 'replace':
455+
result.append('\ufffd')
456+
# 'ignore' - skip this character
457+
else:
458+
result.append(chr(ch))
459+
460+
pos += 4
461+
462+
return result, pos, 0
463+
464+
465+
def utf_32_decode(data, errors='strict', final=0):
466+
"""UTF-32 decoding with BOM detection."""
467+
if len(data) >= 4:
468+
# Check for BOM
469+
if data[0:4] == b'\xff\xfe\x00\x00':
470+
# UTF-32 LE BOM
471+
res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data[4:], len(data)-4, errors, 'little', final)
472+
res = ''.join(res)
473+
return res, consumed + 4
474+
elif data[0:4] == b'\x00\x00\xfe\xff':
475+
# UTF-32 BE BOM
476+
res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data[4:], len(data)-4, errors, 'big', final)
477+
res = ''.join(res)
478+
return res, consumed + 4
479+
480+
# Default to little-endian if no BOM
481+
byteorder = 'little' if sys.byteorder == 'little' else 'big'
482+
res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data, len(data), errors, byteorder, final)
483+
res = ''.join(res)
484+
return res, consumed
485+
486+
487+
def utf_32_le_decode(data, errors='strict', final=0):
488+
"""UTF-32 little-endian decoding without BOM."""
489+
res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data, len(data), errors, 'little', final)
490+
res = ''.join(res)
491+
return res, consumed
492+
493+
494+
def utf_32_be_decode(data, errors='strict', final=0):
495+
"""UTF-32 big-endian decoding without BOM."""
496+
res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data, len(data), errors, 'big', final)
497+
res = ''.join(res)
498+
return res, consumed
360499

361500

362501
# ----------------------------------------------------------------------
@@ -815,7 +954,7 @@ def PyUnicode_EncodeUTF16(s, size, errors, byteorder='little'):
815954
p += STORECHAR(0xFEFF, bom)
816955

817956
if (size == 0):
818-
return ""
957+
return []
819958

820959
if (byteorder == 'little' ):
821960
bom = 'little'

0 commit comments

Comments
 (0)