From 4411b1e21682c0fc7493a03205499fd98f825bb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20=C4=8Cern=C3=BD?= Date: Thu, 14 May 2026 13:58:30 +0200 Subject: [PATCH] Sanitize strings to prevent UTF errors If a string is collected by a probe, we will sanitize this string before converting it to SEXP. This will ensure that no errors related to UTF enconding can happen during the whole processing. This way we will fix multiple errors emitted by pcre2 or libxml2 libraries that are caused by providing non-UTF strings. Resolves: https://redhat.atlassian.net/browse/RHEL-171005 --- src/OVAL/probes/SEAP/sexp-manip_r.c | 22 +++ src/common/oscap_utf8.c | 130 ++++++++++++++++++ src/common/oscap_utf8.h | 41 ++++++ tests/API/OVAL/unittests/CMakeLists.txt | 1 + .../unittests/test_pcre_nonutf_characters.sh | 26 ++++ .../unittests/test_pcre_nonutf_characters.xml | 32 +++++ tests/API/probes/CMakeLists.txt | 1 + tests/probes/file/test_probes_file.sh | 2 +- 8 files changed, 254 insertions(+), 1 deletion(-) create mode 100644 src/common/oscap_utf8.c create mode 100644 src/common/oscap_utf8.h create mode 100755 tests/API/OVAL/unittests/test_pcre_nonutf_characters.sh create mode 100644 tests/API/OVAL/unittests/test_pcre_nonutf_characters.xml diff --git a/src/OVAL/probes/SEAP/sexp-manip_r.c b/src/OVAL/probes/SEAP/sexp-manip_r.c index 7d2e2c5f2a3..e7420d65d85 100644 --- a/src/OVAL/probes/SEAP/sexp-manip_r.c +++ b/src/OVAL/probes/SEAP/sexp-manip_r.c @@ -21,6 +21,7 @@ */ #include +#include #include #include @@ -29,6 +30,7 @@ #include "_sexp-rawptr.h" #include "public/sexp-manip_r.h" #include "debug_priv.h" +#include "oscap_utf8.h" SEXP_t *SEXP_init(SEXP_t *sexp_mem) { @@ -206,20 +208,31 @@ SEXP_t *SEXP_number_newb_r(SEXP_t *sexp_mem, bool n) SEXP_t *SEXP_string_new_r (SEXP_t *sexp_mem, const void *string, size_t length) { SEXP_val_t v_dsc; + size_t sanitized_len = 0; + char *sanitized = oscap_sanitize_utf8(string, length, &sanitized_len); if (sexp_mem == NULL) { + free(sanitized); errno = EFAULT; return (NULL); } + if (sanitized != NULL) { + dW("Replaced invalid UTF-8 byte sequence(s) with the replacement character (U+FFFD) in '%s'.", sanitized); + string = sanitized; + length = sanitized_len; + } + if (SEXP_val_new (&v_dsc, sizeof (char) * length, SEXP_VALTYPE_STRING) != 0) { + free(sanitized); /* TODO: handle this */ return (NULL); } memcpy (v_dsc.mem, string, sizeof (char) * length); + free(sanitized); SEXP_init(sexp_mem); sexp_mem->s_type = NULL; @@ -266,9 +279,18 @@ SEXP_t *SEXP_string_newf_rv(SEXP_t *sexp_mem, const char *format, va_list ap) return NULL; } + char *sanitized = oscap_sanitize_utf8(v_string, v_strlen, NULL); + if (sanitized != NULL) { + dW("Replaced invalid UTF-8 byte sequence(s) with the replacement character (U+FFFD) in '%s'.", sanitized); + free(v_string); + v_string = sanitized; + v_strlen = strlen(sanitized); + } + if (SEXP_val_new (&v_dsc, sizeof (char) * v_strlen, SEXP_VALTYPE_STRING) != 0) { + free(v_string); /* TODO: handle this */ return (NULL); } diff --git a/src/common/oscap_utf8.c b/src/common/oscap_utf8.c new file mode 100644 index 00000000000..b4c5cf62062 --- /dev/null +++ b/src/common/oscap_utf8.c @@ -0,0 +1,130 @@ +/* + * Copyright 2026 Red Hat Inc., Durham, North Carolina. + * All Rights Reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: + * Jan Černý + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include + +#include "oscap_utf8.h" + +static int _utf8_char_len(const uint8_t *s, size_t remaining) +{ + uint8_t b = s[0]; + + if (b <= 0x7F) + return 1; + + if (b >= 0xC2 && b <= 0xDF) { + if (remaining < 2 || (s[1] & 0xC0) != 0x80) + return -1; + return 2; + } + + if (b >= 0xE0 && b <= 0xEF) { + if (remaining < 3 || (s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80) + return -1; + if (b == 0xE0 && s[1] < 0xA0) + return -1; + if (b == 0xED && s[1] > 0x9F) + return -1; + return 3; + } + + if (b >= 0xF0 && b <= 0xF4) { + if (remaining < 4 || (s[1] & 0xC0) != 0x80 || + (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80) + return -1; + if (b == 0xF0 && s[1] < 0x90) + return -1; + if (b == 0xF4 && s[1] > 0x8F) + return -1; + return 4; + } + + return -1; +} + +char *oscap_sanitize_utf8(const char *input, size_t input_len, size_t *output_len) +{ + const uint8_t *in = (const uint8_t *)input; + size_t i = 0; + + while (i < input_len) { + int clen = _utf8_char_len(in + i, input_len - i); + if (clen < 0) + break; + i += clen; + } + + if (i == input_len) + return NULL; + + size_t alloc = input_len + 64; + uint8_t *out = malloc(alloc + 1); + if (out == NULL) + return NULL; + + if (i > 0) + memcpy(out, in, i); + size_t o = i; + + while (i < input_len) { + int clen = _utf8_char_len(in + i, input_len - i); + if (clen < 0) { + if (o + 3 > alloc) { + alloc = alloc * 2; + uint8_t *tmp = realloc(out, alloc + 1); + if (tmp == NULL) { + free(out); + return NULL; + } + out = tmp; + } + out[o++] = 0xEF; + out[o++] = 0xBF; + out[o++] = 0xBD; + i++; + } else { + if (o + clen > alloc) { + alloc = alloc * 2; + uint8_t *tmp = realloc(out, alloc + 1); + if (tmp == NULL) { + free(out); + return NULL; + } + out = tmp; + } + memcpy(out + o, in + i, clen); + o += clen; + i += clen; + } + } + + out[o] = '\0'; + if (output_len != NULL) + *output_len = o; + return (char *)out; +} diff --git a/src/common/oscap_utf8.h b/src/common/oscap_utf8.h new file mode 100644 index 00000000000..872a907b027 --- /dev/null +++ b/src/common/oscap_utf8.h @@ -0,0 +1,41 @@ +/* + * Copyright 2026 Red Hat Inc., Durham, North Carolina. + * All Rights Reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: + * Jan Černý + */ + +#ifndef OSCAP_UTF8_H_ +#define OSCAP_UTF8_H_ + +#include + +/** + * Sanitize a string by replacing invalid UTF-8 byte sequences with the + * Unicode replacement character (U+FFFD). + * + * @param input the input string (not necessarily null-terminated) + * @param input_len byte length of the input + * @param output_len if not NULL, set to the byte length of the sanitized string + * @return newly allocated null-terminated sanitized string if any invalid + * sequences were found (caller must free), or NULL if the input + * is already valid UTF-8 + */ +char *oscap_sanitize_utf8(const char *input, size_t input_len, size_t *output_len); + +#endif diff --git a/tests/API/OVAL/unittests/CMakeLists.txt b/tests/API/OVAL/unittests/CMakeLists.txt index d081b32b27b..4d8f4501b61 100644 --- a/tests/API/OVAL/unittests/CMakeLists.txt +++ b/tests/API/OVAL/unittests/CMakeLists.txt @@ -26,6 +26,7 @@ add_oscap_test("test_ipv6_super_set_of.sh") add_oscap_test("test_item_not_exist.sh") add_oscap_test("test_object_component_type.sh") add_oscap_test("test_oval_empty_variable_evaluation.sh") +add_oscap_test("test_pcre_nonutf_characters.sh") add_oscap_test("test_platform_version.sh") add_oscap_test("test_recursive_extend_def.sh") add_oscap_test("test_skip_valid.sh") diff --git a/tests/API/OVAL/unittests/test_pcre_nonutf_characters.sh b/tests/API/OVAL/unittests/test_pcre_nonutf_characters.sh new file mode 100755 index 00000000000..844e3e059fc --- /dev/null +++ b/tests/API/OVAL/unittests/test_pcre_nonutf_characters.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +. $builddir/tests/test_common.sh + +set -e -o pipefail + +oval_def="$(mktemp)" +stdout="$(mktemp)" +stderr="$(mktemp)" +result="$(mktemp)" +temp_dir="$(mktemp -d)" +cp $srcdir/test_pcre_nonutf_characters.xml "$oval_def" +sed -i "s;TEMP_DIR_PLACEHOLDER;$temp_dir;" "$oval_def" +normal_file="$temp_dir/normal_filename" +touch "$normal_file" +evil_file=$temp_dir/$(printf "evil_filename_\334_non_utf8_character") +touch "$evil_file" + +$OSCAP oval eval --results "$result" "$oval_def" >"$stdout" 2>"$stderr" + +assert_exists 2 '/oval_results/results/system/oval_system_characteristics/system_data/unix-sys:file_item' +grep -q "Definition oval:org.mitre.oval.test:def:1: true" "$stdout" +grep -q "W: oscap: Replaced invalid UTF-8 byte sequence(s) with the replacement character (U+FFFD) in .*" "$stderr" + +rm -f "$oval_def" +rm -f "$stdout" "$stderr" "$result" +rm -rf "$temp_dir" diff --git a/tests/API/OVAL/unittests/test_pcre_nonutf_characters.xml b/tests/API/OVAL/unittests/test_pcre_nonutf_characters.xml new file mode 100644 index 00000000000..cefa1d53750 --- /dev/null +++ b/tests/API/OVAL/unittests/test_pcre_nonutf_characters.xml @@ -0,0 +1,32 @@ + + + + 5.10 + 2009-01-12T10:41:00-05:00 + + + + + Test for pattern match operation on file names + The definition should not produce any error during the evaluation + + + + + + + + + + + + + + + + TEMP_DIR_PLACEHOLDER + ^.*$ + + + + diff --git a/tests/API/probes/CMakeLists.txt b/tests/API/probes/CMakeLists.txt index 5f31ebbb84f..c6e24545de5 100644 --- a/tests/API/probes/CMakeLists.txt +++ b/tests/API/probes/CMakeLists.txt @@ -28,6 +28,7 @@ add_oscap_test_executable(oval_fts_list "${CMAKE_SOURCE_DIR}/src/OVAL/probes/probe/entcmp.c" "${CMAKE_SOURCE_DIR}/src/common/util.c" "${CMAKE_SOURCE_DIR}/src/common/oscap_pcre.c" + "${CMAKE_SOURCE_DIR}/src/common/oscap_utf8.c" "${OVAL_RESULTS_SOURCES}" ) target_include_directories(oval_fts_list PUBLIC diff --git a/tests/probes/file/test_probes_file.sh b/tests/probes/file/test_probes_file.sh index 3595a4580d1..8ae9787f2a9 100755 --- a/tests/probes/file/test_probes_file.sh +++ b/tests/probes/file/test_probes_file.sh @@ -112,7 +112,7 @@ function test_probes_file_invalid_utf8 { $OSCAP oval validate $result || ret_val=1 assert_exists 1 '//results//criterion' || ret_val=1 - assert_exists 1 '//results//criterion[@result="error"]' || ret_val=1 + assert_exists 1 '//results//criterion[@result="true"]' || ret_val=1 rm $DF_INJECTED rm -rf "$files_dir"