From a19a41b8447d48ecc75cbf1a48accdb50dac2bda Mon Sep 17 00:00:00 2001 From: Ivan K Date: Tue, 3 Mar 2026 16:31:13 +0300 Subject: [PATCH 1/4] Test case --- inst/tests/tests.Rraw | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f144532cf..293eed980 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -20671,9 +20671,11 @@ DT = data.table(factor(rep("\uf8", 3))) # identical() to V1's only level but stored in a different CHARSXP samelevel = iconv(levels(DT$V1), from = "UTF-8", to = "latin1") DT[1, V1 := samelevel] -test(2311.1, nlevels(DT$V1), 1L) # used to be 2 +# used to fail to look up the new level, resulting in an invalid factor, #7648 +test(2311.1, as.integer(DT$V1), rep(1L, 3)) +test(2311.2, nlevels(DT$V1), 1L) # used to be 2 DT[1, V1 := factor("a", levels = c("a", samelevel))] -test(2311.2, nlevels(DT$V1), 2L) # used to be 3 +test(2311.3, nlevels(DT$V1), 2L) # used to be 3 # avoid translateChar*() in OpenMP threads, #6883 DF = list(rep(iconv("\uf8", from = "UTF-8", to = "latin1"), 2e6)) From 99fffd88c5f2e0f1a166680c89e2d732ea54618d Mon Sep 17 00:00:00 2001 From: Ivan K Date: Tue, 3 Mar 2026 16:31:29 +0300 Subject: [PATCH 2/4] memrecycle: look up source from converted vector When UTF-8 conversion in performed during assignment to a factor, make sure that the source strings are also looked up in their UTF-8 form, not the original vector. Fixes: #7648 --- src/assign.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/assign.c b/src/assign.c index 05a55cb5a..148532e36 100644 --- a/src/assign.c +++ b/src/assign.c @@ -806,7 +806,8 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con newSourceD[i] = val==NA_INTEGER ? NA_INTEGER : -hash_lookup(marks, sourceLevelsD[val-1], 0); // retains NA factor levels here via TL(NA_STRING); e.g. ordered factor } } else { - const SEXP *sourceD = STRING_PTR_RO(source); + // for character input, "levels" correspond to the source vector pre-converted to UTF-8 + const SEXP *sourceD = sourceLevelsD; for (int i=0; i Date: Tue, 3 Mar 2026 16:42:08 +0300 Subject: [PATCH 3/4] NEWS entry --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 7cf63f0b7..6877436d5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -40,6 +40,8 @@ 5. Non-equi joins combining an equality condition with two inequality conditions on the same column (e.g., `on = .(id == id, val >= lo, val <= hi)`) no longer error, [#7641](https://github.com/Rdatatable/data.table/issues/7641). The internal `chmatchdup` remapping of duplicate `rightcols` was overwriting the original column indices, causing downstream code to reference non-existent columns. Thanks @tarun-t for the report and fix, and @aitap for the diagnosis. +6. By-reference sub-assignments of strings to factor columns now _actually_ match the levels in UTF-8 when required and now don't result in invalid factors being created, [#7648](https://github.com/Rdatatable/data.table/issues/7648), amending a previous incomplete fix to [#6886](https://github.com/Rdatatable/data.table/issues/6886) in v1.17.2. Thanks @BASS-JN for the report and @aitap for the fix. + ### Notes 1. {data.table} now depends on R 3.5.0 (2018). From e1c2a2897932ee7fe63249b0cc10267b768b717a Mon Sep 17 00:00:00 2001 From: Ivan K Date: Tue, 3 Mar 2026 22:51:08 +0300 Subject: [PATCH 4/4] Clarify comment, drop a temporary value Co-Authored-By: Michael Chirico --- src/assign.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/assign.c b/src/assign.c index 148532e36..1b474072c 100644 --- a/src/assign.c +++ b/src/assign.c @@ -806,10 +806,9 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con newSourceD[i] = val==NA_INTEGER ? NA_INTEGER : -hash_lookup(marks, sourceLevelsD[val-1], 0); // retains NA factor levels here via TL(NA_STRING); e.g. ordered factor } } else { - // for character input, "levels" correspond to the source vector pre-converted to UTF-8 - const SEXP *sourceD = sourceLevelsD; for (int i=0; i