From 89f3845fa2b3ebdb472afae276f5874efab2f7b3 Mon Sep 17 00:00:00 2001 From: Robert Yokota Date: Sat, 14 Mar 2026 14:14:07 -0700 Subject: [PATCH 1/3] optimize Tokenizer charAt --- include/jsonata/Tokenizer.h | 3 +++ src/jsonata/Tokenizer.cpp | 45 +++++++++++++++++-------------------- src/jsonata/Utils.cpp | 2 -- 3 files changed, 23 insertions(+), 27 deletions(-) diff --git a/include/jsonata/Tokenizer.h b/include/jsonata/Tokenizer.h index 5a15b40..351f9c1 100644 --- a/include/jsonata/Tokenizer.h +++ b/include/jsonata/Tokenizer.h @@ -30,6 +30,7 @@ #include #include #include +#include namespace jsonata { @@ -57,6 +58,8 @@ class Tokenizer { // Instance variables std::string path_; + std::vector codepoints_; // Pre-computed codepoints for O(1) access + std::vector byte_offsets_; // Byte offset of each codepoint in path_ size_t position_; size_t length_; int64_t depth_; diff --git a/src/jsonata/Tokenizer.cpp b/src/jsonata/Tokenizer.cpp index 70733f4..066e2ad 100644 --- a/src/jsonata/Tokenizer.cpp +++ b/src/jsonata/Tokenizer.cpp @@ -55,8 +55,15 @@ std::unordered_map Tokenizer::createEscapes() { Tokenizer::Tokenizer(const std::string& path) : path_(path), position_(0), depth_(0) { - // Calculate length in codepoints using utfcpp - length_ = utf8::unchecked::distance(path_.begin(), path_.end()); + // Pre-compute codepoints and byte offsets for O(1) charAt access + auto it = path_.begin(); + while (it != path_.end()) { + byte_offsets_.push_back(static_cast(it - path_.begin())); + codepoints_.push_back(static_cast(utf8::unchecked::next(it))); + } + // Store sentinel byte offset for end-of-string + byte_offsets_.push_back(static_cast(it - path_.begin())); + length_ = codepoints_.size(); } std::unique_ptr Tokenizer::create(const std::string& type, @@ -69,11 +76,7 @@ int32_t Tokenizer::charAt(size_t index) const { if (index >= length_) { return 0; // Match Java behavior: out of bounds returns 0/null char } - - // Use utfcpp to advance to the correct codepoint - auto it = path_.begin(); - utf8::unchecked::advance(it, index); - return static_cast(utf8::unchecked::next(it)); + return codepoints_[index]; } // Helper method to extract substring by codepoint indices (like Java substring) @@ -88,14 +91,8 @@ std::string Tokenizer::substring(size_t start, size_t end) const { return ""; } - // Use utfcpp to find byte positions for the codepoint range - auto startIt = path_.begin(); - utf8::unchecked::advance(startIt, start); - - auto endIt = startIt; - utf8::unchecked::advance(endIt, end - start); - - return std::string(startIt, endIt); + // Use pre-computed byte offsets for O(1) lookup + return path_.substr(byte_offsets_[start], byte_offsets_[end] - byte_offsets_[start]); } bool Tokenizer::isClosingSlash(size_t position) const { @@ -371,23 +368,21 @@ std::unique_ptr Tokenizer::next(bool prefix) { throw JException("S0101", static_cast(position_)); } - // Test for numbers using codepoint-based substring (matches Java logic) + // Test for numbers using byte-level regex on remaining input (matches Java logic) { static const std::regex numregex( "^-?(0|([1-9][0-9]*))(\\.[0-9]+)?([Ee][-+]?[0-9]+)?"); - std::string remaining = substring(position_, length_); - std::smatch match; - if (std::regex_search(remaining, match, numregex) && + // Use byte offsets to get iterators into the original string without copying + auto byteStart = path_.cbegin() + static_cast(byte_offsets_[position_]); + auto byteEnd = path_.cend(); + std::cmatch match; + if (std::regex_search(&*byteStart, &*byteEnd, match, numregex) && match.position() == 0) { std::string numStr = match.str(0); double num = std::stod(numStr); if (!std::isnan(num) && std::isfinite(num)) { - // Convert byte length to codepoint length for position - // advancement - size_t byteLen = numStr.length(); - size_t codepointLen = - utf8::unchecked::distance(numStr.begin(), numStr.end()); - position_ += codepointLen; + // Number literals are ASCII, so byte length == codepoint length + position_ += numStr.length(); return create("number", Utils::convertNumber(num)); } throw JException("S0102", static_cast(position_)); diff --git a/src/jsonata/Utils.cpp b/src/jsonata/Utils.cpp index 4085e7b..5556ffe 100644 --- a/src/jsonata/Utils.cpp +++ b/src/jsonata/Utils.cpp @@ -182,8 +182,6 @@ std::optional Utils::type(const std::any& value) { return "null"; } else if (isNumber(value)) { return "number"; - } else if (isNullValue(value)) { - return "null"; } else if (isString(value)) { return "string"; } else if (isBoolean(value)) { From a391b3f839655afcf7f27db03b8ba0117d0ce6a0 Mon Sep 17 00:00:00 2001 From: Robert Yokota Date: Sat, 14 Mar 2026 14:21:19 -0700 Subject: [PATCH 2/3] fix copy entire vectors --- src/jsonata/Utils.cpp | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/jsonata/Utils.cpp b/src/jsonata/Utils.cpp index 5556ffe..a216c81 100644 --- a/src/jsonata/Utils.cpp +++ b/src/jsonata/Utils.cpp @@ -202,7 +202,7 @@ bool Utils::isArrayOfStrings(const std::any& v) { // Check for JList first if (v.type() == typeid(Utils::JList)) { - auto vec = arrayify(v); + const auto& vec = std::any_cast(v); for (const auto& item : vec) { if (!item.has_value() || item.type() != typeid(std::string)) { return false; @@ -214,7 +214,7 @@ bool Utils::isArrayOfStrings(const std::any& v) { // Also check for std::vector for backward compatibility (tests, // JSON parsing) if (v.type() == typeid(std::vector)) { - auto vec = std::any_cast>(v); + const auto& vec = std::any_cast&>(v); for (const auto& item : vec) { if (!item.has_value() || item.type() != typeid(std::string)) { return false; @@ -231,7 +231,7 @@ bool Utils::isArrayOfNumbers(const std::any& v) { // Check for JList first if (v.type() == typeid(Utils::JList)) { - auto vec = arrayify(v); + const auto& vec = std::any_cast(v); for (const auto& item : vec) { if (!isNumeric(item)) { return false; @@ -243,7 +243,7 @@ bool Utils::isArrayOfNumbers(const std::any& v) { // Also check for std::vector for backward compatibility (tests, // JSON parsing) if (v.type() == typeid(std::vector)) { - auto vec = std::any_cast>(v); + const auto& vec = std::any_cast&>(v); for (const auto& item : vec) { if (!isNumeric(item)) { return false; @@ -310,12 +310,10 @@ Utils::JList Utils::createSequence(const std::any& el) { } bool Utils::isSequence(const std::any& result) { - try { - auto jlist = std::any_cast(result); - return jlist.sequence; - } catch (const std::bad_any_cast&) { + if (!result.has_value() || result.type() != typeid(JList)) { return false; } + return std::any_cast(result).sequence; } std::any Utils::convertNumber(const std::any& n) { @@ -351,7 +349,7 @@ Utils::JList Utils::arrayify(const std::any& value) { // If it's a JList, handle both range and regular cases if (value.type() == typeid(JList)) { - const auto& jlist = std::any_cast(value); + const auto& jlist = std::any_cast(value); if (jlist.isRange()) { // Materialize the range into a regular JList @@ -361,7 +359,7 @@ Utils::JList Utils::arrayify(const std::any& value) { } return result; } else { - // For regular JList, return as-is + // For regular JList, return a copy return jlist; } } @@ -369,7 +367,7 @@ Utils::JList Utils::arrayify(const std::any& value) { // If it's a std::vector, convert to JList for backward // compatibility if (value.type() == typeid(std::vector)) { - const auto& vec = std::any_cast>(value); + const auto& vec = std::any_cast&>(value); JList result(vec); return result; } From 8d23c019c5b296206aa7e530548dbab0bedea458 Mon Sep 17 00:00:00 2001 From: Robert Yokota Date: Sat, 14 Mar 2026 14:36:04 -0700 Subject: [PATCH 3/3] Remove dead code --- src/jsonata/Functions.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/jsonata/Functions.cpp b/src/jsonata/Functions.cpp index cb3d2d0..0f1032c 100644 --- a/src/jsonata/Functions.cpp +++ b/src/jsonata/Functions.cpp @@ -2685,14 +2685,6 @@ bool Functions::contains(const std::string& str, const std::any& token) { } // Check if it's a regex object (stored as map with "type" = "regex") else { - try { - auto regex = std::any_cast(token); - auto matches = evaluateMatcher(regex, str); - return !matches.empty(); - } catch (const std::bad_any_cast&) { - // Not a regex object, fall through - } - // Java line 703: else throw new Error("unknown type to match: // "+token); For C++, fall back to string conversion as fallback auto tokenStr = string(token);