rayokota · rayokota · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026
diff --git a/include/jsonata/Tokenizer.h b/include/jsonata/Tokenizer.h
@@ -30,6 +30,7 @@
 #include <regex>
 #include <string>
 #include <unordered_map>
+#include <vector>
 
 namespace jsonata {
 
@@ -57,6 +58,8 @@ class Tokenizer {
 
     // Instance variables
     std::string path_;
+    std::vector<int32_t> codepoints_;  // Pre-computed codepoints for O(1) access
+    std::vector<size_t> byte_offsets_;  // Byte offset of each codepoint in path_
     size_t position_;
     size_t length_;
     int64_t depth_;

diff --git a/src/jsonata/Functions.cpp b/src/jsonata/Functions.cpp
@@ -2685,14 +2685,6 @@ bool Functions::contains(const std::string& str, const std::any& token) {
         }
         // Check if it's a regex object (stored as map with "type" = "regex")
         else {
-            try {
-                auto regex = std::any_cast<std::regex>(token);
-                auto matches = evaluateMatcher(regex, str);
-                return !matches.empty();
-            } catch (const std::bad_any_cast&) {
-                // Not a regex object, fall through
-            }
-
             // Java line 703: else throw new Error("unknown type to match:
             // "+token); For C++, fall back to string conversion as fallback
             auto tokenStr = string(token);

diff --git a/src/jsonata/Tokenizer.cpp b/src/jsonata/Tokenizer.cpp
@@ -55,8 +55,15 @@ std::unordered_map<std::string, std::string> Tokenizer::createEscapes() {
 
 Tokenizer::Tokenizer(const std::string& path)
     : path_(path), position_(0), depth_(0) {
-    // Calculate length in codepoints using utfcpp
-    length_ = utf8::unchecked::distance(path_.begin(), path_.end());
+    // Pre-compute codepoints and byte offsets for O(1) charAt access
+    auto it = path_.begin();
+    while (it != path_.end()) {
+        byte_offsets_.push_back(static_cast<size_t>(it - path_.begin()));
+        codepoints_.push_back(static_cast<int32_t>(utf8::unchecked::next(it)));
+    }
+    // Store sentinel byte offset for end-of-string
+    byte_offsets_.push_back(static_cast<size_t>(it - path_.begin()));
+    length_ = codepoints_.size();
 }
 
 std::unique_ptr<Tokenizer::Token> Tokenizer::create(const std::string& type,
@@ -69,11 +76,7 @@ int32_t Tokenizer::charAt(size_t index) const {
     if (index >= length_) {
         return 0;  // Match Java behavior: out of bounds returns 0/null char
     }
-
-    // Use utfcpp to advance to the correct codepoint
-    auto it = path_.begin();
-    utf8::unchecked::advance(it, index);
-    return static_cast<int32_t>(utf8::unchecked::next(it));
+    return codepoints_[index];
 }
 
 // Helper method to extract substring by codepoint indices (like Java substring)
@@ -88,14 +91,8 @@ std::string Tokenizer::substring(size_t start, size_t end) const {
         return "";
     }
 
-    // Use utfcpp to find byte positions for the codepoint range
-    auto startIt = path_.begin();
-    utf8::unchecked::advance(startIt, start);
-
-    auto endIt = startIt;
-    utf8::unchecked::advance(endIt, end - start);
-
-    return std::string(startIt, endIt);
+    // Use pre-computed byte offsets for O(1) lookup
+    return path_.substr(byte_offsets_[start], byte_offsets_[end] - byte_offsets_[start]);
 }
 
 bool Tokenizer::isClosingSlash(size_t position) const {
@@ -371,23 +368,21 @@ std::unique_ptr<Tokenizer::Token> Tokenizer::next(bool prefix) {
         throw JException("S0101", static_cast<int64_t>(position_));
     }
 
-    // Test for numbers using codepoint-based substring (matches Java logic)
+    // Test for numbers using byte-level regex on remaining input (matches Java logic)
     {
         static const std::regex numregex(
             "^-?(0|([1-9][0-9]*))(\\.[0-9]+)?([Ee][-+]?[0-9]+)?");
-        std::string remaining = substring(position_, length_);
-        std::smatch match;
-        if (std::regex_search(remaining, match, numregex) &&
+        // Use byte offsets to get iterators into the original string without copying
+        auto byteStart = path_.cbegin() + static_cast<std::string::difference_type>(byte_offsets_[position_]);
+        auto byteEnd = path_.cend();
+        std::cmatch match;
+        if (std::regex_search(&*byteStart, &*byteEnd, match, numregex) &&
             match.position() == 0) {
             std::string numStr = match.str(0);
             double num = std::stod(numStr);
             if (!std::isnan(num) && std::isfinite(num)) {
-                // Convert byte length to codepoint length for position
-                // advancement
-                size_t byteLen = numStr.length();
-                size_t codepointLen =
-                    utf8::unchecked::distance(numStr.begin(), numStr.end());
-                position_ += codepointLen;
+                // Number literals are ASCII, so byte length == codepoint length
+                position_ += numStr.length();
                 return create("number", Utils::convertNumber(num));
             }
             throw JException("S0102", static_cast<int64_t>(position_));

diff --git a/src/jsonata/Utils.cpp b/src/jsonata/Utils.cpp
@@ -182,8 +182,6 @@ std::optional<std::string> Utils::type(const std::any& value) {
         return "null";
     } else if (isNumber(value)) {
         return "number";
-    } else if (isNullValue(value)) {
-        return "null";
     } else if (isString(value)) {
         return "string";
     } else if (isBoolean(value)) {
@@ -204,7 +202,7 @@ bool Utils::isArrayOfStrings(const std::any& v) {
 
     // Check for JList first
     if (v.type() == typeid(Utils::JList)) {
-        auto vec = arrayify(v);
+        const auto& vec = std::any_cast<const JList&>(v);
         for (const auto& item : vec) {
             if (!item.has_value() || item.type() != typeid(std::string)) {
                 return false;
@@ -216,7 +214,7 @@ bool Utils::isArrayOfStrings(const std::any& v) {
     // Also check for std::vector<std::any> for backward compatibility (tests,
     // JSON parsing)
     if (v.type() == typeid(std::vector<std::any>)) {
-        auto vec = std::any_cast<std::vector<std::any>>(v);
+        const auto& vec = std::any_cast<const std::vector<std::any>&>(v);
         for (const auto& item : vec) {
             if (!item.has_value() || item.type() != typeid(std::string)) {
                 return false;
@@ -233,7 +231,7 @@ bool Utils::isArrayOfNumbers(const std::any& v) {
 
     // Check for JList first
     if (v.type() == typeid(Utils::JList)) {
-        auto vec = arrayify(v);
+        const auto& vec = std::any_cast<const JList&>(v);
         for (const auto& item : vec) {
             if (!isNumeric(item)) {
                 return false;
@@ -245,7 +243,7 @@ bool Utils::isArrayOfNumbers(const std::any& v) {
     // Also check for std::vector<std::any> for backward compatibility (tests,
     // JSON parsing)
     if (v.type() == typeid(std::vector<std::any>)) {
-        auto vec = std::any_cast<std::vector<std::any>>(v);
+        const auto& vec = std::any_cast<const std::vector<std::any>&>(v);
         for (const auto& item : vec) {
             if (!isNumeric(item)) {
                 return false;
@@ -312,12 +310,10 @@ Utils::JList Utils::createSequence(const std::any& el) {
 }
 
 bool Utils::isSequence(const std::any& result) {
-    try {
-        auto jlist = std::any_cast<JList>(result);
-        return jlist.sequence;
-    } catch (const std::bad_any_cast&) {
+    if (!result.has_value() || result.type() != typeid(JList)) {
         return false;
     }
+    return std::any_cast<const JList&>(result).sequence;
 }
 
 std::any Utils::convertNumber(const std::any& n) {
@@ -353,7 +349,7 @@ Utils::JList Utils::arrayify(const std::any& value) {
 
     // If it's a JList, handle both range and regular cases
     if (value.type() == typeid(JList)) {
-        const auto& jlist = std::any_cast<JList>(value);
+        const auto& jlist = std::any_cast<const JList&>(value);
 
         if (jlist.isRange()) {
             // Materialize the range into a regular JList
@@ -363,15 +359,15 @@ Utils::JList Utils::arrayify(const std::any& value) {
             }
             return result;
         } else {
-            // For regular JList, return as-is
+            // For regular JList, return a copy
             return jlist;
         }
     }
 
     // If it's a std::vector<std::any>, convert to JList for backward
     // compatibility
     if (value.type() == typeid(std::vector<std::any>)) {
-        const auto& vec = std::any_cast<std::vector<std::any>>(value);
+        const auto& vec = std::any_cast<const std::vector<std::any>&>(value);
         JList result(vec);
         return result;
     }