Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions include/jsonata/Tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include <regex>
#include <string>
#include <unordered_map>
#include <vector>

namespace jsonata {

Expand Down Expand Up @@ -57,6 +58,8 @@ class Tokenizer {

// Instance variables
std::string path_;
std::vector<int32_t> codepoints_; // Pre-computed codepoints for O(1) access
std::vector<size_t> byte_offsets_; // Byte offset of each codepoint in path_
size_t position_;
size_t length_;
int64_t depth_;
Expand Down
8 changes: 0 additions & 8 deletions src/jsonata/Functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2685,14 +2685,6 @@ bool Functions::contains(const std::string& str, const std::any& token) {
}
// Check if it's a regex object (stored as map with "type" = "regex")
else {
try {
auto regex = std::any_cast<std::regex>(token);
auto matches = evaluateMatcher(regex, str);
return !matches.empty();
} catch (const std::bad_any_cast&) {
// Not a regex object, fall through
}

// Java line 703: else throw new Error("unknown type to match:
// "+token); For C++, fall back to string conversion as fallback
auto tokenStr = string(token);
Expand Down
45 changes: 20 additions & 25 deletions src/jsonata/Tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,15 @@ std::unordered_map<std::string, std::string> Tokenizer::createEscapes() {

Tokenizer::Tokenizer(const std::string& path)
: path_(path), position_(0), depth_(0) {
// Calculate length in codepoints using utfcpp
length_ = utf8::unchecked::distance(path_.begin(), path_.end());
// Pre-compute codepoints and byte offsets for O(1) charAt access
auto it = path_.begin();
while (it != path_.end()) {
byte_offsets_.push_back(static_cast<size_t>(it - path_.begin()));
codepoints_.push_back(static_cast<int32_t>(utf8::unchecked::next(it)));
}
// Store sentinel byte offset for end-of-string
byte_offsets_.push_back(static_cast<size_t>(it - path_.begin()));
length_ = codepoints_.size();
}

std::unique_ptr<Tokenizer::Token> Tokenizer::create(const std::string& type,
Expand All @@ -69,11 +76,7 @@ int32_t Tokenizer::charAt(size_t index) const {
if (index >= length_) {
return 0; // Match Java behavior: out of bounds returns 0/null char
}

// Use utfcpp to advance to the correct codepoint
auto it = path_.begin();
utf8::unchecked::advance(it, index);
return static_cast<int32_t>(utf8::unchecked::next(it));
return codepoints_[index];
}

// Helper method to extract substring by codepoint indices (like Java substring)
Expand All @@ -88,14 +91,8 @@ std::string Tokenizer::substring(size_t start, size_t end) const {
return "";
}

// Use utfcpp to find byte positions for the codepoint range
auto startIt = path_.begin();
utf8::unchecked::advance(startIt, start);

auto endIt = startIt;
utf8::unchecked::advance(endIt, end - start);

return std::string(startIt, endIt);
// Use pre-computed byte offsets for O(1) lookup
return path_.substr(byte_offsets_[start], byte_offsets_[end] - byte_offsets_[start]);
}

bool Tokenizer::isClosingSlash(size_t position) const {
Expand Down Expand Up @@ -371,23 +368,21 @@ std::unique_ptr<Tokenizer::Token> Tokenizer::next(bool prefix) {
throw JException("S0101", static_cast<int64_t>(position_));
}

// Test for numbers using codepoint-based substring (matches Java logic)
// Test for numbers using byte-level regex on remaining input (matches Java logic)
{
static const std::regex numregex(
"^-?(0|([1-9][0-9]*))(\\.[0-9]+)?([Ee][-+]?[0-9]+)?");
std::string remaining = substring(position_, length_);
std::smatch match;
if (std::regex_search(remaining, match, numregex) &&
// Use byte offsets to get iterators into the original string without copying
auto byteStart = path_.cbegin() + static_cast<std::string::difference_type>(byte_offsets_[position_]);
auto byteEnd = path_.cend();
std::cmatch match;
if (std::regex_search(&*byteStart, &*byteEnd, match, numregex) &&
match.position() == 0) {
std::string numStr = match.str(0);
double num = std::stod(numStr);
if (!std::isnan(num) && std::isfinite(num)) {
// Convert byte length to codepoint length for position
// advancement
size_t byteLen = numStr.length();
size_t codepointLen =
utf8::unchecked::distance(numStr.begin(), numStr.end());
position_ += codepointLen;
// Number literals are ASCII, so byte length == codepoint length
position_ += numStr.length();
return create("number", Utils::convertNumber(num));
}
throw JException("S0102", static_cast<int64_t>(position_));
Expand Down
22 changes: 9 additions & 13 deletions src/jsonata/Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,6 @@ std::optional<std::string> Utils::type(const std::any& value) {
return "null";
} else if (isNumber(value)) {
return "number";
} else if (isNullValue(value)) {
return "null";
} else if (isString(value)) {
return "string";
} else if (isBoolean(value)) {
Expand All @@ -204,7 +202,7 @@ bool Utils::isArrayOfStrings(const std::any& v) {

// Check for JList first
if (v.type() == typeid(Utils::JList)) {
auto vec = arrayify(v);
const auto& vec = std::any_cast<const JList&>(v);
for (const auto& item : vec) {
if (!item.has_value() || item.type() != typeid(std::string)) {
return false;
Expand All @@ -216,7 +214,7 @@ bool Utils::isArrayOfStrings(const std::any& v) {
// Also check for std::vector<std::any> for backward compatibility (tests,
// JSON parsing)
if (v.type() == typeid(std::vector<std::any>)) {
auto vec = std::any_cast<std::vector<std::any>>(v);
const auto& vec = std::any_cast<const std::vector<std::any>&>(v);
for (const auto& item : vec) {
if (!item.has_value() || item.type() != typeid(std::string)) {
return false;
Expand All @@ -233,7 +231,7 @@ bool Utils::isArrayOfNumbers(const std::any& v) {

// Check for JList first
if (v.type() == typeid(Utils::JList)) {
auto vec = arrayify(v);
const auto& vec = std::any_cast<const JList&>(v);
for (const auto& item : vec) {
if (!isNumeric(item)) {
return false;
Expand All @@ -245,7 +243,7 @@ bool Utils::isArrayOfNumbers(const std::any& v) {
// Also check for std::vector<std::any> for backward compatibility (tests,
// JSON parsing)
if (v.type() == typeid(std::vector<std::any>)) {
auto vec = std::any_cast<std::vector<std::any>>(v);
const auto& vec = std::any_cast<const std::vector<std::any>&>(v);
for (const auto& item : vec) {
if (!isNumeric(item)) {
return false;
Expand Down Expand Up @@ -312,12 +310,10 @@ Utils::JList Utils::createSequence(const std::any& el) {
}

bool Utils::isSequence(const std::any& result) {
try {
auto jlist = std::any_cast<JList>(result);
return jlist.sequence;
} catch (const std::bad_any_cast&) {
if (!result.has_value() || result.type() != typeid(JList)) {
return false;
}
return std::any_cast<const JList&>(result).sequence;
}

std::any Utils::convertNumber(const std::any& n) {
Expand Down Expand Up @@ -353,7 +349,7 @@ Utils::JList Utils::arrayify(const std::any& value) {

// If it's a JList, handle both range and regular cases
if (value.type() == typeid(JList)) {
const auto& jlist = std::any_cast<JList>(value);
const auto& jlist = std::any_cast<const JList&>(value);

if (jlist.isRange()) {
// Materialize the range into a regular JList
Expand All @@ -363,15 +359,15 @@ Utils::JList Utils::arrayify(const std::any& value) {
}
return result;
} else {
// For regular JList, return as-is
// For regular JList, return a copy
return jlist;
}
}

// If it's a std::vector<std::any>, convert to JList for backward
// compatibility
if (value.type() == typeid(std::vector<std::any>)) {
const auto& vec = std::any_cast<std::vector<std::any>>(value);
const auto& vec = std::any_cast<const std::vector<std::any>&>(value);
JList result(vec);
return result;
}
Expand Down
Loading