diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b13885b..475671d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -168,6 +168,7 @@ jobs: working-directory: rust run: cargo test + - name: Test (no default features) working-directory: rust run: cargo test --no-default-features diff --git a/README.md b/README.md index 5d8ac88..01fb992 100644 --- a/README.md +++ b/README.md @@ -132,6 +132,24 @@ const std::optional& get_last_error(); Returns the last parse error, if any. +### `lexer::get_last_error_location` + +```cpp +const std::optional& get_last_error_location(); +``` + +Returns the location of the last parse error, if available. Location tracking +is best-effort and may be unavailable. + +### `lexer::error_location` + +```cpp +struct error_location { + uint32_t line; // 1-based + uint32_t column; // 1-based (byte-oriented) +}; +``` + ## C API merve provides a C API (`merve_c.h`) for use from C programs, FFI bindings, or any language that can call C functions. The C API is compiled into the merve library alongside the C++ implementation. @@ -141,11 +159,13 @@ merve provides a C API (`merve_c.h`) for use from C programs, FFI bindings, or a ```c #include "merve_c.h" #include +#include int main(void) { const char* source = "exports.foo = 1;\nexports.bar = 2;\n"; - merve_analysis result = merve_parse_commonjs(source, strlen(source)); + merve_error_loc err_loc = {0, 0}; + merve_analysis result = merve_parse_commonjs(source, strlen(source), &err_loc); if (merve_is_valid(result)) { size_t count = merve_get_exports_count(result); @@ -157,6 +177,9 @@ int main(void) { } } else { printf("Parse error: %d\n", merve_get_last_error()); + if (err_loc.line != 0) { + printf(" at line %u, column %u\n", err_loc.line, err_loc.column); + } } merve_free(result); @@ -180,12 +203,13 @@ Found 2 exports: | `merve_string` | Non-owning string reference (`data` + `length`). Not null-terminated. | | `merve_analysis` | Opaque handle to a parse result. Must be freed with `merve_free()`. | | `merve_version_components` | Struct with `major`, `minor`, `revision` fields. | +| `merve_error_loc` | Error location (`line`, `column`). `{0,0}` means unavailable. | #### Functions | Function | Description | |----------|-------------| -| `merve_parse_commonjs(input, length)` | Parse CommonJS source. Returns a handle (NULL only on OOM). | +| `merve_parse_commonjs(input, length, out_err)` | Parse CommonJS source and optionally fill error location. Returns a handle (NULL only on OOM). | | `merve_is_valid(result)` | Check if parsing succeeded. NULL-safe. | | `merve_free(result)` | Free a parse result. NULL-safe. | | `merve_get_exports_count(result)` | Number of named exports found. | @@ -198,6 +222,9 @@ Found 2 exports: | `merve_get_version()` | Version string (e.g. `"1.0.1"`). | | `merve_get_version_components()` | Version as `{major, minor, revision}`. | +On parse failure, `merve_parse_commonjs` writes a non-zero location when +`out_err` is non-NULL and the location is available. + #### Error Constants | Constant | Value | Description | diff --git a/include/merve/parser.h b/include/merve/parser.h index 8d1ff42..6458872 100644 --- a/include/merve/parser.h +++ b/include/merve/parser.h @@ -3,6 +3,7 @@ #include "merve/version.h" +#include #include #include #include @@ -37,6 +38,17 @@ enum lexer_error { TEMPLATE_NEST_OVERFLOW, ///< Template literal nesting too deep }; +/** + * @brief Source location information for a parse error. + * + * - line and column are 1-based. + * - column is byte-oriented. + */ +struct error_location { + uint32_t line; + uint32_t column; +}; + /** * @brief Type alias for export names. * @@ -146,6 +158,18 @@ std::optional parse_commonjs(std::string_view file_contents); */ const std::optional& get_last_error(); +/** + * @brief Get the location of the last failed parse operation. + * + * @return const std::optional& The last error location, or + * std::nullopt if unavailable. + * + * @note This is global state and may be overwritten by subsequent calls + * to parse_commonjs(). + * @note Location tracking is best-effort and may be unavailable. + */ +const std::optional& get_last_error_location(); + } // namespace lexer #endif // MERVE_PARSER_H diff --git a/include/merve_c.h b/include/merve_c.h index af4a9d7..035ce21 100644 --- a/include/merve_c.h +++ b/include/merve_c.h @@ -39,6 +39,19 @@ typedef struct { int revision; } merve_version_components; +/** + * @brief Source location for a parse error. + * + * - line and column are 1-based. + * - column is byte-oriented. + * + * A zeroed location (`{0, 0}`) means the location is unavailable. + */ +typedef struct { + uint32_t line; + uint32_t column; +} merve_error_loc; + /* Error codes corresponding to lexer::lexer_error values. */ #define MERVE_ERROR_TODO 0 #define MERVE_ERROR_UNEXPECTED_PAREN 1 @@ -59,20 +72,32 @@ extern "C" { #endif /** - * Parse CommonJS source code and extract export information. + * Parse CommonJS source code and optionally return error location. * * The source buffer must remain valid while accessing string_view-backed * export names from the returned handle. * + * If @p out_err is non-NULL, it is always written: + * - On success: set to {0, 0}. + * - On parse failure with known location: set to that location. + * - On parse failure without available location: set to {0, 0}. + * * You must call merve_free() on the returned handle when done. * - * @param input Pointer to the JavaScript source (need not be null-terminated). - * NULL is treated as an empty string. - * @param length Length of the input in bytes. + * @param input Pointer to the JavaScript source (need not be + * null-terminated). NULL is treated as an empty string. + * @param length Length of the input in bytes. + * @param out_err Optional output pointer for parse error location. * @return A handle to the parse result, or NULL on out-of-memory. * Use merve_is_valid() to check if parsing succeeded. */ -merve_analysis merve_parse_commonjs(const char* input, size_t length); +#ifdef __cplusplus +merve_analysis merve_parse_commonjs(const char* input, size_t length, + merve_error_loc* out_err = nullptr); +#else +merve_analysis merve_parse_commonjs(const char* input, size_t length, + merve_error_loc* out_err); +#endif /** * Check whether the parse result is valid (parsing succeeded). @@ -165,7 +190,7 @@ const char* merve_get_version(void); merve_version_components merve_get_version_components(void); #ifdef __cplusplus -} /* extern "C" */ +} /* extern "C" */ #endif #endif /* MERVE_C_H */ diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 6f19756..f4781cb 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -77,7 +77,7 @@ checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" [[package]] name = "merve" -version = "1.1.2" +version = "1.1.3" dependencies = [ "cc", "link_args", diff --git a/rust/README.md b/rust/README.md index e41045e..088bb55 100644 --- a/rust/README.md +++ b/rust/README.md @@ -51,7 +51,7 @@ merve = { version = "...", features = ["libcpp"] } ### `parse_commonjs` ```rust -pub fn parse_commonjs(source: &str) -> Result, LexerError> +pub fn parse_commonjs(source: &str) -> Result, LocatedLexerError> ``` Parse CommonJS source code and extract export information. The returned @@ -100,6 +100,17 @@ Returned when the input contains ESM syntax or malformed constructs: `LexerError` implements `Display` and, with the `std` feature, `std::error::Error`. +### `LocatedLexerError` + +```rust +pub struct LocatedLexerError { + pub kind: LexerError, + pub location: Option, +} +``` + +`ErrorLocation` uses 1-based `line`/`column` (byte-oriented column). + ### Versioning helpers ```rust diff --git a/rust/deps/merve.cpp b/rust/deps/merve.cpp index dbb39b3..2f7e846 100644 --- a/rust/deps/merve.cpp +++ b/rust/deps/merve.cpp @@ -314,6 +314,40 @@ struct StarExportBinding { // Thread-local state for error tracking (safe for concurrent parse calls). thread_local std::optional last_error; +thread_local std::optional last_error_location; + +static error_location makeErrorLocation(const char* source, const char* end, const char* at) { + const char* target = at; + if (target < source) target = source; + if (target > end) target = end; + + uint32_t line = 1; + uint32_t column = 1; + const char* cur = source; + + while (cur < target) { + const char ch = *cur++; + if (ch == '\n') { + line++; + column = 1; + continue; + } + if (ch == '\r') { + line++; + column = 1; + if (cur < target && *cur == '\n') { + cur++; + } + continue; + } + column++; + } + + error_location loc{}; + loc.line = line; + loc.column = column; + return loc; +} // Lexer state class class CJSLexer { @@ -334,6 +368,7 @@ class CJSLexer { std::array templateStack_; std::array openTokenPosStack_; + std::array openTokenTypeStack_; std::array openClassPosStack; std::array starExportStack_; StarExportBinding* starExportStack; @@ -485,9 +520,11 @@ class CJSLexer { } // Parsing utilities - void syntaxError(lexer_error code) { + void syntaxError(lexer_error code, const char* at = nullptr) { if (!last_error) { last_error = code; + const char* error_pos = at ? at : pos; + last_error_location = makeErrorLocation(source, end, error_pos); } pos = end + 1; } @@ -1490,6 +1527,7 @@ class CJSLexer { char ch = commentWhitespace(); switch (ch) { case '(': + openTokenTypeStack_[openTokenDepth] = '('; openTokenPosStack_[openTokenDepth++] = startPos; return; case '.': @@ -1503,7 +1541,7 @@ class CJSLexer { // It's something like import.metaData, not import.meta return; } - syntaxError(lexer_error::UNEXPECTED_ESM_IMPORT_META); + syntaxError(lexer_error::UNEXPECTED_ESM_IMPORT_META, startPos); } return; default: @@ -1518,17 +1556,18 @@ class CJSLexer { pos--; return; } - syntaxError(lexer_error::UNEXPECTED_ESM_IMPORT); + syntaxError(lexer_error::UNEXPECTED_ESM_IMPORT, startPos); } } void throwIfExportStatement() { + const char* startPos = pos; pos += 6; const char* curPos = pos; char ch = commentWhitespace(); if (pos == curPos && !isPunctuator(ch)) return; - syntaxError(lexer_error::UNEXPECTED_ESM_EXPORT); + syntaxError(lexer_error::UNEXPECTED_ESM_EXPORT, startPos); } public: @@ -1537,7 +1576,7 @@ class CJSLexer { templateStackDepth(0), openTokenDepth(0), templateDepth(0), line(1), lastSlashWasDivision(false), nextBraceIsClass(false), - templateStack_{}, openTokenPosStack_{}, openClassPosStack{}, + templateStack_{}, openTokenPosStack_{}, openTokenTypeStack_{}, openClassPosStack{}, starExportStack_{}, starExportStack(nullptr), STAR_EXPORT_STACK_END(nullptr), exports(out_exports), re_exports(out_re_exports) {} @@ -1602,6 +1641,7 @@ class CJSLexer { pos += 23; if (*pos == '(') { pos++; + openTokenTypeStack_[openTokenDepth] = '('; openTokenPosStack_[openTokenDepth++] = lastTokenPos; if (tryParseRequire(RequireType::Import) && keywordStart(startPos)) tryBacktrackAddStarExportBinding(startPos - 1); @@ -1611,6 +1651,7 @@ class CJSLexer { if (pos + 4 < end && matchesAt(pos, end, "Star")) pos += 4; if (*pos == '(') { + openTokenTypeStack_[openTokenDepth] = '('; openTokenPosStack_[openTokenDepth++] = lastTokenPos; if (*(pos + 1) == 'r') { pos++; @@ -1645,6 +1686,7 @@ class CJSLexer { tryParseObjectDefineOrKeys(openTokenDepth == 0); break; case '(': + openTokenTypeStack_[openTokenDepth] = '('; openTokenPosStack_[openTokenDepth++] = lastTokenPos; break; case ')': @@ -1657,6 +1699,7 @@ class CJSLexer { case '{': openClassPosStack[openTokenDepth] = nextBraceIsClass; nextBraceIsClass = false; + openTokenTypeStack_[openTokenDepth] = '{'; openTokenPosStack_[openTokenDepth++] = lastTokenPos; break; case '}': @@ -1719,6 +1762,19 @@ class CJSLexer { lastTokenPos = pos; } + if (!last_error) { + if (templateDepth != std::numeric_limits::max()) { + syntaxError(lexer_error::UNTERMINATED_TEMPLATE_STRING, end); + } else if (openTokenDepth != 0) { + const char open_ch = openTokenTypeStack_[openTokenDepth - 1]; + if (open_ch == '{') { + syntaxError(lexer_error::UNTERMINATED_BRACE, end); + } else { + syntaxError(lexer_error::UNTERMINATED_PAREN, end); + } + } + } + if (templateDepth != std::numeric_limits::max() || openTokenDepth || last_error) { return false; } @@ -1729,6 +1785,7 @@ class CJSLexer { std::optional parse_commonjs(std::string_view file_contents) { last_error.reset(); + last_error_location.reset(); lexer_analysis result; CJSLexer lexer(result.exports, result.re_exports); @@ -1744,6 +1801,10 @@ const std::optional& get_last_error() { return last_error; } +const std::optional& get_last_error_location() { + return last_error_location; +} + } // namespace lexer /* end file parser.cpp */ /* begin file merve_c.cpp */ @@ -1796,6 +1857,19 @@ typedef struct { int revision; } merve_version_components; +/** + * @brief Source location for a parse error. + * + * - line and column are 1-based. + * - column is byte-oriented. + * + * A zeroed location (`{0, 0}`) means the location is unavailable. + */ +typedef struct { + uint32_t line; + uint32_t column; +} merve_error_loc; + /* Error codes corresponding to lexer::lexer_error values. */ #define MERVE_ERROR_TODO 0 #define MERVE_ERROR_UNEXPECTED_PAREN 1 @@ -1816,20 +1890,32 @@ extern "C" { #endif /** - * Parse CommonJS source code and extract export information. + * Parse CommonJS source code and optionally return error location. * * The source buffer must remain valid while accessing string_view-backed * export names from the returned handle. * + * If @p out_err is non-NULL, it is always written: + * - On success: set to {0, 0}. + * - On parse failure with known location: set to that location. + * - On parse failure without available location: set to {0, 0}. + * * You must call merve_free() on the returned handle when done. * - * @param input Pointer to the JavaScript source (need not be null-terminated). - * NULL is treated as an empty string. - * @param length Length of the input in bytes. + * @param input Pointer to the JavaScript source (need not be + * null-terminated). NULL is treated as an empty string. + * @param length Length of the input in bytes. + * @param out_err Optional output pointer for parse error location. * @return A handle to the parse result, or NULL on out-of-memory. * Use merve_is_valid() to check if parsing succeeded. */ -merve_analysis merve_parse_commonjs(const char* input, size_t length); +#ifdef __cplusplus +merve_analysis merve_parse_commonjs(const char* input, size_t length, + merve_error_loc* out_err = nullptr); +#else +merve_analysis merve_parse_commonjs(const char* input, size_t length, + merve_error_loc* out_err); +#endif /** * Check whether the parse result is valid (parsing succeeded). @@ -1922,7 +2008,7 @@ const char* merve_get_version(void); merve_version_components merve_get_version_components(void); #ifdef __cplusplus -} /* extern "C" */ +} /* extern "C" */ #endif #endif /* MERVE_C_H */ @@ -1941,9 +2027,25 @@ static merve_string merve_string_create(const char* data, size_t length) { return out; } +static void merve_error_loc_clear(merve_error_loc* out_err) { + if (!out_err) return; + out_err->line = 0; + out_err->column = 0; +} + +static void merve_error_loc_set(merve_error_loc* out_err, + const lexer::error_location& loc) { + if (!out_err) return; + out_err->line = loc.line; + out_err->column = loc.column; +} + extern "C" { -merve_analysis merve_parse_commonjs(const char* input, size_t length) { +merve_analysis merve_parse_commonjs(const char* input, size_t length, + merve_error_loc* out_err) { + merve_error_loc_clear(out_err); + merve_analysis_impl* impl = new (std::nothrow) merve_analysis_impl(); if (!impl) return nullptr; if (input != nullptr) { @@ -1951,6 +2053,15 @@ merve_analysis merve_parse_commonjs(const char* input, size_t length) { } else { impl->result = lexer::parse_commonjs(std::string_view("", 0)); } + + if (!impl->result.has_value() && out_err) { + const std::optional& err_loc = + lexer::get_last_error_location(); + if (err_loc.has_value()) { + merve_error_loc_set(out_err, err_loc.value()); + } + } + return static_cast(impl); } diff --git a/rust/deps/merve.h b/rust/deps/merve.h index d30dec2..e37bcef 100644 --- a/rust/deps/merve.h +++ b/rust/deps/merve.h @@ -29,6 +29,7 @@ enum { #endif // MERVE_VERSION_H /* end file merve/version.h */ +#include #include #include #include @@ -63,6 +64,17 @@ enum lexer_error { TEMPLATE_NEST_OVERFLOW, ///< Template literal nesting too deep }; +/** + * @brief Source location information for a parse error. + * + * - line and column are 1-based. + * - column is byte-oriented. + */ +struct error_location { + uint32_t line; + uint32_t column; +}; + /** * @brief Type alias for export names. * @@ -172,6 +184,18 @@ std::optional parse_commonjs(std::string_view file_contents); */ const std::optional& get_last_error(); +/** + * @brief Get the location of the last failed parse operation. + * + * @return const std::optional& The last error location, or + * std::nullopt if unavailable. + * + * @note This is global state and may be overwritten by subsequent calls + * to parse_commonjs(). + * @note Location tracking is best-effort and may be unavailable. + */ +const std::optional& get_last_error_location(); + } // namespace lexer #endif // MERVE_PARSER_H diff --git a/rust/deps/merve_c.h b/rust/deps/merve_c.h index af4a9d7..035ce21 100644 --- a/rust/deps/merve_c.h +++ b/rust/deps/merve_c.h @@ -39,6 +39,19 @@ typedef struct { int revision; } merve_version_components; +/** + * @brief Source location for a parse error. + * + * - line and column are 1-based. + * - column is byte-oriented. + * + * A zeroed location (`{0, 0}`) means the location is unavailable. + */ +typedef struct { + uint32_t line; + uint32_t column; +} merve_error_loc; + /* Error codes corresponding to lexer::lexer_error values. */ #define MERVE_ERROR_TODO 0 #define MERVE_ERROR_UNEXPECTED_PAREN 1 @@ -59,20 +72,32 @@ extern "C" { #endif /** - * Parse CommonJS source code and extract export information. + * Parse CommonJS source code and optionally return error location. * * The source buffer must remain valid while accessing string_view-backed * export names from the returned handle. * + * If @p out_err is non-NULL, it is always written: + * - On success: set to {0, 0}. + * - On parse failure with known location: set to that location. + * - On parse failure without available location: set to {0, 0}. + * * You must call merve_free() on the returned handle when done. * - * @param input Pointer to the JavaScript source (need not be null-terminated). - * NULL is treated as an empty string. - * @param length Length of the input in bytes. + * @param input Pointer to the JavaScript source (need not be + * null-terminated). NULL is treated as an empty string. + * @param length Length of the input in bytes. + * @param out_err Optional output pointer for parse error location. * @return A handle to the parse result, or NULL on out-of-memory. * Use merve_is_valid() to check if parsing succeeded. */ -merve_analysis merve_parse_commonjs(const char* input, size_t length); +#ifdef __cplusplus +merve_analysis merve_parse_commonjs(const char* input, size_t length, + merve_error_loc* out_err = nullptr); +#else +merve_analysis merve_parse_commonjs(const char* input, size_t length, + merve_error_loc* out_err); +#endif /** * Check whether the parse result is valid (parsing succeeded). @@ -165,7 +190,7 @@ const char* merve_get_version(void); merve_version_components merve_get_version_components(void); #ifdef __cplusplus -} /* extern "C" */ +} /* extern "C" */ #endif #endif /* MERVE_C_H */ diff --git a/rust/src/ffi.rs b/rust/src/ffi.rs index a966915..ab7148f 100644 --- a/rust/src/ffi.rs +++ b/rust/src/ffi.rs @@ -25,8 +25,18 @@ pub struct merve_version_components { pub revision: c_int, } +#[repr(C)] +pub struct merve_error_loc { + pub line: u32, + pub column: u32, +} + unsafe extern "C" { - pub fn merve_parse_commonjs(input: *const c_char, length: usize) -> merve_analysis; + pub fn merve_parse_commonjs( + input: *const c_char, + length: usize, + out_err: *mut merve_error_loc, + ) -> merve_analysis; pub fn merve_is_valid(result: merve_analysis) -> bool; pub fn merve_free(result: merve_analysis); pub fn merve_get_exports_count(result: merve_analysis) -> usize; diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 0997598..841fe01 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -112,6 +112,66 @@ impl fmt::Display for LexerError { #[cfg(feature = "std")] impl std::error::Error for LexerError {} +/// 1-based error position. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct ErrorLocation { + /// 1-based line number. + pub line: NonZeroU32, + /// 1-based column number (byte-oriented). + pub column: NonZeroU32, +} + +impl ErrorLocation { + #[inline] + fn from_ffi(loc: ffi::merve_error_loc) -> Option { + Some(Self { + line: NonZeroU32::new(loc.line)?, + column: NonZeroU32::new(loc.column)?, + }) + } +} + +/// Lexer error with optional source location. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct LocatedLexerError { + /// Error kind reported by the lexer. + pub kind: LexerError, + /// Source location, if available. + pub location: Option, +} + +impl LocatedLexerError { + #[inline] + fn from_code_and_loc(code: i32, loc: ffi::merve_error_loc) -> Self { + let kind = if code >= 0 { + LexerError::from_code(code) + } else { + LexerError::Unknown(code) + }; + Self { + kind, + location: ErrorLocation::from_ffi(loc), + } + } +} + +impl fmt::Display for LocatedLexerError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(loc) = self.location { + write!( + f, + "{} at line {}, column {}", + self.kind, loc.line, loc.column + ) + } else { + write!(f, "{}", self.kind) + } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for LocatedLexerError {} + /// A parsed CommonJS analysis result. /// /// The lifetime `'a` is tied to the source string passed to [`parse_commonjs`], @@ -317,7 +377,7 @@ impl ExactSizeIterator for ExportIter<'_, '_> {} /// /// # Errors /// -/// Returns a [`LexerError`] if the input contains ESM syntax or other +/// Returns a [`LocatedLexerError`] if the input contains ESM syntax or other /// unsupported constructs. /// /// # Examples @@ -346,30 +406,33 @@ impl ExactSizeIterator for ExportIter<'_, '_> {} /// let _ = leaked; /// } /// ``` -pub fn parse_commonjs(source: &str) -> Result, LexerError> { +pub fn parse_commonjs(source: &str) -> Result, LocatedLexerError> { if source.is_empty() { - return Err(LexerError::EmptySource); + return Err(LocatedLexerError { + kind: LexerError::EmptySource, + location: Some(ErrorLocation { + line: NonZeroU32::new(1).expect("1 is non-zero"), + column: NonZeroU32::new(1).expect("1 is non-zero"), + }), + }); } - let handle = unsafe { ffi::merve_parse_commonjs(source.as_ptr().cast(), source.len()) }; + + let mut loc = ffi::merve_error_loc { line: 0, column: 0 }; + + let handle = + unsafe { ffi::merve_parse_commonjs(source.as_ptr().cast(), source.len(), &mut loc) }; if handle.is_null() { - // NULL means allocation failure; map to a generic error let code = unsafe { ffi::merve_get_last_error() }; - return Err(if code >= 0 { - LexerError::from_code(code) - } else { - LexerError::Unknown(code) - }); + return Err(LocatedLexerError::from_code_and_loc(code, loc)); } + if !unsafe { ffi::merve_is_valid(handle) } { let code = unsafe { ffi::merve_get_last_error() }; - let err = if code >= 0 { - LexerError::from_code(code) - } else { - LexerError::Unknown(code) - }; + let err = LocatedLexerError::from_code_and_loc(code, loc); unsafe { ffi::merve_free(handle) }; return Err(err); } + Ok(Analysis { handle, _source: PhantomData, @@ -455,7 +518,7 @@ mod tests { let result = parse_commonjs(source); assert!(result.is_err()); let err = result.unwrap_err(); - assert_eq!(err, LexerError::UnexpectedEsmImport); + assert_eq!(err.kind, LexerError::UnexpectedEsmImport); } #[test] @@ -464,14 +527,69 @@ mod tests { let result = parse_commonjs(source); assert!(result.is_err()); let err = result.unwrap_err(); - assert_eq!(err, LexerError::UnexpectedEsmExport); + assert_eq!(err.kind, LexerError::UnexpectedEsmExport); } #[test] fn empty_input() { let result = parse_commonjs(""); assert!(result.is_err()); - assert_eq!(result.unwrap_err(), LexerError::EmptySource); + let err = result.unwrap_err(); + assert_eq!(err.kind, LexerError::EmptySource); + let loc = err + .location + .expect("empty source location should be present"); + assert_eq!(loc.line, NonZeroU32::new(1).unwrap()); + assert_eq!(loc.column, NonZeroU32::new(1).unwrap()); + } + + #[test] + fn parse_reports_error_position() { + let source = "\n import 'x';"; + let result = parse_commonjs(source); + assert!(result.is_err()); + + let err = result.unwrap_err(); + assert_eq!(err.kind, LexerError::UnexpectedEsmImport); + let loc = err.location.expect("location should be present"); + assert_eq!(loc.line, NonZeroU32::new(2).unwrap()); + assert_eq!(loc.column, NonZeroU32::new(3).unwrap()); + } + + #[test] + fn parse_crlf_position() { + let source = "\r\n import 'x';"; + let result = parse_commonjs(source); + assert!(result.is_err()); + + let err = result.unwrap_err(); + assert_eq!(err.kind, LexerError::UnexpectedEsmImport); + let loc = err.location.expect("location should be present"); + assert_eq!(loc.line, NonZeroU32::new(2).unwrap()); + assert_eq!(loc.column, NonZeroU32::new(3).unwrap()); + } + + #[test] + fn parse_import_meta_and_eof() { + let import_meta = parse_commonjs("\n import.meta.url"); + assert!(import_meta.is_err()); + let import_meta_err = import_meta.unwrap_err(); + assert_eq!(import_meta_err.kind, LexerError::UnexpectedEsmImportMeta); + let import_meta_loc = import_meta_err + .location + .expect("import.meta location should be present"); + assert_eq!(import_meta_loc.line, NonZeroU32::new(2).unwrap()); + assert_eq!(import_meta_loc.column, NonZeroU32::new(3).unwrap()); + + let eof = parse_commonjs("(a + b"); + assert!(eof.is_err()); + let eof_err = eof.unwrap_err(); + assert_eq!(eof_err.kind, LexerError::UnterminatedParen); + let eof_loc = eof_err + .location + .expect("unterminated paren location should be present"); + assert_eq!(eof_loc.line, NonZeroU32::new(1).unwrap()); + assert_eq!(eof_loc.column, NonZeroU32::new(7).unwrap()); } #[test] @@ -558,6 +676,22 @@ mod tests { assert!(s.contains("99"), "got: {s}"); } + #[cfg(feature = "std")] + #[test] + fn located_error_display_includes_location() { + let err = LocatedLexerError { + kind: LexerError::UnexpectedEsmImport, + location: Some(ErrorLocation { + line: NonZeroU32::new(2).unwrap(), + column: NonZeroU32::new(4).unwrap(), + }), + }; + let s = format!("{err}"); + assert!(s.contains("line 2"), "got: {s}"); + assert!(s.contains("column 4"), "got: {s}"); + assert!(!s.contains("offset"), "got: {s}"); + } + #[test] fn error_from_code_roundtrip() { for code in 0..=12 { @@ -574,6 +708,13 @@ mod tests { assert_error::(); } + #[cfg(feature = "std")] + #[test] + fn located_error_is_std_error() { + fn assert_error() {} + assert_error::(); + } + #[test] fn bracket_notation_exports() { let source = r#"exports["hello-world"] = 1;"#; diff --git a/src/merve_c.cpp b/src/merve_c.cpp index 1909fa3..f05e460 100644 --- a/src/merve_c.cpp +++ b/src/merve_c.cpp @@ -14,9 +14,25 @@ static merve_string merve_string_create(const char* data, size_t length) { return out; } +static void merve_error_loc_clear(merve_error_loc* out_err) { + if (!out_err) return; + out_err->line = 0; + out_err->column = 0; +} + +static void merve_error_loc_set(merve_error_loc* out_err, + const lexer::error_location& loc) { + if (!out_err) return; + out_err->line = loc.line; + out_err->column = loc.column; +} + extern "C" { -merve_analysis merve_parse_commonjs(const char* input, size_t length) { +merve_analysis merve_parse_commonjs(const char* input, size_t length, + merve_error_loc* out_err) { + merve_error_loc_clear(out_err); + merve_analysis_impl* impl = new (std::nothrow) merve_analysis_impl(); if (!impl) return nullptr; if (input != nullptr) { @@ -24,6 +40,15 @@ merve_analysis merve_parse_commonjs(const char* input, size_t length) { } else { impl->result = lexer::parse_commonjs(std::string_view("", 0)); } + + if (!impl->result.has_value() && out_err) { + const std::optional& err_loc = + lexer::get_last_error_location(); + if (err_loc.has_value()) { + merve_error_loc_set(out_err, err_loc.value()); + } + } + return static_cast(impl); } diff --git a/src/parser.cpp b/src/parser.cpp index c977db0..3f87e8f 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -312,6 +312,40 @@ struct StarExportBinding { // Thread-local state for error tracking (safe for concurrent parse calls). thread_local std::optional last_error; +thread_local std::optional last_error_location; + +static error_location makeErrorLocation(const char* source, const char* end, const char* at) { + const char* target = at; + if (target < source) target = source; + if (target > end) target = end; + + uint32_t line = 1; + uint32_t column = 1; + const char* cur = source; + + while (cur < target) { + const char ch = *cur++; + if (ch == '\n') { + line++; + column = 1; + continue; + } + if (ch == '\r') { + line++; + column = 1; + if (cur < target && *cur == '\n') { + cur++; + } + continue; + } + column++; + } + + error_location loc{}; + loc.line = line; + loc.column = column; + return loc; +} // Lexer state class class CJSLexer { @@ -332,6 +366,7 @@ class CJSLexer { std::array templateStack_; std::array openTokenPosStack_; + std::array openTokenTypeStack_; std::array openClassPosStack; std::array starExportStack_; StarExportBinding* starExportStack; @@ -483,9 +518,11 @@ class CJSLexer { } // Parsing utilities - void syntaxError(lexer_error code) { + void syntaxError(lexer_error code, const char* at = nullptr) { if (!last_error) { last_error = code; + const char* error_pos = at ? at : pos; + last_error_location = makeErrorLocation(source, end, error_pos); } pos = end + 1; } @@ -1488,6 +1525,7 @@ class CJSLexer { char ch = commentWhitespace(); switch (ch) { case '(': + openTokenTypeStack_[openTokenDepth] = '('; openTokenPosStack_[openTokenDepth++] = startPos; return; case '.': @@ -1501,7 +1539,7 @@ class CJSLexer { // It's something like import.metaData, not import.meta return; } - syntaxError(lexer_error::UNEXPECTED_ESM_IMPORT_META); + syntaxError(lexer_error::UNEXPECTED_ESM_IMPORT_META, startPos); } return; default: @@ -1516,17 +1554,18 @@ class CJSLexer { pos--; return; } - syntaxError(lexer_error::UNEXPECTED_ESM_IMPORT); + syntaxError(lexer_error::UNEXPECTED_ESM_IMPORT, startPos); } } void throwIfExportStatement() { + const char* startPos = pos; pos += 6; const char* curPos = pos; char ch = commentWhitespace(); if (pos == curPos && !isPunctuator(ch)) return; - syntaxError(lexer_error::UNEXPECTED_ESM_EXPORT); + syntaxError(lexer_error::UNEXPECTED_ESM_EXPORT, startPos); } public: @@ -1535,7 +1574,7 @@ class CJSLexer { templateStackDepth(0), openTokenDepth(0), templateDepth(0), line(1), lastSlashWasDivision(false), nextBraceIsClass(false), - templateStack_{}, openTokenPosStack_{}, openClassPosStack{}, + templateStack_{}, openTokenPosStack_{}, openTokenTypeStack_{}, openClassPosStack{}, starExportStack_{}, starExportStack(nullptr), STAR_EXPORT_STACK_END(nullptr), exports(out_exports), re_exports(out_re_exports) {} @@ -1600,6 +1639,7 @@ class CJSLexer { pos += 23; if (*pos == '(') { pos++; + openTokenTypeStack_[openTokenDepth] = '('; openTokenPosStack_[openTokenDepth++] = lastTokenPos; if (tryParseRequire(RequireType::Import) && keywordStart(startPos)) tryBacktrackAddStarExportBinding(startPos - 1); @@ -1609,6 +1649,7 @@ class CJSLexer { if (pos + 4 < end && matchesAt(pos, end, "Star")) pos += 4; if (*pos == '(') { + openTokenTypeStack_[openTokenDepth] = '('; openTokenPosStack_[openTokenDepth++] = lastTokenPos; if (*(pos + 1) == 'r') { pos++; @@ -1643,6 +1684,7 @@ class CJSLexer { tryParseObjectDefineOrKeys(openTokenDepth == 0); break; case '(': + openTokenTypeStack_[openTokenDepth] = '('; openTokenPosStack_[openTokenDepth++] = lastTokenPos; break; case ')': @@ -1655,6 +1697,7 @@ class CJSLexer { case '{': openClassPosStack[openTokenDepth] = nextBraceIsClass; nextBraceIsClass = false; + openTokenTypeStack_[openTokenDepth] = '{'; openTokenPosStack_[openTokenDepth++] = lastTokenPos; break; case '}': @@ -1717,6 +1760,19 @@ class CJSLexer { lastTokenPos = pos; } + if (!last_error) { + if (templateDepth != std::numeric_limits::max()) { + syntaxError(lexer_error::UNTERMINATED_TEMPLATE_STRING, end); + } else if (openTokenDepth != 0) { + const char open_ch = openTokenTypeStack_[openTokenDepth - 1]; + if (open_ch == '{') { + syntaxError(lexer_error::UNTERMINATED_BRACE, end); + } else { + syntaxError(lexer_error::UNTERMINATED_PAREN, end); + } + } + } + if (templateDepth != std::numeric_limits::max() || openTokenDepth || last_error) { return false; } @@ -1727,6 +1783,7 @@ class CJSLexer { std::optional parse_commonjs(std::string_view file_contents) { last_error.reset(); + last_error_location.reset(); lexer_analysis result; CJSLexer lexer(result.exports, result.re_exports); @@ -1742,4 +1799,8 @@ const std::optional& get_last_error() { return last_error; } +const std::optional& get_last_error_location() { + return last_error_location; +} + } // namespace lexer diff --git a/tests/c_api_compile_test.c b/tests/c_api_compile_test.c index 77caaa1..e9419de 100644 --- a/tests/c_api_compile_test.c +++ b/tests/c_api_compile_test.c @@ -23,6 +23,15 @@ static void check_types(void) { merve_analysis a = (merve_analysis)0; (void)a; + merve_error_loc loc; + loc.line = 0; + loc.column = 0; + (void)loc; + + merve_analysis (*parse_fn)(const char*, size_t, merve_error_loc*) = + &merve_parse_commonjs; + (void)parse_fn; + /* Verify the error constants are valid integer constant expressions. */ int errors[] = { MERVE_ERROR_TODO, diff --git a/tests/c_api_tests.cpp b/tests/c_api_tests.cpp index cdcd0e4..ebc7386 100644 --- a/tests/c_api_tests.cpp +++ b/tests/c_api_tests.cpp @@ -11,6 +11,10 @@ static bool merve_string_eq(merve_string s, const char* expected) { return std::memcmp(s.data, expected, expected_len) == 0; } +static bool merve_error_loc_is_zero(merve_error_loc loc) { + return loc.line == 0 && loc.column == 0; +} + TEST(c_api_tests, version_string) { const char* version = merve_get_version(); ASSERT_NE(version, nullptr); @@ -106,6 +110,102 @@ TEST(c_api_tests, esm_import_error) { merve_free(result); } +TEST(c_api_tests, parse_commonjs_success_clears_error_location) { + const char* source = "exports.foo = 1;"; + merve_error_loc loc{9, 9}; + merve_analysis result = + merve_parse_commonjs(source, std::strlen(source), &loc); + ASSERT_NE(result, nullptr); + ASSERT_TRUE(merve_is_valid(result)); + ASSERT_TRUE(merve_error_loc_is_zero(loc)); + merve_free(result); +} + +TEST(c_api_tests, parse_commonjs_error_location) { + const char* source = "\n import 'x';"; + merve_error_loc loc{123, 456}; + merve_analysis result = + merve_parse_commonjs(source, std::strlen(source), &loc); + ASSERT_NE(result, nullptr); + ASSERT_FALSE(merve_is_valid(result)); + ASSERT_EQ(merve_get_last_error(), MERVE_ERROR_UNEXPECTED_ESM_IMPORT); + + ASSERT_EQ(loc.line, 2u); + ASSERT_EQ(loc.column, 3u); + + merve_free(result); +} + +TEST(c_api_tests, parse_commonjs_error_location_crlf) { + const char* source = "\r\n import 'x';"; + merve_error_loc loc{123, 456}; + merve_analysis result = + merve_parse_commonjs(source, std::strlen(source), &loc); + ASSERT_NE(result, nullptr); + ASSERT_FALSE(merve_is_valid(result)); + ASSERT_EQ(merve_get_last_error(), MERVE_ERROR_UNEXPECTED_ESM_IMPORT); + + ASSERT_EQ(loc.line, 2u); + ASSERT_EQ(loc.column, 3u); + + merve_free(result); +} + +TEST(c_api_tests, parse_commonjs_export_error_location) { + const char* source = "\n export { x };"; + merve_error_loc loc{123, 456}; + merve_analysis result = + merve_parse_commonjs(source, std::strlen(source), &loc); + ASSERT_NE(result, nullptr); + ASSERT_FALSE(merve_is_valid(result)); + ASSERT_EQ(merve_get_last_error(), MERVE_ERROR_UNEXPECTED_ESM_EXPORT); + + ASSERT_EQ(loc.line, 2u); + ASSERT_EQ(loc.column, 3u); + + merve_free(result); +} + +TEST(c_api_tests, parse_commonjs_import_meta_error_location) { + const char* source = "\n import.meta.url"; + merve_error_loc loc{123, 456}; + merve_analysis result = + merve_parse_commonjs(source, std::strlen(source), &loc); + ASSERT_NE(result, nullptr); + ASSERT_FALSE(merve_is_valid(result)); + ASSERT_EQ(merve_get_last_error(), MERVE_ERROR_UNEXPECTED_ESM_IMPORT_META); + + ASSERT_EQ(loc.line, 2u); + ASSERT_EQ(loc.column, 3u); + + merve_free(result); +} + +TEST(c_api_tests, parse_commonjs_eof_unterminated_paren_location) { + const char* source = "(a + b"; + merve_error_loc loc{123, 456}; + merve_analysis result = + merve_parse_commonjs(source, std::strlen(source), &loc); + ASSERT_NE(result, nullptr); + ASSERT_FALSE(merve_is_valid(result)); + ASSERT_EQ(merve_get_last_error(), MERVE_ERROR_UNTERMINATED_PAREN); + + ASSERT_EQ(loc.line, 1u); + ASSERT_EQ(loc.column, 7u); + + merve_free(result); +} + +TEST(c_api_tests, parse_commonjs_accepts_null_out_err) { + const char* source = "import 'x';"; + merve_analysis result = + merve_parse_commonjs(source, std::strlen(source), NULL); + ASSERT_NE(result, nullptr); + ASSERT_FALSE(merve_is_valid(result)); + ASSERT_EQ(merve_get_last_error(), MERVE_ERROR_UNEXPECTED_ESM_IMPORT); + merve_free(result); +} + TEST(c_api_tests, esm_export_error) { const char* source = "export { x };"; merve_analysis result = merve_parse_commonjs(source, std::strlen(source)); diff --git a/tests/real_world_tests.cpp b/tests/real_world_tests.cpp index 08e54f4..12a3bf7 100644 --- a/tests/real_world_tests.cpp +++ b/tests/real_world_tests.cpp @@ -980,6 +980,70 @@ TEST(real_world_tests, esm_syntax_error_import_meta) { ASSERT_EQ(err, lexer::lexer_error::UNEXPECTED_ESM_IMPORT_META); } +TEST(real_world_tests, eof_unterminated_brace_error) { + auto result = lexer::parse_commonjs("(function test() {"); + ASSERT_FALSE(result.has_value()); + auto err = lexer::get_last_error(); + ASSERT_TRUE(err.has_value()); + ASSERT_EQ(err, lexer::lexer_error::UNTERMINATED_BRACE); +} + +TEST(real_world_tests, eof_unterminated_paren_error) { + auto result = lexer::parse_commonjs("(a + b"); + ASSERT_FALSE(result.has_value()); + auto err = lexer::get_last_error(); + ASSERT_TRUE(err.has_value()); + ASSERT_EQ(err, lexer::lexer_error::UNTERMINATED_PAREN); +} + +TEST(real_world_tests, error_location_state_resets_after_success) { + auto failed = lexer::parse_commonjs("\n import 'x';"); + ASSERT_FALSE(failed.has_value()); + + auto loc_after_error = lexer::get_last_error_location(); + ASSERT_TRUE(loc_after_error.has_value()); + ASSERT_EQ(loc_after_error->line, 2u); + ASSERT_EQ(loc_after_error->column, 3u); + + auto ok = lexer::parse_commonjs("exports.ok = 1;"); + ASSERT_TRUE(ok.has_value()); + ASSERT_FALSE(lexer::get_last_error_location().has_value()); +} + +TEST(real_world_tests, error_location_crlf_line_counting) { + auto failed = lexer::parse_commonjs("\r\n import 'x';"); + ASSERT_FALSE(failed.has_value()); + auto err = lexer::get_last_error(); + ASSERT_TRUE(err.has_value()); + ASSERT_EQ(err, lexer::lexer_error::UNEXPECTED_ESM_IMPORT); + + auto loc = lexer::get_last_error_location(); + ASSERT_TRUE(loc.has_value()); + ASSERT_EQ(loc->line, 2u); + ASSERT_EQ(loc->column, 3u); +} + +TEST(real_world_tests, error_location_import_meta_and_eof) { + auto import_meta = lexer::parse_commonjs("\n import.meta.url"); + ASSERT_FALSE(import_meta.has_value()); + ASSERT_EQ(lexer::get_last_error(), + lexer::lexer_error::UNEXPECTED_ESM_IMPORT_META); + + auto import_meta_loc = lexer::get_last_error_location(); + ASSERT_TRUE(import_meta_loc.has_value()); + ASSERT_EQ(import_meta_loc->line, 2u); + ASSERT_EQ(import_meta_loc->column, 3u); + + auto eof_unterminated = lexer::parse_commonjs("(a + b"); + ASSERT_FALSE(eof_unterminated.has_value()); + ASSERT_EQ(lexer::get_last_error(), lexer::lexer_error::UNTERMINATED_PAREN); + + auto eof_loc = lexer::get_last_error_location(); + ASSERT_TRUE(eof_loc.has_value()); + ASSERT_EQ(eof_loc->line, 1u); + ASSERT_EQ(eof_loc->column, 7u); +} + TEST(real_world_tests, unicode_escape_sequences) { // Test various unicode escape sequences in exports auto result = lexer::parse_commonjs("\