diff --git a/tools/grammar/src/parser.rs b/tools/grammar/src/parser.rs index aef8160df9..a5a91a5cf0 100644 --- a/tools/grammar/src/parser.rs +++ b/tools/grammar/src/parser.rs @@ -38,6 +38,24 @@ macro_rules! bail { type Result = std::result::Result; +/// Whether a character can start a grammar rule name. +/// +/// This includes ASCII alphabetic characters, underscores, and +/// non-ASCII Unicode symbols such as `⊥` (bottom) and `⊤` (top). +/// ASCII symbols are excluded because characters such as `+`, `|`, +/// `~`, and `^` are grammar syntax. +fn is_name_start(ch: char) -> bool { + ch.is_alphabetic() || ch == '_' || !ch.is_ascii() +} + +/// Whether a character can continue a grammar rule name. +/// +/// Accepts alphanumeric characters, underscores, and non-ASCII +/// characters. +fn is_name_continue(ch: char) -> bool { + ch.is_alphanumeric() || ch == '_' || !ch.is_ascii() +} + pub fn parse_grammar( input: &str, grammar: &mut Grammar, @@ -152,18 +170,11 @@ impl Parser<'_> { } fn parse_name(&mut self) -> Option { - // Names must start with an alphabetic character or - // underscore. let first = self.input[self.index..].chars().next()?; - if !first.is_alphabetic() && first != '_' { + if !is_name_start(first) { return None; } - let name = self.take_while(&|c: char| c.is_alphanumeric() || c == '_'); - if name.is_empty() { - None - } else { - Some(name.to_string()) - } + Some(self.take_while(&|c| is_name_continue(c)).to_string()) } fn parse_expression(&mut self) -> Result> { @@ -231,7 +242,7 @@ impl Parser<'_> { } else if self.input[self.index..] .chars() .next() - .map(|ch| ch.is_alphanumeric()) + .map(|ch| is_name_start(ch)) .unwrap_or(false) { self.parse_nonterminal() diff --git a/tools/mdbook-spec/src/grammar.rs b/tools/mdbook-spec/src/grammar.rs index f0a4e3fe12..12ece5df7a 100644 --- a/tools/mdbook-spec/src/grammar.rs +++ b/tools/mdbook-spec/src/grammar.rs @@ -11,8 +11,13 @@ use std::sync::LazyLock; mod render_markdown; mod render_railroad; -static NAMES_RE: LazyLock = - LazyLock::new(|| Regex::new(r"(?m)^(?:@root )?([A-Za-z0-9_]+)(?: \([^)]+\))? ->").unwrap()); +static NAMES_RE: LazyLock = LazyLock::new(|| { + // For match rule names, we support standard ASCII identifiers + // or non-ASCII characters (such as `⊥`). This must be + // kept in sync with `is_name_start` and `is_name_continue` in + // `tools/grammar/src/parser.rs`. + Regex::new(r"(?m)^(?:@root )?([A-Za-z0-9_]+|[^\x00-\x7F])(?: \([^)]+\))? ->").unwrap() +}); #[derive(Debug)] pub struct RenderCtx {