From 22141e59989ca41991f839c02064e70e7c7fb112 Mon Sep 17 00:00:00 2001 From: Ivan Glazunov Date: Tue, 17 Mar 2026 13:06:20 +0300 Subject: [PATCH 1/2] feat: add rdf n-tripples graph load source --- src/formats/mod.rs | 18 +++- src/formats/nt.rs | 235 ++++++++++++++++++++++++++++++++++++++++++ src/graph/inmemory.rs | 30 +++++- 3 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 src/formats/nt.rs diff --git a/src/formats/mod.rs b/src/formats/mod.rs index 4aafb79..9656658 100644 --- a/src/formats/mod.rs +++ b/src/formats/mod.rs @@ -4,18 +4,25 @@ //! //! ```no_run //! use pathrex::graph::{Graph, InMemory, GraphDecomposition}; -//! use pathrex::formats::Csv; +//! use pathrex::formats::{Csv, NTriples}; //! use std::fs::File; //! //! // Build from CSV in one line //! let g = Graph::::try_from( //! Csv::from_reader(File::open("edges.csv").unwrap()).unwrap() //! ).unwrap(); +//! +//! // Build from N-Triples in one line +//! let g2 = Graph::::try_from( +//! NTriples::new(File::open("data.nt").unwrap()) +//! ).unwrap(); //! ``` pub mod csv; +pub mod nt; pub use csv::Csv; +pub use nt::NTriples; use thiserror::Error; @@ -33,4 +40,13 @@ pub enum FormatError { /// An I/O error occurred while reading the data source. #[error("I/O error: {0}")] Io(#[from] std::io::Error), + + /// An error produced by the N-Triples parser. + #[error("N-Triples parse error: {0}")] + NTriples(String), + + /// An RDF literal appeared as a subject or object where a node IRI or + /// blank node was expected. + #[error("RDF literal cannot be used as a graph node (triple skipped)")] + LiteralAsNode, } diff --git a/src/formats/nt.rs b/src/formats/nt.rs new file mode 100644 index 0000000..75866f5 --- /dev/null +++ b/src/formats/nt.rs @@ -0,0 +1,235 @@ +//! N-Triples edge iterator for the formats layer. +//! +//! ```no_run +//! use pathrex::formats::NTriples; +//! use pathrex::formats::FormatError; +//! +//! # let reader = std::io::empty(); +//! let iter = NTriples::new(reader) +//! .filter_map(|r| match r { +//! Err(FormatError::LiteralAsNode) => None, // skip +//! other => Some(other), +//! }); +//! ``` +//! +//! To load into a graph: +//! +//! ```no_run +//! use pathrex::graph::{Graph, InMemory, GraphDecomposition}; +//! use pathrex::formats::NTriples; +//! use std::fs::File; +//! +//! let graph = Graph::::try_from( +//! NTriples::new(File::open("data.nt").unwrap()) +//! ).unwrap(); +//! ``` + +use std::io::Read; + +use oxrdf::{NamedOrBlankNode, Term}; +use oxttl::NTriplesParser; +use oxttl::ntriples::ReaderNTriplesParser; + +use crate::formats::FormatError; +use crate::graph::Edge; + +/// Controls how predicate IRIs are converted to edge label strings. +#[derive(Debug, Clone, Default)] +pub enum LabelExtraction { + /// Use only the local name: the fragment (`#name`) or last path segment. + /// For example, `http://example.org/ns/knows` → `"knows"`. + /// This is the default. + #[default] + LocalName, + /// Use the full IRI string as the label. + /// For example, `http://example.org/ns/knows` → `"http://example.org/ns/knows"`. + FullIri, +} + +/// An iterator that reads N-Triples and yields `Result`. +/// +/// # Example +/// +/// ```no_run +/// use pathrex::formats::nt::NTriples; +/// use std::fs::File; +/// +/// let file = File::open("data.nt").unwrap(); +/// let iter = NTriples::new(file); +/// for result in iter { +/// let edge = result.unwrap(); +/// println!("{} --{}--> {}", edge.source, edge.label, edge.target); +/// } +/// ``` +pub struct NTriples { + inner: ReaderNTriplesParser, + label_extraction: LabelExtraction, +} + +impl NTriples { + pub fn new(reader: R) -> Self { + Self::with_label_extraction(reader, LabelExtraction::LocalName) + } + + pub fn with_label_extraction(reader: R, label_extraction: LabelExtraction) -> Self { + Self { + inner: NTriplesParser::new().for_reader(reader), + label_extraction, + } + } + + fn subject_to_node_id(subject: NamedOrBlankNode) -> String { + match subject { + NamedOrBlankNode::NamedNode(n) => n.into_string(), + NamedOrBlankNode::BlankNode(b) => format!("_:{}", b.as_str()), + } + } + + fn object_to_node_id(object: Term) -> Result { + match object { + Term::NamedNode(n) => Ok(n.into_string()), + Term::BlankNode(b) => Ok(format!("_:{}", b.as_str())), + Term::Literal(_) => Err(FormatError::LiteralAsNode), + } + } + + fn extract_label(iri: &str, strategy: &LabelExtraction) -> String { + match strategy { + LabelExtraction::FullIri => iri.to_owned(), + LabelExtraction::LocalName => { + // Fragment takes priority, then last path segment. + if let Some(pos) = iri.rfind('#') { + iri[pos + 1..].to_owned() + } else if let Some(pos) = iri.rfind('/') { + iri[pos + 1..].to_owned() + } else { + iri.to_owned() + } + } + } + } +} + +impl Iterator for NTriples { + type Item = Result; + + fn next(&mut self) -> Option { + let triple = match self.inner.next()? { + Ok(t) => t, + Err(e) => return Some(Err(FormatError::NTriples(e.to_string()))), + }; + + let source = Self::subject_to_node_id(triple.subject.into()); + let label = Self::extract_label(triple.predicate.as_str(), &self.label_extraction); + let target = match Self::object_to_node_id(triple.object) { + Ok(t) => t, + Err(e) => return Some(Err(e)), + }; + + Some(Ok(Edge { + source, + target, + label, + })) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse(nt: &str) -> Vec> { + NTriples::new(nt.as_bytes()).collect() + } + + #[test] + fn test_basic_ntriples() { + let nt = " .\n\ + .\n"; + let edges = parse(nt); + assert_eq!(edges.len(), 2); + + let e0 = edges[0].as_ref().unwrap(); + assert_eq!(e0.source, "http://example.org/Alice"); + assert_eq!(e0.target, "http://example.org/Bob"); + assert_eq!(e0.label, "knows"); + + let e1 = edges[1].as_ref().unwrap(); + assert_eq!(e1.source, "http://example.org/Bob"); + assert_eq!(e1.target, "http://example.org/Charlie"); + assert_eq!(e1.label, "likes"); + } + + #[test] + fn test_full_iri_label_extraction() { + let nt = + " .\n"; + let edges: Vec<_> = + NTriples::with_label_extraction(nt.as_bytes(), LabelExtraction::FullIri).collect(); + + assert_eq!(edges.len(), 1); + assert_eq!(edges[0].as_ref().unwrap().label, "http://example.org/knows"); + } + + #[test] + fn test_blank_node_subject_and_object() { + let nt = "_:b1 _:b2 .\n"; + let edges = parse(nt); + assert_eq!(edges.len(), 1); + + let e = edges[0].as_ref().unwrap(); + assert_eq!(e.source, "_:b1"); + assert_eq!(e.target, "_:b2"); + } + + #[test] + fn test_literal_object_yields_error() { + let nt = " \"Alice\" .\n"; + let edges = parse(nt); + assert_eq!(edges.len(), 1); + assert!( + matches!(edges[0], Err(FormatError::LiteralAsNode)), + "literal object should yield LiteralAsNode error" + ); + } + + #[test] + fn test_caller_can_skip_literal_triples() { + let nt = " .\n\ + \"Alice\" .\n\ + .\n"; + let edges: Vec<_> = NTriples::new(nt.as_bytes()) + .filter_map(|r| match r { + Err(FormatError::LiteralAsNode) => None, + other => Some(other), + }) + .collect(); + + assert_eq!(edges.len(), 2, "literal triple should be skipped"); + assert!(edges.iter().all(|r| r.is_ok())); + } + + #[test] + fn test_fragment_iri_local_name() { + let nt = + " .\n"; + let edges = parse(nt); + assert_eq!(edges[0].as_ref().unwrap().label, "knows"); + } + + #[test] + fn test_ntriples_graph_source() { + use crate::graph::{GraphBuilder, GraphDecomposition, InMemoryBuilder}; + + let nt = " .\n\ + .\n"; + let iter = NTriples::new(nt.as_bytes()); + + let graph = InMemoryBuilder::default() + .load(iter) + .expect("load should succeed") + .build() + .expect("build should succeed"); + assert_eq!(graph.num_nodes(), 3); + } +} diff --git a/src/graph/inmemory.rs b/src/graph/inmemory.rs index d31916c..6135023 100644 --- a/src/graph/inmemory.rs +++ b/src/graph/inmemory.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use std::{collections::HashMap, io::Read}; -use crate::formats::Csv; +use crate::formats::{Csv, NTriples}; use crate::{ graph::GraphSource, lagraph_sys::{GrB_Index, GrB_Matrix, GrB_Matrix_free, LAGraph_Kind}, @@ -191,6 +191,15 @@ impl GraphSource for Csv { } } +impl GraphSource for NTriples { + fn apply_to(self, mut builder: InMemoryBuilder) -> Result { + for item in self { + builder.push_edge(item?)?; + } + Ok(builder) + } +} + #[cfg(test)] mod tests { use super::*; @@ -278,4 +287,23 @@ mod tests { assert!(graph.get_graph("knows").is_ok()); assert!(graph.get_graph("likes").is_ok()); } + + #[test] + fn test_with_stream_from_ntriples() { + use crate::formats::nt::NTriples; + + let nt = " .\n\ + .\n\ + .\n"; + + let graph = InMemoryBuilder::new() + .load(NTriples::new(nt.as_bytes())) + .expect("load should succeed") + .build() + .expect("build should succeed"); + + assert_eq!(graph.num_nodes(), 3); + assert!(graph.get_graph("knows").is_ok()); + assert!(graph.get_graph("likes").is_ok()); + } } From 7fbc86953ae18e071d7fbb91509bf32e9257bf81 Mon Sep 17 00:00:00 2001 From: Ivan Glazunov Date: Thu, 19 Mar 2026 05:44:21 +0300 Subject: [PATCH 2/2] doc: update AGENTS.md --- AGENTS.md | 36 +++++++++++++++++++++++++++++++----- src/graph/mod.rs | 2 +- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index a71b1cb..a1227d7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -26,7 +26,8 @@ pathrex/ │ │ └── inmemory.rs # InMemory marker, InMemoryBuilder, InMemoryGraph │ └── formats/ │ ├── mod.rs # FormatError enum, re-exports -│ └── csv.rs # Csv — CSV → Edge iterator (CsvConfig, ColumnSpec) +│ ├── csv.rs # Csv — CSV → Edge iterator (CsvConfig, ColumnSpec) +│ └── nt.rs # NTriples — N-Triples → Edge iterator (LabelExtraction) ├── tests/ │ └── inmemory_tests.rs # Integration tests for InMemoryBuilder / InMemoryGraph ├── deps/ @@ -187,9 +188,13 @@ into a single graph. ### Format parsers -[`Csv`](src/formats/csv.rs:52) is the only built-in parser. It yields -`Iterator>` and is directly pluggable into -`GraphBuilder::load()` via its `GraphSource` impl. +Two built-in parsers are available, both yielding +`Iterator>` and pluggable into +`GraphBuilder::load()` via their `GraphSource` impls. + +#### `Csv` + +[`Csv`](src/formats/csv.rs:52) parses delimiter-separated edge files. Configuration is via [`CsvConfig`](src/formats/csv.rs:17): @@ -204,6 +209,27 @@ Configuration is via [`CsvConfig`](src/formats/csv.rs:17): [`ColumnSpec`](src/formats/csv.rs:11) is either `Index(usize)` or `Name(String)`. Name-based lookup requires `has_header: true`. +#### `NTriples` + +[`NTriples`](src/formats/nt.rs:57) parses [W3C N-Triples](https://www.w3.org/TR/n-triples/) +RDF files using `oxttl`. Each triple `(subject, predicate, object)` becomes an +[`Edge`](src/graph/mod.rs:154) where: + +- `source` — subject IRI or blank-node ID (`_:label`). +- `target` — object IRI or blank-node ID; triples whose object is an RDF + literal yield `Err(FormatError::LiteralAsNode)` (callers may filter these out). +- `label` — predicate IRI, transformed by [`LabelExtraction`](src/formats/nt.rs:36): + +| Variant | Behaviour | +|---|---| +| `LocalName` (default) | Fragment (`#name`) or last path segment of the predicate IRI | +| `FullIri` | Full predicate IRI string | + +Constructors: + +- [`NTriples::new(reader)`](src/formats/nt.rs:72) — uses `LabelExtraction::LocalName`. +- [`NTriples::with_label_extraction(reader, strategy)`](src/formats/nt.rs:76) — explicit strategy. + ### FFI layer [`lagraph_sys`](src/lagraph_sys.rs) exposes raw C bindings for GraphBLAS and @@ -254,7 +280,7 @@ Tests in `src/graph/mod.rs` use `CountingBuilder` / `CountOutput` / `VecSource` [`src/utils.rs`](src/utils.rs) — these do **not** call into GraphBLAS and run without native libraries. -Tests in `src/formats/csv.rs` are pure Rust and need no native dependencies. +Tests in `src/formats/csv.rs` and `src/formats/nt.rs` are pure Rust and need no native dependencies. Tests in `src/graph/inmemory.rs` and [`tests/inmemory_tests.rs`](tests/inmemory_tests.rs) call real GraphBLAS/LAGraph and require the native libraries to be present. diff --git a/src/graph/mod.rs b/src/graph/mod.rs index 62e1667..d096b6f 100644 --- a/src/graph/mod.rs +++ b/src/graph/mod.rs @@ -109,7 +109,7 @@ impl LagraphGraph { Self::new(matrix, kind) } - pub fn check_graph(&self) -> Result<(), GraphError> { + pub fn check_graph(&self) -> Result<(), GraphError> { la_ok!(LAGraph_CheckGraph(self.inner)) } }