From 53d207c835e729ccf0a5e855f8b140a2aa5b6a14 Mon Sep 17 00:00:00 2001 From: "SAKAI, Kazuaki" Date: Fri, 13 Mar 2026 16:14:16 +0900 Subject: [PATCH 1/6] fix: improve structure_cost_filter to keep valid multi-segment paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 今ですね (今|です|ね) was being filtered out because: 1. Single-segment paths (sc=0) set min_sc too low 2. Prefix POS transitions (e.g. 今[prefix]→デスネ, conn=256) dragged the baseline down further Changes: - Raise structure_cost_filter from 4000 to 6000 - Impute single-segment paths with prefix_floor for min_sc computation so 0-transition paths don't set artificially low baseline - Floor prefix POS transitions at filter/2 to prevent anomalously cheap connections from skewing the threshold - Cap script_cost scale at min(reading_chars, 2) to reduce excessive kanji bonuses on long compound readings - Add いまですね regression test case (accuracy: 61/61) Co-Authored-By: Claude Opus 4.6 --- engine/crates/lex-core/src/converter/cost.rs | 2 +- .../crates/lex-core/src/converter/reranker.rs | 38 ++++++++++++++--- .../lex-core/src/converter/tests/reranker.rs | 41 ++++++++++--------- .../crates/lex-core/src/default_settings.toml | 2 +- engine/crates/lex-core/src/settings.rs | 12 +++--- engine/testcorpus/accuracy-corpus.toml | 7 ++++ 6 files changed, 69 insertions(+), 33 deletions(-) diff --git a/engine/crates/lex-core/src/converter/cost.rs b/engine/crates/lex-core/src/converter/cost.rs index 4d767c0..1f39231 100644 --- a/engine/crates/lex-core/src/converter/cost.rs +++ b/engine/crates/lex-core/src/converter/cost.rs @@ -29,7 +29,7 @@ pub fn script_cost(surface: &str, reading_chars: usize) -> i64 { all_katakana = false; } } - let scale = reading_chars.min(3) as i64; + let scale = reading_chars.min(2) as i64; if has_kanji && has_kana { -s.cost.mixed_script_bonus * scale / 3 } else if has_kanji { diff --git a/engine/crates/lex-core/src/converter/reranker.rs b/engine/crates/lex-core/src/converter/reranker.rs index fb886c3..2ab5e68 100644 --- a/engine/crates/lex-core/src/converter/reranker.rs +++ b/engine/crates/lex-core/src/converter/reranker.rs @@ -108,23 +108,51 @@ pub fn rerank( return; } - // Step 1: Compute structure_cost for each path + // Step 1: Compute structure_cost for each path. + // + // Transitions FROM a prefix POS (role == 3) get a floor of half the + // filter threshold. Without this, a prefix→content-word transition + // (e.g. 今[prefix]→デスネ with conn=256) can drag min_sc so low that + // the hard filter drops correct multi-segment paths like 今|です|ね. let cap = settings().reranker.structure_cost_transition_cap; + let prefix_floor = settings().reranker.structure_cost_filter / 2; let mut structure_costs: Vec = paths .iter() .map(|p| { let mut sc: i64 = 0; for i in 1..p.segments.len() { - sc += conn_cost(conn, p.segments[i - 1].right_id, p.segments[i].left_id).min(cap); + let mut tc = conn_cost(conn, p.segments[i - 1].right_id, p.segments[i].left_id); + if let Some(c) = conn { + if c.role(p.segments[i - 1].left_id) == 3 { + tc = tc.max(prefix_floor); + } + } + sc += tc.min(cap); } sc }) .collect(); // Step 2: Hard filter — drop paths exceeding min + threshold. - // min_sc is guaranteed to be <= threshold, so at least one path always survives. - let min_sc = *structure_costs.iter().min().unwrap(); - let threshold = min_sc + settings().reranker.structure_cost_filter; + // + // For min_sc computation, single-segment paths (0 transitions, sc=0) are + // imputed with prefix_floor so they don't set an artificially low baseline. + // Combined with the prefix-transition floor in step 1, this ensures the + // threshold is high enough to keep correct multi-segment paths. + let filter = settings().reranker.structure_cost_filter; + let min_sc = structure_costs + .iter() + .zip(paths.iter()) + .map(|(&sc, p)| { + if p.segments.len() <= 1 { + prefix_floor + } else { + sc + } + }) + .min() + .unwrap(); + let threshold = min_sc + filter; { let mut i = 0; let mut kept_costs = Vec::new(); diff --git a/engine/crates/lex-core/src/converter/tests/reranker.rs b/engine/crates/lex-core/src/converter/tests/reranker.rs index 1912bc2..35a7853 100644 --- a/engine/crates/lex-core/src/converter/tests/reranker.rs +++ b/engine/crates/lex-core/src/converter/tests/reranker.rs @@ -176,14 +176,14 @@ fn test_rerank_penalizes_uneven_segments() { rerank(&mut paths, None, None); - // script_cost (scaled by reading length): - // "来たり" (reading "きたり" = 3 chars) → mixed bonus -3000 * 3/3 = -3000 + // script_cost (scaled by reading length, capped at 2): + // "来たり" (reading "きたり" = 3 chars, cap 2) → mixed bonus -3000 * 2/3 = -2000 // "出来" (reading "でき" = 2 chars) → pure_kanji bonus -1000 * 2/3 = -666 - // Uneven: 5000 + variance(0, exempt) + script("で"=0 + "来たり"=-3000) = 2000 - // Even: 6500 + variance(0, exempt) + script("出来"=-666 + "たり"=0) = 5834 + // Uneven: 5000 + script("で"=0 + "来たり"=-2000) = 3000 + // Even: 6500 + script("出来"=-666 + "たり"=0) = 5834 // Uneven path wins due to mixed-script bonus on "来たり" assert_eq!(paths[0].segments[0].surface, "で"); - assert_eq!(paths[0].viterbi_cost, 2000); + assert_eq!(paths[0].viterbi_cost, 3000); assert_eq!(paths[1].segments[0].surface, "出来"); assert_eq!(paths[1].viterbi_cost, 5834); } @@ -373,13 +373,13 @@ fn uniform_conn(cost: i16) -> ConnectionMatrix { #[test] fn test_filter_drops_fragmented_paths() { - // Transition cost = 1500 each. - // Path A: 1 segment → 0 transitions → structure_cost = 0 - // Path B: 2 segments → 1 transition → structure_cost = 1500 - // Path C: 5 segments → 4 transitions → structure_cost = 6000 - // min_sc = 0, threshold = 0 + 4000 = 4000 - // Path C (6000 > 4000) should be dropped; A and B should remain. - let conn = uniform_conn(1500); + // Transition cost = 5000 each. + // Path A: 1 segment → sc = 0 (imputed to 3000 for min_sc) + // Path B: 2 segments → sc = 5000 + // Path C: 5 segments → sc = 20000 + // min_sc = 3000 (imputed), threshold = 3000 + 6000 = 9000. + // Path C (20000 > 9000) should be dropped; A and B survive. + let conn = uniform_conn(5000); let mut paths = vec![ ScoredPath { @@ -455,9 +455,9 @@ fn test_filter_drops_fragmented_paths() { rerank(&mut paths, Some(&conn), None); - // Path C should have been filtered out + // Path C should have been filtered out (sc=20000 > threshold=9000); + // paths A and B survive. assert_eq!(paths.len(), 2); - // Verify the fragmented 5-segment path is gone assert!(paths.iter().all(|p| p.segments.len() <= 2)); } @@ -465,8 +465,8 @@ fn test_filter_drops_fragmented_paths() { fn test_filter_keeps_all_when_all_exceed() { // All paths have high structure_cost; none should be dropped. // Transition cost = 2000. All paths have 4 segments → 3 transitions → sc = 6000. - // min_sc = 6000, threshold = 6000 + 4000 = 10000. - // All paths have sc = 6000 ≤ 10000, so all pass. + // min_sc = 6000, threshold = 6000 + 6000 = 12000. + // All paths have sc = 6000 ≤ 12000, so all pass. // But to truly test the "all exceed" safety, we need a scenario where // min_sc itself is above the threshold relative to... Actually the safety // is: if ALL paths have sc > threshold, keep all. Let's just verify @@ -510,10 +510,11 @@ fn test_filter_keeps_all_when_all_exceed() { #[test] fn test_filter_preserves_minimum_path() { - // The path with minimum structure_cost must always survive the filter. - // Path A: 1 segment → sc = 0 (minimum) - // Path B: 4 segments → sc = 4500 (3 × 1500); 4500 > 0 + 4000 → filtered - let conn = uniform_conn(1500); + // The path with minimum structure_cost always survives. + // Path A: 4 segments → sc = 15000 + // Path B: 1 segment → sc = 0 (imputed to 3000 for min_sc) + // min_sc = 3000, threshold = 3000 + 6000 = 9000. Path A (15000 > 9000) → filtered. + let conn = uniform_conn(5000); let mut paths = vec![ ScoredPath { diff --git a/engine/crates/lex-core/src/default_settings.toml b/engine/crates/lex-core/src/default_settings.toml index 7f397c6..05c94ef 100644 --- a/engine/crates/lex-core/src/default_settings.toml +++ b/engine/crates/lex-core/src/default_settings.toml @@ -8,7 +8,7 @@ unknown_word_cost = 10000 [reranker] length_variance_weight = 2000 -structure_cost_filter = 4000 +structure_cost_filter = 6000 non_independent_kanji_penalty = 3000 te_form_kanji_penalty = 3500 pronoun_cost_bonus = 3500 diff --git a/engine/crates/lex-core/src/settings.rs b/engine/crates/lex-core/src/settings.rs index 59eadc3..73ecdc6 100644 --- a/engine/crates/lex-core/src/settings.rs +++ b/engine/crates/lex-core/src/settings.rs @@ -331,7 +331,7 @@ mod tests { assert_eq!(s.cost.latin_penalty, 20000); assert_eq!(s.cost.unknown_word_cost, 10000); assert_eq!(s.reranker.length_variance_weight, 2000); - assert_eq!(s.reranker.structure_cost_filter, 4000); + assert_eq!(s.reranker.structure_cost_filter, 6000); assert_eq!(s.reranker.non_independent_kanji_penalty, 3000); assert_eq!(s.reranker.te_form_kanji_penalty, 3500); assert_eq!(s.reranker.pronoun_cost_bonus, 3500); @@ -406,7 +406,7 @@ unknown_word_cost = 10000 [reranker] length_variance_weight = 2000 -structure_cost_filter = 4000 +structure_cost_filter = 6000 non_independent_kanji_penalty = 3000 [history] @@ -438,7 +438,7 @@ unknown_word_cost = 10000 [reranker] length_variance_weight = 2000 -structure_cost_filter = 4000 +structure_cost_filter = 6000 non_independent_kanji_penalty = 3000 [history] @@ -469,7 +469,7 @@ unknown_word_cost = 10000 [reranker] length_variance_weight = 2000 -structure_cost_filter = 4000 +structure_cost_filter = 6000 non_independent_kanji_penalty = 3000 [history] @@ -500,7 +500,7 @@ unknown_word_cost = 10000 [reranker] length_variance_weight = 2000 -structure_cost_filter = 4000 +structure_cost_filter = 6000 non_independent_kanji_penalty = 3000 [history] @@ -532,7 +532,7 @@ unknown_word_cost = 10000 [reranker] length_variance_weight = 2000 -structure_cost_filter = 4000 +structure_cost_filter = 6000 non_independent_kanji_penalty = 3000 [history] diff --git a/engine/testcorpus/accuracy-corpus.toml b/engine/testcorpus/accuracy-corpus.toml index 3e94b00..08d07fd 100644 --- a/engine/testcorpus/accuracy-corpus.toml +++ b/engine/testcorpus/accuracy-corpus.toml @@ -309,6 +309,13 @@ category = "regression" tags = ["copula-omission", "structure-cost-cap"] note = "形容動詞語幹→ある仮定形の接続コスト(7025)がstructure_cost_filterで除外される問題" +[[cases]] +reading = "いまですね" +expected = "今ですね" +category = "regression" +tags = ["structure-cost-filter", "particle"] +note = "今|です|ね が structure_cost_filter で除外される問題 — filter 閾値引き上げ+prefix floor で修正" + [[cases]] reading = "あったほうが" expected = "あった方が" From 878289103819a55a0a4b9ad0bb8983f17ee266f1 Mon Sep 17 00:00:00 2001 From: "SAKAI, Kazuaki" Date: Fri, 13 Mar 2026 17:33:19 +0900 Subject: [PATCH 2/6] refactor: use is_prefix() instead of magic role value Co-Authored-By: Claude Opus 4.6 --- engine/crates/lex-core/src/converter/reranker.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engine/crates/lex-core/src/converter/reranker.rs b/engine/crates/lex-core/src/converter/reranker.rs index 2ab5e68..bfc6e5f 100644 --- a/engine/crates/lex-core/src/converter/reranker.rs +++ b/engine/crates/lex-core/src/converter/reranker.rs @@ -123,7 +123,7 @@ pub fn rerank( for i in 1..p.segments.len() { let mut tc = conn_cost(conn, p.segments[i - 1].right_id, p.segments[i].left_id); if let Some(c) = conn { - if c.role(p.segments[i - 1].left_id) == 3 { + if c.is_prefix(p.segments[i - 1].left_id) { tc = tc.max(prefix_floor); } } From f6086e2be554e9bfc5cb28ada6a02bdbaf975826 Mon Sep 17 00:00:00 2001 From: "SAKAI, Kazuaki" Date: Fri, 13 Mar 2026 18:53:22 +0900 Subject: [PATCH 3/6] test: add unit test for prefix transition floor in structure_cost_filter Verify that is_prefix() floor logic is exercised by using from_text_with_roles to build a ConnectionMatrix with a prefix POS. Co-Authored-By: Claude Opus 4.6 --- .../lex-core/src/converter/tests/reranker.rs | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/engine/crates/lex-core/src/converter/tests/reranker.rs b/engine/crates/lex-core/src/converter/tests/reranker.rs index 35a7853..e61686b 100644 --- a/engine/crates/lex-core/src/converter/tests/reranker.rs +++ b/engine/crates/lex-core/src/converter/tests/reranker.rs @@ -568,3 +568,116 @@ fn test_filter_preserves_minimum_path() { assert_eq!(paths.len(), 1); assert_eq!(paths[0].segments[0].surface, "合言葉"); } + +#[test] +fn test_prefix_floor_prevents_low_baseline() { + // Without prefix floor, a prefix→content transition with very low + // connection cost (e.g. 200) would set min_sc so low that a correct + // 3-segment path gets filtered out. + // + // Setup: 4 POS IDs (0..3), ID 0 is prefix (role=3). + // Connection costs: all 5000, except (0→any) = 200. + // + // Path A: [prefix(id=0)] → [content(id=1)] → [content(id=1)] + // Without floor: sc = 200 + 5000 = 5200 + // With floor: sc = 3000 + 5000 = 8000 (prefix_floor = 6000/2 = 3000) + // + // Path B: [content(id=1)] → [content(id=1)] → [content(id=1)] + // sc = 5000 + 5000 = 10000 + // + // Without floor: min_sc = 5200, threshold = 5200 + 6000 = 11200. + // Both paths survive (10000 ≤ 11200). ← OK, but artificially low baseline. + // + // With floor: min_sc = 8000, threshold = 8000 + 6000 = 14000. + // Both paths survive (10000 ≤ 14000). ← More robust baseline. + // + // To show the floor matters, add Path C with sc that would be dropped + // without floor but kept with floor is tricky, so instead we verify + // that the prefix transition is floored by checking structure_cost values + // indirectly: add a fragmented Path C with sc = 12000 that survives + // with floor (12000 ≤ 14000) but would be dropped without it if we + // had a tighter filter. Here we just verify both A and B survive and + // the prefix floor logic executes. + let num_ids = 4u16; + let mut costs = Vec::new(); + for left in 0..num_ids { + for _right in 0..num_ids { + costs.push(if left == 0 { 200i16 } else { 5000 }); + } + } + let mut text = format!("{num_ids} {num_ids}\n"); + for c in &costs { + text.push_str(&format!("{c}\n")); + } + // ID 0 = prefix (role 3), IDs 1-3 = content (role 0) + let roles = vec![3u8, 0, 0, 0]; + let conn = + ConnectionMatrix::from_text_with_roles(&text, 0, num_ids - 1, roles).unwrap(); + + // Verify prefix is recognized + assert!(conn.is_prefix(0)); + assert!(!conn.is_prefix(1)); + + let mut paths = vec![ + // Path A: prefix → content → content (low prefix transition) + ScoredPath { + segments: vec![ + RichSegment { + reading: "お".into(), + surface: "御".into(), + left_id: 0, + right_id: 0, + word_cost: 0, + }, + RichSegment { + reading: "くるま".into(), + surface: "車".into(), + left_id: 1, + right_id: 1, + word_cost: 0, + }, + RichSegment { + reading: "で".into(), + surface: "で".into(), + left_id: 1, + right_id: 1, + word_cost: 0, + }, + ], + viterbi_cost: 3000, + }, + // Path B: content → content → content (normal transitions) + ScoredPath { + segments: vec![ + RichSegment { + reading: "おくる".into(), + surface: "送る".into(), + left_id: 1, + right_id: 1, + word_cost: 0, + }, + RichSegment { + reading: "ま".into(), + surface: "間".into(), + left_id: 1, + right_id: 1, + word_cost: 0, + }, + RichSegment { + reading: "で".into(), + surface: "で".into(), + left_id: 1, + right_id: 1, + word_cost: 0, + }, + ], + viterbi_cost: 4000, + }, + ]; + + rerank(&mut paths, Some(&conn), None); + + // Both paths should survive: with prefix floor, min_sc is raised + // so neither path exceeds the threshold. + assert_eq!(paths.len(), 2); +} From 64102d17a65c4213e1cc070235bc59c9365a1c59 Mon Sep 17 00:00:00 2001 From: "SAKAI, Kazuaki" Date: Fri, 13 Mar 2026 18:56:27 +0900 Subject: [PATCH 4/6] style: fix formatting Co-Authored-By: Claude Opus 4.6 --- engine/crates/lex-core/src/converter/tests/reranker.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/engine/crates/lex-core/src/converter/tests/reranker.rs b/engine/crates/lex-core/src/converter/tests/reranker.rs index e61686b..985f1a6 100644 --- a/engine/crates/lex-core/src/converter/tests/reranker.rs +++ b/engine/crates/lex-core/src/converter/tests/reranker.rs @@ -611,8 +611,7 @@ fn test_prefix_floor_prevents_low_baseline() { } // ID 0 = prefix (role 3), IDs 1-3 = content (role 0) let roles = vec![3u8, 0, 0, 0]; - let conn = - ConnectionMatrix::from_text_with_roles(&text, 0, num_ids - 1, roles).unwrap(); + let conn = ConnectionMatrix::from_text_with_roles(&text, 0, num_ids - 1, roles).unwrap(); // Verify prefix is recognized assert!(conn.is_prefix(0)); From 7590014b0e1d6ed19d43df1478cbd22f1b60b60d Mon Sep 17 00:00:00 2001 From: "SAKAI, Kazuaki" Date: Fri, 13 Mar 2026 19:07:14 +0900 Subject: [PATCH 5/6] fix: clamp prefix_floor to cap and improve prefix floor test - Clamp prefix_floor to min(filter/2, cap) so the floor remains effective when structure_cost_transition_cap is lower than the floor. - Rewrite test_prefix_floor_prevents_low_baseline so that path B would be dropped without the floor but survives with it, ensuring the test actually validates the flooring logic. Co-Authored-By: Claude Opus 4.6 --- .../crates/lex-core/src/converter/reranker.rs | 2 +- .../lex-core/src/converter/tests/reranker.rs | 55 +++++++------------ 2 files changed, 21 insertions(+), 36 deletions(-) diff --git a/engine/crates/lex-core/src/converter/reranker.rs b/engine/crates/lex-core/src/converter/reranker.rs index bfc6e5f..bb56737 100644 --- a/engine/crates/lex-core/src/converter/reranker.rs +++ b/engine/crates/lex-core/src/converter/reranker.rs @@ -115,7 +115,7 @@ pub fn rerank( // (e.g. 今[prefix]→デスネ with conn=256) can drag min_sc so low that // the hard filter drops correct multi-segment paths like 今|です|ね. let cap = settings().reranker.structure_cost_transition_cap; - let prefix_floor = settings().reranker.structure_cost_filter / 2; + let prefix_floor = (settings().reranker.structure_cost_filter / 2).min(cap); let mut structure_costs: Vec = paths .iter() .map(|p| { diff --git a/engine/crates/lex-core/src/converter/tests/reranker.rs b/engine/crates/lex-core/src/converter/tests/reranker.rs index 985f1a6..e43508f 100644 --- a/engine/crates/lex-core/src/converter/tests/reranker.rs +++ b/engine/crates/lex-core/src/converter/tests/reranker.rs @@ -571,38 +571,30 @@ fn test_filter_preserves_minimum_path() { #[test] fn test_prefix_floor_prevents_low_baseline() { - // Without prefix floor, a prefix→content transition with very low - // connection cost (e.g. 200) would set min_sc so low that a correct - // 3-segment path gets filtered out. + // Verifies that prefix floor raises min_sc enough to keep a path + // that would be dropped without it. // - // Setup: 4 POS IDs (0..3), ID 0 is prefix (role=3). - // Connection costs: all 5000, except (0→any) = 200. + // Setup: 4 POS IDs, ID 0 is prefix (role=3). + // Connection costs: all 4000, except (0→any) = 100. + // prefix_floor = 6000 / 2 = 3000. // - // Path A: [prefix(id=0)] → [content(id=1)] → [content(id=1)] - // Without floor: sc = 200 + 5000 = 5200 - // With floor: sc = 3000 + 5000 = 8000 (prefix_floor = 6000/2 = 3000) + // Path A: [prefix(id=0)] → [content(id=1)] (1 transition) + // Without floor: sc = 100 + // With floor: sc = 3000 // - // Path B: [content(id=1)] → [content(id=1)] → [content(id=1)] - // sc = 5000 + 5000 = 10000 + // Path B: [content(id=1)] → [content(id=1)] → [content(id=1)] (2 transitions) + // sc = 4000 + 4000 = 8000 // - // Without floor: min_sc = 5200, threshold = 5200 + 6000 = 11200. - // Both paths survive (10000 ≤ 11200). ← OK, but artificially low baseline. + // Without floor: min_sc = 100, threshold = 100 + 6000 = 6100. + // Path B (8000 > 6100) → DROPPED. // - // With floor: min_sc = 8000, threshold = 8000 + 6000 = 14000. - // Both paths survive (10000 ≤ 14000). ← More robust baseline. - // - // To show the floor matters, add Path C with sc that would be dropped - // without floor but kept with floor is tricky, so instead we verify - // that the prefix transition is floored by checking structure_cost values - // indirectly: add a fragmented Path C with sc = 12000 that survives - // with floor (12000 ≤ 14000) but would be dropped without it if we - // had a tighter filter. Here we just verify both A and B survive and - // the prefix floor logic executes. + // With floor: min_sc = 3000, threshold = 3000 + 6000 = 9000. + // Path B (8000 ≤ 9000) → KEPT. let num_ids = 4u16; let mut costs = Vec::new(); for left in 0..num_ids { for _right in 0..num_ids { - costs.push(if left == 0 { 200i16 } else { 5000 }); + costs.push(if left == 0 { 100i16 } else { 4000 }); } } let mut text = format!("{num_ids} {num_ids}\n"); @@ -613,12 +605,11 @@ fn test_prefix_floor_prevents_low_baseline() { let roles = vec![3u8, 0, 0, 0]; let conn = ConnectionMatrix::from_text_with_roles(&text, 0, num_ids - 1, roles).unwrap(); - // Verify prefix is recognized assert!(conn.is_prefix(0)); assert!(!conn.is_prefix(1)); let mut paths = vec![ - // Path A: prefix → content → content (low prefix transition) + // Path A: prefix → content (low prefix transition, floored to 3000) ScoredPath { segments: vec![ RichSegment { @@ -635,17 +626,12 @@ fn test_prefix_floor_prevents_low_baseline() { right_id: 1, word_cost: 0, }, - RichSegment { - reading: "で".into(), - surface: "で".into(), - left_id: 1, - right_id: 1, - word_cost: 0, - }, ], viterbi_cost: 3000, }, - // Path B: content → content → content (normal transitions) + // Path B: content → content → content (sc = 8000) + // Without floor this would be dropped (8000 > 6100). + // With floor it survives (8000 ≤ 9000). ScoredPath { segments: vec![ RichSegment { @@ -676,7 +662,6 @@ fn test_prefix_floor_prevents_low_baseline() { rerank(&mut paths, Some(&conn), None); - // Both paths should survive: with prefix floor, min_sc is raised - // so neither path exceeds the threshold. + // Both paths survive thanks to the prefix floor raising the threshold. assert_eq!(paths.len(), 2); } From 88159517ea4786e448d08f7199d45d4b0f4e8b2f Mon Sep 17 00:00:00 2001 From: "SAKAI, Kazuaki" Date: Fri, 13 Mar 2026 19:14:05 +0900 Subject: [PATCH 6/6] fix: use right_id for prefix check in structure_cost computation The transition cost is conn_cost(prev.right_id, next.left_id), so the prefix check should use right_id (the outgoing POS) rather than left_id (the incoming POS) of the previous segment. Co-Authored-By: Claude Opus 4.6 --- engine/crates/lex-core/src/converter/reranker.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engine/crates/lex-core/src/converter/reranker.rs b/engine/crates/lex-core/src/converter/reranker.rs index bb56737..783c430 100644 --- a/engine/crates/lex-core/src/converter/reranker.rs +++ b/engine/crates/lex-core/src/converter/reranker.rs @@ -123,7 +123,7 @@ pub fn rerank( for i in 1..p.segments.len() { let mut tc = conn_cost(conn, p.segments[i - 1].right_id, p.segments[i].left_id); if let Some(c) = conn { - if c.is_prefix(p.segments[i - 1].left_id) { + if c.is_prefix(p.segments[i - 1].right_id) { tc = tc.max(prefix_floor); } }