diff --git a/engine/crates/lex-core/src/converter/cost.rs b/engine/crates/lex-core/src/converter/cost.rs index 4d767c0..1f39231 100644 --- a/engine/crates/lex-core/src/converter/cost.rs +++ b/engine/crates/lex-core/src/converter/cost.rs @@ -29,7 +29,7 @@ pub fn script_cost(surface: &str, reading_chars: usize) -> i64 { all_katakana = false; } } - let scale = reading_chars.min(3) as i64; + let scale = reading_chars.min(2) as i64; if has_kanji && has_kana { -s.cost.mixed_script_bonus * scale / 3 } else if has_kanji { diff --git a/engine/crates/lex-core/src/converter/reranker.rs b/engine/crates/lex-core/src/converter/reranker.rs index fb886c3..783c430 100644 --- a/engine/crates/lex-core/src/converter/reranker.rs +++ b/engine/crates/lex-core/src/converter/reranker.rs @@ -108,23 +108,51 @@ pub fn rerank( return; } - // Step 1: Compute structure_cost for each path + // Step 1: Compute structure_cost for each path. + // + // Transitions FROM a prefix POS (role == 3) get a floor of half the + // filter threshold. Without this, a prefix→content-word transition + // (e.g. 今[prefix]→デスネ with conn=256) can drag min_sc so low that + // the hard filter drops correct multi-segment paths like 今|です|ね. let cap = settings().reranker.structure_cost_transition_cap; + let prefix_floor = (settings().reranker.structure_cost_filter / 2).min(cap); let mut structure_costs: Vec = paths .iter() .map(|p| { let mut sc: i64 = 0; for i in 1..p.segments.len() { - sc += conn_cost(conn, p.segments[i - 1].right_id, p.segments[i].left_id).min(cap); + let mut tc = conn_cost(conn, p.segments[i - 1].right_id, p.segments[i].left_id); + if let Some(c) = conn { + if c.is_prefix(p.segments[i - 1].right_id) { + tc = tc.max(prefix_floor); + } + } + sc += tc.min(cap); } sc }) .collect(); // Step 2: Hard filter — drop paths exceeding min + threshold. - // min_sc is guaranteed to be <= threshold, so at least one path always survives. - let min_sc = *structure_costs.iter().min().unwrap(); - let threshold = min_sc + settings().reranker.structure_cost_filter; + // + // For min_sc computation, single-segment paths (0 transitions, sc=0) are + // imputed with prefix_floor so they don't set an artificially low baseline. + // Combined with the prefix-transition floor in step 1, this ensures the + // threshold is high enough to keep correct multi-segment paths. + let filter = settings().reranker.structure_cost_filter; + let min_sc = structure_costs + .iter() + .zip(paths.iter()) + .map(|(&sc, p)| { + if p.segments.len() <= 1 { + prefix_floor + } else { + sc + } + }) + .min() + .unwrap(); + let threshold = min_sc + filter; { let mut i = 0; let mut kept_costs = Vec::new(); diff --git a/engine/crates/lex-core/src/converter/tests/reranker.rs b/engine/crates/lex-core/src/converter/tests/reranker.rs index 1912bc2..e43508f 100644 --- a/engine/crates/lex-core/src/converter/tests/reranker.rs +++ b/engine/crates/lex-core/src/converter/tests/reranker.rs @@ -176,14 +176,14 @@ fn test_rerank_penalizes_uneven_segments() { rerank(&mut paths, None, None); - // script_cost (scaled by reading length): - // "来たり" (reading "きたり" = 3 chars) → mixed bonus -3000 * 3/3 = -3000 + // script_cost (scaled by reading length, capped at 2): + // "来たり" (reading "きたり" = 3 chars, cap 2) → mixed bonus -3000 * 2/3 = -2000 // "出来" (reading "でき" = 2 chars) → pure_kanji bonus -1000 * 2/3 = -666 - // Uneven: 5000 + variance(0, exempt) + script("で"=0 + "来たり"=-3000) = 2000 - // Even: 6500 + variance(0, exempt) + script("出来"=-666 + "たり"=0) = 5834 + // Uneven: 5000 + script("で"=0 + "来たり"=-2000) = 3000 + // Even: 6500 + script("出来"=-666 + "たり"=0) = 5834 // Uneven path wins due to mixed-script bonus on "来たり" assert_eq!(paths[0].segments[0].surface, "で"); - assert_eq!(paths[0].viterbi_cost, 2000); + assert_eq!(paths[0].viterbi_cost, 3000); assert_eq!(paths[1].segments[0].surface, "出来"); assert_eq!(paths[1].viterbi_cost, 5834); } @@ -373,13 +373,13 @@ fn uniform_conn(cost: i16) -> ConnectionMatrix { #[test] fn test_filter_drops_fragmented_paths() { - // Transition cost = 1500 each. - // Path A: 1 segment → 0 transitions → structure_cost = 0 - // Path B: 2 segments → 1 transition → structure_cost = 1500 - // Path C: 5 segments → 4 transitions → structure_cost = 6000 - // min_sc = 0, threshold = 0 + 4000 = 4000 - // Path C (6000 > 4000) should be dropped; A and B should remain. - let conn = uniform_conn(1500); + // Transition cost = 5000 each. + // Path A: 1 segment → sc = 0 (imputed to 3000 for min_sc) + // Path B: 2 segments → sc = 5000 + // Path C: 5 segments → sc = 20000 + // min_sc = 3000 (imputed), threshold = 3000 + 6000 = 9000. + // Path C (20000 > 9000) should be dropped; A and B survive. + let conn = uniform_conn(5000); let mut paths = vec![ ScoredPath { @@ -455,9 +455,9 @@ fn test_filter_drops_fragmented_paths() { rerank(&mut paths, Some(&conn), None); - // Path C should have been filtered out + // Path C should have been filtered out (sc=20000 > threshold=9000); + // paths A and B survive. assert_eq!(paths.len(), 2); - // Verify the fragmented 5-segment path is gone assert!(paths.iter().all(|p| p.segments.len() <= 2)); } @@ -465,8 +465,8 @@ fn test_filter_drops_fragmented_paths() { fn test_filter_keeps_all_when_all_exceed() { // All paths have high structure_cost; none should be dropped. // Transition cost = 2000. All paths have 4 segments → 3 transitions → sc = 6000. - // min_sc = 6000, threshold = 6000 + 4000 = 10000. - // All paths have sc = 6000 ≤ 10000, so all pass. + // min_sc = 6000, threshold = 6000 + 6000 = 12000. + // All paths have sc = 6000 ≤ 12000, so all pass. // But to truly test the "all exceed" safety, we need a scenario where // min_sc itself is above the threshold relative to... Actually the safety // is: if ALL paths have sc > threshold, keep all. Let's just verify @@ -510,10 +510,11 @@ fn test_filter_keeps_all_when_all_exceed() { #[test] fn test_filter_preserves_minimum_path() { - // The path with minimum structure_cost must always survive the filter. - // Path A: 1 segment → sc = 0 (minimum) - // Path B: 4 segments → sc = 4500 (3 × 1500); 4500 > 0 + 4000 → filtered - let conn = uniform_conn(1500); + // The path with minimum structure_cost always survives. + // Path A: 4 segments → sc = 15000 + // Path B: 1 segment → sc = 0 (imputed to 3000 for min_sc) + // min_sc = 3000, threshold = 3000 + 6000 = 9000. Path A (15000 > 9000) → filtered. + let conn = uniform_conn(5000); let mut paths = vec![ ScoredPath { @@ -567,3 +568,100 @@ fn test_filter_preserves_minimum_path() { assert_eq!(paths.len(), 1); assert_eq!(paths[0].segments[0].surface, "合言葉"); } + +#[test] +fn test_prefix_floor_prevents_low_baseline() { + // Verifies that prefix floor raises min_sc enough to keep a path + // that would be dropped without it. + // + // Setup: 4 POS IDs, ID 0 is prefix (role=3). + // Connection costs: all 4000, except (0→any) = 100. + // prefix_floor = 6000 / 2 = 3000. + // + // Path A: [prefix(id=0)] → [content(id=1)] (1 transition) + // Without floor: sc = 100 + // With floor: sc = 3000 + // + // Path B: [content(id=1)] → [content(id=1)] → [content(id=1)] (2 transitions) + // sc = 4000 + 4000 = 8000 + // + // Without floor: min_sc = 100, threshold = 100 + 6000 = 6100. + // Path B (8000 > 6100) → DROPPED. + // + // With floor: min_sc = 3000, threshold = 3000 + 6000 = 9000. + // Path B (8000 ≤ 9000) → KEPT. + let num_ids = 4u16; + let mut costs = Vec::new(); + for left in 0..num_ids { + for _right in 0..num_ids { + costs.push(if left == 0 { 100i16 } else { 4000 }); + } + } + let mut text = format!("{num_ids} {num_ids}\n"); + for c in &costs { + text.push_str(&format!("{c}\n")); + } + // ID 0 = prefix (role 3), IDs 1-3 = content (role 0) + let roles = vec![3u8, 0, 0, 0]; + let conn = ConnectionMatrix::from_text_with_roles(&text, 0, num_ids - 1, roles).unwrap(); + + assert!(conn.is_prefix(0)); + assert!(!conn.is_prefix(1)); + + let mut paths = vec![ + // Path A: prefix → content (low prefix transition, floored to 3000) + ScoredPath { + segments: vec![ + RichSegment { + reading: "お".into(), + surface: "御".into(), + left_id: 0, + right_id: 0, + word_cost: 0, + }, + RichSegment { + reading: "くるま".into(), + surface: "車".into(), + left_id: 1, + right_id: 1, + word_cost: 0, + }, + ], + viterbi_cost: 3000, + }, + // Path B: content → content → content (sc = 8000) + // Without floor this would be dropped (8000 > 6100). + // With floor it survives (8000 ≤ 9000). + ScoredPath { + segments: vec![ + RichSegment { + reading: "おくる".into(), + surface: "送る".into(), + left_id: 1, + right_id: 1, + word_cost: 0, + }, + RichSegment { + reading: "ま".into(), + surface: "間".into(), + left_id: 1, + right_id: 1, + word_cost: 0, + }, + RichSegment { + reading: "で".into(), + surface: "で".into(), + left_id: 1, + right_id: 1, + word_cost: 0, + }, + ], + viterbi_cost: 4000, + }, + ]; + + rerank(&mut paths, Some(&conn), None); + + // Both paths survive thanks to the prefix floor raising the threshold. + assert_eq!(paths.len(), 2); +} diff --git a/engine/crates/lex-core/src/default_settings.toml b/engine/crates/lex-core/src/default_settings.toml index 7f397c6..05c94ef 100644 --- a/engine/crates/lex-core/src/default_settings.toml +++ b/engine/crates/lex-core/src/default_settings.toml @@ -8,7 +8,7 @@ unknown_word_cost = 10000 [reranker] length_variance_weight = 2000 -structure_cost_filter = 4000 +structure_cost_filter = 6000 non_independent_kanji_penalty = 3000 te_form_kanji_penalty = 3500 pronoun_cost_bonus = 3500 diff --git a/engine/crates/lex-core/src/settings.rs b/engine/crates/lex-core/src/settings.rs index 59eadc3..73ecdc6 100644 --- a/engine/crates/lex-core/src/settings.rs +++ b/engine/crates/lex-core/src/settings.rs @@ -331,7 +331,7 @@ mod tests { assert_eq!(s.cost.latin_penalty, 20000); assert_eq!(s.cost.unknown_word_cost, 10000); assert_eq!(s.reranker.length_variance_weight, 2000); - assert_eq!(s.reranker.structure_cost_filter, 4000); + assert_eq!(s.reranker.structure_cost_filter, 6000); assert_eq!(s.reranker.non_independent_kanji_penalty, 3000); assert_eq!(s.reranker.te_form_kanji_penalty, 3500); assert_eq!(s.reranker.pronoun_cost_bonus, 3500); @@ -406,7 +406,7 @@ unknown_word_cost = 10000 [reranker] length_variance_weight = 2000 -structure_cost_filter = 4000 +structure_cost_filter = 6000 non_independent_kanji_penalty = 3000 [history] @@ -438,7 +438,7 @@ unknown_word_cost = 10000 [reranker] length_variance_weight = 2000 -structure_cost_filter = 4000 +structure_cost_filter = 6000 non_independent_kanji_penalty = 3000 [history] @@ -469,7 +469,7 @@ unknown_word_cost = 10000 [reranker] length_variance_weight = 2000 -structure_cost_filter = 4000 +structure_cost_filter = 6000 non_independent_kanji_penalty = 3000 [history] @@ -500,7 +500,7 @@ unknown_word_cost = 10000 [reranker] length_variance_weight = 2000 -structure_cost_filter = 4000 +structure_cost_filter = 6000 non_independent_kanji_penalty = 3000 [history] @@ -532,7 +532,7 @@ unknown_word_cost = 10000 [reranker] length_variance_weight = 2000 -structure_cost_filter = 4000 +structure_cost_filter = 6000 non_independent_kanji_penalty = 3000 [history] diff --git a/engine/testcorpus/accuracy-corpus.toml b/engine/testcorpus/accuracy-corpus.toml index 3e94b00..08d07fd 100644 --- a/engine/testcorpus/accuracy-corpus.toml +++ b/engine/testcorpus/accuracy-corpus.toml @@ -309,6 +309,13 @@ category = "regression" tags = ["copula-omission", "structure-cost-cap"] note = "形容動詞語幹→ある仮定形の接続コスト(7025)がstructure_cost_filterで除外される問題" +[[cases]] +reading = "いまですね" +expected = "今ですね" +category = "regression" +tags = ["structure-cost-filter", "particle"] +note = "今|です|ね が structure_cost_filter で除外される問題 — filter 閾値引き上げ+prefix floor で修正" + [[cases]] reading = "あったほうが" expected = "あった方が"