1use crate::sejong::SejongConverter;
28use crate::tokenizer::{Token, Tokenizer};
29use std::collections::HashMap;
30use std::fs::File;
31use std::io::{BufRead, BufReader};
32use std::path::Path;
33use thiserror::Error;
34
35#[derive(Error, Debug)]
37pub enum EvaluateError {
38 #[error("I/O error: {0}")]
40 Io(#[from] std::io::Error),
41
42 #[error("Parse error: {0}")]
44 Parse(String),
45
46 #[error("Data error: {0}")]
48 Data(String),
49}
50
51pub type Result<T> = std::result::Result<T, EvaluateError>;
53
54#[derive(Debug, Clone, PartialEq, Eq)]
56pub struct GoldToken {
57 pub surface: String,
59 pub pos: String,
61}
62
63impl GoldToken {
64 #[must_use]
71 pub const fn new(surface: String, pos: String) -> Self {
72 Self { surface, pos }
73 }
74
75 pub fn parse(s: &str) -> Result<Self> {
85 let parts: Vec<&str> = s.split('/').collect();
86 if parts.len() != 2 {
87 return Err(EvaluateError::Parse(format!(
88 "Invalid token format: {s} (expected surface/pos)"
89 )));
90 }
91
92 Ok(Self {
93 surface: SejongConverter::normalize_jamo(parts[0]),
94 pos: parts[1].to_string(),
95 })
96 }
97}
98
99#[derive(Debug, Clone)]
101pub struct GoldSentence {
102 pub text: String,
104 pub tokens: Vec<GoldToken>,
106 pub eojeol_counts: Option<Vec<usize>>,
109}
110
111impl GoldSentence {
112 #[must_use]
119 pub const fn new(text: String, tokens: Vec<GoldToken>) -> Self {
120 Self {
121 text,
122 tokens,
123 eojeol_counts: None,
124 }
125 }
126
127 pub fn parse_tsv_line(line: &str) -> Result<Self> {
145 let parts: Vec<&str> = line.split('\t').collect();
146 if parts.len() < 2 || parts.len() > 3 {
147 return Err(EvaluateError::Parse(format!(
148 "Invalid TSV line: {line} (expected 2 or 3 tab-separated columns)"
149 )));
150 }
151
152 let text = parts[0].trim().to_string();
153 let tokens_str = parts[1].trim();
154
155 let tokens = tokens_str
156 .split_whitespace()
157 .map(GoldToken::parse)
158 .collect::<Result<Vec<_>>>()?;
159
160 if tokens.is_empty() {
161 return Err(EvaluateError::Data(format!(
162 "Empty gold tokens for text: {text}"
163 )));
164 }
165
166 let eojeol_counts = if parts.len() == 3 {
167 let counts: Vec<usize> = parts[2]
168 .trim()
169 .split(',')
170 .map(|s| {
171 s.trim().parse::<usize>().map_err(|e| {
172 EvaluateError::Parse(format!("Invalid eojeol count '{s}': {e}"))
173 })
174 })
175 .collect::<Result<Vec<_>>>()?;
176
177 let sum: usize = counts.iter().sum();
178 if sum != tokens.len() {
179 return Err(EvaluateError::Data(format!(
180 "eojeol_counts sum ({sum}) does not match tokens len ({}) for text: {text}",
181 tokens.len()
182 )));
183 }
184 Some(counts)
185 } else {
186 None
187 };
188
189 Ok(Self {
190 text,
191 tokens,
192 eojeol_counts,
193 })
194 }
195}
196
197#[derive(Debug, Clone)]
199pub struct TestDataset {
200 pub sentences: Vec<GoldSentence>,
202}
203
204impl TestDataset {
205 #[must_use]
207 pub const fn new() -> Self {
208 Self {
209 sentences: Vec::new(),
210 }
211 }
212
213 pub fn from_tsv<P: AsRef<Path>>(path: P) -> Result<Self> {
229 let file = File::open(path)?;
230 let reader = BufReader::new(file);
231
232 let mut sentences = Vec::new();
233
234 for (line_num, line) in reader.lines().enumerate() {
235 let line = line?;
236 let trimmed = line.trim();
237
238 if trimmed.is_empty() || trimmed.starts_with('#') {
240 continue;
241 }
242
243 let sentence = GoldSentence::parse_tsv_line(trimmed)
244 .map_err(|e| EvaluateError::Parse(format!("Line {}: {}", line_num + 1, e)))?;
245
246 sentences.push(sentence);
247 }
248
249 if sentences.is_empty() {
250 return Err(EvaluateError::Data("Empty dataset".to_string()));
251 }
252
253 Ok(Self { sentences })
254 }
255
256 pub fn add_sentence(&mut self, sentence: GoldSentence) {
262 self.sentences.push(sentence);
263 }
264
265 #[must_use]
267 pub fn len(&self) -> usize {
268 self.sentences.len()
269 }
270
271 #[must_use]
273 pub fn is_empty(&self) -> bool {
274 self.sentences.is_empty()
275 }
276}
277
278impl Default for TestDataset {
279 fn default() -> Self {
280 Self::new()
281 }
282}
283
284#[derive(Debug, Clone)]
286pub struct EvaluationResult {
287 pub total_sentences: usize,
289 pub total_gold_tokens: usize,
291 pub total_pred_tokens: usize,
293
294 pub true_positives: usize,
296 pub false_positives: usize,
298 pub false_negatives: usize,
300
301 pub exact_match_sentences: usize,
303
304 pub token_accuracy: f64,
306 pub sentence_accuracy: f64,
308 pub pos_accuracy: f64,
310 pub precision: f64,
312 pub recall: f64,
314 pub f1_score: f64,
316
317 pub pos_stats: HashMap<String, PosStats>,
319}
320
321#[derive(Debug, Clone, Default)]
323pub struct PosStats {
324 pub gold_count: usize,
326 pub pred_count: usize,
328 pub correct: usize,
330 pub accuracy: f64,
332}
333
334impl EvaluationResult {
335 #[must_use]
337 pub fn new() -> Self {
338 Self {
339 total_sentences: 0,
340 total_gold_tokens: 0,
341 total_pred_tokens: 0,
342 true_positives: 0,
343 false_positives: 0,
344 false_negatives: 0,
345 exact_match_sentences: 0,
346 token_accuracy: 0.0,
347 sentence_accuracy: 0.0,
348 pos_accuracy: 0.0,
349 precision: 0.0,
350 recall: 0.0,
351 f1_score: 0.0,
352 pos_stats: HashMap::new(),
353 }
354 }
355
356 #[must_use]
362 #[allow(clippy::cast_precision_loss, clippy::unwrap_used)]
363 pub fn format_report(&self) -> String {
364 use std::fmt::Write;
365
366 let mut report = String::new();
367
368 report.push_str("=== 정확도 평가 결과 ===\n");
369 writeln!(report, "테스트 문장: {}", self.total_sentences).unwrap();
370 writeln!(
371 report,
372 "Token Accuracy: {:.1}%",
373 self.token_accuracy * 100.0
374 )
375 .unwrap();
376 writeln!(
377 report,
378 "Sentence Accuracy: {:.1}%",
379 self.sentence_accuracy * 100.0
380 )
381 .unwrap();
382 writeln!(report, "POS Accuracy: {:.1}%", self.pos_accuracy * 100.0).unwrap();
383 writeln!(report, "Precision: {:.3}", self.precision).unwrap();
384 writeln!(report, "Recall: {:.3}", self.recall).unwrap();
385 writeln!(report, "F1 Score: {:.3}", self.f1_score).unwrap();
386 report.push('\n');
387
388 report.push_str("토큰 통계:\n");
389 writeln!(report, " 정답 토큰: {}", self.total_gold_tokens).unwrap();
390 writeln!(report, " 예측 토큰: {}", self.total_pred_tokens).unwrap();
391 writeln!(
392 report,
393 " 완전 일치 문장: {} / {} ({:.1}%)",
394 self.exact_match_sentences,
395 self.total_sentences,
396 (self.exact_match_sentences as f64 / self.total_sentences as f64) * 100.0
397 )
398 .unwrap();
399 report.push('\n');
400
401 let mut pos_sorted: Vec<_> = self.pos_stats.iter().collect();
403 pos_sorted.sort_by_key(|b| std::cmp::Reverse(b.1.gold_count));
404
405 if !pos_sorted.is_empty() {
406 report.push_str("품사별 정확도:\n");
407 for (pos, stats) in pos_sorted.iter().take(15) {
408 writeln!(
409 report,
410 " {pos:<6} ({}개): {:.1}%",
411 stats.gold_count,
412 stats.accuracy * 100.0
413 )
414 .unwrap();
415 }
416
417 if pos_sorted.len() > 15 {
418 writeln!(report, " ... 외 {}개 품사", pos_sorted.len() - 15).unwrap();
419 }
420 }
421
422 report
423 }
424}
425
426impl Default for EvaluationResult {
427 fn default() -> Self {
428 Self::new()
429 }
430}
431
432#[must_use]
443pub fn evaluate_tokens(
444 gold_tokens: &[GoldToken],
445 pred_tokens: &[Token],
446) -> (usize, usize, usize, usize) {
447 let min_len = gold_tokens.len().min(pred_tokens.len());
448
449 let mut true_positives = 0;
450 let mut pos_match = 0;
451
452 for i in 0..min_len {
454 let gold = &gold_tokens[i];
455 let pred = &pred_tokens[i];
456
457 if gold.surface == pred.surface && gold.pos == pred.pos {
458 true_positives += 1;
459 pos_match += 1;
460 } else if gold.surface == pred.surface {
461 pos_match += 1;
462 }
463 }
464
465 let false_positives = pred_tokens.len().saturating_sub(true_positives);
466 let false_negatives = gold_tokens.len().saturating_sub(true_positives);
467
468 (true_positives, false_positives, false_negatives, pos_match)
469}
470
471#[must_use]
475pub fn evaluate_tokens_aligned(
476 gold_tokens: &[GoldToken],
477 pred_tokens: &[Token],
478) -> (usize, usize, usize, usize) {
479 evaluate_tokens_aligned_with_pos_match(gold_tokens, pred_tokens, pos_eq_strict)
480}
481
482#[must_use]
487pub fn evaluate_tokens_aligned_with_pos_match(
488 gold_tokens: &[GoldToken],
489 pred_tokens: &[Token],
490 pos_eq: PosMatchFn,
491) -> (usize, usize, usize, usize) {
492 evaluate_tokens_aligned_with_match(gold_tokens, pred_tokens, pos_eq, surface_eq_strict)
493}
494
495#[must_use]
504pub fn evaluate_tokens_aligned_with_match(
505 gold_tokens: &[GoldToken],
506 pred_tokens: &[Token],
507 pos_eq: PosMatchFn,
508 surface_eq: SurfaceMatchFn,
509) -> (usize, usize, usize, usize) {
510 let mut true_positives = 0;
511 let mut pos_match = 0;
512
513 let mut gold_idx = 0;
514 let mut pred_idx = 0;
515
516 while gold_idx < gold_tokens.len() && pred_idx < pred_tokens.len() {
517 let gold = &gold_tokens[gold_idx];
518 let pred = &pred_tokens[pred_idx];
519
520 if surface_eq(&gold.surface, &pred.surface) {
521 pos_match += 1;
522 if pos_eq(&gold.pos, &pred.pos) {
523 true_positives += 1;
524 }
525 gold_idx += 1;
526 pred_idx += 1;
527 } else {
528 let mut found = false;
529 for look_ahead in 1..=3 {
530 if pred_idx + look_ahead < pred_tokens.len()
531 && surface_eq(&gold.surface, &pred_tokens[pred_idx + look_ahead].surface)
532 {
533 pred_idx += look_ahead;
534 found = true;
535 break;
536 }
537 }
538
539 if !found {
540 for look_ahead in 1..=3 {
541 if gold_idx + look_ahead < gold_tokens.len()
542 && surface_eq(&gold_tokens[gold_idx + look_ahead].surface, &pred.surface)
543 {
544 gold_idx += look_ahead;
545 found = true;
546 break;
547 }
548 }
549 }
550
551 if !found {
552 gold_idx += 1;
553 pred_idx += 1;
554 }
555 }
556 }
557
558 let false_positives = pred_tokens.len().saturating_sub(true_positives);
559 let false_negatives = gold_tokens.len().saturating_sub(true_positives);
560
561 (true_positives, false_positives, false_negatives, pos_match)
562}
563
564#[must_use]
575#[allow(clippy::cast_precision_loss)]
576pub fn evaluate_dataset(tokenizer: &mut Tokenizer, dataset: &TestDataset) -> EvaluationResult {
577 let mut result = EvaluationResult::new();
578 result.total_sentences = dataset.len();
579
580 for gold_sentence in &dataset.sentences {
581 let pred_tokens = tokenizer.tokenize(&gold_sentence.text);
582
583 result.total_gold_tokens += gold_sentence.tokens.len();
584 result.total_pred_tokens += pred_tokens.len();
585
586 let (tp, fp, fn_, _pos_match) = evaluate_tokens(&gold_sentence.tokens, &pred_tokens);
587
588 result.true_positives += tp;
589 result.false_positives += fp;
590 result.false_negatives += fn_;
591
592 if gold_sentence.tokens.len() == pred_tokens.len() && tp == gold_sentence.tokens.len() {
594 result.exact_match_sentences += 1;
595 }
596
597 for (i, gold_token) in gold_sentence.tokens.iter().enumerate() {
599 let pos_stat = result.pos_stats.entry(gold_token.pos.clone()).or_default();
600
601 pos_stat.gold_count += 1;
602
603 if i < pred_tokens.len() {
604 let pred_token = &pred_tokens[i];
605 if gold_token.surface == pred_token.surface {
606 pos_stat.pred_count += 1;
607 if gold_token.pos == pred_token.pos {
608 pos_stat.correct += 1;
609 }
610 }
611 }
612 }
613 }
614
615 let total_tokens = result.total_gold_tokens;
617 if total_tokens > 0 {
618 result.token_accuracy = result.true_positives as f64 / total_tokens as f64;
619 }
620
621 if result.total_sentences > 0 {
622 result.sentence_accuracy =
623 result.exact_match_sentences as f64 / result.total_sentences as f64;
624 }
625
626 let total_pred = result.total_pred_tokens;
627 if total_pred > 0 {
628 result.precision = result.true_positives as f64 / total_pred as f64;
629 }
630
631 if total_tokens > 0 {
632 result.recall = result.true_positives as f64 / total_tokens as f64;
633 }
634
635 if result.precision + result.recall > 0.0 {
636 result.f1_score =
637 2.0 * (result.precision * result.recall) / (result.precision + result.recall);
638 }
639
640 let mut total_pos_correct = 0;
642 let mut total_pos_gold = 0;
643
644 for pos_stat in result.pos_stats.values_mut() {
645 if pos_stat.gold_count > 0 {
646 pos_stat.accuracy = pos_stat.correct as f64 / pos_stat.gold_count as f64;
647 }
648 total_pos_correct += pos_stat.correct;
649 total_pos_gold += pos_stat.gold_count;
650 }
651
652 if total_pos_gold > 0 {
653 result.pos_accuracy = total_pos_correct as f64 / total_pos_gold as f64;
654 }
655
656 result
657}
658
659#[must_use]
664pub fn evaluate_dataset_sejong(
665 tokenizer: &mut Tokenizer,
666 dataset: &TestDataset,
667) -> EvaluationResult {
668 evaluate_dataset_sejong_with_pos_match(tokenizer, dataset, pos_eq_strict)
669}
670
671#[must_use]
675pub fn evaluate_dataset_sejong_lenient(
676 tokenizer: &mut Tokenizer,
677 dataset: &TestDataset,
678) -> EvaluationResult {
679 evaluate_dataset_sejong_with_pos_match(tokenizer, dataset, pos_tags_equivalent)
680}
681
682pub fn evaluate_dataset_sejong_with_pos_match(
686 tokenizer: &mut Tokenizer,
687 dataset: &TestDataset,
688 pos_eq: PosMatchFn,
689) -> EvaluationResult {
690 evaluate_dataset_sejong_with_match(tokenizer, dataset, pos_eq, surface_eq_strict)
691}
692
693#[allow(clippy::cast_precision_loss)]
698pub fn evaluate_dataset_sejong_with_match(
699 tokenizer: &mut Tokenizer,
700 dataset: &TestDataset,
701 pos_eq: PosMatchFn,
702 surface_eq: SurfaceMatchFn,
703) -> EvaluationResult {
704 let converter = SejongConverter::new();
705 let mut result = EvaluationResult::new();
706 result.total_sentences = dataset.len();
707
708 for gold_sentence in &dataset.sentences {
709 let pred_tokens = tokenizer.tokenize(&gold_sentence.text);
710 let sejong_tokens = converter.convert_tokens(&pred_tokens);
711
712 let converted_pred: Vec<Token> = sejong_tokens
713 .iter()
714 .map(|st| Token {
715 surface: SejongConverter::normalize_jamo(&st.surface),
716 pos: st.pos.clone(),
717 start_pos: st.start_pos,
718 end_pos: st.end_pos,
719 start_byte: 0,
720 end_byte: 0,
721 reading: None,
722 lemma: None,
723 cost: 0,
724 features: String::new(),
725 normalized: None,
726 })
727 .collect();
728
729 result.total_gold_tokens += gold_sentence.tokens.len();
730 result.total_pred_tokens += converted_pred.len();
731
732 let (tp, fp, fn_, _pos_match) = evaluate_tokens_aligned_with_match(
733 &gold_sentence.tokens,
734 &converted_pred,
735 pos_eq,
736 surface_eq,
737 );
738
739 result.true_positives += tp;
740 result.false_positives += fp;
741 result.false_negatives += fn_;
742
743 if gold_sentence.tokens.len() == converted_pred.len() && tp == gold_sentence.tokens.len() {
744 result.exact_match_sentences += 1;
745 }
746
747 for (i, gold_token) in gold_sentence.tokens.iter().enumerate() {
748 let pos_stat = result
749 .pos_stats
750 .entry(gold_token.pos.clone())
751 .or_insert_with(|| PosStats {
752 gold_count: 0,
753 pred_count: 0,
754 correct: 0,
755 accuracy: 0.0,
756 });
757 pos_stat.gold_count += 1;
758
759 if i < converted_pred.len() {
760 let pred_token = &converted_pred[i];
761 if surface_eq(&gold_token.surface, &pred_token.surface) {
762 pos_stat.pred_count += 1;
763 if pos_eq(&gold_token.pos, &pred_token.pos) {
764 pos_stat.correct += 1;
765 }
766 }
767 }
768 }
769 }
770
771 let total_tokens = result.total_gold_tokens;
772 if total_tokens > 0 {
773 result.token_accuracy = result.true_positives as f64 / total_tokens as f64;
774 }
775
776 if result.total_sentences > 0 {
777 result.sentence_accuracy =
778 result.exact_match_sentences as f64 / result.total_sentences as f64;
779 }
780
781 let total_pred = result.total_pred_tokens;
782 if total_pred > 0 {
783 result.precision = result.true_positives as f64 / total_pred as f64;
784 }
785
786 if total_tokens > 0 {
787 result.recall = result.true_positives as f64 / total_tokens as f64;
788 }
789
790 if result.precision + result.recall > 0.0 {
791 result.f1_score =
792 2.0 * (result.precision * result.recall) / (result.precision + result.recall);
793 }
794
795 let mut total_pos_correct = 0;
796 let mut total_pos_gold = 0;
797
798 for pos_stat in result.pos_stats.values_mut() {
799 if pos_stat.gold_count > 0 {
800 pos_stat.accuracy = pos_stat.correct as f64 / pos_stat.gold_count as f64;
801 }
802 total_pos_correct += pos_stat.correct;
803 total_pos_gold += pos_stat.gold_count;
804 }
805
806 if total_pos_gold > 0 {
807 result.pos_accuracy = total_pos_correct as f64 / total_pos_gold as f64;
808 }
809
810 result
811}
812
813pub const TAG_EQUIVALENCE_GROUPS: &[&[&str]] = &[
828 &["SP", "SC"],
829 &["SS", "SY", "SSO", "SSC"],
830 &["MM", "MMD", "MMN", "MMA"],
831 &["SL", "NNP"],
832];
833
834pub const TAG_EQUIVALENCE_GROUPS_PRACTICAL: &[&[&str]] = &[
850 &["SP", "SC"],
851 &["SS", "SY", "SSO", "SSC"],
852 &["MM", "MMD", "MMN", "MMA"],
853 &["SL", "NNP"],
854 &["NNB", "NNG"],
855 &["VA", "VV"],
856];
857
858#[must_use]
862pub fn pos_tags_equivalent(a: &str, b: &str) -> bool {
863 pos_tags_equivalent_in(a, b, TAG_EQUIVALENCE_GROUPS)
864}
865
866#[must_use]
870pub fn pos_tags_equivalent_practical(a: &str, b: &str) -> bool {
871 pos_tags_equivalent_in(a, b, TAG_EQUIVALENCE_GROUPS_PRACTICAL)
872}
873
874#[must_use]
876fn pos_tags_equivalent_in(a: &str, b: &str, groups: &[&[&str]]) -> bool {
877 if a == b {
878 return true;
879 }
880 groups
881 .iter()
882 .any(|group| group.contains(&a) && group.contains(&b))
883}
884
885pub type PosMatchFn = fn(&str, &str) -> bool;
890
891#[must_use]
893pub fn pos_eq_strict(a: &str, b: &str) -> bool {
894 a == b
895}
896
897pub type SurfaceMatchFn = fn(&str, &str) -> bool;
903
904#[must_use]
906pub fn surface_eq_strict(a: &str, b: &str) -> bool {
907 a == b
908}
909
910#[must_use]
919pub fn surface_eq_canonical(a: &str, b: &str) -> bool {
920 if a == b {
921 return true;
922 }
923 canonical_form(a) == canonical_form(b)
924}
925
926#[must_use]
938pub fn surface_eq_canonical_lenient(a: &str, b: &str) -> bool {
939 if a == b {
940 return true;
941 }
942 let a_can = canonical_form(a);
943 let b_can = canonical_form(b);
944 if a_can == b_can {
945 return true;
946 }
947 normalize_endings(&a_can) == normalize_endings(&b_can)
948}
949
950fn canonical_form(s: &str) -> String {
952 use mecab_ko_hangul::{compose_str, decompose_str};
953 compose_str(&decompose_str(s))
954}
955
956fn normalize_endings(s: &str) -> String {
967 let chars: Vec<char> = s.chars().collect();
969 let mut out = String::with_capacity(s.len());
970 for (i, &c) in chars.iter().enumerate() {
971 let prev = if i > 0 { chars[i - 1] } else { '\0' };
972 if prev == '하' && (c == '았' || c == '아') {
973 out.push(if c == '았' { '였' } else { '여' });
975 } else if c == '어' && prev == '하' {
976 out.push('여');
977 } else {
978 out.push(c);
979 }
980 }
981
982 if out.contains("이습니다") {
987 out = out.replace("이습니다", "입니다");
988 }
989
990 for (from, to) in R_IRREGULAR_PATTERNS {
995 if out.contains(from) {
996 out = out.replace(from, to);
997 }
998 }
999
1000 out
1001}
1002
1003const R_IRREGULAR_PATTERNS: &[(&str, &str)] = &[
1009 ("따르아", "따라"), ("모르아", "몰라"), ("다르아", "달라"), ("부르어", "불러"), ("흐르어", "흘러"), ("오르아", "올라"), ("자르아", "잘라"), ("누르어", "눌러"), ("고르아", "골라"), ];
1019
1020#[derive(Debug, Clone)]
1028pub struct DualMetricResult {
1029 pub morpheme: EvaluationResult,
1031 pub eojeol_correct: usize,
1033 pub eojeol_total: usize,
1035 pub eojeol_accuracy: f64,
1037}
1038
1039impl DualMetricResult {
1040 #[must_use]
1042 pub fn format_report(&self) -> String {
1043 use std::fmt::Write;
1044 let mut report = self.morpheme.format_report();
1045 report.push('\n');
1046 report.push_str("=== 어절 레벨 (Eojeol-level) ===\n");
1047 if self.eojeol_total > 0 {
1048 writeln!(
1049 report,
1050 "Eojeol Accuracy: {:.1}% ({} / {})",
1051 self.eojeol_accuracy * 100.0,
1052 self.eojeol_correct,
1053 self.eojeol_total
1054 )
1055 .unwrap();
1056 } else {
1057 report.push_str("어절 정보 없음 (legacy 2-column TSV)\n");
1058 }
1059 report
1060 }
1061}
1062
1063#[must_use]
1067pub fn evaluate_dataset_dual(
1068 tokenizer: &mut Tokenizer,
1069 dataset: &TestDataset,
1070) -> DualMetricResult {
1071 evaluate_dataset_dual_with_pos_match(tokenizer, dataset, pos_eq_strict)
1072}
1073
1074#[must_use]
1080pub fn evaluate_dataset_dual_lenient(
1081 tokenizer: &mut Tokenizer,
1082 dataset: &TestDataset,
1083) -> DualMetricResult {
1084 evaluate_dataset_dual_with_pos_match(tokenizer, dataset, pos_tags_equivalent)
1085}
1086
1087#[must_use]
1091pub fn evaluate_dataset_dual_with_pos_match(
1092 tokenizer: &mut Tokenizer,
1093 dataset: &TestDataset,
1094 pos_eq: PosMatchFn,
1095) -> DualMetricResult {
1096 evaluate_dataset_dual_with_match(tokenizer, dataset, pos_eq, surface_eq_strict)
1097}
1098
1099#[allow(clippy::cast_precision_loss)]
1117pub fn evaluate_dataset_dual_per_eojeol_with_match(
1118 tokenizer: &mut Tokenizer,
1119 dataset: &TestDataset,
1120 pos_eq: PosMatchFn,
1121 surface_eq: SurfaceMatchFn,
1122) -> DualMetricResult {
1123 let morpheme = evaluate_dataset_sejong_with_match(tokenizer, dataset, pos_eq, surface_eq);
1124
1125 let converter = SejongConverter::new();
1126 let mut eojeol_correct: usize = 0;
1127 let mut eojeol_total: usize = 0;
1128
1129 for gold_sentence in &dataset.sentences {
1130 let Some(eojeol_counts) = &gold_sentence.eojeol_counts else {
1131 continue;
1132 };
1133 let eojeols: Vec<&str> = gold_sentence.text.split_whitespace().collect();
1134 if eojeols.len() != eojeol_counts.len() {
1135 continue;
1136 }
1137
1138 let mut gold_idx: usize = 0;
1139 for (eo_i, &count_g) in eojeol_counts.iter().enumerate() {
1140 eojeol_total += 1;
1141 if gold_idx + count_g > gold_sentence.tokens.len() {
1142 gold_idx = gold_sentence.tokens.len();
1143 continue;
1144 }
1145 let gold_slice = &gold_sentence.tokens[gold_idx..gold_idx + count_g];
1146 gold_idx += count_g;
1147
1148 let pred_raw = tokenizer.tokenize(eojeols[eo_i]);
1149 let pred_sejong = converter.convert_tokens(&pred_raw);
1150 let pred_morphs: Vec<(String, String)> = pred_sejong
1151 .iter()
1152 .map(|t| (SejongConverter::normalize_jamo(&t.surface), t.pos.clone()))
1153 .collect();
1154
1155 let gold_concat: String = gold_slice.iter().map(|t| t.surface.as_str()).collect();
1157 let pred_concat: String = pred_morphs.iter().map(|(s, _)| s.as_str()).collect();
1158 if !surface_eq(&gold_concat, &pred_concat) {
1159 continue;
1160 }
1161
1162 if gold_slice.len() != pred_morphs.len() {
1164 continue;
1165 }
1166 let all_match = gold_slice
1167 .iter()
1168 .zip(pred_morphs.iter())
1169 .all(|(g, (ps, pp))| surface_eq(&g.surface, ps) && pos_eq(&g.pos, pp));
1170 if all_match {
1171 eojeol_correct += 1;
1172 }
1173 }
1174 }
1175
1176 let eojeol_accuracy = if eojeol_total > 0 {
1177 eojeol_correct as f64 / eojeol_total as f64
1178 } else {
1179 0.0
1180 };
1181
1182 DualMetricResult {
1183 morpheme,
1184 eojeol_correct,
1185 eojeol_total,
1186 eojeol_accuracy,
1187 }
1188}
1189
1190#[must_use]
1192pub fn evaluate_dataset_dual_per_eojeol(
1193 tokenizer: &mut Tokenizer,
1194 dataset: &TestDataset,
1195) -> DualMetricResult {
1196 evaluate_dataset_dual_per_eojeol_with_match(
1197 tokenizer,
1198 dataset,
1199 pos_eq_strict,
1200 surface_eq_strict,
1201 )
1202}
1203
1204#[derive(Debug, Clone)]
1213pub struct EojeolSurfaceResult {
1214 pub correct: usize,
1216 pub total: usize,
1218 pub accuracy: f64,
1220}
1221
1222impl EojeolSurfaceResult {
1223 #[must_use]
1225 pub fn format_report(&self) -> String {
1226 if self.total > 0 {
1227 format!(
1228 "Eojeol Surface-only Accuracy: {:.1}% ({} / {})",
1229 self.accuracy * 100.0,
1230 self.correct,
1231 self.total
1232 )
1233 } else {
1234 "어절 정보 없음 (legacy 2-column TSV)".to_string()
1235 }
1236 }
1237}
1238
1239#[allow(clippy::cast_precision_loss)]
1257pub fn evaluate_dataset_eojeol_surface_only_with_match(
1258 tokenizer: &mut Tokenizer,
1259 dataset: &TestDataset,
1260 surface_eq: SurfaceMatchFn,
1261) -> EojeolSurfaceResult {
1262 let converter = SejongConverter::new();
1263 let mut correct: usize = 0;
1264 let mut total: usize = 0;
1265
1266 for gold_sentence in &dataset.sentences {
1267 let Some(eojeol_counts) = &gold_sentence.eojeol_counts else {
1268 continue;
1269 };
1270 let eojeols: Vec<&str> = gold_sentence.text.split_whitespace().collect();
1271 if eojeols.len() != eojeol_counts.len() {
1272 continue;
1273 }
1274
1275 let mut gold_idx: usize = 0;
1276 for (eo_i, &count_g) in eojeol_counts.iter().enumerate() {
1277 total += 1;
1278 if gold_idx + count_g > gold_sentence.tokens.len() {
1279 gold_idx = gold_sentence.tokens.len();
1280 continue;
1281 }
1282 let gold_slice = &gold_sentence.tokens[gold_idx..gold_idx + count_g];
1283 gold_idx += count_g;
1284
1285 let gold_concat: String = gold_slice.iter().map(|t| t.surface.as_str()).collect();
1286
1287 let pred_raw = tokenizer.tokenize(eojeols[eo_i]);
1288 let pred_sejong = converter.convert_tokens(&pred_raw);
1289 let pred_concat: String = pred_sejong
1290 .iter()
1291 .map(|t| SejongConverter::normalize_jamo(&t.surface))
1292 .collect();
1293
1294 if surface_eq(&gold_concat, &pred_concat) {
1295 correct += 1;
1296 }
1297 }
1298 }
1299
1300 let accuracy = if total > 0 {
1301 correct as f64 / total as f64
1302 } else {
1303 0.0
1304 };
1305
1306 EojeolSurfaceResult {
1307 correct,
1308 total,
1309 accuracy,
1310 }
1311}
1312
1313#[must_use]
1317pub fn evaluate_dataset_eojeol_surface_only(
1318 tokenizer: &mut Tokenizer,
1319 dataset: &TestDataset,
1320) -> EojeolSurfaceResult {
1321 evaluate_dataset_eojeol_surface_only_with_match(tokenizer, dataset, surface_eq_strict)
1322}
1323
1324#[allow(clippy::cast_precision_loss)]
1337pub fn evaluate_dataset_dual_with_match(
1338 tokenizer: &mut Tokenizer,
1339 dataset: &TestDataset,
1340 pos_eq: PosMatchFn,
1341 surface_eq: SurfaceMatchFn,
1342) -> DualMetricResult {
1343 let morpheme = evaluate_dataset_sejong_with_match(tokenizer, dataset, pos_eq, surface_eq);
1345
1346 let converter = SejongConverter::new();
1348 let mut eojeol_correct: usize = 0;
1349 let mut eojeol_total: usize = 0;
1350
1351 for gold_sentence in &dataset.sentences {
1352 let Some(counts) = &gold_sentence.eojeol_counts else {
1353 continue;
1354 };
1355
1356 let pred_raw = tokenizer.tokenize(&gold_sentence.text);
1357 let pred_sejong = converter.convert_tokens(&pred_raw);
1358
1359 let pred_morphs: Vec<(String, String)> = pred_sejong
1360 .iter()
1361 .map(|t| {
1362 (
1363 SejongConverter::normalize_jamo(&t.surface),
1364 t.pos.clone(),
1365 )
1366 })
1367 .collect();
1368
1369 let mut gold_idx = 0;
1370 let mut pred_idx = 0;
1371
1372 for &count in counts {
1373 eojeol_total += 1;
1374
1375 let gold_end = gold_idx + count;
1376 let pred_end = pred_idx + count;
1377
1378 if gold_end > gold_sentence.tokens.len() || pred_end > pred_morphs.len() {
1379 gold_idx = gold_end.min(gold_sentence.tokens.len());
1380 pred_idx = pred_end.min(pred_morphs.len());
1381 continue;
1382 }
1383
1384 let gold_slice = &gold_sentence.tokens[gold_idx..gold_end];
1385 let pred_slice = &pred_morphs[pred_idx..pred_end];
1386
1387 let matches = gold_slice
1388 .iter()
1389 .zip(pred_slice.iter())
1390 .all(|(g, (p_surf, p_pos))| {
1391 surface_eq(&g.surface, p_surf) && pos_eq(&g.pos, p_pos)
1392 });
1393
1394 if matches {
1395 eojeol_correct += 1;
1396 }
1397
1398 gold_idx = gold_end;
1399 pred_idx = pred_end;
1400 }
1401 }
1402
1403 let eojeol_accuracy = if eojeol_total > 0 {
1404 eojeol_correct as f64 / eojeol_total as f64
1405 } else {
1406 0.0
1407 };
1408
1409 DualMetricResult {
1410 morpheme,
1411 eojeol_correct,
1412 eojeol_total,
1413 eojeol_accuracy,
1414 }
1415}
1416
1417#[cfg(test)]
1418mod tests {
1419 use super::*;
1420
1421 #[test]
1422 fn test_pos_tags_equivalent_strict_match() {
1423 assert!(pos_tags_equivalent("NNG", "NNG"));
1424 assert!(pos_tags_equivalent("VV", "VV"));
1425 }
1426
1427 #[test]
1428 fn test_pos_tags_equivalent_groups() {
1429 assert!(pos_tags_equivalent("SP", "SC"));
1431 assert!(pos_tags_equivalent("SC", "SP"));
1432 assert!(pos_tags_equivalent("SS", "SY"));
1434 assert!(pos_tags_equivalent("SS", "SSO"));
1435 assert!(pos_tags_equivalent("SSC", "SY"));
1436 assert!(pos_tags_equivalent("MM", "MMD"));
1438 assert!(pos_tags_equivalent("MMA", "MMN"));
1439 }
1440
1441 #[test]
1442 fn test_pos_tags_equivalent_distinct() {
1443 assert!(!pos_tags_equivalent("NNG", "NNP")); assert!(!pos_tags_equivalent("NNG", "NNB")); assert!(!pos_tags_equivalent("VV", "VA"));
1447 assert!(!pos_tags_equivalent("EC", "EF"));
1448 assert!(!pos_tags_equivalent("SP", "SS"));
1450 assert!(!pos_tags_equivalent("MM", "SP"));
1451 }
1452
1453 #[test]
1454 fn test_pos_tags_equivalent_sl_nnp_added_in_sprint126() {
1455 assert!(pos_tags_equivalent("SL", "NNP"));
1457 assert!(pos_tags_equivalent("NNP", "SL"));
1458 }
1459
1460 #[test]
1461 fn test_pos_tags_equivalent_practical_includes_nnb_nng() {
1462 assert!(pos_tags_equivalent_practical("NNB", "NNG"));
1464 assert!(pos_tags_equivalent_practical("NNG", "NNB"));
1465 assert!(!pos_tags_equivalent("NNB", "NNG"));
1467 }
1468
1469 #[test]
1470 fn test_pos_tags_equivalent_practical_includes_va_vv() {
1471 assert!(pos_tags_equivalent_practical("VA", "VV"));
1473 assert!(pos_tags_equivalent_practical("VV", "VA"));
1474 assert!(!pos_tags_equivalent("VA", "VV"));
1476 }
1477
1478 #[test]
1479 fn test_surface_eq_strict_basic() {
1480 assert!(surface_eq_strict("한", "한"));
1481 assert!(!surface_eq_strict("한", "하ㄴ"));
1482 }
1483
1484 #[test]
1485 fn test_surface_eq_canonical_jamo_syllable_mix() {
1486 assert!(surface_eq_canonical("한", "하ㄴ"));
1488 assert!(surface_eq_canonical("함께", "하ㅁ께"));
1489 assert!(surface_eq_canonical("역할", "역하ㄹ"));
1490 }
1491
1492 #[test]
1493 fn test_surface_eq_canonical_pure_strict_match() {
1494 assert!(surface_eq_canonical("한", "한"));
1496 assert!(surface_eq_canonical("ㄱㅏ", "ㄱㅏ"));
1497 }
1498
1499 #[test]
1500 fn test_surface_eq_canonical_distinct_words() {
1501 assert!(!surface_eq_canonical("한", "둘"));
1503 assert!(!surface_eq_canonical("것이", "게"));
1504 }
1505
1506 #[test]
1507 fn test_surface_eq_canonical_lenient_endings() {
1508 assert!(surface_eq_canonical_lenient("인정하였다", "인정하았다"));
1510 assert!(surface_eq_canonical_lenient("등장하여", "등장하어"));
1512 assert!(surface_eq_canonical_lenient("통하여", "통하어"));
1513 assert!(surface_eq_canonical_lenient("함께", "하ㅁ께"));
1515 }
1516
1517 #[test]
1518 fn test_surface_eq_canonical_lenient_does_not_overcorrect() {
1519 assert!(!surface_eq_canonical_lenient("먹었다", "먹였다"));
1521 }
1522
1523 #[test]
1524 fn test_surface_eq_canonical_lenient_haa_to_haye() {
1525 assert!(surface_eq_canonical_lenient("편하아요", "편하어요"));
1527 assert!(surface_eq_canonical_lenient("가능하아요", "가능하어요"));
1528 assert!(surface_eq_canonical_lenient("말하아", "말하어"));
1530 }
1531
1532 #[test]
1533 fn test_surface_eq_canonical_lenient_imnida() {
1534 assert!(surface_eq_canonical_lenient("것입니다", "것이습니다"));
1536 assert!(surface_eq_canonical_lenient("숙소입니다", "숙소이습니다"));
1537 assert!(surface_eq_canonical_lenient("입니다", "이습니다"));
1538 assert!(surface_eq_canonical_lenient("것이ㅂ니다", "것이습니다"));
1540 }
1541
1542 #[test]
1543 fn test_surface_eq_canonical_lenient_r_irregular() {
1544 assert!(surface_eq_canonical_lenient("따라", "따르아"));
1546 assert!(surface_eq_canonical_lenient("따라서", "따르아서"));
1547 assert!(surface_eq_canonical_lenient("몰라요", "모르아요"));
1548 assert!(surface_eq_canonical_lenient("달라", "다르아"));
1549 assert!(surface_eq_canonical_lenient("불러", "부르어"));
1550 assert!(surface_eq_canonical_lenient("흘러", "흐르어"));
1551 assert!(surface_eq_canonical_lenient("올라", "오르아"));
1552 assert!(surface_eq_canonical_lenient("잘라", "자르아"));
1553 assert!(surface_eq_canonical_lenient("눌러", "누르어"));
1554 assert!(surface_eq_canonical_lenient("골라", "고르아"));
1555 }
1556
1557 #[test]
1558 fn test_surface_eq_canonical_lenient_r_irregular_does_not_overcorrect() {
1559 assert!(!surface_eq_canonical_lenient("푸르러", "푸르어"));
1562 assert!(!surface_eq_canonical_lenient("길러", "기르어"));
1564 }
1565
1566 #[test]
1567 fn test_surface_eq_canonical_lenient_imnida_overcorrect() {
1568 assert!(!surface_eq_canonical_lenient("이것입니다", "그것입니다"));
1573 assert!(!surface_eq_canonical_lenient("입니다", "다닙니다"));
1575 }
1576
1577 #[test]
1578 fn test_pos_tags_equivalent_practical_inherits_conservative() {
1579 assert!(pos_tags_equivalent_practical("SP", "SC"));
1581 assert!(pos_tags_equivalent_practical("SS", "SSO"));
1582 assert!(pos_tags_equivalent_practical("MM", "MMD"));
1583 assert!(pos_tags_equivalent_practical("SL", "NNP"));
1584 assert!(!pos_tags_equivalent_practical("NNG", "NNP"));
1586 }
1587
1588 #[test]
1589 fn test_pos_eq_strict() {
1590 assert!(pos_eq_strict("NNG", "NNG"));
1591 assert!(!pos_eq_strict("NNG", "NNP"));
1592 assert!(!pos_eq_strict("SP", "SC")); }
1594
1595 #[test]
1596 fn test_eojeol_surface_result_format_empty() {
1597 let result = EojeolSurfaceResult {
1598 correct: 0,
1599 total: 0,
1600 accuracy: 0.0,
1601 };
1602 assert!(result.format_report().contains("legacy"));
1603 }
1604
1605 #[test]
1606 fn test_eojeol_surface_result_format_populated() {
1607 let result = EojeolSurfaceResult {
1608 correct: 875,
1609 total: 1000,
1610 accuracy: 0.875,
1611 };
1612 let report = result.format_report();
1613 assert!(report.contains("87.5%"));
1614 assert!(report.contains("875"));
1615 assert!(report.contains("1000"));
1616 }
1617
1618 #[test]
1619 fn test_gold_token_parse() {
1620 let token = GoldToken::parse("나/NP").unwrap();
1621 assert_eq!(token.surface, "나");
1622 assert_eq!(token.pos, "NP");
1623
1624 assert!(GoldToken::parse("invalid").is_err());
1625 assert!(GoldToken::parse("too/many/parts").is_err());
1626 }
1627
1628 #[test]
1629 fn test_gold_sentence_parse() {
1630 let sentence =
1631 GoldSentence::parse_tsv_line("나는 학생이다\t나/NP 는/JX 학생/NNG 이/VCP 다/EF")
1632 .unwrap();
1633 assert_eq!(sentence.text, "나는 학생이다");
1634 assert_eq!(sentence.tokens.len(), 5);
1635 assert_eq!(sentence.tokens[0].surface, "나");
1636 assert_eq!(sentence.tokens[0].pos, "NP");
1637 }
1638
1639 #[test]
1640 fn test_evaluate_tokens_perfect_match() {
1641 let gold = vec![
1642 GoldToken::new("나".to_string(), "NP".to_string()),
1643 GoldToken::new("는".to_string(), "JX".to_string()),
1644 ];
1645
1646 let pred = vec![
1647 Token {
1648 surface: "나".to_string(),
1649 pos: "NP".to_string(),
1650 start_pos: 0,
1651 end_pos: 1,
1652 start_byte: 0,
1653 end_byte: 3,
1654 reading: None,
1655 lemma: None,
1656 cost: 0,
1657 features: String::new(),
1658 normalized: None,
1659 },
1660 Token {
1661 surface: "는".to_string(),
1662 pos: "JX".to_string(),
1663 start_pos: 1,
1664 end_pos: 2,
1665 start_byte: 3,
1666 end_byte: 6,
1667 reading: None,
1668 lemma: None,
1669 cost: 0,
1670 features: String::new(),
1671 normalized: None,
1672 },
1673 ];
1674
1675 let (tp, fp, fn_, _) = evaluate_tokens(&gold, &pred);
1676 assert_eq!(tp, 2);
1677 assert_eq!(fp, 0);
1678 assert_eq!(fn_, 0);
1679 }
1680
1681 #[test]
1682 fn test_evaluate_tokens_mismatch() {
1683 let gold = vec![
1684 GoldToken::new("나".to_string(), "NP".to_string()),
1685 GoldToken::new("는".to_string(), "JX".to_string()),
1686 ];
1687
1688 let pred = vec![Token {
1689 surface: "나".to_string(),
1690 pos: "NP".to_string(),
1691 start_pos: 0,
1692 end_pos: 1,
1693 start_byte: 0,
1694 end_byte: 3,
1695 reading: None,
1696 lemma: None,
1697 cost: 0,
1698 features: String::new(),
1699 normalized: None,
1700 }];
1701
1702 let (tp, fp, fn_, _) = evaluate_tokens(&gold, &pred);
1703 assert_eq!(tp, 1);
1704 assert_eq!(fp, 0);
1705 assert_eq!(fn_, 1);
1706 }
1707
1708 #[test]
1709 fn test_evaluation_result_format() {
1710 let mut result = EvaluationResult::new();
1711 result.total_sentences = 10;
1712 result.total_gold_tokens = 50;
1713 result.total_pred_tokens = 48;
1714 result.true_positives = 45;
1715 result.false_positives = 3;
1716 result.false_negatives = 5;
1717 result.exact_match_sentences = 7;
1718 result.token_accuracy = 0.9;
1719 result.sentence_accuracy = 0.7;
1720 result.pos_accuracy = 0.92;
1721 result.precision = 0.9375;
1722 result.recall = 0.9;
1723 result.f1_score = 0.9184;
1724
1725 let report = result.format_report();
1726 assert!(report.contains("테스트 문장: 10"));
1727 assert!(report.contains("Token Accuracy: 90.0%"));
1728 assert!(report.contains("F1 Score: 0.918"));
1729 }
1730
1731 #[test]
1732 #[cfg(feature = "test-utils")]
1733 fn test_dataset_from_tsv() {
1734 use std::io::Write;
1735
1736 let mut file = tempfile::NamedTempFile::new().unwrap();
1737 writeln!(file, "# 주석").unwrap();
1738 writeln!(file, "").unwrap();
1739 writeln!(file, "나는 학생\t나/NP 는/JX 학생/NNG").unwrap();
1740 writeln!(file, "오늘 날씨\t오늘/NNG 날씨/NNG").unwrap();
1741 file.flush().unwrap();
1742
1743 let dataset = TestDataset::from_tsv(file.path()).unwrap();
1744 assert_eq!(dataset.len(), 2);
1745 assert_eq!(dataset.sentences[0].text, "나는 학생");
1746 assert_eq!(dataset.sentences[0].tokens.len(), 3);
1747 assert_eq!(dataset.sentences[1].text, "오늘 날씨");
1748 assert_eq!(dataset.sentences[1].tokens.len(), 2);
1749 }
1750}