1use crate::pos_tag::PosTag;
25use crate::tokenizer::{Token, Tokenizer};
26use crate::Result;
27use std::collections::HashSet;
28
29const COMPOUND_DICT: &[(&str, &[(&str, &str)])] = &[
34 ("형태소분석기", &[("형태소", "NNG"), ("분석기", "NNG")]),
36 ("형태소분석", &[("형태소", "NNG"), ("분석", "NNG")]),
37 ("자연어처리", &[("자연어", "NNG"), ("처리", "NNG")]),
38 ("인공지능", &[("인공", "NNG"), ("지능", "NNG")]),
39 ("기계학습", &[("기계", "NNG"), ("학습", "NNG")]),
40 ("딥러닝", &[("딥", "NNG"), ("러닝", "NNG")]),
41 ("데이터베이스", &[("데이터", "NNG"), ("베이스", "NNG")]),
42 ("운영체제", &[("운영", "NNG"), ("체제", "NNG")]),
43 ("프로그래밍", &[("프로그램", "NNG"), ("밍", "XSN")]),
44 ("소프트웨어", &[("소프트", "NNG"), ("웨어", "NNG")]),
45 ("하드웨어", &[("하드", "NNG"), ("웨어", "NNG")]),
46 ("대한민국", &[("대한", "NNP"), ("민국", "NNG")]),
48 ("국립국어원", &[("국립", "NNG"), ("국어원", "NNP")]),
49 ("대통령", &[("대", "XPN"), ("통령", "NNG")]),
50 ("국무총리", &[("국무", "NNG"), ("총리", "NNG")]),
51 ("대법원", &[("대", "XPN"), ("법원", "NNG")]),
52 ("헌법재판소", &[("헌법", "NNG"), ("재판소", "NNG")]),
53 ("국회의원", &[("국회", "NNG"), ("의원", "NNG")]),
54 (
55 "지방자치단체",
56 &[("지방", "NNG"), ("자치", "NNG"), ("단체", "NNG")],
57 ),
58 ("대학교", &[("대학", "NNG"), ("교", "NNG")]),
60 ("초등학교", &[("초등", "NNG"), ("학교", "NNG")]),
61 ("중학교", &[("중", "XPN"), ("학교", "NNG")]),
62 ("고등학교", &[("고등", "NNG"), ("학교", "NNG")]),
63 ("운동장", &[("운동", "NNG"), ("장", "NNG")]),
64 ("도서관", &[("도서", "NNG"), ("관", "NNG")]),
65 ("교과서", &[("교과", "NNG"), ("서", "NNG")]),
66 ("아파트", &[("아파트", "NNG")]),
68 ("백화점", &[("백화", "NNG"), ("점", "NNG")]),
69 ("주차장", &[("주차", "NNG"), ("장", "NNG")]),
70 ("병원", &[("병원", "NNG")]),
71 ("약국", &[("약국", "NNG")]),
72 ("편의점", &[("편의", "NNG"), ("점", "NNG")]),
73 ("공항", &[("공항", "NNG")]),
74 ("지하철", &[("지하", "NNG"), ("철", "NNG")]),
75 ("버스정류장", &[("버스", "NNG"), ("정류장", "NNG")]),
76 ("주식시장", &[("주식", "NNG"), ("시장", "NNG")]),
78 ("부동산", &[("부동", "NNG"), ("산", "NNG")]),
79 ("신용카드", &[("신용", "NNG"), ("카드", "NNG")]),
80 ("은행계좌", &[("은행", "NNG"), ("계좌", "NNG")]),
81 ("지구온난화", &[("지구", "NNG"), ("온난화", "NNG")]),
83 ("환경오염", &[("환경", "NNG"), ("오염", "NNG")]),
84 ("태양광", &[("태양", "NNG"), ("광", "NNG")]),
85 ("풍력발전", &[("풍력", "NNG"), ("발전", "NNG")]),
86 ("건강보험", &[("건강", "NNG"), ("보험", "NNG")]),
88 ("의료기관", &[("의료", "NNG"), ("기관", "NNG")]),
89 ("응급실", &[("응급", "NNG"), ("실", "NNG")]),
90 ("수술실", &[("수술", "NNG"), ("실", "NNG")]),
91];
92
93const PREFIXES: &[(&str, &str)] = &[
95 ("신", "XPN"), ("구", "XPN"), ("총", "XPN"), ("부", "XPN"), ("대", "XPN"), ("소", "XPN"), ("중", "XPN"), ("고", "XPN"), ("저", "XPN"), ("최", "XPN"), ("초", "XPN"), ("준", "XPN"), ("범", "XPN"), ("반", "XPN"), ("비", "XPN"), ("미", "XPN"), ("재", "XPN"), ("전", "XPN"), ("후", "XPN"), ("무", "XPN"), ("유", "XPN"), ("친", "XPN"), ("반", "XPN"), ];
120
121const SUFFIXES: &[(&str, &str)] = &[
123 ("들", "XSN"), ("님", "XSN"), ("씨", "XSN"), ("꾼", "XSN"), ("쟁이", "XSN"), ("치", "XSN"), ("가", "XSN"), ("자", "XSN"), ("사", "XSN"), ("원", "XSN"), ("인", "XSN"), ("생", "XSN"), ("장", "XSN"), ("실", "XSN"), ("관", "XSN"), ("소", "XSN"), ("점", "XSN"), ("기", "XSN"), ("화", "XSN"), ("적", "XSN"), ("성", "XSN"), ("율", "XSN"), ("도", "XSN"), ("비", "XSN"), ("권", "XSN"), ("론", "XSN"), ("학", "XSN"), ("계", "XSN"), ];
153
154#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
158pub enum DecompoundMode {
159 None,
164
165 Discard,
170
171 Mixed,
176}
177
178impl DecompoundMode {
179 #[must_use]
181 pub fn parse(s: &str) -> Option<Self> {
182 match s.to_lowercase().as_str() {
183 "none" => Some(Self::None),
184 "discard" => Some(Self::Discard),
185 "mixed" => Some(Self::Mixed),
186 _ => None,
187 }
188 }
189
190 #[must_use]
192 #[allow(clippy::should_implement_trait)]
193 pub fn from_str(s: &str) -> Option<Self> {
194 Self::parse(s)
195 }
196
197 #[must_use]
199 pub const fn as_str(&self) -> &'static str {
200 match self {
201 Self::None => "none",
202 Self::Discard => "discard",
203 Self::Mixed => "mixed",
204 }
205 }
206}
207
208#[derive(Debug, Clone, PartialEq, Eq)]
212pub struct NoriToken {
213 pub surface: String,
215 pub pos_tag: String,
217 pub start_offset: usize,
219 pub end_offset: usize,
221 pub lemma: Option<String>,
223 pub reading: Option<String>,
225 pub word_type: WordType,
227 pub is_decompound: bool,
229}
230
231#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
233pub enum WordType {
234 Known,
236 Unknown,
238 User,
240}
241
242impl WordType {
243 #[must_use]
245 pub const fn as_str(&self) -> &'static str {
246 match self {
247 Self::Known => "KNOWN",
248 Self::Unknown => "UNKNOWN",
249 Self::User => "USER",
250 }
251 }
252}
253
254pub struct NoriTokenizer {
258 tokenizer: Tokenizer,
260 decompound_mode: DecompoundMode,
262 output_unknown_unigrams: bool,
264}
265
266impl NoriTokenizer {
267 pub fn new(decompound_mode: DecompoundMode, output_unknown_unigrams: bool) -> Result<Self> {
286 Ok(Self {
287 tokenizer: Tokenizer::new()?,
288 decompound_mode,
289 output_unknown_unigrams,
290 })
291 }
292
293 pub fn with_dict(
299 dict_path: &str,
300 decompound_mode: DecompoundMode,
301 output_unknown_unigrams: bool,
302 ) -> Result<Self> {
303 Ok(Self {
304 tokenizer: Tokenizer::with_dict(dict_path)?,
305 decompound_mode,
306 output_unknown_unigrams,
307 })
308 }
309
310 pub fn tokenize(&mut self, text: &str) -> Result<Vec<NoriToken>> {
327 let mecab_tokens = self.tokenizer.tokenize(text);
328 let mut nori_tokens = Vec::new();
329
330 for token in &mecab_tokens {
331 let nori_token = self.convert_token(token, text);
332 nori_tokens.extend(nori_token);
333 }
334
335 Ok(nori_tokens)
336 }
337
338 fn convert_token(&self, token: &Token, text: &str) -> Vec<NoriToken> {
340 let pos_tag = token.pos.parse::<PosTag>().unwrap_or(PosTag::Unknown);
341 let nori_tag = pos_tag.to_nori_compat();
342
343 let mut tokens = vec![NoriToken {
345 surface: token.surface.clone(),
346 pos_tag: nori_tag.as_str().to_string(),
347 start_offset: char_offset(text, token.start_byte),
348 end_offset: char_offset(text, token.end_byte),
349 lemma: token.lemma.clone(),
350 reading: token.reading.clone(),
351 word_type: if pos_tag == PosTag::Unknown {
352 WordType::Unknown
353 } else {
354 WordType::Known
355 },
356 is_decompound: false,
357 }];
358
359 if self.should_decompound(pos_tag) {
361 let decompounded = Self::decompound_token_enhanced(token, text);
362 tokens = self.apply_decompound_mode(tokens, decompounded);
363 }
364
365 if self.output_unknown_unigrams && pos_tag == PosTag::Unknown {
367 tokens = Self::split_unknown_to_unigrams(token, text);
368 }
369
370 tokens
371 }
372
373 fn should_decompound(&self, pos_tag: PosTag) -> bool {
375 self.decompound_mode != DecompoundMode::None && matches!(pos_tag, PosTag::NNG | PosTag::NNP)
376 }
377
378 fn decompound_token_enhanced(token: &Token, text: &str) -> Vec<NoriToken> {
386 if let Some(tokens) = Self::try_dict_decompose(token, text) {
388 return tokens;
389 }
390
391 if let Some(tokens) = Self::try_extract_suffix(token, text) {
393 return tokens;
394 }
395
396 if let Some(tokens) = Self::try_extract_prefix(token, text) {
398 return tokens;
399 }
400
401 Self::decompound_token(token, text)
403 }
404
405 fn try_dict_decompose(token: &Token, text: &str) -> Option<Vec<NoriToken>> {
409 let surface = &token.surface;
410
411 for (compound, parts) in COMPOUND_DICT {
413 if *compound == surface {
414 if parts.len() <= 1 {
416 return None;
417 }
418
419 let mut result = Vec::with_capacity(parts.len());
420 let mut byte_offset = token.start_byte;
421
422 for (part_surface, part_pos) in *parts {
423 let part_bytes = part_surface.len();
424 result.push(NoriToken {
425 surface: (*part_surface).to_string(),
426 pos_tag: (*part_pos).to_string(),
427 start_offset: char_offset(text, byte_offset),
428 end_offset: char_offset(text, byte_offset + part_bytes),
429 lemma: None,
430 reading: None,
431 word_type: WordType::Known,
432 is_decompound: true,
433 });
434 byte_offset += part_bytes;
435 }
436
437 return Some(result);
438 }
439 }
440
441 None
442 }
443
444 fn try_extract_suffix(token: &Token, text: &str) -> Option<Vec<NoriToken>> {
449 let surface = &token.surface;
450 let chars: Vec<char> = surface.chars().collect();
451
452 if chars.len() < 2 {
453 return None;
454 }
455
456 let mut sorted_suffixes: Vec<_> = SUFFIXES.iter().collect();
458 sorted_suffixes.sort_by_key(|b| std::cmp::Reverse(b.0.len()));
459
460 for (suffix, suffix_tag) in sorted_suffixes {
461 let suffix_chars: Vec<char> = suffix.chars().collect();
462 if chars.len() > suffix_chars.len()
463 && chars[chars.len() - suffix_chars.len()..] == suffix_chars[..]
464 {
465 let stem_len = chars.len() - suffix_chars.len();
467 let stem: String = chars[..stem_len].iter().collect();
468 let stem_bytes = stem.len();
469
470 if stem_len >= 1 {
472 let result = vec![
474 NoriToken {
475 surface: stem,
476 pos_tag: token.pos.clone(),
477 start_offset: char_offset(text, token.start_byte),
478 end_offset: char_offset(text, token.start_byte + stem_bytes),
479 lemma: None,
480 reading: None,
481 word_type: WordType::Known,
482 is_decompound: true,
483 },
484 NoriToken {
485 surface: (*suffix).to_string(),
486 pos_tag: (*suffix_tag).to_string(),
487 start_offset: char_offset(text, token.start_byte + stem_bytes),
488 end_offset: char_offset(text, token.end_byte),
489 lemma: None,
490 reading: None,
491 word_type: WordType::Known,
492 is_decompound: true,
493 },
494 ];
495
496 return Some(result);
497 }
498 }
499 }
500
501 None
502 }
503
504 fn try_extract_prefix(token: &Token, text: &str) -> Option<Vec<NoriToken>> {
509 let surface = &token.surface;
510 let chars: Vec<char> = surface.chars().collect();
511
512 if chars.len() < 2 {
513 return None;
514 }
515
516 let mut sorted_prefixes: Vec<_> = PREFIXES.iter().collect();
518 sorted_prefixes.sort_by_key(|b| std::cmp::Reverse(b.0.len()));
519
520 for (prefix, prefix_tag) in sorted_prefixes {
521 let prefix_chars: Vec<char> = prefix.chars().collect();
522 if chars.len() > prefix_chars.len() && chars[..prefix_chars.len()] == prefix_chars[..] {
523 let rest: String = chars[prefix_chars.len()..].iter().collect();
525 let prefix_bytes = prefix.len();
526 let rest_len = chars.len() - prefix_chars.len();
527
528 if rest_len >= 2 {
530 let result = vec![
532 NoriToken {
533 surface: (*prefix).to_string(),
534 pos_tag: (*prefix_tag).to_string(),
535 start_offset: char_offset(text, token.start_byte),
536 end_offset: char_offset(text, token.start_byte + prefix_bytes),
537 lemma: None,
538 reading: None,
539 word_type: WordType::Known,
540 is_decompound: true,
541 },
542 NoriToken {
543 surface: rest,
544 pos_tag: token.pos.clone(),
545 start_offset: char_offset(text, token.start_byte + prefix_bytes),
546 end_offset: char_offset(text, token.end_byte),
547 lemma: None,
548 reading: None,
549 word_type: WordType::Known,
550 is_decompound: true,
551 },
552 ];
553
554 return Some(result);
555 }
556 }
557 }
558
559 None
560 }
561
562 fn decompound_token(token: &Token, text: &str) -> Vec<NoriToken> {
582 use mecab_ko_hangul::{has_jongseong, is_hangul_syllable};
583
584 let surface = &token.surface;
585 let chars: Vec<char> = surface.chars().collect();
586
587 if chars.len() < 3 {
589 return Vec::new();
590 }
591
592 if !chars.iter().all(|&c| is_hangul_syllable(c)) {
594 return Vec::new();
595 }
596
597 let mut split_positions = Vec::new();
599
600 for i in 1..chars.len() {
601 if i >= chars.len() - 1 {
603 continue;
604 }
605
606 let prev_char = chars[i - 1];
607 let curr_char = chars[i];
608
609 let prev_has_jong = has_jongseong(prev_char) == Some(true);
610 let curr_has_jong = has_jongseong(curr_char) == Some(true);
611
612 let is_boundary = if !prev_has_jong && curr_has_jong {
617 true
619 } else if prev_has_jong && !curr_has_jong {
620 true
622 } else if prev_has_jong && curr_has_jong && i >= 2 {
623 has_jongseong(chars[i - 2]) == Some(true)
625 } else {
626 false
627 };
628
629 if is_boundary {
630 if i >= 1 && chars.len() - i >= 1 {
632 split_positions.push(i);
633 }
634 }
635 }
636
637 if split_positions.is_empty() {
639 let mid = chars.len() / 2;
640 if mid >= 1 && chars.len() - mid >= 1 {
641 split_positions.push(mid);
642 }
643 }
644
645 if split_positions.len() > 2 {
647 let first = split_positions[0];
649 let last = split_positions[split_positions.len() - 1];
650 split_positions = vec![first, last];
651 }
652
653 if split_positions.is_empty() {
654 return Vec::new();
655 }
656
657 let mut result = Vec::new();
659 let mut start_idx = 0;
660 let mut byte_offset = token.start_byte;
661
662 for &split_pos in &split_positions {
663 if split_pos <= start_idx {
664 continue;
665 }
666
667 let part: String = chars[start_idx..split_pos].iter().collect();
668 let part_len_bytes = part.len();
669
670 if !part.is_empty() && split_pos - start_idx >= 1 {
672 result.push(NoriToken {
673 surface: part,
674 pos_tag: token.pos.clone(),
675 start_offset: char_offset(text, byte_offset),
676 end_offset: char_offset(text, byte_offset + part_len_bytes),
677 lemma: None,
678 reading: None,
679 word_type: WordType::Known,
680 is_decompound: true,
681 });
682 }
683
684 byte_offset += part_len_bytes;
685 start_idx = split_pos;
686 }
687
688 if start_idx < chars.len() {
690 let part: String = chars[start_idx..].iter().collect();
691 let part_len_bytes = part.len();
692
693 if !part.is_empty() {
695 result.push(NoriToken {
696 surface: part,
697 pos_tag: token.pos.clone(),
698 start_offset: char_offset(text, byte_offset),
699 end_offset: char_offset(text, byte_offset + part_len_bytes),
700 lemma: None,
701 reading: None,
702 word_type: WordType::Known,
703 is_decompound: true,
704 });
705 }
706 }
707
708 result
709 }
710
711 fn apply_decompound_mode(
713 &self,
714 original: Vec<NoriToken>,
715 decompounded: Vec<NoriToken>,
716 ) -> Vec<NoriToken> {
717 match self.decompound_mode {
718 DecompoundMode::None => original,
719 DecompoundMode::Discard => {
720 if decompounded.is_empty() {
721 original
722 } else {
723 decompounded
724 }
725 }
726 DecompoundMode::Mixed => {
727 let mut result = original;
728 result.extend(decompounded);
729 result
730 }
731 }
732 }
733
734 fn split_unknown_to_unigrams(token: &Token, text: &str) -> Vec<NoriToken> {
736 let chars: Vec<char> = token.surface.chars().collect();
737 let mut tokens = Vec::new();
738 let mut char_pos = token.start_byte;
739
740 for ch in chars {
741 let surface = ch.to_string();
742 let char_len = ch.len_utf8();
743
744 tokens.push(NoriToken {
745 surface,
746 pos_tag: "UNKNOWN".to_string(),
747 start_offset: char_offset(text, char_pos),
748 end_offset: char_offset(text, char_pos + char_len),
749 lemma: None,
750 reading: None,
751 word_type: WordType::Unknown,
752 is_decompound: false,
753 });
754
755 char_pos += char_len;
756 }
757
758 tokens
759 }
760}
761
762pub struct NoriAnalyzer {
766 tokenizer: NoriTokenizer,
768 stoptags: HashSet<String>,
770 _user_dictionary: Option<String>,
772}
773
774impl NoriAnalyzer {
775 pub fn new(
802 user_dictionary: Option<String>,
803 decompound_mode: DecompoundMode,
804 stoptags: Vec<String>,
805 output_unknown_unigrams: bool,
806 ) -> Result<Self> {
807 Ok(Self {
808 tokenizer: NoriTokenizer::new(decompound_mode, output_unknown_unigrams)?,
809 stoptags: stoptags.into_iter().collect(),
810 _user_dictionary: user_dictionary,
811 })
812 }
813
814 pub fn default_with_decompound(decompound_mode: DecompoundMode) -> Result<Self> {
828 Self::new(
829 None,
830 decompound_mode,
831 vec!["J".to_string(), "E".to_string()],
832 false,
833 )
834 }
835
836 pub fn analyze(&mut self, text: &str) -> Result<Vec<NoriToken>> {
851 let tokens = self.tokenizer.tokenize(text)?;
852 Ok(self.filter_stoptags(tokens))
853 }
854
855 fn filter_stoptags(&self, tokens: Vec<NoriToken>) -> Vec<NoriToken> {
857 if self.stoptags.is_empty() {
858 return tokens;
859 }
860
861 tokens
862 .into_iter()
863 .filter(|token| !self.stoptags.contains(&token.pos_tag))
864 .collect()
865 }
866
867 pub fn add_stoptag(&mut self, tag: String) {
869 self.stoptags.insert(tag);
870 }
871
872 pub fn remove_stoptag(&mut self, tag: &str) -> bool {
874 self.stoptags.remove(tag)
875 }
876
877 #[must_use]
879 pub fn stoptags(&self) -> Vec<&str> {
880 self.stoptags.iter().map(String::as_str).collect()
881 }
882}
883
884#[must_use]
896pub fn mecab_to_nori_tag(mecab_tag: &str) -> String {
897 mecab_tag.parse::<PosTag>().map_or_else(
898 |_| mecab_tag.to_string(),
899 |tag| tag.to_nori_compat().as_str().to_string(),
900 )
901}
902
903#[must_use]
917pub fn nori_to_mecab_tag(nori_tag: &str) -> String {
918 match nori_tag {
919 "J" => "JX".to_string(),
921 "E" => "EF".to_string(),
923 _ => nori_tag.to_string(),
925 }
926}
927
928fn char_offset(text: &str, byte_offset: usize) -> usize {
930 text[..byte_offset.min(text.len())].chars().count()
931}
932
933#[cfg(test)]
934#[allow(clippy::unwrap_used)]
935mod tests {
936 use super::*;
937
938 #[test]
939 fn test_decompound_mode_from_str() {
940 assert_eq!(DecompoundMode::parse("none"), Some(DecompoundMode::None));
941 assert_eq!(
942 DecompoundMode::parse("discard"),
943 Some(DecompoundMode::Discard)
944 );
945 assert_eq!(DecompoundMode::parse("mixed"), Some(DecompoundMode::Mixed));
946 assert_eq!(DecompoundMode::parse("NONE"), Some(DecompoundMode::None));
947 assert_eq!(DecompoundMode::parse("invalid"), None);
948 }
949
950 #[test]
951 fn test_decompound_mode_as_str() {
952 assert_eq!(DecompoundMode::None.as_str(), "none");
953 assert_eq!(DecompoundMode::Discard.as_str(), "discard");
954 assert_eq!(DecompoundMode::Mixed.as_str(), "mixed");
955 }
956
957 #[test]
958 fn test_word_type_as_str() {
959 assert_eq!(WordType::Known.as_str(), "KNOWN");
960 assert_eq!(WordType::Unknown.as_str(), "UNKNOWN");
961 assert_eq!(WordType::User.as_str(), "USER");
962 }
963
964 #[test]
965 fn test_mecab_to_nori_tag() {
966 assert_eq!(mecab_to_nori_tag("JKS"), "J");
968 assert_eq!(mecab_to_nori_tag("JKO"), "J");
969 assert_eq!(mecab_to_nori_tag("JX"), "J");
970
971 assert_eq!(mecab_to_nori_tag("EF"), "E");
973 assert_eq!(mecab_to_nori_tag("EC"), "E");
974 assert_eq!(mecab_to_nori_tag("ETM"), "E");
975
976 assert_eq!(mecab_to_nori_tag("NNG"), "NNG");
978 assert_eq!(mecab_to_nori_tag("VV"), "VV");
979 assert_eq!(mecab_to_nori_tag("MAG"), "MAG");
980 }
981
982 #[test]
983 fn test_nori_to_mecab_tag() {
984 assert_eq!(nori_to_mecab_tag("J"), "JX");
985 assert_eq!(nori_to_mecab_tag("E"), "EF");
986 assert_eq!(nori_to_mecab_tag("NNG"), "NNG");
987 assert_eq!(nori_to_mecab_tag("VV"), "VV");
988 }
989
990 #[test]
991 fn test_char_offset() {
992 let text = "안녕하세요";
993 assert_eq!(char_offset(text, 0), 0);
994 assert_eq!(char_offset(text, 3), 1); assert_eq!(char_offset(text, 6), 2); assert_eq!(char_offset(text, 100), 5); }
998
999 #[test]
1000 fn test_nori_tokenizer_creation() {
1001 let tokenizer = NoriTokenizer::new(DecompoundMode::None, false);
1002 assert!(tokenizer.is_ok());
1003
1004 let tokenizer = NoriTokenizer::new(DecompoundMode::Mixed, true);
1005 assert!(tokenizer.is_ok());
1006 }
1007
1008 #[test]
1009 fn test_nori_analyzer_creation() {
1010 let analyzer = NoriAnalyzer::new(
1011 None,
1012 DecompoundMode::None,
1013 vec!["J".to_string(), "E".to_string()],
1014 false,
1015 );
1016 assert!(analyzer.is_ok());
1017 }
1018
1019 #[test]
1020 fn test_nori_analyzer_default() {
1021 let analyzer = NoriAnalyzer::default_with_decompound(DecompoundMode::Mixed);
1022 assert!(analyzer.is_ok());
1023
1024 let analyzer = analyzer.unwrap();
1025 let stoptags = analyzer.stoptags();
1026 assert_eq!(stoptags.len(), 2);
1027 assert!(stoptags.contains(&"J"));
1028 assert!(stoptags.contains(&"E"));
1029 }
1030
1031 #[test]
1032 fn test_nori_analyzer_stoptag_management() {
1033 let mut analyzer = NoriAnalyzer::default_with_decompound(DecompoundMode::None).unwrap();
1034
1035 assert_eq!(analyzer.stoptags().len(), 2);
1037
1038 analyzer.add_stoptag("SF".to_string());
1040 assert_eq!(analyzer.stoptags().len(), 3);
1041 assert!(analyzer.stoptags().contains(&"SF"));
1042
1043 assert!(analyzer.remove_stoptag("SF"));
1045 assert_eq!(analyzer.stoptags().len(), 2);
1046 assert!(!analyzer.stoptags().contains(&"SF"));
1047
1048 assert!(!analyzer.remove_stoptag("NONEXISTENT"));
1050 }
1051
1052 #[test]
1053 fn test_pos_tag_nori_mapping() {
1054 assert_eq!(PosTag::JKS.to_nori_compat().as_str(), "J");
1056 assert_eq!(PosTag::JKO.to_nori_compat().as_str(), "J");
1057 assert_eq!(PosTag::JX.to_nori_compat().as_str(), "J");
1058
1059 assert_eq!(PosTag::EF.to_nori_compat().as_str(), "E");
1061 assert_eq!(PosTag::EC.to_nori_compat().as_str(), "E");
1062 assert_eq!(PosTag::ETM.to_nori_compat().as_str(), "E");
1063
1064 assert_eq!(PosTag::NNG.to_nori_compat().as_str(), "NNG");
1066 assert_eq!(PosTag::VV.to_nori_compat().as_str(), "VV");
1067 }
1068
1069 #[test]
1070 fn test_tokenizer_basic_functionality() {
1071 let mut tokenizer = NoriTokenizer::new(DecompoundMode::None, false).unwrap();
1072 let result = tokenizer.tokenize("안녕");
1073 assert!(result.is_ok());
1074
1075 let tokens = result.unwrap();
1076 assert!(!tokens.is_empty());
1077 }
1078
1079 #[test]
1080 fn test_analyzer_basic_functionality() {
1081 let mut analyzer = NoriAnalyzer::default_with_decompound(DecompoundMode::None).unwrap();
1082 let result = analyzer.analyze("테스트");
1083 assert!(result.is_ok());
1084 }
1085
1086 #[test]
1087 fn test_decompound_token_basic() {
1088 let token = Token {
1090 surface: "형태소분석".to_string(),
1091 pos: "NNG".to_string(),
1092 start_pos: 0,
1093 end_pos: 5,
1094 start_byte: 0,
1095 end_byte: 15, reading: None,
1097 lemma: None,
1098 cost: 0,
1099 features: "NNG,*,*,*,*,*,*,*".to_string(),
1100 normalized: None,
1101 };
1102
1103 let result = NoriTokenizer::decompound_token(&token, "형태소분석");
1104
1105 assert!(!result.is_empty(), "Should decompose compound noun");
1107
1108 for part in &result {
1110 assert!(
1111 part.is_decompound,
1112 "All parts should be marked as decompound"
1113 );
1114 assert_eq!(part.pos_tag, "NNG");
1115 assert_eq!(part.word_type, WordType::Known);
1116 }
1117 }
1118
1119 #[test]
1120 fn test_decompound_token_short_word() {
1121 let token = Token {
1123 surface: "사과".to_string(),
1124 pos: "NNG".to_string(),
1125 start_pos: 0,
1126 end_pos: 2,
1127 start_byte: 0,
1128 end_byte: 6,
1129 reading: None,
1130 lemma: None,
1131 cost: 0,
1132 features: "NNG,*,*,*,*,*,*,*".to_string(),
1133 normalized: None,
1134 };
1135
1136 let result = NoriTokenizer::decompound_token(&token, "사과");
1137
1138 assert!(result.is_empty(), "Short words should not be decomposed");
1140 }
1141
1142 #[test]
1143 fn test_decompound_token_non_hangul() {
1144 let token = Token {
1146 surface: "ABC".to_string(),
1147 pos: "NNG".to_string(),
1148 start_pos: 0,
1149 end_pos: 3,
1150 start_byte: 0,
1151 end_byte: 3,
1152 reading: None,
1153 lemma: None,
1154 cost: 0,
1155 features: "NNG,*,*,*,*,*,*,*".to_string(),
1156 normalized: None,
1157 };
1158
1159 let result = NoriTokenizer::decompound_token(&token, "ABC");
1160
1161 assert!(
1163 result.is_empty(),
1164 "Non-Hangul words should not be decomposed"
1165 );
1166 }
1167
1168 #[test]
1169 fn test_decompound_token_mixed_jongseong() {
1170 let token = Token {
1172 surface: "학교운동장".to_string(),
1173 pos: "NNG".to_string(),
1174 start_pos: 0,
1175 end_pos: 5,
1176 start_byte: 0,
1177 end_byte: 15,
1178 reading: None,
1179 lemma: None,
1180 cost: 0,
1181 features: "NNG,*,*,*,*,*,*,*".to_string(),
1182 normalized: None,
1183 };
1184
1185 let result = NoriTokenizer::decompound_token(&token, "학교운동장");
1186
1187 if !result.is_empty() {
1189 for part in &result {
1191 assert!(part.is_decompound);
1192 assert!(!part.surface.is_empty());
1193 assert_eq!(part.pos_tag, "NNG");
1194 }
1195 }
1196 }
1197
1198 #[test]
1199 fn test_decompound_modes_with_compound() {
1200 use super::DecompoundMode;
1201
1202 let test_token = Token {
1203 surface: "형태소분석".to_string(),
1204 pos: "NNG".to_string(),
1205 start_pos: 0,
1206 end_pos: 5,
1207 start_byte: 0,
1208 end_byte: 15,
1209 reading: None,
1210 lemma: None,
1211 cost: 0,
1212 features: "NNG,*,*,*,*,*,*,*".to_string(),
1213 normalized: None,
1214 };
1215
1216 let tokenizer = NoriTokenizer::new(DecompoundMode::None, false).unwrap();
1218 let pos_tag = test_token.pos.parse::<PosTag>().unwrap();
1219 assert!(!tokenizer.should_decompound(pos_tag));
1220
1221 let tokenizer = NoriTokenizer::new(DecompoundMode::Discard, false).unwrap();
1223 assert!(tokenizer.should_decompound(pos_tag));
1224
1225 let tokenizer = NoriTokenizer::new(DecompoundMode::Mixed, false).unwrap();
1227 assert!(tokenizer.should_decompound(pos_tag));
1228 }
1229
1230 #[test]
1231 fn test_compound_noun_patterns() {
1232 let token = Token {
1236 surface: "대한민국".to_string(),
1237 pos: "NNG".to_string(),
1238 start_pos: 0,
1239 end_pos: 4,
1240 start_byte: 0,
1241 end_byte: 12,
1242 reading: None,
1243 lemma: None,
1244 cost: 0,
1245 features: "NNG,*,*,*,*,*,*,*".to_string(),
1246 normalized: None,
1247 };
1248 let result = NoriTokenizer::decompound_token(&token, "대한민국");
1249 assert!(!result.is_empty(), "Should decompose 대한민국");
1250
1251 let token = Token {
1253 surface: "국립국어원".to_string(),
1254 pos: "NNG".to_string(),
1255 start_pos: 0,
1256 end_pos: 5,
1257 start_byte: 0,
1258 end_byte: 15,
1259 reading: None,
1260 lemma: None,
1261 cost: 0,
1262 features: "NNG,*,*,*,*,*,*,*".to_string(),
1263 normalized: None,
1264 };
1265 let result = NoriTokenizer::decompound_token(&token, "국립국어원");
1266 assert!(!result.is_empty(), "Should decompose 국립국어원");
1267 }
1268
1269 #[test]
1270 fn test_decompound_offset_accuracy() {
1271 let token = Token {
1273 surface: "형태소분석".to_string(),
1274 pos: "NNG".to_string(),
1275 start_pos: 0,
1276 end_pos: 5,
1277 start_byte: 0,
1278 end_byte: 15,
1279 reading: None,
1280 lemma: None,
1281 cost: 0,
1282 features: "NNG,*,*,*,*,*,*,*".to_string(),
1283 normalized: None,
1284 };
1285
1286 let result = NoriTokenizer::decompound_token(&token, "형태소분석");
1287
1288 if !result.is_empty() {
1289 let mut prev_end = 0;
1291 for part in &result {
1292 assert!(
1293 part.start_offset >= prev_end,
1294 "Offsets should not overlap: {} >= {}",
1295 part.start_offset,
1296 prev_end
1297 );
1298 assert!(
1299 part.end_offset > part.start_offset,
1300 "End should be after start: {} > {}",
1301 part.end_offset,
1302 part.start_offset
1303 );
1304 prev_end = part.end_offset;
1305 }
1306
1307 assert_eq!(
1309 result.last().unwrap().end_offset,
1310 5,
1311 "Last token should end at original token end"
1312 );
1313 }
1314 }
1315
1316 #[test]
1317 fn test_decompound_min_syllable_constraint() {
1318 let short_words = vec![
1320 ("한글", 2), ("사과", 2), ("바나나", 3), ];
1324
1325 for (word, len) in short_words {
1326 let token = Token {
1327 surface: word.to_string(),
1328 pos: "NNG".to_string(),
1329 start_pos: 0,
1330 end_pos: len,
1331 start_byte: 0,
1332 end_byte: word.len(),
1333 reading: None,
1334 lemma: None,
1335 cost: 0,
1336 features: "NNG,*,*,*,*,*,*,*".to_string(),
1337 normalized: None,
1338 };
1339
1340 let result = NoriTokenizer::decompound_token(&token, word);
1341
1342 if len < 3 {
1343 assert!(
1344 result.is_empty(),
1345 "Words with {len} syllables should not decompose: {word}"
1346 );
1347 }
1348 }
1349 }
1350
1351 #[test]
1352 fn test_decompound_preserves_wordtype() {
1353 let token = Token {
1354 surface: "형태소분석".to_string(),
1355 pos: "NNG".to_string(),
1356 start_pos: 0,
1357 end_pos: 5,
1358 start_byte: 0,
1359 end_byte: 15,
1360 reading: None,
1361 lemma: None,
1362 cost: 0,
1363 features: "NNG,*,*,*,*,*,*,*".to_string(),
1364 normalized: None,
1365 };
1366
1367 let result = NoriTokenizer::decompound_token(&token, "형태소분석");
1368
1369 for part in result {
1370 assert_eq!(part.word_type, WordType::Known);
1371 assert!(part.is_decompound);
1372 }
1373 }
1374
1375 #[test]
1376 fn test_mixed_mode_returns_both() {
1377 let mut tokenizer = NoriTokenizer::new(DecompoundMode::Mixed, false).unwrap();
1378
1379 let text = "형태소";
1381 let result = tokenizer.tokenize(text);
1382 assert!(result.is_ok());
1383
1384 }
1387
1388 #[test]
1389 fn test_discard_mode_returns_only_parts() {
1390 let mut tokenizer = NoriTokenizer::new(DecompoundMode::Discard, false).unwrap();
1391
1392 let text = "형태소";
1393 let result = tokenizer.tokenize(text);
1394 assert!(result.is_ok());
1395
1396 }
1398
1399 #[test]
1400 fn test_dict_decompose_basic() {
1401 let token = Token {
1402 surface: "형태소분석기".to_string(),
1403 pos: "NNG".to_string(),
1404 start_pos: 0,
1405 end_pos: 6,
1406 start_byte: 0,
1407 end_byte: 18,
1408 reading: None,
1409 lemma: None,
1410 cost: 0,
1411 features: "NNG,*,*,*,*,*,*,*".to_string(),
1412 normalized: None,
1413 };
1414
1415 let result = NoriTokenizer::try_dict_decompose(&token, "형태소분석기");
1416
1417 assert!(result.is_some(), "Should find compound in dictionary");
1419 let parts = result.unwrap();
1420 assert_eq!(parts.len(), 2);
1421 assert_eq!(parts[0].surface, "형태소");
1422 assert_eq!(parts[1].surface, "분석기");
1423 }
1424
1425 #[test]
1426 fn test_dict_decompose_대한민국() {
1427 let token = Token {
1428 surface: "대한민국".to_string(),
1429 pos: "NNP".to_string(),
1430 start_pos: 0,
1431 end_pos: 4,
1432 start_byte: 0,
1433 end_byte: 12,
1434 reading: None,
1435 lemma: None,
1436 cost: 0,
1437 features: "NNP,*,*,*,*,*,*,*".to_string(),
1438 normalized: None,
1439 };
1440
1441 let result = NoriTokenizer::try_dict_decompose(&token, "대한민국");
1442
1443 assert!(result.is_some(), "Should find 대한민국 in dictionary");
1445 let parts = result.unwrap();
1446 assert_eq!(parts.len(), 2);
1447 assert_eq!(parts[0].surface, "대한");
1448 assert_eq!(parts[0].pos_tag, "NNP");
1449 assert_eq!(parts[1].surface, "민국");
1450 }
1451
1452 #[test]
1453 fn test_enhanced_suffix_extraction() {
1454 let token = Token {
1456 surface: "현대화".to_string(),
1457 pos: "NNG".to_string(),
1458 start_pos: 0,
1459 end_pos: 3,
1460 start_byte: 0,
1461 end_byte: 9,
1462 reading: None,
1463 lemma: None,
1464 cost: 0,
1465 features: "NNG,*,*,*,*,*,*,*".to_string(),
1466 normalized: None,
1467 };
1468
1469 let result = NoriTokenizer::try_extract_suffix(&token, "현대화");
1470
1471 assert!(result.is_some(), "Should extract suffix 화");
1473 let parts = result.unwrap();
1474 assert_eq!(parts.len(), 2);
1475 assert_eq!(parts[0].surface, "현대");
1476 assert_eq!(parts[1].surface, "화");
1477 assert_eq!(parts[1].pos_tag, "XSN");
1478 }
1479
1480 #[test]
1481 fn test_enhanced_prefix_extraction() {
1482 let token = Token {
1484 surface: "초고속".to_string(),
1485 pos: "NNG".to_string(),
1486 start_pos: 0,
1487 end_pos: 3,
1488 start_byte: 0,
1489 end_byte: 9,
1490 reading: None,
1491 lemma: None,
1492 cost: 0,
1493 features: "NNG,*,*,*,*,*,*,*".to_string(),
1494 normalized: None,
1495 };
1496
1497 let result = NoriTokenizer::try_extract_prefix(&token, "초고속");
1498
1499 assert!(result.is_some(), "Should extract prefix 초");
1501 let parts = result.unwrap();
1502 assert_eq!(parts.len(), 2);
1503 assert_eq!(parts[0].surface, "초");
1504 assert_eq!(parts[0].pos_tag, "XPN");
1505 assert_eq!(parts[1].surface, "고속");
1506 }
1507
1508 #[test]
1509 fn test_decompound_enhanced_priority() {
1510 let token = Token {
1512 surface: "형태소분석".to_string(),
1513 pos: "NNG".to_string(),
1514 start_pos: 0,
1515 end_pos: 5,
1516 start_byte: 0,
1517 end_byte: 15,
1518 reading: None,
1519 lemma: None,
1520 cost: 0,
1521 features: "NNG,*,*,*,*,*,*,*".to_string(),
1522 normalized: None,
1523 };
1524
1525 let result = NoriTokenizer::decompound_token_enhanced(&token, "형태소분석");
1526
1527 assert_eq!(result.len(), 2);
1529 assert_eq!(result[0].surface, "형태소");
1530 assert_eq!(result[1].surface, "분석");
1531 }
1532
1533 #[test]
1534 fn test_multiple_suffix_entries() {
1535 assert!(SUFFIXES.len() > 10, "Should have many suffix entries");
1537
1538 assert!(
1540 SUFFIXES.iter().any(|(s, _)| *s == "화"),
1541 "Should contain 화"
1542 );
1543 assert!(
1544 SUFFIXES.iter().any(|(s, _)| *s == "적"),
1545 "Should contain 적"
1546 );
1547 assert!(
1548 SUFFIXES.iter().any(|(s, _)| *s == "쟁이"),
1549 "Should contain 쟁이"
1550 );
1551 }
1552
1553 #[test]
1554 fn test_multiple_prefix_entries() {
1555 assert!(PREFIXES.len() > 10, "Should have many prefix entries");
1557
1558 assert!(
1560 PREFIXES.iter().any(|(p, _)| *p == "초"),
1561 "Should contain 초"
1562 );
1563 assert!(
1564 PREFIXES.iter().any(|(p, _)| *p == "최"),
1565 "Should contain 최"
1566 );
1567 assert!(
1568 PREFIXES.iter().any(|(p, _)| *p == "친"),
1569 "Should contain 친"
1570 );
1571 }
1572}