1use std::borrow::Cow;
38use std::path::Path;
39
40use mecab_ko_dict::{SystemDictionary, UserDictionary};
41
42use crate::error::Result;
43use crate::lattice::{Lattice, Node, NodeBuilder, NodeType};
44use crate::normalizer::{NormalizationConfig, Normalizer};
45use crate::pool::{PoolManager, PoolStats};
46use crate::pos_tag::PosTag;
47use crate::unknown::UnknownHandler;
48use crate::viterbi::{SpacePenalty, ViterbiSearcher};
49
50#[derive(Debug, Clone, PartialEq, Eq)]
54pub struct Token {
55 pub surface: String,
57
58 pub pos: String,
60
61 pub start_pos: usize,
63
64 pub end_pos: usize,
66
67 pub start_byte: usize,
69
70 pub end_byte: usize,
72
73 pub reading: Option<String>,
75
76 pub lemma: Option<String>,
78
79 pub cost: i32,
81
82 pub features: String,
84
85 pub normalized: Option<String>,
87}
88
89impl Token {
90 #[must_use]
92 pub const fn new(
93 surface: String,
94 pos: String,
95 start_pos: usize,
96 end_pos: usize,
97 start_byte: usize,
98 end_byte: usize,
99 ) -> Self {
100 Self {
101 surface,
102 pos,
103 start_pos,
104 end_pos,
105 start_byte,
106 end_byte,
107 reading: None,
108 lemma: None,
109 cost: 0,
110 features: String::new(),
111 normalized: None,
112 }
113 }
114
115 #[must_use]
121 pub fn from_node(node: &Node) -> Self {
122 let features = node.feature.to_string();
123 let (pos, reading, lemma) = parse_features(&features);
124
125 Self {
126 surface: node.surface.to_string(),
127 pos: pos.to_string(),
128 start_pos: node.start_pos,
129 end_pos: node.end_pos,
130 start_byte: node.start_byte,
131 end_byte: node.end_byte,
132 reading,
133 lemma,
134 cost: node.total_cost,
135 features,
136 normalized: None,
137 }
138 }
139
140 #[inline]
142 #[must_use]
143 pub const fn char_len(&self) -> usize {
144 self.end_pos - self.start_pos
145 }
146
147 #[inline]
149 #[must_use]
150 pub const fn byte_len(&self) -> usize {
151 self.end_byte - self.start_byte
152 }
153
154 #[must_use]
156 pub fn pos_tag(&self) -> Option<PosTag> {
157 self.pos.parse().ok()
158 }
159}
160
161fn parse_features(features: &str) -> (Cow<'_, str>, Option<String>, Option<String>) {
169 let mut split = features.splitn(5, ',');
171
172 let pos = split.next().unwrap_or("*");
173
174 let reading = split
176 .nth(2) .filter(|s| !s.is_empty() && *s != "*")
178 .map(std::string::ToString::to_string);
179
180 let lemma = reading.clone();
181
182 (Cow::Borrowed(pos), reading, lemma)
183}
184
185pub struct Tokenizer {
196 dictionary: SystemDictionary,
198
199 unknown_handler: UnknownHandler,
201
202 viterbi_searcher: ViterbiSearcher,
204
205 lattice: Lattice,
207
208 normalizer: Option<Normalizer>,
210
211 enable_normalization: bool,
213
214 pool_manager: PoolManager,
216}
217
218impl Tokenizer {
219 pub fn new() -> Result<Self> {
237 let dictionary = SystemDictionary::load_default()?;
238 let unknown_handler = UnknownHandler::korean_default();
239 let viterbi_searcher = ViterbiSearcher::new();
240
241 let lattice = Lattice::new("");
243
244 Ok(Self {
245 dictionary,
246 unknown_handler,
247 viterbi_searcher,
248 lattice,
249 normalizer: None,
250 enable_normalization: false,
251 pool_manager: PoolManager::new(),
252 })
253 }
254
255 pub fn with_dict<P: AsRef<Path>>(dict_path: P) -> Result<Self> {
266 let dictionary = SystemDictionary::load(dict_path)?;
267 let unknown_handler = UnknownHandler::korean_default();
268 let viterbi_searcher = ViterbiSearcher::new();
269
270 let lattice = Lattice::new("");
271
272 Ok(Self {
273 dictionary,
274 unknown_handler,
275 viterbi_searcher,
276 lattice,
277 normalizer: None,
278 enable_normalization: false,
279 pool_manager: PoolManager::new(),
280 })
281 }
282
283 #[must_use]
302 pub fn with_user_dict(mut self, user_dict: UserDictionary) -> Self {
303 self.dictionary.set_user_dictionary(user_dict);
304 self
305 }
306
307 pub fn set_user_dict(&mut self, user_dict: UserDictionary) {
329 self.dictionary.set_user_dictionary(user_dict);
330 }
331
332 #[cfg(feature = "hot-reload-v2")]
341 pub fn set_hot_reload(
342 &mut self,
343 hr: std::sync::Arc<mecab_ko_dict::hot_reload_v2::HotReloadDictV2>,
344 ) {
345 self.dictionary.set_hot_reload(hr);
346 }
347
348 #[must_use]
354 pub fn with_space_penalty(mut self, penalty: SpacePenalty) -> Self {
355 self.viterbi_searcher = ViterbiSearcher::new().with_space_penalty(penalty);
356 self
357 }
358
359 pub fn tokenize(&mut self, text: &str) -> Vec<Token> {
382 if text.is_empty() {
383 return Vec::new();
384 }
385
386 self.lattice.reset(text);
388
389 self.build_lattice();
391
392 let path = self
394 .viterbi_searcher
395 .search(&mut self.lattice, self.dictionary.matrix());
396
397 path.iter()
399 .filter_map(|&node_id| self.lattice.node(node_id))
400 .map(|node| {
401 let mut token = Token::from_node(node);
402 let orig_start = self.lattice.original_byte_pos(node.start_pos);
403 token.start_byte = orig_start;
404 token.end_byte = orig_start + node.surface.len();
405 token
406 })
407 .collect()
408 }
409
410 fn build_lattice(&mut self) {
415 let char_len = self.lattice.char_len();
416
417 for pos in 0..char_len {
419 let has_dict_entry = self.add_dict_nodes(pos);
421
422 self.unknown_handler
424 .add_unknown_nodes(&mut self.lattice, pos, has_dict_entry);
425 }
426 }
427
428 fn add_dict_nodes(&mut self, start_pos: usize) -> bool {
440 let char_len = self.lattice.char_len();
445 let search_text: &str = self.lattice.substring(start_pos, char_len);
446
447 if search_text.is_empty() {
448 return false;
449 }
450
451 let dict_entries: Vec<_> = self
456 .dictionary
457 .common_prefix_search(search_text)
458 .unwrap_or_default();
459
460 let user_entries: Vec<_> = self
465 .dictionary
466 .user_dictionary()
467 .map(|ud| ud.common_prefix_search(search_text))
468 .unwrap_or_default();
469
470 let mut found = false;
472
473 for (entry, byte_len) in dict_entries {
474 let end_pos = self
477 .lattice
478 .char_pos_from_start_and_byte_len(start_pos, byte_len);
479
480 self.lattice.add_node(
481 NodeBuilder::new(&entry.surface, start_pos, end_pos)
482 .left_id(entry.left_id)
483 .right_id(entry.right_id)
484 .word_cost(i32::from(entry.cost))
485 .node_type(NodeType::Known)
486 .feature(&entry.feature),
487 );
488
489 found = true;
490 }
491
492 for user_entry in user_entries {
493 let surface_char_len = user_entry.surface.chars().count();
494 let end_pos = start_pos + surface_char_len;
495
496 self.lattice.add_node(
497 NodeBuilder::new(&user_entry.surface, start_pos, end_pos)
498 .left_id(user_entry.left_id)
499 .right_id(user_entry.right_id)
500 .word_cost(i32::from(user_entry.cost))
501 .node_type(NodeType::User)
502 .feature(&user_entry.feature),
503 );
504
505 found = true;
506 }
507
508 found
509 }
510
511 pub fn tokenize_to_lattice(&mut self, text: &str) -> &Lattice {
523 if !text.is_empty() {
524 self.lattice.reset(text);
525 self.build_lattice();
526 }
527 &self.lattice
528 }
529
530 pub fn wakati(&mut self, text: &str) -> Vec<String> {
561 self.tokenize(text).into_iter().map(|t| t.surface).collect()
562 }
563
564 pub fn nouns(&mut self, text: &str) -> Vec<String> {
574 self.tokenize(text)
575 .into_iter()
576 .filter(|t| t.pos.starts_with("NN"))
577 .map(|t| t.surface)
578 .collect()
579 }
580
581 pub fn morphs(&mut self, text: &str) -> Vec<String> {
594 self.wakati(text)
595 }
596
597 pub fn pos(&mut self, text: &str) -> Vec<(String, String)> {
620 self.tokenize(text)
621 .into_iter()
622 .map(|t| (t.surface, t.pos))
623 .collect()
624 }
625
626 #[must_use]
631 pub const fn dictionary(&self) -> &SystemDictionary {
632 &self.dictionary
633 }
634
635 #[must_use]
640 pub fn lattice_stats(&self) -> crate::lattice::LatticeStats {
641 self.lattice.stats()
642 }
643
644 #[must_use]
648 pub fn pool_stats(&self) -> PoolStats {
649 self.pool_manager.stats()
650 }
651
652 #[must_use]
656 pub fn memory_stats(&self) -> crate::memory::MemoryStats {
657 crate::memory::MemoryStats {
658 dictionary_bytes: 0, lattice_bytes: self.lattice.memory_usage(),
660 pool_bytes: self.pool_manager.total_memory_usage(),
661 cache_bytes: 0,
662 interner_bytes: 0,
663 token_bytes: 0,
664 }
665 }
666
667 pub fn clear_pools(&self) {
672 self.pool_manager.clear_all();
673 }
674
675 pub fn set_normalization(
686 &mut self,
687 enable: bool,
688 config: Option<NormalizationConfig>,
689 ) -> Result<()> {
690 self.enable_normalization = enable;
691
692 if enable {
693 let normalizer_config = config.unwrap_or_default();
694 self.normalizer = Some(Normalizer::new(normalizer_config)?);
695 } else {
696 self.normalizer = None;
697 }
698
699 Ok(())
700 }
701
702 #[must_use]
704 pub const fn normalizer(&self) -> Option<&Normalizer> {
705 self.normalizer.as_ref()
706 }
707
708 #[must_use]
710 pub const fn is_normalization_enabled(&self) -> bool {
711 self.enable_normalization
712 }
713
714 pub fn tokenize_with_normalization(&mut self, text: &str) -> Vec<Token> {
726 let mut tokens = self.tokenize(text);
727
728 if let Some(normalizer) = &self.normalizer {
730 for token in &mut tokens {
731 token.normalized = Some(normalizer.normalize(&token.surface));
732 }
733 }
734
735 tokens
736 }
737
738 #[must_use]
750 pub fn get_word_variants(&self, word: &str) -> (String, Vec<String>) {
751 self.normalizer.as_ref().map_or_else(
752 || (word.to_string(), Vec::new()),
753 |normalizer| {
754 let standard = normalizer.normalize(word);
755 let variants = normalizer.get_variants(&standard);
756 (standard, variants)
757 },
758 )
759 }
760}
761
762#[cfg(test)]
766#[allow(clippy::expect_used, clippy::vec_init_then_push)]
767mod tests {
768 use super::*;
769 use mecab_ko_dict::{matrix::DenseMatrix, trie::TrieBuilder, DictEntry};
770
771 fn create_test_tokenizer() -> Tokenizer {
773 let mut trie_entries = vec![
775 ("아버지", 0u32),
776 ("가", 1),
777 ("방", 2),
778 ("에", 3),
779 ("들어가", 4),
780 ("신다", 5),
781 ];
782 let trie_bytes = TrieBuilder::build_unsorted(&mut trie_entries).expect("should build trie");
783 let trie =
784 mecab_ko_dict::TrieBackend::Owned(mecab_ko_dict::Trie::from_vec(trie_bytes));
785
786 let matrix = DenseMatrix::new(10, 10, 100);
788 let matrix = mecab_ko_dict::matrix::ConnectionMatrix::Dense(matrix);
789
790 let mut entries = Vec::new();
792 entries.push(DictEntry::new(
793 "아버지",
794 1,
795 1,
796 1000,
797 "NNG,*,T,아버지,*,*,*,*",
798 ));
799 entries.push(DictEntry::new("가", 5, 5, 500, "JKS,*,F,가,*,*,*,*"));
800 entries.push(DictEntry::new("방", 2, 2, 2000, "NNG,*,T,방,*,*,*,*"));
801 entries.push(DictEntry::new("에", 6, 6, 400, "JKB,*,F,에,*,*,*,*"));
802 entries.push(DictEntry::new(
803 "들어가",
804 3,
805 3,
806 1500,
807 "VV,*,F,들어가다,*,*,*,*",
808 ));
809 entries.push(DictEntry::new("신다", 4, 4, 1800, "VV+EP,*,F,신다,*,*,*,*"));
810
811 let dictionary = SystemDictionary::new_test(
812 std::path::PathBuf::from("./test_dic"),
813 trie,
814 matrix,
815 entries,
816 );
817
818 let unknown_handler = UnknownHandler::korean_default();
819 let viterbi_searcher = ViterbiSearcher::new();
820 let lattice = Lattice::new("");
821
822 Tokenizer {
823 dictionary,
824 unknown_handler,
825 viterbi_searcher,
826 lattice,
827 normalizer: None,
828 enable_normalization: false,
829 pool_manager: PoolManager::new(),
830 }
831 }
832
833 #[test]
834 fn test_token_creation() {
835 let token = Token::new("안녕".to_string(), "NNG".to_string(), 0, 2, 0, 6);
836
837 assert_eq!(token.surface, "안녕");
838 assert_eq!(token.pos, "NNG");
839 assert_eq!(token.start_pos, 0);
840 assert_eq!(token.end_pos, 2);
841 assert_eq!(token.char_len(), 2);
842 assert_eq!(token.byte_len(), 6);
843 }
844
845 #[test]
846 fn test_parse_features() {
847 let features = "NNG,*,T,안녕,*,*,*,*";
848 let (pos, reading, lemma) = parse_features(features);
849
850 assert_eq!(pos, "NNG");
851 assert_eq!(reading, Some("안녕".to_string()));
852 assert_eq!(lemma, Some("안녕".to_string()));
853 }
854
855 #[test]
856 fn test_parse_features_no_reading() {
857 let features = "JKS,*,F,*,*,*,*,*";
858 let (pos, reading, _lemma) = parse_features(features);
859
860 assert_eq!(pos, "JKS");
861 assert_eq!(reading, None);
862 }
863
864 #[test]
865 fn test_tokenize_simple() {
866 let mut tokenizer = create_test_tokenizer();
867 let tokens = tokenizer.tokenize("아버지");
868
869 assert!(!tokens.is_empty());
870 assert_eq!(tokens[0].surface, "아버지");
871 assert_eq!(tokens[0].pos, "NNG");
872 }
873
874 #[test]
875 fn test_tokenize_with_particle() {
876 let mut tokenizer = create_test_tokenizer();
877 let tokens = tokenizer.tokenize("아버지가");
878
879 assert_eq!(tokens.len(), 2);
880 assert_eq!(tokens[0].surface, "아버지");
881 assert_eq!(tokens[0].pos, "NNG");
882 assert_eq!(tokens[1].surface, "가");
883 assert_eq!(tokens[1].pos, "JKS");
884 }
885
886 #[test]
887 fn test_tokenize_complex() {
888 let mut tokenizer = create_test_tokenizer();
889 let tokens = tokenizer.tokenize("아버지가방에들어가신다");
890
891 assert!(!tokens.is_empty());
893
894 assert_eq!(tokens[0].surface, "아버지");
896 }
897
898 #[test]
899 fn test_tokenize_empty() {
900 let mut tokenizer = create_test_tokenizer();
901 let tokens = tokenizer.tokenize("");
902
903 assert!(tokens.is_empty());
904 }
905
906 #[test]
907 fn test_tokenize_with_spaces() {
908 let mut tokenizer = create_test_tokenizer();
909 let tokens = tokenizer.tokenize("아버지 가방");
910
911 assert!(!tokens.is_empty());
913 }
914
915 #[test]
916 fn test_wakati() {
917 let mut tokenizer = create_test_tokenizer();
918 let surfaces = tokenizer.wakati("아버지가");
919
920 assert_eq!(surfaces.len(), 2);
921 assert_eq!(surfaces[0], "아버지");
922 assert_eq!(surfaces[1], "가");
923 }
924
925 #[test]
926 fn test_nouns() {
927 let mut tokenizer = create_test_tokenizer();
928 let nouns = tokenizer.nouns("아버지가방에");
929
930 assert!(nouns.contains(&"아버지".to_string()));
932 assert!(nouns.contains(&"방".to_string()));
933 assert!(!nouns.contains(&"가".to_string())); }
935
936 #[test]
937 fn test_pos() {
938 let mut tokenizer = create_test_tokenizer();
939 let pos_tags = tokenizer.pos("아버지가");
940
941 assert_eq!(pos_tags.len(), 2);
942 assert_eq!(pos_tags[0], ("아버지".to_string(), "NNG".to_string()));
943 assert_eq!(pos_tags[1], ("가".to_string(), "JKS".to_string()));
944 }
945
946 #[test]
947 fn test_tokenize_to_lattice() {
948 let mut tokenizer = create_test_tokenizer();
949 let lattice = tokenizer.tokenize_to_lattice("아버지가");
950
951 assert!(lattice.node_count() > 2); let stats = lattice.stats();
956 assert!(stats.total_nodes > 2);
957 }
958
959 #[test]
960 fn test_lattice_stats() {
961 let mut tokenizer = create_test_tokenizer();
962 tokenizer.tokenize("아버지가");
963
964 let stats = tokenizer.lattice_stats();
965 assert!(stats.total_nodes > 0);
966 assert!(stats.char_length > 0);
967 }
968
969 #[test]
970 fn test_token_positions() {
971 let mut tokenizer = create_test_tokenizer();
972 let tokens = tokenizer.tokenize("아버지가");
973
974 assert_eq!(tokens[0].start_pos, 0);
976 assert_eq!(tokens[0].end_pos, 3);
977
978 assert_eq!(tokens[1].start_pos, 3);
980 assert_eq!(tokens[1].end_pos, 4);
981 }
982
983 #[test]
984 fn test_multiple_tokenize_calls() {
985 let mut tokenizer = create_test_tokenizer();
986
987 let tokens1 = tokenizer.tokenize("아버지");
989 assert!(!tokens1.is_empty());
990
991 let tokens2 = tokenizer.tokenize("가방");
993 assert!(!tokens2.is_empty());
994
995 assert_ne!(tokens1[0].surface, tokens2[0].surface);
997 }
998
999 #[test]
1000 fn test_token_from_node() {
1001 use crate::lattice::Node;
1002 use std::borrow::Cow;
1003
1004 let node = Node {
1005 id: 1,
1006 surface: Cow::Borrowed("테스트"),
1007 start_pos: 0,
1008 end_pos: 3,
1009 start_byte: 0,
1010 end_byte: 9,
1011 left_id: 1,
1012 right_id: 1,
1013 word_cost: 1000,
1014 total_cost: 1500,
1015 prev_node_id: 0,
1016 node_type: NodeType::Known,
1017 feature: Cow::Borrowed("NNG,*,T,테스트,*,*,*,*"),
1018 has_space_before: false,
1019 };
1020
1021 let token = Token::from_node(&node);
1022
1023 assert_eq!(token.surface, "테스트");
1024 assert_eq!(token.pos, "NNG");
1025 assert_eq!(token.start_pos, 0);
1026 assert_eq!(token.end_pos, 3);
1027 assert_eq!(token.reading, Some("테스트".to_string()));
1028 assert_eq!(token.cost, 1500);
1029 }
1030
1031 #[test]
1032 fn test_with_user_dict() {
1033 let mut tokenizer = create_test_tokenizer();
1034
1035 let mut user_dict = UserDictionary::new();
1036 user_dict.add_entry("딥러닝", "NNG", Some(-1000), None);
1037
1038 tokenizer.set_user_dict(user_dict);
1039
1040 assert!(tokenizer.dictionary().user_dictionary().is_some());
1042 }
1043}