1use std::collections::HashMap;
30use std::io::BufRead;
31
32use mecab_ko_hangul::{classify_char, CharType};
33
34use crate::error::{Error, Result};
35use crate::lattice::{Lattice, NodeBuilder, NodeType};
36
37pub type CategoryId = u8;
39
40pub const DEFAULT_CATEGORY: CategoryId = 0;
42pub const SPACE_CATEGORY: CategoryId = 1;
44pub const HANGUL_CATEGORY: CategoryId = 2;
46pub const HANJA_CATEGORY: CategoryId = 3;
48pub const ALPHA_CATEGORY: CategoryId = 4;
50pub const NUMERIC_CATEGORY: CategoryId = 5;
52pub const SYMBOL_CATEGORY: CategoryId = 6;
54
55#[derive(Debug, Clone)]
59pub struct CharCategoryDef {
60 pub name: String,
62 pub id: CategoryId,
64 pub invoke: bool,
66 pub group: bool,
68 pub length: usize,
70}
71
72impl CharCategoryDef {
73 #[must_use]
75 pub fn new(name: &str, id: CategoryId, invoke: bool, group: bool, length: usize) -> Self {
76 Self {
77 name: name.to_string(),
78 id,
79 invoke,
80 group,
81 length,
82 }
83 }
84}
85
86#[derive(Debug, Clone)]
90pub struct UnknownDef {
91 pub category_id: CategoryId,
93 pub left_id: u16,
95 pub right_id: u16,
97 pub cost: i16,
99 pub pos: String,
101 pub feature: String,
103}
104
105#[derive(Debug, Clone, Copy, PartialEq, Eq)]
109pub enum WordPattern {
110 Plain,
112 ProperNoun,
114 CamelCase,
116 HangulAlphaMix,
118 NumberUnit,
120 Emoji,
122}
123
124impl UnknownDef {
125 #[must_use]
127 pub fn new(
128 category_id: CategoryId,
129 left_id: u16,
130 right_id: u16,
131 cost: i16,
132 pos: &str,
133 feature: &str,
134 ) -> Self {
135 Self {
136 category_id,
137 left_id,
138 right_id,
139 cost,
140 pos: pos.to_string(),
141 feature: feature.to_string(),
142 }
143 }
144}
145
146#[derive(Debug, Clone)]
150pub struct CharCategoryMap {
151 categories: Vec<CharCategoryDef>,
153 name_to_id: HashMap<String, CategoryId>,
155 type_to_category: HashMap<CharType, CategoryId>,
157 range_overrides: Vec<(u32, u32, CategoryId)>,
159}
160
161impl Default for CharCategoryMap {
162 fn default() -> Self {
163 Self::korean_default()
164 }
165}
166
167impl CharCategoryMap {
168 #[must_use]
170 pub fn new() -> Self {
171 Self {
172 categories: Vec::new(),
173 name_to_id: HashMap::new(),
174 type_to_category: HashMap::new(),
175 range_overrides: Vec::new(),
176 }
177 }
178
179 #[must_use]
183 pub fn korean_default() -> Self {
184 let mut map = Self::new();
185
186 let defaults = [
189 ("DEFAULT", DEFAULT_CATEGORY, false, true, 0),
190 ("SPACE", SPACE_CATEGORY, false, true, 0),
191 ("HANGUL", HANGUL_CATEGORY, false, true, 2), ("HANJA", HANJA_CATEGORY, false, false, 1),
193 ("ALPHA", ALPHA_CATEGORY, true, true, 0), ("NUMERIC", NUMERIC_CATEGORY, true, true, 0), ("SYMBOL", SYMBOL_CATEGORY, true, true, 0),
196 ];
197
198 for (name, id, invoke, group, length) in defaults {
199 map.add_category(CharCategoryDef::new(name, id, invoke, group, length));
200 }
201
202 map.type_to_category
204 .insert(CharType::HangulSyllable, HANGUL_CATEGORY);
205 map.type_to_category
206 .insert(CharType::HangulJamo, HANGUL_CATEGORY);
207 map.type_to_category.insert(CharType::Hanja, HANJA_CATEGORY);
208 map.type_to_category
209 .insert(CharType::Katakana, ALPHA_CATEGORY);
210 map.type_to_category
211 .insert(CharType::Hiragana, ALPHA_CATEGORY);
212 map.type_to_category
213 .insert(CharType::Alphabet, ALPHA_CATEGORY);
214 map.type_to_category
215 .insert(CharType::Digit, NUMERIC_CATEGORY);
216 map.type_to_category
217 .insert(CharType::Whitespace, SPACE_CATEGORY);
218 map.type_to_category
219 .insert(CharType::Punctuation, SYMBOL_CATEGORY);
220 map.type_to_category
221 .insert(CharType::Other, DEFAULT_CATEGORY);
222
223 map
224 }
225
226 pub fn add_category(&mut self, def: CharCategoryDef) {
228 self.name_to_id.insert(def.name.clone(), def.id);
229 self.categories.push(def);
230 }
231
232 pub fn add_range(&mut self, start: u32, end: u32, category_id: CategoryId) {
234 self.range_overrides.push((start, end, category_id));
235 }
236
237 #[must_use]
239 pub fn get_category(&self, c: char) -> CategoryId {
240 let code = c as u32;
241
242 for &(start, end, cat_id) in &self.range_overrides {
244 if code >= start && code <= end {
245 return cat_id;
246 }
247 }
248
249 let char_type = classify_char(c);
251 self.type_to_category
252 .get(&char_type)
253 .copied()
254 .unwrap_or(DEFAULT_CATEGORY)
255 }
256
257 #[must_use]
259 pub fn get_category_def(&self, id: CategoryId) -> Option<&CharCategoryDef> {
260 self.categories.iter().find(|c| c.id == id)
261 }
262
263 #[must_use]
265 pub fn get_id_by_name(&self, name: &str) -> Option<CategoryId> {
266 self.name_to_id.get(name).copied()
267 }
268
269 pub fn from_char_def<R: BufRead>(reader: R) -> Result<Self> {
282 let mut map = Self::new();
283 let mut next_id: CategoryId = 0;
284
285 for line in reader.lines() {
286 let line = line.map_err(|e| Error::Init(e.to_string()))?;
287 let line = line.trim();
288
289 if line.is_empty() || line.starts_with('#') {
291 continue;
292 }
293
294 if !line.starts_with("0x") && !line.chars().next().is_some_and(|c| c.is_ascii_digit()) {
296 let parts: Vec<&str> = line.split_whitespace().collect();
297 if parts.len() >= 4 {
298 let name = parts[0];
299 let invoke = parts[1] == "1";
300 let group = parts[2] == "1";
301 let length: usize = parts[3].parse().unwrap_or(0);
302
303 map.add_category(CharCategoryDef::new(name, next_id, invoke, group, length));
304 next_id += 1;
305 }
306 }
307 else if line.starts_with("0x") {
309 let parts: Vec<&str> = line.split_whitespace().collect();
310 if parts.len() >= 2 {
311 let range_part = parts[0];
312 let category_name = parts[1];
313
314 if let Some(cat_id) = map.get_id_by_name(category_name) {
315 if let Some((start, end)) = parse_unicode_range(range_part) {
317 map.add_range(start, end, cat_id);
318 }
319 }
320 }
321 }
322 }
323
324 Ok(map)
325 }
326}
327
328fn parse_unicode_range(s: &str) -> Option<(u32, u32)> {
333 if let Some((start_str, end_str)) = s.split_once("..") {
334 let start = parse_hex(start_str)?;
335 let end = parse_hex(end_str)?;
336 Some((start, end))
337 } else {
338 let value = parse_hex(s)?;
339 Some((value, value))
340 }
341}
342
343fn parse_hex(s: &str) -> Option<u32> {
345 let s = s.trim_start_matches("0x").trim_start_matches("0X");
346 u32::from_str_radix(s, 16).ok()
347}
348
349#[must_use]
353const fn is_emoji(c: char) -> bool {
354 let code = c as u32;
355 matches!(code,
357 0x1F300..=0x1F9FF | 0x2600..=0x27BF )
360}
361
362#[derive(Debug, Clone, Default)]
366pub struct UnknownDictionary {
367 entries: HashMap<CategoryId, Vec<UnknownDef>>,
369}
370
371impl UnknownDictionary {
372 #[must_use]
374 pub fn new() -> Self {
375 Self {
376 entries: HashMap::new(),
377 }
378 }
379
380 #[must_use]
382 pub fn korean_default() -> Self {
383 let mut dict = Self::new();
384
385 let defaults = [
388 (DEFAULT_CATEGORY, 1800, 3562, 7000, "SY", "SY,*,*,*,*,*,*,*"),
389 (SPACE_CATEGORY, 1799, 3559, 0, "SP", "SP,*,*,*,*,*,*,*"),
390 (HANGUL_CATEGORY, 1800, 3565, 5000, "NNG", "NNG,*,*,*,*,*,*,*"),
392 (HANJA_CATEGORY, 1800, 3560, 6000, "SH", "SH,*,*,*,*,*,*,*"),
393 (ALPHA_CATEGORY, 1800, 3558, 4000, "SL", "SL,*,*,*,*,*,*,*"),
394 (NUMERIC_CATEGORY, 1800, 3561, 3000, "SN", "SN,*,*,*,*,*,*,*"),
395 (SYMBOL_CATEGORY, 1800, 3562, 7000, "SY", "SY,*,*,*,*,*,*,*"),
396 ];
397
398 for (cat_id, left_id, right_id, cost, pos, feature) in defaults {
399 dict.add_entry(UnknownDef::new(
400 cat_id, left_id, right_id, cost, pos, feature,
401 ));
402 }
403
404 dict
405 }
406
407 pub fn add_entry(&mut self, def: UnknownDef) {
409 self.entries.entry(def.category_id).or_default().push(def);
410 }
411
412 #[must_use]
414 pub fn get_entries(&self, category_id: CategoryId) -> &[UnknownDef] {
415 self.entries
416 .get(&category_id)
417 .map_or(&[], std::vec::Vec::as_slice)
418 }
419
420 pub fn from_unk_def<R: BufRead>(reader: R, category_map: &CharCategoryMap) -> Result<Self> {
432 let mut dict = Self::new();
433
434 for line in reader.lines() {
435 let line = line.map_err(|e| Error::Init(e.to_string()))?;
436 let line = line.trim();
437
438 if line.is_empty() || line.starts_with('#') {
439 continue;
440 }
441
442 let parts: Vec<&str> = line.split(',').collect();
443 if parts.len() >= 5 {
444 let category_name = parts[0];
445 let left_id: u16 = parts[1].parse().unwrap_or(0);
446 let right_id: u16 = parts[2].parse().unwrap_or(0);
447 let cost: i16 = parts[3].parse().unwrap_or(0);
448 let pos = parts[4];
449 let feature = line; if let Some(cat_id) = category_map.get_id_by_name(category_name) {
452 dict.add_entry(UnknownDef::new(
453 cat_id, left_id, right_id, cost, pos, feature,
454 ));
455 }
456 }
457 }
458
459 Ok(dict)
460 }
461}
462
463#[derive(Debug, Clone)]
465pub struct UnknownCandidate {
466 pub surface: String,
468 pub start_pos: usize,
470 pub end_pos: usize,
472 pub left_id: u16,
474 pub right_id: u16,
476 pub cost: i16,
478 pub pos: String,
480 pub category_id: CategoryId,
482 pub pattern: WordPattern,
484}
485
486#[derive(Debug, Clone)]
490pub struct UnknownHandler {
491 pub category_map: CharCategoryMap,
493 pub unknown_dict: UnknownDictionary,
495}
496
497impl Default for UnknownHandler {
498 fn default() -> Self {
499 Self::korean_default()
500 }
501}
502
503impl UnknownHandler {
504 #[must_use]
506 pub const fn new(category_map: CharCategoryMap, unknown_dict: UnknownDictionary) -> Self {
507 Self {
508 category_map,
509 unknown_dict,
510 }
511 }
512
513 #[must_use]
515 pub fn korean_default() -> Self {
516 Self::new(
517 CharCategoryMap::korean_default(),
518 UnknownDictionary::korean_default(),
519 )
520 }
521
522 #[must_use]
526 fn detect_pattern(&self, surface: &str) -> WordPattern {
527 let chars: Vec<char> = surface.chars().collect();
528 if chars.is_empty() {
529 return WordPattern::Plain;
530 }
531
532 if chars.iter().any(|&c| is_emoji(c)) {
534 return WordPattern::Emoji;
535 }
536
537 let has_hangul = chars.iter().any(|&c| {
539 let cat = self.category_map.get_category(c);
540 cat == HANGUL_CATEGORY
541 });
542 let has_alpha = chars.iter().any(|&c| {
543 let cat = self.category_map.get_category(c);
544 cat == ALPHA_CATEGORY
545 });
546
547 if has_hangul && has_alpha {
548 return WordPattern::HangulAlphaMix;
549 }
550
551 let has_digit = chars.iter().any(|&c| {
553 let cat = self.category_map.get_category(c);
554 cat == NUMERIC_CATEGORY
555 });
556
557 if has_digit && (has_hangul || has_alpha) {
558 return WordPattern::NumberUnit;
559 }
560
561 if has_alpha && !has_hangul {
563 if chars.len() > 1 {
565 let mut has_internal_uppercase = false;
566 for (i, &c) in chars.iter().enumerate() {
567 if i > 0 && c.is_uppercase() {
568 has_internal_uppercase = true;
569 break;
570 }
571 }
572 if has_internal_uppercase {
573 return WordPattern::CamelCase;
574 }
575 }
576
577 if chars[0].is_uppercase() && chars.len() > 1 {
579 return WordPattern::ProperNoun;
580 }
581 }
582
583 WordPattern::Plain
584 }
585
586 #[must_use]
590 #[allow(clippy::unused_self)]
591 fn adjust_cost_by_pattern(&self, base_cost: i16, pattern: WordPattern, length: usize) -> i16 {
592 let mut cost = i32::from(base_cost);
593
594 match pattern {
596 WordPattern::Plain => {
597 if length > 6 {
600 #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
601 let penalty = ((length - 6) * 80) as i32; cost += penalty;
603 }
604 }
605 WordPattern::ProperNoun => {
606 cost -= 600; }
609 WordPattern::CamelCase => {
610 cost -= 400; }
613 WordPattern::HangulAlphaMix => {
614 cost -= 100; }
618 WordPattern::NumberUnit => {
619 cost -= 300; }
622 WordPattern::Emoji => {
623 cost += 1500; }
626 }
627
628 #[allow(clippy::cast_possible_truncation)]
630 {
631 cost.clamp(i32::from(i16::MIN), i32::from(i16::MAX)) as i16
632 }
633 }
634
635 #[must_use]
639 #[allow(clippy::unused_self)]
640 fn estimate_pos(
641 &self,
642 pattern: WordPattern,
643 category_id: CategoryId,
644 base_pos: &str,
645 ) -> String {
646 match pattern {
647 WordPattern::ProperNoun | WordPattern::CamelCase if category_id == ALPHA_CATEGORY => {
649 return "NNP".to_string();
650 }
651 WordPattern::HangulAlphaMix if category_id == HANGUL_CATEGORY => {
653 return "NNG".to_string();
654 }
655 _ => {}
656 }
657
658 base_pos.to_string()
659 }
660
661 #[must_use]
673 pub fn generate_candidates(
674 &self,
675 text: &str,
676 start_pos: usize,
677 has_dict_entry: bool,
678 has_space_before: impl Fn(usize) -> bool,
679 ) -> Vec<UnknownCandidate> {
680 let start_byte = text
689 .char_indices()
690 .nth(start_pos)
691 .map_or(text.len(), |(b, _)| b);
692
693 let suffix = &text[start_byte..];
694 let Some(first_char) = suffix.chars().next() else {
695 return Vec::new();
696 };
697 let category_id = self.category_map.get_category(first_char);
698 let Some(category_def) = self.category_map.get_category_def(category_id) else {
699 return Vec::new();
700 };
701
702 if !category_def.invoke && has_dict_entry {
704 return Vec::new();
705 }
706
707 let unknown_defs = self.unknown_dict.get_entries(category_id);
708 if unknown_defs.is_empty() {
709 return Vec::new();
710 }
711
712 let mut candidates = Vec::new();
713
714 if category_def.group {
715 let mut char_count = 0usize;
718 let mut byte_end = 0usize;
719
720 for c in suffix.chars() {
721 if self.category_map.get_category(c) != category_id {
722 break;
723 }
724 if char_count > 0 && has_space_before(start_pos + char_count) {
726 break;
727 }
728 byte_end += c.len_utf8();
729 char_count += 1;
730 }
731
732 let group_char_count = char_count; let max_len = if category_def.length > 0 {
734 category_def.length.min(group_char_count)
735 } else {
736 group_char_count
737 };
738
739 let mut byte_offset = 0usize;
741 let mut char_iter = suffix.chars();
742 for len in 1..=max_len {
743 if let Some(c) = char_iter.next() {
744 byte_offset += c.len_utf8();
745 } else {
746 break;
747 }
748 let end_pos = start_pos + len;
749 let surface = &suffix[..byte_offset];
750
751 let pattern = self.detect_pattern(surface);
753
754 for def in unknown_defs {
755 let adjusted_cost = self.adjust_cost_by_pattern(def.cost, pattern, len);
757
758 let estimated_pos = self.estimate_pos(pattern, category_id, &def.pos);
760
761 candidates.push(UnknownCandidate {
762 surface: surface.to_string(),
763 start_pos,
764 end_pos,
765 left_id: def.left_id,
766 right_id: def.right_id,
767 cost: adjusted_cost,
768 pos: estimated_pos,
769 category_id,
770 pattern,
771 });
772 }
773 }
774 let _ = byte_end; } else {
776 let char_total = suffix.chars().count();
778 let max_len = if category_def.length > 0 {
779 category_def.length.min(char_total)
780 } else {
781 1
782 };
783
784 let mut byte_offset = 0usize;
785 let mut char_iter = suffix.chars();
786 for len in 1..=max_len {
787 if let Some(c) = char_iter.next() {
788 byte_offset += c.len_utf8();
789 } else {
790 break;
791 }
792 let end_pos = start_pos + len;
793 let surface = &suffix[..byte_offset];
794
795 let pattern = self.detect_pattern(surface);
797
798 for def in unknown_defs {
799 let adjusted_cost = self.adjust_cost_by_pattern(def.cost, pattern, len);
801
802 let estimated_pos = self.estimate_pos(pattern, category_id, &def.pos);
804
805 candidates.push(UnknownCandidate {
806 surface: surface.to_string(),
807 start_pos,
808 end_pos,
809 left_id: def.left_id,
810 right_id: def.right_id,
811 cost: adjusted_cost,
812 pos: estimated_pos,
813 category_id,
814 pattern,
815 });
816 }
817 }
818 }
819
820 candidates
821 }
822
823 #[cfg(test)]
828 fn find_group_end(&self, chars: &[char], start_pos: usize, category_id: CategoryId) -> usize {
829 let mut pos = start_pos;
830 while pos < chars.len() {
831 if self.category_map.get_category(chars[pos]) != category_id {
832 break;
833 }
834 pos += 1;
835 }
836 pos
837 }
838
839 pub fn add_unknown_nodes(
851 &self,
852 lattice: &mut Lattice,
853 start_pos: usize,
854 has_dict_entry: bool,
855 ) -> usize {
856 let text = lattice.text();
857 let candidates = self.generate_candidates(text, start_pos, has_dict_entry, |pos| {
858 lattice.has_space_at(pos)
859 });
860 let mut count = 0;
861
862 for candidate in candidates {
863 lattice.add_node(
864 NodeBuilder::new(&candidate.surface, candidate.start_pos, candidate.end_pos)
865 .left_id(candidate.left_id)
866 .right_id(candidate.right_id)
867 .word_cost(i32::from(candidate.cost))
868 .node_type(NodeType::Unknown)
869 .feature(&candidate.pos),
870 );
871 count += 1;
872 }
873
874 count
875 }
876}
877
878#[cfg(test)]
879#[allow(clippy::unwrap_used, clippy::needless_collect)]
880mod tests {
881 use super::*;
882
883 impl UnknownHandler {
884 fn generate_candidates_no_space(
886 &self,
887 text: &str,
888 start_pos: usize,
889 has_dict_entry: bool,
890 ) -> Vec<UnknownCandidate> {
891 self.generate_candidates(text, start_pos, has_dict_entry, |_| false)
892 }
893 }
894
895 #[test]
896 fn test_category_map_default() {
897 let map = CharCategoryMap::korean_default();
898
899 assert_eq!(map.get_category('가'), HANGUL_CATEGORY);
900 assert_eq!(map.get_category('A'), ALPHA_CATEGORY);
901 assert_eq!(map.get_category('1'), NUMERIC_CATEGORY);
902 assert_eq!(map.get_category(' '), SPACE_CATEGORY);
903 assert_eq!(map.get_category('.'), SYMBOL_CATEGORY);
904 assert_eq!(map.get_category('韓'), HANJA_CATEGORY);
905 }
906
907 #[test]
908 fn test_category_def() {
909 let map = CharCategoryMap::korean_default();
910
911 let hangul_def = map.get_category_def(HANGUL_CATEGORY).unwrap();
912 assert_eq!(hangul_def.name, "HANGUL");
913 assert!(!hangul_def.invoke);
914 assert!(hangul_def.group);
915 assert_eq!(hangul_def.length, 2);
916
917 let alpha_def = map.get_category_def(ALPHA_CATEGORY).unwrap();
918 assert!(alpha_def.invoke); }
920
921 #[test]
922 fn test_unknown_dict_default() {
923 let dict = UnknownDictionary::korean_default();
924
925 let hangul_entries = dict.get_entries(HANGUL_CATEGORY);
926 assert!(!hangul_entries.is_empty());
927 assert_eq!(hangul_entries[0].pos, "NNG");
928
929 let alpha_entries = dict.get_entries(ALPHA_CATEGORY);
930 assert!(!alpha_entries.is_empty());
931 assert_eq!(alpha_entries[0].pos, "SL");
932 }
933
934 #[test]
935 fn test_generate_candidates_hangul() {
936 let handler = UnknownHandler::korean_default();
937
938 let candidates = handler.generate_candidates_no_space("가나다라", 0, false);
940
941 assert!(!candidates.is_empty());
943 let surfaces: Vec<_> = candidates.iter().map(|c| c.surface.as_str()).collect();
944 assert!(surfaces.contains(&"가"));
945 assert!(surfaces.contains(&"가나"));
946 }
947
948 #[test]
949 fn test_generate_candidates_alpha() {
950 let handler = UnknownHandler::korean_default();
951
952 let candidates = handler.generate_candidates_no_space("ABC", 0, false);
954
955 let surfaces: Vec<_> = candidates.iter().map(|c| c.surface.as_str()).collect();
957 assert!(surfaces.contains(&"A"));
958 assert!(surfaces.contains(&"AB"));
959 assert!(surfaces.contains(&"ABC"));
960 }
961
962 #[test]
963 fn test_generate_candidates_with_dict_entry() {
964 let handler = UnknownHandler::korean_default();
965
966 let candidates = handler.generate_candidates_no_space("가나다", 0, true);
968 assert!(candidates.is_empty());
969
970 let candidates = handler.generate_candidates_no_space("ABC", 0, true);
972 assert!(!candidates.is_empty());
973 }
974
975 #[test]
976 fn test_generate_candidates_mixed() {
977 let handler = UnknownHandler::korean_default();
978
979 let text = "가ABC";
981
982 let candidates = handler.generate_candidates_no_space(text, 0, false);
984 assert!(candidates.iter().all(|c| c.category_id == HANGUL_CATEGORY));
985
986 let candidates = handler.generate_candidates_no_space(text, 1, false);
988 assert!(candidates.iter().all(|c| c.category_id == ALPHA_CATEGORY));
989 }
990
991 #[test]
992 fn test_find_group_end() {
993 let handler = UnknownHandler::korean_default();
994 let chars: Vec<char> = "가나다ABC".chars().collect();
995
996 let end = handler.find_group_end(&chars, 0, HANGUL_CATEGORY);
998 assert_eq!(end, 3);
999
1000 let end = handler.find_group_end(&chars, 3, ALPHA_CATEGORY);
1002 assert_eq!(end, 6);
1003 }
1004
1005 #[test]
1006 fn test_add_unknown_nodes() {
1007 let handler = UnknownHandler::korean_default();
1008 let mut lattice = Lattice::new("테스트ABC");
1009
1010 let count = handler.add_unknown_nodes(&mut lattice, 0, false);
1011 assert!(count > 0);
1012
1013 let nodes_at_0: Vec<_> = lattice.nodes_starting_at(0).collect();
1015 assert!(!nodes_at_0.is_empty());
1016 }
1017
1018 #[test]
1019 fn test_pattern_detection_proper_noun() {
1020 let handler = UnknownHandler::korean_default();
1021
1022 let pattern = handler.detect_pattern("Apple");
1023 assert_eq!(pattern, WordPattern::ProperNoun);
1024
1025 let pattern = handler.detect_pattern("Google");
1026 assert_eq!(pattern, WordPattern::ProperNoun);
1027 }
1028
1029 #[test]
1030 fn test_pattern_detection_camel_case() {
1031 let handler = UnknownHandler::korean_default();
1032
1033 let pattern = handler.detect_pattern("iPhone");
1034 assert_eq!(pattern, WordPattern::CamelCase);
1035
1036 let pattern = handler.detect_pattern("HelloWorld");
1037 assert_eq!(pattern, WordPattern::CamelCase);
1038
1039 let pattern = handler.detect_pattern("iPad");
1040 assert_eq!(pattern, WordPattern::CamelCase);
1041 }
1042
1043 #[test]
1044 fn test_pattern_detection_hangul_alpha_mix() {
1045 let handler = UnknownHandler::korean_default();
1046
1047 let pattern = handler.detect_pattern("카카오톡");
1048 assert_eq!(pattern, WordPattern::Plain);
1050
1051 let pattern = handler.detect_pattern("API키");
1053 assert_eq!(pattern, WordPattern::HangulAlphaMix);
1054 }
1055
1056 #[test]
1057 fn test_pattern_detection_number_unit() {
1058 let handler = UnknownHandler::korean_default();
1059
1060 let pattern = handler.detect_pattern("15kg");
1061 assert_eq!(pattern, WordPattern::NumberUnit);
1062
1063 let pattern = handler.detect_pattern("3개");
1064 assert_eq!(pattern, WordPattern::NumberUnit);
1065
1066 let pattern = handler.detect_pattern("100원");
1067 assert_eq!(pattern, WordPattern::NumberUnit);
1068 }
1069
1070 #[test]
1071 fn test_pattern_detection_emoji() {
1072 let handler = UnknownHandler::korean_default();
1073
1074 let pattern = handler.detect_pattern("😀");
1075 assert_eq!(pattern, WordPattern::Emoji);
1076
1077 let pattern = handler.detect_pattern("안녕😊");
1078 assert_eq!(pattern, WordPattern::Emoji);
1079 }
1080
1081 #[test]
1082 fn test_pattern_detection_plain() {
1083 let handler = UnknownHandler::korean_default();
1084
1085 let pattern = handler.detect_pattern("hello");
1086 assert_eq!(pattern, WordPattern::Plain);
1087
1088 let _pattern = handler.detect_pattern("test123");
1089 }
1092
1093 #[test]
1094 fn test_cost_adjustment_by_pattern() {
1095 let handler = UnknownHandler::korean_default();
1096
1097 let base_cost = 4000i16;
1099 let adjusted = handler.adjust_cost_by_pattern(base_cost, WordPattern::ProperNoun, 5);
1100 assert!(adjusted < base_cost);
1101
1102 let adjusted = handler.adjust_cost_by_pattern(base_cost, WordPattern::CamelCase, 5);
1104 assert!(adjusted < base_cost);
1105
1106 let adjusted = handler.adjust_cost_by_pattern(base_cost, WordPattern::Emoji, 1);
1108 assert!(adjusted > base_cost);
1109 }
1110
1111 #[test]
1112 fn test_cost_adjustment_by_length() {
1113 let handler = UnknownHandler::korean_default();
1114 let base_cost = 5000i16;
1115
1116 let cost_short = handler.adjust_cost_by_pattern(base_cost, WordPattern::Plain, 3);
1118
1119 let cost_long = handler.adjust_cost_by_pattern(base_cost, WordPattern::Plain, 10);
1121
1122 assert!(cost_long > cost_short);
1124 }
1125
1126 #[test]
1127 fn test_pos_estimation_proper_noun() {
1128 let handler = UnknownHandler::korean_default();
1129
1130 let pos = handler.estimate_pos(WordPattern::ProperNoun, ALPHA_CATEGORY, "SL");
1131 assert_eq!(pos, "NNP");
1132
1133 let pos = handler.estimate_pos(WordPattern::CamelCase, ALPHA_CATEGORY, "SL");
1134 assert_eq!(pos, "NNP");
1135 }
1136
1137 #[test]
1138 fn test_pos_estimation_hangul_alpha_mix() {
1139 let handler = UnknownHandler::korean_default();
1140
1141 let pos = handler.estimate_pos(WordPattern::HangulAlphaMix, HANGUL_CATEGORY, "NNG");
1142 assert_eq!(pos, "NNG");
1143 }
1144
1145 #[test]
1146 fn test_generate_candidates_with_patterns() {
1147 let handler = UnknownHandler::korean_default();
1148
1149 let candidates = handler.generate_candidates_no_space("Apple", 0, false);
1151 assert!(!candidates.is_empty());
1152
1153 let has_proper_noun = candidates
1155 .iter()
1156 .any(|c| c.pattern == WordPattern::ProperNoun);
1157 assert!(has_proper_noun);
1158
1159 let proper_noun_candidates: Vec<_> = candidates
1161 .iter()
1162 .filter(|c| c.pattern == WordPattern::ProperNoun)
1163 .collect();
1164 assert!(proper_noun_candidates.iter().any(|c| c.pos == "NNP"));
1165 }
1166
1167 #[test]
1168 fn test_generate_candidates_abbreviation() {
1169 let handler = UnknownHandler::korean_default();
1170
1171 let candidates = handler.generate_candidates_no_space("API", 0, false);
1173 assert!(!candidates.is_empty());
1174
1175 let surfaces: Vec<_> = candidates.iter().map(|c| c.surface.as_str()).collect();
1177 assert!(surfaces.contains(&"API") || surfaces.contains(&"A"));
1178 }
1179
1180 #[test]
1181 fn test_generate_candidates_camel_case() {
1182 let handler = UnknownHandler::korean_default();
1183
1184 let candidates = handler.generate_candidates_no_space("iPhone", 0, false);
1185 assert!(!candidates.is_empty());
1186
1187 let has_camel = candidates
1189 .iter()
1190 .any(|c| c.pattern == WordPattern::CamelCase);
1191 assert!(has_camel);
1192 }
1193
1194 #[test]
1195 fn test_unknown_korean_word() {
1196 let handler = UnknownHandler::korean_default();
1197
1198 let candidates = handler.generate_candidates_no_space("테스트", 0, false);
1200 assert!(!candidates.is_empty());
1201
1202 assert!(candidates.iter().all(|c| c.category_id == HANGUL_CATEGORY));
1204 }
1205
1206 #[test]
1207 fn test_is_emoji() {
1208 assert!(is_emoji('😀'));
1209 assert!(is_emoji('😊'));
1210 assert!(is_emoji('🚀'));
1211 assert!(is_emoji('❤'));
1212
1213 assert!(!is_emoji('a'));
1214 assert!(!is_emoji('가'));
1215 assert!(!is_emoji('1'));
1216 }
1217
1218 #[test]
1219 fn test_parse_unicode_range() {
1220 assert_eq!(
1221 parse_unicode_range("0xAC00..0xD7A3"),
1222 Some((0xAC00, 0xD7A3))
1223 );
1224 assert_eq!(parse_unicode_range("0xAC00"), Some((0xAC00, 0xAC00)));
1225 assert_eq!(parse_unicode_range("0x0020"), Some((0x0020, 0x0020)));
1226 }
1227
1228 #[test]
1229 fn test_char_def_parsing() {
1230 let char_def = r"
1231# Comment line
1232DEFAULT 0 1 0
1233SPACE 0 1 0
1234HANGUL 0 1 2
1235ALPHA 1 1 0
1236
12370xAC00..0xD7A3 HANGUL
12380x0041..0x005A ALPHA
1239";
1240
1241 let map = CharCategoryMap::from_char_def(char_def.as_bytes()).unwrap();
1242
1243 assert!(map.get_id_by_name("DEFAULT").is_some());
1244 assert!(map.get_id_by_name("HANGUL").is_some());
1245 assert!(map.get_id_by_name("ALPHA").is_some());
1246
1247 assert_eq!(
1249 map.get_category('가'),
1250 map.get_id_by_name("HANGUL").unwrap()
1251 );
1252 assert_eq!(map.get_category('A'), map.get_id_by_name("ALPHA").unwrap());
1253 }
1254
1255 #[test]
1256 fn test_unk_def_parsing() {
1257 let char_def = "DEFAULT 0 1 0\nHANGUL 0 1 2\n";
1258 let map = CharCategoryMap::from_char_def(char_def.as_bytes()).unwrap();
1259
1260 let unk_def = r"
1261DEFAULT,1800,3562,7000,SY,*,*,*,*,*,*,*
1262HANGUL,1800,3565,5000,UNKNOWN,*,*,*,*,*,*,*
1263";
1264
1265 let dict = UnknownDictionary::from_unk_def(unk_def.as_bytes(), &map).unwrap();
1266
1267 let hangul_id = map.get_id_by_name("HANGUL").unwrap();
1268 let entries = dict.get_entries(hangul_id);
1269 assert!(!entries.is_empty());
1270 assert_eq!(entries[0].pos, "UNKNOWN");
1271 assert_eq!(entries[0].cost, 5000);
1272 }
1273}