1use std::io::{BufRead, BufReader, Read as _, Write as _};
31use std::path::{Path, PathBuf};
32use std::sync::Arc;
33
34use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
35
36use crate::entry_store::{EagerStore, EntryStore, LazyStore, LazyStoreV3};
37use crate::error::{DictError, Result};
38use crate::lazy_entries::LazyEntries;
39use crate::lazy_entries_v3::{detect_entries_format, EntriesFormat, LazyEntriesV3};
40use crate::matrix::{ConnectionMatrix, Matrix};
41use crate::trie::TrieBackend;
42use crate::user_dict::UserDictionary;
43use crate::{Dictionary, Entry};
44
45#[cfg(feature = "hot-reload-v2")]
46use crate::hot_reload_v2::HotReloadDictV2;
47
48const DEFAULT_DICDIR_PATHS: &[&str] = &[
50 "/opt/homebrew/lib/mecab/dic/mecab-ko-dic",
51 "/usr/local/lib/mecab/dic/mecab-ko-dic",
52 "/usr/lib/mecab/dic/mecab-ko-dic",
53 "/opt/mecab/dic/mecab-ko-dic",
54 "./dic/mecab-ko-dic",
55];
56
57const TRIE_FILE: &str = "sys.dic";
59const MATRIX_FILE: &str = "matrix.bin";
60const ENTRIES_BIN_FILE: &str = "entries.bin";
61const ENTRIES_CSV_FILE: &str = "entries.csv";
62
63const ENTRIES_MAGIC: &[u8; 4] = b"MKED";
65const ENTRIES_VERSION: u32 = 1;
67
68pub struct SystemDictionary {
73 dicdir: PathBuf,
75 trie: TrieBackend,
77 matrix: ConnectionMatrix,
79 entry_store: Arc<dyn EntryStore>,
81 user_dict: Option<Arc<UserDictionary>>,
83 #[cfg(feature = "hot-reload-v2")]
85 hot_reload: Option<Arc<HotReloadDictV2>>,
86}
87
88#[derive(Debug, Clone, PartialEq, Eq)]
92pub struct DictEntry {
93 pub surface: String,
95 pub left_id: u16,
97 pub right_id: u16,
99 pub cost: i16,
101 pub feature: String,
103}
104
105impl DictEntry {
106 pub fn new(
108 surface: impl Into<String>,
109 left_id: u16,
110 right_id: u16,
111 cost: i16,
112 feature: impl Into<String>,
113 ) -> Self {
114 Self {
115 surface: surface.into(),
116 left_id,
117 right_id,
118 cost,
119 feature: feature.into(),
120 }
121 }
122
123 #[must_use]
125 pub fn to_entry(&self) -> Entry {
126 Entry {
127 surface: self.surface.clone(),
128 left_id: self.left_id,
129 right_id: self.right_id,
130 cost: self.cost,
131 feature: self.feature.clone(),
132 }
133 }
134}
135
136impl From<Entry> for DictEntry {
137 fn from(entry: Entry) -> Self {
138 Self {
139 surface: entry.surface,
140 left_id: entry.left_id,
141 right_id: entry.right_id,
142 cost: entry.cost,
143 feature: entry.feature,
144 }
145 }
146}
147
148#[derive(Debug, Clone, Copy)]
153pub struct LoadOptions {
154 pub use_mmap_trie: bool,
156 pub use_mmap_matrix: bool,
158 pub use_lazy_entries: bool,
160 pub lazy_cache_size: Option<usize>,
162}
163
164impl Default for LoadOptions {
165 fn default() -> Self {
172 Self {
173 use_mmap_trie: false,
174 use_mmap_matrix: false,
175 use_lazy_entries: true,
176 lazy_cache_size: Some(10000),
177 }
178 }
179}
180
181impl LoadOptions {
182 #[must_use]
184 pub const fn memory_optimized() -> Self {
185 Self {
186 use_mmap_trie: true,
187 use_mmap_matrix: true,
188 use_lazy_entries: true,
189 lazy_cache_size: Some(10000),
190 }
191 }
192
193 #[must_use]
198 pub const fn speed_optimized() -> Self {
199 Self {
200 use_mmap_trie: false,
201 use_mmap_matrix: false,
202 use_lazy_entries: false,
203 lazy_cache_size: None,
204 }
205 }
206
207 #[must_use]
211 pub const fn eager() -> Self {
212 Self::speed_optimized()
213 }
214}
215
216impl SystemDictionary {
217 pub fn load_default() -> Result<Self> {
228 let dicdir = DictionaryLoader::find_dicdir()?;
229 Self::load(dicdir)
230 }
231
232 pub fn load_memory_optimized() -> Result<Self> {
241 let dicdir = DictionaryLoader::find_dicdir()?;
242 Self::load_with_options(dicdir, LoadOptions::memory_optimized())
243 }
244
245 pub fn load_with_options<P: AsRef<Path>>(dicdir: P, options: LoadOptions) -> Result<Self> {
252 let dicdir = dicdir.as_ref().to_path_buf();
253
254 let trie_path = dicdir.join(TRIE_FILE);
256 let trie = if trie_path.exists() {
257 if options.use_mmap_trie {
258 TrieBackend::from_mmap_file(&trie_path)?
259 } else {
260 TrieBackend::from_file(&trie_path)?
261 }
262 } else {
263 let compressed_path = dicdir.join(format!("{TRIE_FILE}.zst"));
265 if compressed_path.exists() {
266 TrieBackend::from_compressed_file(&compressed_path)?
267 } else {
268 return Err(DictError::Format(format!(
269 "Trie file not found: {}",
270 trie_path.display()
271 )));
272 }
273 };
274
275 let matrix_path = dicdir.join(MATRIX_FILE);
277 let matrix = if matrix_path.exists() {
278 if options.use_mmap_matrix {
279 ConnectionMatrix::from_mmap_file(&matrix_path)?
280 } else {
281 ConnectionMatrix::from_bin_file(&matrix_path)?
282 }
283 } else {
284 let compressed_path = dicdir.join(format!("{MATRIX_FILE}.zst"));
286 if compressed_path.exists() {
287 ConnectionMatrix::from_compressed_file(&compressed_path)?
288 } else {
289 let def_path = dicdir.join("matrix.def");
291 if def_path.exists() {
292 ConnectionMatrix::from_def_file(&def_path)?
293 } else {
294 return Err(DictError::Format(format!(
295 "Matrix file not found: {}",
296 matrix_path.display()
297 )));
298 }
299 }
300 };
301
302 let entry_store: Arc<dyn EntryStore> = if options.use_lazy_entries {
304 let entries_path = dicdir.join(ENTRIES_BIN_FILE);
305 if entries_path.exists() {
306 match detect_entries_format(&entries_path) {
307 Ok(EntriesFormat::V3) => {
308 if let Ok(lazy) = LazyEntriesV3::from_file(&entries_path) {
309 if let Some(cache_size) = options.lazy_cache_size {
310 lazy.set_cache_size(cache_size);
311 }
312 Arc::new(LazyStoreV3::new(lazy))
313 } else {
314 let entries = Self::load_entries(&dicdir)?;
315 Arc::new(EagerStore::new(entries))
316 }
317 }
318 Ok(EntriesFormat::V2) => {
319 if let Ok(lazy) = LazyEntries::from_file(&entries_path) {
320 if let Some(cache_size) = options.lazy_cache_size {
321 lazy.set_cache_size(cache_size);
322 }
323 Arc::new(LazyStore::new(lazy))
324 } else {
325 let entries = Self::load_entries(&dicdir)?;
326 Arc::new(EagerStore::new(entries))
327 }
328 }
329 Ok(EntriesFormat::V1) | Err(_) => {
330 let entries = Self::load_entries(&dicdir)?;
331 Arc::new(EagerStore::new(entries))
332 }
333 }
334 } else {
335 let entries = Self::load_entries(&dicdir)?;
336 Arc::new(EagerStore::new(entries))
337 }
338 } else {
339 let entries = Self::load_entries(&dicdir)?;
340 Arc::new(EagerStore::new(entries))
341 };
342
343 Ok(Self {
344 dicdir,
345 trie,
346 matrix,
347 entry_store,
348 user_dict: None,
349 #[cfg(feature = "hot-reload-v2")]
350 hot_reload: None,
351 })
352 }
353
354 pub fn load<P: AsRef<Path>>(dicdir: P) -> Result<Self> {
365 let dicdir = dicdir.as_ref().to_path_buf();
366
367 let trie_path = dicdir.join(TRIE_FILE);
369 let trie = if trie_path.exists() {
370 TrieBackend::from_file(&trie_path)?
371 } else {
372 let compressed_path = dicdir.join(format!("{TRIE_FILE}.zst"));
374 if compressed_path.exists() {
375 TrieBackend::from_compressed_file(&compressed_path)?
376 } else {
377 return Err(DictError::Format(format!(
378 "Trie file not found: {}",
379 trie_path.display()
380 )));
381 }
382 };
383
384 let matrix_path = dicdir.join(MATRIX_FILE);
386 let matrix = if matrix_path.exists() {
387 ConnectionMatrix::from_bin_file(&matrix_path)?
388 } else {
389 let compressed_path = dicdir.join(format!("{MATRIX_FILE}.zst"));
391 if compressed_path.exists() {
392 ConnectionMatrix::from_compressed_file(&compressed_path)?
393 } else {
394 let def_path = dicdir.join("matrix.def");
396 if def_path.exists() {
397 ConnectionMatrix::from_def_file(&def_path)?
398 } else {
399 return Err(DictError::Format(format!(
400 "Matrix file not found: {}",
401 matrix_path.display()
402 )));
403 }
404 }
405 };
406
407 let entries = Self::load_entries(&dicdir)?;
409 let entry_store: Arc<dyn EntryStore> = Arc::new(EagerStore::new(entries));
410
411 Ok(Self {
412 dicdir,
413 trie,
414 matrix,
415 entry_store,
416 user_dict: None,
417 #[cfg(feature = "hot-reload-v2")]
418 hot_reload: None,
419 })
420 }
421
422 fn load_entries(dicdir: &Path) -> Result<Vec<DictEntry>> {
428 let bin_path = dicdir.join(ENTRIES_BIN_FILE);
430 if bin_path.exists() {
431 return Self::load_entries_bin(&bin_path);
432 }
433
434 let csv_path = dicdir.join(ENTRIES_CSV_FILE);
436 if csv_path.exists() {
437 return Self::load_entries_csv(&csv_path);
438 }
439
440 Ok(Vec::new())
442 }
443
444 fn load_entries_csv(path: &Path) -> Result<Vec<DictEntry>> {
448 let file = std::fs::File::open(path).map_err(DictError::Io)?;
449 let reader = BufReader::new(file);
450 let mut entries = Vec::new();
451
452 for (line_num, line_result) in reader.lines().enumerate() {
453 let line = line_result.map_err(DictError::Io)?;
454 let line = line.trim();
455 if line.is_empty() || line.starts_with('#') {
456 continue;
457 }
458
459 let mut fields = line.splitn(5, ',');
461 let surface = fields
462 .next()
463 .ok_or_else(|| {
464 DictError::Format(format!("line {}: missing surface", line_num + 1))
465 })?
466 .to_string();
467 let left_id: u16 = fields
468 .next()
469 .ok_or_else(|| {
470 DictError::Format(format!("line {}: missing left_id", line_num + 1))
471 })?
472 .parse()
473 .map_err(|_| {
474 DictError::Format(format!("line {}: invalid left_id", line_num + 1))
475 })?;
476 let right_id: u16 = fields
477 .next()
478 .ok_or_else(|| {
479 DictError::Format(format!("line {}: missing right_id", line_num + 1))
480 })?
481 .parse()
482 .map_err(|_| {
483 DictError::Format(format!("line {}: invalid right_id", line_num + 1))
484 })?;
485 let cost: i16 = fields
486 .next()
487 .ok_or_else(|| DictError::Format(format!("line {}: missing cost", line_num + 1)))?
488 .parse()
489 .map_err(|_| DictError::Format(format!("line {}: invalid cost", line_num + 1)))?;
490 let feature = fields.next().unwrap_or("").to_string();
491
492 entries.push(DictEntry {
493 surface,
494 left_id,
495 right_id,
496 cost,
497 feature,
498 });
499 }
500
501 Ok(entries)
502 }
503
504 fn load_entries_bin(path: &Path) -> Result<Vec<DictEntry>> {
509 let data = std::fs::read(path).map_err(DictError::Io)?;
510 let mut cursor = std::io::Cursor::new(&data);
511
512 let mut magic = [0u8; 4];
514 cursor
515 .read_exact(&mut magic)
516 .map_err(|e| DictError::Format(format!("entries.bin magic: {e}")))?;
517
518 if &magic == b"MKE2" {
520 return Self::load_entries_bin_v2(path);
521 }
522
523 if &magic != ENTRIES_MAGIC {
525 return Err(DictError::Format(
526 "entries.bin: invalid magic number (expected MKED or MKE2)".into(),
527 ));
528 }
529
530 let version = cursor
532 .read_u32::<LittleEndian>()
533 .map_err(|e| DictError::Format(format!("entries.bin version: {e}")))?;
534 if version != ENTRIES_VERSION {
535 return Err(DictError::Format(format!(
536 "entries.bin: unsupported version {version}"
537 )));
538 }
539
540 let count = cursor
542 .read_u32::<LittleEndian>()
543 .map_err(|e| DictError::Format(format!("entries.bin count: {e}")))?;
544
545 let mut entries = Vec::with_capacity(count as usize);
546 for i in 0..count {
547 let left_id = cursor
548 .read_u16::<LittleEndian>()
549 .map_err(|e| DictError::Format(format!("entries.bin entry {i} left_id: {e}")))?;
550 let right_id = cursor
551 .read_u16::<LittleEndian>()
552 .map_err(|e| DictError::Format(format!("entries.bin entry {i} right_id: {e}")))?;
553 let cost = cursor
554 .read_i16::<LittleEndian>()
555 .map_err(|e| DictError::Format(format!("entries.bin entry {i} cost: {e}")))?;
556 let surface_len = cursor
557 .read_u16::<LittleEndian>()
558 .map_err(|e| DictError::Format(format!("entries.bin entry {i} surface_len: {e}")))?
559 as usize;
560 let feature_len = cursor
561 .read_u16::<LittleEndian>()
562 .map_err(|e| DictError::Format(format!("entries.bin entry {i} feature_len: {e}")))?
563 as usize;
564
565 let mut surface_bytes = vec![0u8; surface_len];
566 cursor
567 .read_exact(&mut surface_bytes)
568 .map_err(|e| DictError::Format(format!("entries.bin entry {i} surface: {e}")))?;
569 let surface = String::from_utf8(surface_bytes).map_err(|e| {
570 DictError::Format(format!("entries.bin entry {i} surface utf8: {e}"))
571 })?;
572
573 let mut feature_bytes = vec![0u8; feature_len];
574 cursor
575 .read_exact(&mut feature_bytes)
576 .map_err(|e| DictError::Format(format!("entries.bin entry {i} feature: {e}")))?;
577 let feature = String::from_utf8(feature_bytes).map_err(|e| {
578 DictError::Format(format!("entries.bin entry {i} feature utf8: {e}"))
579 })?;
580
581 entries.push(DictEntry {
582 surface,
583 left_id,
584 right_id,
585 cost,
586 feature,
587 });
588 }
589
590 Ok(entries)
591 }
592
593 fn load_entries_bin_v2(path: &Path) -> Result<Vec<DictEntry>> {
597 let lazy = LazyEntries::from_file(path)?;
598 let count = lazy.len();
599 let mut entries = Vec::with_capacity(count);
600
601 for i in 0..count {
602 let entry = lazy.get(i as u32)?;
603 entries.push((*entry).clone());
604 }
605
606 Ok(entries)
607 }
608
609 pub fn save_entries_bin(entries: &[DictEntry], path: &Path) -> Result<()> {
615 let mut file = std::fs::File::create(path).map_err(DictError::Io)?;
616
617 file.write_all(ENTRIES_MAGIC).map_err(DictError::Io)?;
618 file.write_u32::<LittleEndian>(ENTRIES_VERSION)
619 .map_err(DictError::Io)?;
620
621 let count = u32::try_from(entries.len())
622 .map_err(|_| DictError::Format("too many entries".into()))?;
623 file.write_u32::<LittleEndian>(count)
624 .map_err(DictError::Io)?;
625
626 for entry in entries {
627 file.write_u16::<LittleEndian>(entry.left_id)
628 .map_err(DictError::Io)?;
629 file.write_u16::<LittleEndian>(entry.right_id)
630 .map_err(DictError::Io)?;
631 file.write_i16::<LittleEndian>(entry.cost)
632 .map_err(DictError::Io)?;
633
634 let surface_bytes = entry.surface.as_bytes();
635 let surface_len = u16::try_from(surface_bytes.len())
636 .map_err(|_| DictError::Format("surface too long".into()))?;
637 file.write_u16::<LittleEndian>(surface_len)
638 .map_err(DictError::Io)?;
639
640 let feature_bytes = entry.feature.as_bytes();
641 let feature_len = u16::try_from(feature_bytes.len())
642 .map_err(|_| DictError::Format("feature too long".into()))?;
643 file.write_u16::<LittleEndian>(feature_len)
644 .map_err(DictError::Io)?;
645
646 file.write_all(surface_bytes).map_err(DictError::Io)?;
647 file.write_all(feature_bytes).map_err(DictError::Io)?;
648 }
649
650 Ok(())
651 }
652
653 pub fn save_entries_csv(entries: &[DictEntry], path: &Path) -> Result<()> {
659 let mut file = std::fs::File::create(path).map_err(DictError::Io)?;
660
661 for entry in entries {
662 writeln!(
663 file,
664 "{},{},{},{},{}",
665 entry.surface, entry.left_id, entry.right_id, entry.cost, entry.feature
666 )
667 .map_err(DictError::Io)?;
668 }
669
670 Ok(())
671 }
672
673 fn get_entries_at(&self, first_index: u32, surface: &str) -> Result<Vec<Arc<DictEntry>>> {
682 self.entry_store.get_entries_at(first_index, surface)
683 }
684
685 #[must_use]
691 pub fn with_user_dictionary(mut self, user_dict: UserDictionary) -> Self {
692 self.user_dict = Some(Arc::new(user_dict));
693 self
694 }
695
696 pub fn set_user_dictionary(&mut self, user_dict: UserDictionary) {
698 self.user_dict = Some(Arc::new(user_dict));
699 }
700
701 #[must_use]
703 pub fn dicdir(&self) -> &Path {
704 &self.dicdir
705 }
706
707 #[must_use]
709 pub const fn trie(&self) -> &TrieBackend {
710 &self.trie
711 }
712
713 #[must_use]
715 pub const fn matrix(&self) -> &ConnectionMatrix {
716 &self.matrix
717 }
718
719 #[must_use]
721 pub fn entry_count(&self) -> usize {
722 self.entry_store.len()
723 }
724
725 #[must_use]
727 pub fn entry_store(&self) -> &Arc<dyn EntryStore> {
728 &self.entry_store
729 }
730
731 #[must_use]
733 pub fn user_dictionary(&self) -> Option<&UserDictionary> {
734 self.user_dict.as_deref()
735 }
736
737 #[cfg(feature = "hot-reload-v2")]
743 #[must_use]
744 pub fn with_hot_reload(mut self, hr: Arc<HotReloadDictV2>) -> Self {
745 self.hot_reload = Some(hr);
746 self
747 }
748
749 #[cfg(feature = "hot-reload-v2")]
755 pub fn set_hot_reload(&mut self, hr: Arc<HotReloadDictV2>) {
756 self.hot_reload = Some(hr);
757 }
758
759 #[cfg(feature = "hot-reload-v2")]
761 #[must_use]
762 pub const fn hot_reload(&self) -> Option<&Arc<HotReloadDictV2>> {
763 self.hot_reload.as_ref()
764 }
765
766 pub fn get_entry(&self, index: u32) -> Result<Arc<DictEntry>> {
777 self.entry_store.get(index)
778 }
779
780 pub fn common_prefix_search(&self, text: &str) -> Result<Vec<(Arc<DictEntry>, usize)>> {
797 let mut results = Vec::new();
798 for (index, byte_len) in self.trie.common_prefix_search(text) {
799 let surface = &text[..byte_len];
800 let entries = self.get_entries_at(index, surface)?;
801 for entry in entries {
802 results.push((entry, byte_len));
803 }
804 }
805
806 #[cfg(feature = "hot-reload-v2")]
808 if let Some(hr) = &self.hot_reload {
809 let snapshot = hr.load();
810 let domain_entries = snapshot.domain_stack.common_prefix_search(text);
811 for user_entry in domain_entries {
812 let byte_len = user_entry.surface.len();
813 let dict_entry = Arc::new(DictEntry::new(
814 &user_entry.surface,
815 user_entry.left_id,
816 user_entry.right_id,
817 user_entry.cost,
818 &user_entry.feature,
819 ));
820 results.push((dict_entry, byte_len));
821 }
822 }
823
824 Ok(results)
825 }
826
827 pub fn common_prefix_search_at(
838 &self,
839 text: &str,
840 start_byte: usize,
841 ) -> Result<Vec<(Arc<DictEntry>, usize)>> {
842 let mut results = Vec::new();
843 for (index, end_byte) in self.trie.common_prefix_search_at(text, start_byte) {
844 let byte_len = end_byte - start_byte;
845 let surface = &text[start_byte..end_byte];
846 let entries = self.get_entries_at(index, surface)?;
847 for entry in entries {
848 results.push((entry, byte_len));
849 }
850 }
851 Ok(results)
852 }
853
854 #[must_use]
860 pub fn lookup_combined(&self, surface: &str) -> Vec<Entry> {
861 let mut results = self.lookup(surface);
862
863 if let Some(user_dict) = &self.user_dict {
865 let user_entries = user_dict.lookup(surface);
866 results.extend(user_entries.iter().map(|e| e.to_entry()));
867 }
868
869 #[cfg(feature = "hot-reload-v2")]
871 if let Some(hr) = &self.hot_reload {
872 let snapshot = hr.load();
873 let domain_entries = snapshot.domain_stack.lookup(surface);
874 results.extend(domain_entries.iter().map(|ue| Entry {
875 surface: ue.surface.clone(),
876 left_id: ue.left_id,
877 right_id: ue.right_id,
878 cost: ue.cost,
879 feature: ue.feature.clone(),
880 }));
881 }
882
883 results
884 }
885
886 #[doc(hidden)]
888 #[must_use]
889 pub fn new_test(
890 dicdir: PathBuf,
891 trie: TrieBackend,
892 matrix: ConnectionMatrix,
893 entries: Vec<DictEntry>,
894 ) -> Self {
895 Self {
896 dicdir,
897 trie,
898 matrix,
899 entry_store: Arc::new(EagerStore::new(entries)),
900 user_dict: None,
901 #[cfg(feature = "hot-reload-v2")]
902 hot_reload: None,
903 }
904 }
905}
906
907impl Dictionary for SystemDictionary {
908 fn lookup(&self, surface: &str) -> Vec<Entry> {
909 if let Some(index) = self.trie.exact_match(surface) {
911 if let Ok(entries) = self.get_entries_at(index, surface) {
912 if !entries.is_empty() {
913 return entries.iter().map(|e| e.to_entry()).collect();
914 }
915 }
916 }
917
918 Vec::new()
919 }
920
921 fn get_connection_cost(&self, left_id: u16, right_id: u16) -> i16 {
922 i16::try_from(self.matrix.get(right_id, left_id)).unwrap_or(i16::MAX)
923 }
924}
925
926pub struct DictionaryLoader;
930
931impl DictionaryLoader {
932 pub fn find_dicdir() -> Result<PathBuf> {
942 if let Ok(dicdir) = std::env::var("MECAB_DICDIR") {
944 let path = PathBuf::from(dicdir);
945 if path.is_dir() {
946 return Ok(path);
947 }
948 }
949
950 for &path_str in DEFAULT_DICDIR_PATHS {
952 let path = PathBuf::from(path_str);
953 if path.is_dir() {
954 return Ok(path);
955 }
956 }
957
958 {
967 let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
968 let test_dict = manifest_dir.join("../../test-fixtures/mini-dict");
969 if test_dict.is_dir() {
970 eprintln!(
971 "[mecab-ko WARNING] No system dictionary found; falling back to sparse \
972 test dictionary at '{}'. Most Korean words will NOT be tokenized. \
973 Set MECAB_DICDIR to a full mecab-ko-dic installation path.",
974 test_dict.display()
975 );
976 return Ok(test_dict);
977 }
978 }
979
980 Err(DictError::Format(
981 "Dictionary directory not found. Set MECAB_DICDIR environment variable or \
982 install mecab-ko-dic to one of: /usr/local/lib/mecab/dic/mecab-ko-dic, \
983 /usr/lib/mecab/dic/mecab-ko-dic, /opt/mecab/dic/mecab-ko-dic, \
984 ./dic/mecab-ko-dic"
985 .to_string(),
986 ))
987 }
988
989 pub fn load_system<P: AsRef<Path>>(dicdir: P) -> Result<SystemDictionary> {
995 SystemDictionary::load(dicdir)
996 }
997
998 pub fn load_default() -> Result<SystemDictionary> {
1004 SystemDictionary::load_default()
1005 }
1006
1007 pub fn validate_dicdir<P: AsRef<Path>>(dicdir: P) -> Result<()> {
1017 let dicdir = dicdir.as_ref();
1018
1019 if !dicdir.is_dir() {
1020 return Err(DictError::Format(format!(
1021 "Dictionary directory does not exist: {}",
1022 dicdir.display()
1023 )));
1024 }
1025
1026 let has_trie =
1028 dicdir.join(TRIE_FILE).exists() || dicdir.join(format!("{TRIE_FILE}.zst")).exists();
1029
1030 let has_matrix = dicdir.join(MATRIX_FILE).exists() || dicdir.join("matrix.def").exists();
1031
1032 if !has_trie {
1033 return Err(DictError::Format(format!(
1034 "Trie file not found in {}",
1035 dicdir.display()
1036 )));
1037 }
1038
1039 if !has_matrix {
1040 return Err(DictError::Format(format!(
1041 "Matrix file not found in {}",
1042 dicdir.display()
1043 )));
1044 }
1045
1046 Ok(())
1047 }
1048}
1049
1050#[cfg(test)]
1051#[allow(
1052 clippy::expect_used,
1053 clippy::unwrap_used,
1054 clippy::items_after_statements
1055)]
1056mod tests {
1057 use super::*;
1058 use crate::matrix::DenseMatrix;
1059 use crate::trie::{Trie, TrieBuilder};
1060
1061 fn create_test_dictionary() -> SystemDictionary {
1062 let entries = vec![
1064 ("가", 0u32),
1065 ("가다", 1),
1066 ("가방", 2),
1067 ("나", 3),
1068 ("나다", 4),
1069 ];
1070 let trie_bytes = TrieBuilder::build(&entries).expect("should build trie");
1071 let trie = TrieBackend::Owned(Trie::from_vec(trie_bytes));
1072
1073 let matrix = DenseMatrix::new(10, 10, 100);
1075 let matrix = ConnectionMatrix::Dense(matrix);
1076
1077 let dict_entries = vec![
1079 DictEntry::new("가", 1, 1, 100, "NNG,*,T,가,*,*,*,*"),
1080 DictEntry::new("가다", 2, 2, 200, "VV,*,F,가다,*,*,*,*"),
1081 DictEntry::new("가방", 3, 3, 300, "NNG,*,T,가방,*,*,*,*"),
1082 DictEntry::new("나", 4, 4, 400, "NP,*,F,나,*,*,*,*"),
1083 DictEntry::new("나다", 5, 5, 500, "VV,*,F,나다,*,*,*,*"),
1084 ];
1085
1086 SystemDictionary {
1087 dicdir: PathBuf::from("./test_dic"),
1088 trie,
1089 matrix,
1090 entry_store: Arc::new(EagerStore::new(dict_entries)),
1091 user_dict: None,
1092 #[cfg(feature = "hot-reload-v2")]
1093 hot_reload: None,
1094 }
1095 }
1096
1097 #[test]
1098 fn test_dict_entry_creation() {
1099 let entry = DictEntry::new("안녕", 1, 1, 100, "NNG,*,T,안녕,*,*,*,*");
1100 assert_eq!(entry.surface, "안녕");
1101 assert_eq!(entry.left_id, 1);
1102 assert_eq!(entry.right_id, 1);
1103 assert_eq!(entry.cost, 100);
1104 }
1105
1106 #[test]
1107 fn test_dict_entry_to_entry() {
1108 let dict_entry = DictEntry::new("테스트", 5, 5, 200, "NNG,*,T,테스트,*,*,*,*");
1109 let entry = dict_entry.to_entry();
1110
1111 assert_eq!(entry.surface, "테스트");
1112 assert_eq!(entry.left_id, 5);
1113 assert_eq!(entry.cost, 200);
1114 }
1115
1116 #[test]
1117 fn test_system_dictionary_lookup() {
1118 let dict = create_test_dictionary();
1119
1120 let entries = dict.lookup("가");
1121 assert_eq!(entries.len(), 1);
1122 assert_eq!(entries[0].surface, "가");
1123
1124 let entries = dict.lookup("가다");
1125 assert_eq!(entries.len(), 1);
1126 assert_eq!(entries[0].surface, "가다");
1127
1128 let entries = dict.lookup("없음");
1129 assert!(entries.is_empty());
1130 }
1131
1132 #[test]
1133 fn test_system_dictionary_get_connection_cost() {
1134 let dict = create_test_dictionary();
1135 let cost = dict.get_connection_cost(1, 2);
1136 assert_eq!(cost, 100); }
1138
1139 #[test]
1140 fn test_common_prefix_search() {
1141 let dict = create_test_dictionary();
1142
1143 let results = dict
1145 .common_prefix_search("가방에")
1146 .expect("search should work");
1147 assert_eq!(results.len(), 2);
1148
1149 let surfaces: Vec<_> = results.iter().map(|(e, _)| e.surface.as_str()).collect();
1150 assert!(surfaces.contains(&"가"));
1151 assert!(surfaces.contains(&"가방"));
1152 }
1153
1154 #[test]
1155 fn test_common_prefix_search_at() {
1156 let dict = create_test_dictionary();
1157
1158 let text = "나가다";
1159 let start = "나".len(); let results = dict
1162 .common_prefix_search_at(text, start)
1163 .expect("search should work");
1164 assert_eq!(results.len(), 2); let surfaces: Vec<_> = results.iter().map(|(e, _)| e.surface.as_str()).collect();
1167 assert!(surfaces.contains(&"가"));
1168 assert!(surfaces.contains(&"가다"));
1169 }
1170
1171 #[test]
1172 fn test_with_user_dictionary() {
1173 let mut dict = create_test_dictionary();
1174
1175 let mut user_dict = UserDictionary::new();
1176 user_dict.add_entry("딥러닝", "NNG", Some(-1000), None);
1177 user_dict.add_entry("머신러닝", "NNG", Some(-1000), None);
1178
1179 dict.set_user_dictionary(user_dict);
1180
1181 let entries = dict.lookup_combined("딥러닝");
1182 assert_eq!(entries.len(), 1);
1183 assert_eq!(entries[0].surface, "딥러닝");
1184 }
1185
1186 #[test]
1187 fn test_lookup_combined_system_and_user() {
1188 let mut dict = create_test_dictionary();
1189
1190 let mut user_dict = UserDictionary::new();
1191 user_dict.add_entry("가", "JKS", Some(-500), None); dict.set_user_dictionary(user_dict);
1194
1195 let entries = dict.lookup_combined("가");
1196 assert_eq!(entries.len(), 2);
1198 }
1199
1200 #[test]
1201 fn test_get_entry() {
1202 let dict = create_test_dictionary();
1203
1204 let entry = dict.get_entry(0);
1205 assert!(entry.is_ok());
1206 assert_eq!(entry.unwrap().surface, "가");
1207
1208 let entry = dict.get_entry(100);
1209 assert!(entry.is_err());
1210 }
1211
1212 #[test]
1213 fn test_dicdir() {
1214 let dict = create_test_dictionary();
1215 assert_eq!(dict.dicdir(), Path::new("./test_dic"));
1216 }
1217
1218 #[test]
1219 fn test_trie_reference() {
1220 let dict = create_test_dictionary();
1221 let trie = dict.trie();
1222 assert!(trie.exact_match("가").is_some());
1223 }
1224
1225 #[test]
1226 fn test_matrix_reference() {
1227 let dict = create_test_dictionary();
1228 let matrix = dict.matrix();
1229 assert_eq!(matrix.left_size(), 10);
1230 assert_eq!(matrix.right_size(), 10);
1231 }
1232
1233 #[test]
1234 fn test_entry_count() {
1235 let dict = create_test_dictionary();
1236 assert_eq!(dict.entry_count(), 5);
1237 }
1238
1239 #[test]
1240 fn test_dictionary_loader_find_dicdir() {
1241 let result = DictionaryLoader::find_dicdir();
1244
1245 match result {
1248 Ok(path) => {
1249 assert!(path.is_dir());
1250 }
1251 Err(e) => {
1252 assert!(e.to_string().contains("Dictionary directory not found"));
1254 }
1255 }
1256 }
1257
1258 #[test]
1259 fn test_dict_entry_from_entry() {
1260 let entry = Entry {
1261 surface: "테스트".to_string(),
1262 left_id: 10,
1263 right_id: 20,
1264 cost: 300,
1265 feature: "NNG,*,T,테스트,*,*,*,*".to_string(),
1266 };
1267
1268 let dict_entry: DictEntry = entry.into();
1269 assert_eq!(dict_entry.surface, "테스트");
1270 assert_eq!(dict_entry.left_id, 10);
1271 assert_eq!(dict_entry.right_id, 20);
1272 assert_eq!(dict_entry.cost, 300);
1273 }
1274
1275 #[test]
1276 fn test_entries_bin_roundtrip() {
1277 let entries = vec![
1278 DictEntry::new("안녕", 1, 1, 100, "NNG,*,T,안녕,*,*,*,*"),
1279 DictEntry::new("하세요", 2, 2, 50, "VV,*,F,하세요,*,*,*,*"),
1280 DictEntry::new("감사", 3, 3, 80, "NNG,*,F,감사,*,*,*,*"),
1281 ];
1282
1283 let temp = tempfile::NamedTempFile::new().expect("create temp file");
1284 let path = temp.path();
1285
1286 SystemDictionary::save_entries_bin(&entries, path).expect("save should work");
1287 let loaded = SystemDictionary::load_entries_bin(path).expect("load should work");
1288
1289 assert_eq!(loaded.len(), 3);
1290 assert_eq!(loaded[0].surface, "안녕");
1291 assert_eq!(loaded[0].left_id, 1);
1292 assert_eq!(loaded[0].cost, 100);
1293 assert_eq!(loaded[0].feature, "NNG,*,T,안녕,*,*,*,*");
1294 assert_eq!(loaded[1].surface, "하세요");
1295 assert_eq!(loaded[2].surface, "감사");
1296 }
1297
1298 #[test]
1299 fn test_entries_csv_roundtrip() {
1300 let entries = vec![
1301 DictEntry::new("형태소", 10, 20, 150, "NNG,*,F,형태소,*,*,*,*"),
1302 DictEntry::new("분석", 11, 21, 200, "NNG,*,T,분석,*,*,*,*"),
1303 ];
1304
1305 let temp = tempfile::NamedTempFile::new().expect("create temp file");
1306 let path = temp.path();
1307
1308 SystemDictionary::save_entries_csv(&entries, path).expect("save should work");
1309 let loaded = SystemDictionary::load_entries_csv(path).expect("load should work");
1310
1311 assert_eq!(loaded.len(), 2);
1312 assert_eq!(loaded[0].surface, "형태소");
1313 assert_eq!(loaded[0].left_id, 10);
1314 assert_eq!(loaded[0].right_id, 20);
1315 assert_eq!(loaded[0].cost, 150);
1316 assert_eq!(loaded[1].surface, "분석");
1317 }
1318
1319 #[test]
1320 fn test_get_entries_at_multi() {
1321 let trie_input = vec![("가", 0u32), ("나", 2u32)];
1323 let trie_bytes = TrieBuilder::build(&trie_input).expect("build trie");
1324 let trie = TrieBackend::Owned(Trie::from_vec(trie_bytes));
1325 let matrix = ConnectionMatrix::Dense(DenseMatrix::new(5, 5, 100));
1326
1327 let dict_entries = vec![
1328 DictEntry::new("가", 1, 1, 100, "VV,*,F,가,*,*,*,*"),
1329 DictEntry::new("가", 2, 2, 50, "JKS,*,F,가,*,*,*,*"),
1330 DictEntry::new("나", 3, 3, 200, "NP,*,F,나,*,*,*,*"),
1331 ];
1332
1333 let dict = SystemDictionary {
1334 dicdir: PathBuf::from("./test"),
1335 trie,
1336 matrix,
1337 entry_store: Arc::new(EagerStore::new(dict_entries)),
1338 user_dict: None,
1339 #[cfg(feature = "hot-reload-v2")]
1340 hot_reload: None,
1341 };
1342
1343 let results = dict.get_entries_at(0, "가").expect("should get entries");
1345 assert_eq!(results.len(), 2);
1346 assert_eq!(results[0].feature, "VV,*,F,가,*,*,*,*");
1347 assert_eq!(results[1].feature, "JKS,*,F,가,*,*,*,*");
1348
1349 use crate::Dictionary;
1351 let entries = dict.lookup("가");
1352 assert_eq!(entries.len(), 2);
1353 }
1354}