1use crate::error::{DictError, Result};
6use crate::matrix::{DenseMatrix, Matrix};
7use crate::trie::TrieBackend;
8use crate::{Dictionary, Entry};
9use std::fs::File;
10use std::path::{Path, PathBuf};
11
12#[derive(Debug, Clone, Copy)]
14pub struct LoaderConfig {
15 pub use_mmap: bool,
17 pub auto_decompress: bool,
19 pub lazy_load: bool,
21}
22
23impl Default for LoaderConfig {
24 fn default() -> Self {
25 Self {
26 use_mmap: true,
27 auto_decompress: true,
28 lazy_load: false,
29 }
30 }
31}
32
33pub struct MmapDictionary {
37 trie: TrieBackend,
39 matrix: DenseMatrix,
41 dict_dir: PathBuf,
43 entries: Vec<Entry>,
45}
46
47impl MmapDictionary {
48 pub fn load<P: AsRef<Path>>(path: P) -> Result<Self> {
64 Self::load_with_config(path, LoaderConfig::default())
65 }
66
67 pub fn load_with_config<P: AsRef<Path>>(path: P, config: LoaderConfig) -> Result<Self> {
73 let dict_dir = path.as_ref().to_path_buf();
74
75 let trie = Self::load_trie_backend(&dict_dir, config)?;
77
78 let matrix = Self::load_matrix(&dict_dir, config)?;
80
81 let entries = Self::load_entries(&dict_dir, config)?;
83
84 Ok(Self {
85 trie,
86 matrix,
87 dict_dir,
88 entries,
89 })
90 }
91
92 #[cfg(feature = "zstd")]
94 fn load_trie_backend(dict_dir: &Path, config: LoaderConfig) -> Result<TrieBackend> {
95 let compressed_path = dict_dir.join("sys.dic.zst");
96 let uncompressed_path = dict_dir.join("sys.dic");
97
98 if config.auto_decompress && compressed_path.exists() {
99 TrieBackend::from_compressed_file(&compressed_path)
100 } else if uncompressed_path.exists() {
101 if config.use_mmap {
102 TrieBackend::from_mmap_file(&uncompressed_path)
103 } else {
104 TrieBackend::from_file(&uncompressed_path)
105 }
106 } else {
107 Err(DictError::Format(
108 "sys.dic or sys.dic.zst not found".to_string(),
109 ))
110 }
111 }
112
113 #[cfg(not(feature = "zstd"))]
115 fn load_trie_backend(dict_dir: &Path, config: LoaderConfig) -> Result<TrieBackend> {
116 let uncompressed_path = dict_dir.join("sys.dic");
117
118 if uncompressed_path.exists() {
119 if config.use_mmap {
120 TrieBackend::from_mmap_file(&uncompressed_path)
121 } else {
122 TrieBackend::from_file(&uncompressed_path)
123 }
124 } else {
125 Err(DictError::Format(
126 "sys.dic not found (zstd feature disabled, compressed files not supported)"
127 .to_string(),
128 ))
129 }
130 }
131
132 #[cfg(feature = "zstd")]
134 fn load_matrix(dict_dir: &Path, config: LoaderConfig) -> Result<DenseMatrix> {
135 let compressed_path = dict_dir.join("matrix.bin.zst");
136 let uncompressed_path = dict_dir.join("matrix.bin");
137
138 if config.auto_decompress && compressed_path.exists() {
139 DenseMatrix::from_compressed_file(&compressed_path)
140 } else if uncompressed_path.exists() {
141 DenseMatrix::from_bin_file(&uncompressed_path)
142 } else {
143 Err(DictError::Format(
144 "matrix.bin or matrix.bin.zst not found".to_string(),
145 ))
146 }
147 }
148
149 #[cfg(not(feature = "zstd"))]
151 fn load_matrix(dict_dir: &Path, _config: LoaderConfig) -> Result<DenseMatrix> {
152 let uncompressed_path = dict_dir.join("matrix.bin");
153
154 if uncompressed_path.exists() {
155 DenseMatrix::from_bin_file(&uncompressed_path)
156 } else {
157 Err(DictError::Format(
158 "matrix.bin not found (zstd feature disabled, compressed files not supported)"
159 .to_string(),
160 ))
161 }
162 }
163
164 #[cfg(feature = "zstd")]
194 fn load_entries(dict_dir: &Path, config: LoaderConfig) -> Result<Vec<Entry>> {
195 let bin_path = dict_dir.join("entries.bin");
197 let compressed_bin_path = dict_dir.join("entries.bin.zst");
198 let csv_path = dict_dir.join("entries.csv");
199
200 if config.auto_decompress && compressed_bin_path.exists() {
202 return Self::load_entries_from_compressed_bin(&compressed_bin_path);
203 }
204
205 if bin_path.exists() {
207 match Self::load_entries_from_bin(&bin_path) {
208 Ok(entries) => return Ok(entries),
209 Err(DictError::Format(ref msg)) if msg.contains("MKED") || msg.contains("MKE2") => {
210 }
212 Err(e) => return Err(e),
213 }
214 }
215
216 if csv_path.exists() {
218 return Self::load_entries_from_csv(&csv_path);
219 }
220
221 Ok(Vec::new())
225 }
226
227 #[cfg(not(feature = "zstd"))]
228 fn load_entries(dict_dir: &Path, _config: LoaderConfig) -> Result<Vec<Entry>> {
229 let bin_path = dict_dir.join("entries.bin");
231 let csv_path = dict_dir.join("entries.csv");
232
233 if bin_path.exists() {
235 match Self::load_entries_from_bin(&bin_path) {
236 Ok(entries) => return Ok(entries),
237 Err(DictError::Format(ref msg)) if msg.contains("MKED") || msg.contains("MKE2") => {
238 }
240 Err(e) => return Err(e),
241 }
242 }
243
244 if csv_path.exists() {
246 return Self::load_entries_from_csv(&csv_path);
247 }
248
249 Ok(Vec::new())
251 }
252
253 fn load_entries_from_bin(path: &Path) -> Result<Vec<Entry>> {
255 use std::io::Read;
256
257 let mut file = File::open(path)?;
258 let mut buffer = Vec::new();
259 file.read_to_end(&mut buffer)?;
260
261 Self::parse_entries_binary(&buffer)
262 }
263
264 #[cfg(feature = "zstd")]
266 fn load_entries_from_compressed_bin(path: &Path) -> Result<Vec<Entry>> {
267 use std::io::Read;
268
269 let file = File::open(path)?;
270 let mut decoder = zstd::Decoder::new(file)?;
271 let mut buffer = Vec::new();
272 decoder.read_to_end(&mut buffer)?;
273
274 Self::parse_entries_binary(&buffer)
275 }
276
277 #[cfg(not(feature = "zstd"))]
279 #[allow(dead_code)]
280 fn load_entries_from_compressed_bin(_path: &Path) -> Result<Vec<Entry>> {
281 Err(DictError::Format(
282 "zstd feature is not enabled. Use uncompressed files or enable the 'zstd' feature."
283 .to_string(),
284 ))
285 }
286
287 fn parse_entries_binary(data: &[u8]) -> Result<Vec<Entry>> {
292 use std::io::{Cursor, Read};
293
294 if data.len() >= 4 {
296 let magic = &data[0..4];
297 if magic == b"MKED" || magic == b"MKE2" {
298 return Err(DictError::Format(
299 "entries.bin is in SystemDictionary format (MKED/MKE2). \
300 MmapDictionary uses a different format. \
301 Falling back to CSV."
302 .to_string(),
303 ));
304 }
305 }
306
307 let mut cursor = Cursor::new(data);
308 let mut count_bytes = [0u8; 4];
309 cursor.read_exact(&mut count_bytes).map_err(|_| {
310 DictError::Format("Failed to read entry count from binary file".to_string())
311 })?;
312
313 let count = u32::from_le_bytes(count_bytes) as usize;
314 let mut entries = Vec::with_capacity(count);
315
316 for _ in 0..count {
317 let mut buf = [0u8; 2];
319 cursor.read_exact(&mut buf).map_err(|_| {
320 DictError::Format("Failed to read left_id from binary file".to_string())
321 })?;
322 let left_id = u16::from_le_bytes(buf);
323
324 cursor.read_exact(&mut buf).map_err(|_| {
326 DictError::Format("Failed to read right_id from binary file".to_string())
327 })?;
328 let right_id = u16::from_le_bytes(buf);
329
330 cursor.read_exact(&mut buf).map_err(|_| {
332 DictError::Format("Failed to read cost from binary file".to_string())
333 })?;
334 let cost = i16::from_le_bytes(buf);
335
336 cursor.read_exact(&mut buf).map_err(|_| {
338 DictError::Format("Failed to read surface length from binary file".to_string())
339 })?;
340 let surface_len = u16::from_le_bytes(buf) as usize;
341 let mut surface_bytes = vec![0u8; surface_len];
342 cursor.read_exact(&mut surface_bytes).map_err(|_| {
343 DictError::Format("Failed to read surface from binary file".to_string())
344 })?;
345 let surface = String::from_utf8(surface_bytes)
346 .map_err(|_| DictError::Format("Invalid UTF-8 in surface field".to_string()))?;
347
348 cursor.read_exact(&mut buf).map_err(|_| {
350 DictError::Format("Failed to read feature length from binary file".to_string())
351 })?;
352 let feature_len = u16::from_le_bytes(buf) as usize;
353 let mut feature_bytes = vec![0u8; feature_len];
354 cursor.read_exact(&mut feature_bytes).map_err(|_| {
355 DictError::Format("Failed to read feature from binary file".to_string())
356 })?;
357 let feature = String::from_utf8(feature_bytes)
358 .map_err(|_| DictError::Format("Invalid UTF-8 in feature field".to_string()))?;
359
360 entries.push(Entry {
361 surface,
362 left_id,
363 right_id,
364 cost,
365 feature,
366 });
367 }
368
369 Ok(entries)
370 }
371
372 fn load_entries_from_csv(path: &Path) -> Result<Vec<Entry>> {
374 use std::io::{BufRead, BufReader};
375
376 let file = File::open(path)?;
377 let reader = BufReader::new(file);
378 let mut entries = Vec::new();
379
380 for (line_num, line_result) in reader.lines().enumerate() {
381 let line = line_result?;
382
383 if line.trim().is_empty() || line.starts_with('#') {
385 continue;
386 }
387
388 let entry = Self::parse_csv_line(&line)
389 .map_err(|e| DictError::Format(format!("Failed to parse line {line_num}: {e}")))?;
390
391 entries.push(entry);
392 }
393
394 Ok(entries)
395 }
396
397 fn parse_csv_line(line: &str) -> Result<Entry> {
402 let parts: Vec<&str> = line.split(',').collect();
403
404 if parts.len() < 5 {
405 return Err(DictError::Format(format!(
406 "Invalid CSV line: expected at least 5 fields, got {}",
407 parts.len()
408 )));
409 }
410
411 let surface = parts[0].to_string();
412
413 let left_id = parts[1]
414 .parse::<u16>()
415 .map_err(|_| DictError::Format(format!("Invalid left_id: {}", parts[1])))?;
416
417 let right_id = parts[2]
418 .parse::<u16>()
419 .map_err(|_| DictError::Format(format!("Invalid right_id: {}", parts[2])))?;
420
421 let cost = parts[3]
422 .parse::<i16>()
423 .map_err(|_| DictError::Format(format!("Invalid cost: {}", parts[3])))?;
424
425 let feature = parts[4..].join(",");
427
428 Ok(Entry {
429 surface,
430 left_id,
431 right_id,
432 cost,
433 feature,
434 })
435 }
436
437 #[must_use]
439 pub const fn trie(&self) -> &TrieBackend {
440 &self.trie
441 }
442
443 #[must_use]
445 pub const fn matrix(&self) -> &DenseMatrix {
446 &self.matrix
447 }
448
449 #[must_use]
451 pub fn dict_dir(&self) -> &Path {
452 &self.dict_dir
453 }
454
455 #[must_use]
457 pub fn entries(&self) -> &[Entry] {
458 &self.entries
459 }
460
461 #[must_use]
467 pub fn get_entry(&self, index: u32) -> Option<&Entry> {
468 self.entries.get(index as usize)
469 }
470}
471
472impl Dictionary for MmapDictionary {
473 fn lookup(&self, surface: &str) -> Vec<Entry> {
474 self.trie
476 .exact_match(surface)
477 .map_or_else(Vec::new, |index| {
478 self.entries.get(index as usize).map_or_else(
479 || {
480 vec![Entry {
483 surface: surface.to_string(),
484 left_id: 0,
485 right_id: 0,
486 cost: 0,
487 feature: "UNK,*,*,*,*,*,*,*".to_string(),
488 }]
489 },
490 |entry| vec![entry.clone()],
491 )
492 })
493 }
494
495 fn get_connection_cost(&self, left_id: u16, right_id: u16) -> i16 {
496 let cost = self.matrix.get(left_id, right_id);
499 cost.clamp(i32::from(i16::MIN), i32::from(i16::MAX))
500 .try_into()
501 .unwrap_or(i16::MAX)
502 }
503}
504
505pub struct LazyDictionary {
509 dict_path: PathBuf,
510 config: LoaderConfig,
511 dict: std::sync::Mutex<Option<MmapDictionary>>,
512}
513
514impl LazyDictionary {
515 pub fn new<P: AsRef<Path>>(path: P) -> Self {
517 Self::new_with_config(path, LoaderConfig::default())
518 }
519
520 pub fn new_with_config<P: AsRef<Path>>(path: P, config: LoaderConfig) -> Self {
522 Self {
523 dict_path: path.as_ref().to_path_buf(),
524 config,
525 dict: std::sync::Mutex::new(None),
526 }
527 }
528
529 fn ensure_loaded(&self) -> Result<()> {
531 let mut dict = self.dict.lock().map_err(|_| {
532 DictError::Format("Failed to acquire lock for lazy dictionary".to_string())
533 })?;
534
535 if dict.is_some() {
536 return Ok(());
537 }
538
539 let loaded_dict = MmapDictionary::load_with_config(&self.dict_path, self.config)?;
540 *dict = Some(loaded_dict);
541 drop(dict);
542
543 Ok(())
544 }
545}
546
547impl Dictionary for LazyDictionary {
548 fn lookup(&self, surface: &str) -> Vec<Entry> {
549 if self.ensure_loaded().is_err() {
550 return Vec::new();
551 }
552
553 let Ok(dict) = self.dict.lock() else {
554 return Vec::new();
555 };
556
557 dict.as_ref().map_or_else(Vec::new, |d| d.lookup(surface))
558 }
559
560 fn get_connection_cost(&self, left_id: u16, right_id: u16) -> i16 {
561 if self.ensure_loaded().is_err() {
562 return 0;
563 }
564
565 let Ok(dict) = self.dict.lock() else {
566 return 0;
567 };
568
569 dict.as_ref()
570 .map_or(0, |d| d.get_connection_cost(left_id, right_id))
571 }
572}
573
574pub struct DictionaryLoader {
576 path: PathBuf,
577 config: LoaderConfig,
578}
579
580impl DictionaryLoader {
581 pub fn new<P: AsRef<Path>>(path: P) -> Self {
583 Self {
584 path: path.as_ref().to_path_buf(),
585 config: LoaderConfig::default(),
586 }
587 }
588
589 #[must_use]
591 pub const fn use_mmap(mut self, use_mmap: bool) -> Self {
592 self.config.use_mmap = use_mmap;
593 self
594 }
595
596 #[must_use]
598 pub const fn auto_decompress(mut self, auto: bool) -> Self {
599 self.config.auto_decompress = auto;
600 self
601 }
602
603 #[must_use]
605 pub const fn lazy_load(mut self, lazy: bool) -> Self {
606 self.config.lazy_load = lazy;
607 self
608 }
609
610 pub fn load(self) -> Result<Box<dyn Dictionary>> {
616 if self.config.lazy_load {
617 Ok(Box::new(LazyDictionary::new_with_config(
618 self.path,
619 self.config,
620 )))
621 } else {
622 Ok(Box::new(MmapDictionary::load_with_config(
623 self.path,
624 self.config,
625 )?))
626 }
627 }
628}
629
630#[cfg(test)]
631#[allow(clippy::expect_used, clippy::unwrap_used)]
632mod tests {
633 use super::*;
634 use crate::trie::TrieBuilder;
635
636 fn create_test_dict() -> tempfile::TempDir {
637 let temp_dir = tempfile::TempDir::new().expect("create temp dir");
638
639 let trie_entries = vec![("가", 0u32), ("가다", 1u32), ("가방", 2u32)];
641 let trie_bytes = TrieBuilder::build(&trie_entries).expect("build trie");
642 std::fs::write(temp_dir.path().join("sys.dic"), trie_bytes).expect("write trie");
643
644 let matrix = DenseMatrix::new(10, 10, 100);
646 matrix
647 .to_bin_file(temp_dir.path().join("matrix.bin"))
648 .expect("write matrix");
649
650 let entries_csv = "가,1,1,100,NNG,*,T,가,*,*,*,*\n\
652 가다,2,2,200,VV,*,F,가다,*,*,*,*\n\
653 가방,3,3,300,NNG,*,T,가방,*,*,*,*\n";
654 std::fs::write(temp_dir.path().join("entries.csv"), entries_csv).expect("write entries");
655
656 temp_dir
657 }
658
659 #[test]
660 fn test_mmap_dictionary_load() {
661 let temp_dir = create_test_dict();
662 let dict = MmapDictionary::load(temp_dir.path()).expect("load failed");
663
664 assert!(dict.trie().exact_match("가").is_some());
665 assert!(dict.trie().exact_match("가다").is_some());
666 assert!(dict.trie().exact_match("없음").is_none());
667 }
668
669 #[test]
670 fn test_dictionary_lookup() {
671 let temp_dir = create_test_dict();
672 let dict = MmapDictionary::load(temp_dir.path()).expect("load failed");
673
674 let entries = dict.lookup("가");
675 assert!(!entries.is_empty());
676 assert_eq!(entries[0].surface, "가");
677 assert_eq!(entries[0].left_id, 1);
678 assert_eq!(entries[0].right_id, 1);
679 assert_eq!(entries[0].cost, 100);
680 assert!(entries[0].feature.starts_with("NNG"));
681
682 let no_entries = dict.lookup("없음");
683 assert!(no_entries.is_empty());
684 }
685
686 #[test]
687 fn test_connection_cost() {
688 let temp_dir = create_test_dict();
689 let dict = MmapDictionary::load(temp_dir.path()).expect("load failed");
690
691 let cost = dict.get_connection_cost(0, 0);
692 assert_eq!(cost, 100); }
694
695 #[test]
696 fn test_loader_builder() {
697 let temp_dir = create_test_dict();
698
699 let dict = DictionaryLoader::new(temp_dir.path())
700 .use_mmap(true)
701 .auto_decompress(true)
702 .load()
703 .expect("load failed");
704
705 let entries = dict.lookup("가");
706 assert!(!entries.is_empty());
707 }
708
709 #[test]
710 fn test_lazy_dictionary() {
711 let temp_dir = create_test_dict();
712
713 let dict = LazyDictionary::new(temp_dir.path());
714
715 let entries = dict.lookup("가");
717 assert!(!entries.is_empty());
718
719 let entries2 = dict.lookup("가다");
721 assert!(!entries2.is_empty());
722 }
723
724 #[test]
725 fn test_missing_dictionary() {
726 let temp_dir = tempfile::TempDir::new().expect("create temp dir");
727 let result = MmapDictionary::load(temp_dir.path());
728 assert!(result.is_err());
729 }
730
731 #[test]
732 fn test_get_entry_by_index() {
733 let temp_dir = create_test_dict();
734 let dict = MmapDictionary::load(temp_dir.path()).expect("load failed");
735
736 let entry = dict.get_entry(0);
738 assert!(entry.is_some());
739 assert_eq!(entry.unwrap().surface, "가");
740
741 let entry = dict.get_entry(1);
742 assert!(entry.is_some());
743 assert_eq!(entry.unwrap().surface, "가다");
744
745 let entry = dict.get_entry(100);
747 assert!(entry.is_none());
748 }
749
750 #[test]
751 fn test_entries_accessor() {
752 let temp_dir = create_test_dict();
753 let dict = MmapDictionary::load(temp_dir.path()).expect("load failed");
754
755 let entries = dict.entries();
756 assert_eq!(entries.len(), 3);
757 assert_eq!(entries[0].surface, "가");
758 assert_eq!(entries[1].surface, "가다");
759 assert_eq!(entries[2].surface, "가방");
760 }
761
762 #[test]
763 fn test_csv_parsing() {
764 let temp_dir = tempfile::TempDir::new().expect("create temp dir");
765
766 let entries_csv = "안녕,10,20,500,NNG,*,T,안녕,*,*,*,*\n\
768 하세요,15,25,600,VV+EC,*,F,하세요,*,*,*,*\n";
769 std::fs::write(temp_dir.path().join("entries.csv"), entries_csv).expect("write entries");
770
771 let trie_entries = vec![("안녕", 0u32), ("하세요", 1u32)];
773 let trie_bytes = TrieBuilder::build(&trie_entries).expect("build trie");
774 std::fs::write(temp_dir.path().join("sys.dic"), trie_bytes).expect("write trie");
775
776 let matrix = DenseMatrix::new(30, 30, 100);
777 matrix
778 .to_bin_file(temp_dir.path().join("matrix.bin"))
779 .expect("write matrix");
780
781 let dict = MmapDictionary::load(temp_dir.path()).expect("load failed");
783 let entries = dict.lookup("안녕");
784 assert!(!entries.is_empty());
785 assert_eq!(entries[0].surface, "안녕");
786 assert_eq!(entries[0].left_id, 10);
787 assert_eq!(entries[0].right_id, 20);
788 assert_eq!(entries[0].cost, 500);
789 }
790
791 #[test]
792 fn test_dict_without_entries() {
793 let temp_dir = tempfile::TempDir::new().expect("create temp dir");
794
795 let trie_entries = vec![("테스트", 0u32)];
797 let trie_bytes = TrieBuilder::build(&trie_entries).expect("build trie");
798 std::fs::write(temp_dir.path().join("sys.dic"), trie_bytes).expect("write trie");
799
800 let matrix = DenseMatrix::new(2, 2, 100);
801 matrix
802 .to_bin_file(temp_dir.path().join("matrix.bin"))
803 .expect("write matrix");
804
805 let dict = MmapDictionary::load(temp_dir.path()).expect("load failed");
807 assert_eq!(dict.entries().len(), 0);
808
809 let entries = dict.lookup("테스트");
811 assert!(!entries.is_empty());
812 assert_eq!(entries[0].surface, "테스트");
813 assert_eq!(entries[0].feature, "UNK,*,*,*,*,*,*,*");
814 }
815}