1use std::io::{Read, Seek, SeekFrom};
31use std::num::NonZeroUsize;
32use std::path::{Path, PathBuf};
33use std::sync::{Arc, RwLock};
34
35use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
36use memmap2::Mmap;
37
38use crate::dictionary::DictEntry;
39use crate::error::{DictError, Result};
40
41pub const ENTRIES_V3_MAGIC: &[u8; 4] = b"MKE3";
43pub const ENTRIES_V3_VERSION: u32 = 3;
45pub const HEADER_V3_SIZE: usize = 24;
47
48pub const FEATURE_U32: u16 = 1;
50
51const DEFAULT_CACHE_SIZE: usize = 10_000;
52
53const DEFAULT_CACHE_SIZE_NZ: NonZeroUsize = {
55 match NonZeroUsize::new(DEFAULT_CACHE_SIZE) {
56 Some(n) => n,
57 None => panic!("DEFAULT_CACHE_SIZE must be > 0"),
58 }
59};
60
61#[non_exhaustive]
63#[derive(Debug, Clone, Copy, PartialEq, Eq)]
64pub enum EntriesFormat {
65 V1,
67 V2,
69 V3,
71}
72
73pub fn detect_entries_format<P: AsRef<Path>>(path: P) -> Result<EntriesFormat> {
80 use std::io::Read as _;
81 let mut file = std::fs::File::open(path.as_ref()).map_err(DictError::Io)?;
82 let mut magic = [0u8; 4];
83 file.read_exact(&mut magic)
84 .map_err(|e| DictError::Format(format!("cannot read magic: {e}")))?;
85 match &magic {
86 b"MKE3" => Ok(EntriesFormat::V3),
87 b"MKE2" => Ok(EntriesFormat::V2),
88 b"MKED" => Ok(EntriesFormat::V1),
89 _ => Err(DictError::Format(format!(
90 "unknown magic bytes: {magic:?}"
91 ))),
92 }
93}
94
95pub struct LazyEntriesV3 {
97 #[allow(dead_code)]
98 path: PathBuf,
99 mmap: Mmap,
100 count: u32,
101 index_offset: u64,
102 flags: u16,
103 cache: RwLock<lru::LruCache<u32, Arc<DictEntry>>>,
104}
105
106impl LazyEntriesV3 {
107 #[allow(unsafe_code)]
114 pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
115 let path = path.as_ref().to_path_buf();
116 let file = std::fs::File::open(&path).map_err(DictError::Io)?;
117 let mmap = unsafe { Mmap::map(&file).map_err(DictError::Io)? };
120
121 if mmap.len() < HEADER_V3_SIZE {
122 return Err(DictError::Format("MKE3: file too small".into()));
123 }
124
125 let mut cur = std::io::Cursor::new(&mmap[..]);
126
127 let mut magic = [0u8; 4];
128 cur.read_exact(&mut magic)
129 .map_err(|e| DictError::Format(format!("MKE3: cannot read magic: {e}")))?;
130 if &magic != ENTRIES_V3_MAGIC {
131 return Err(DictError::Format(
132 "MKE3: invalid magic (expected MKE3)".into(),
133 ));
134 }
135
136 let version = cur
137 .read_u32::<LittleEndian>()
138 .map_err(|e| DictError::Format(format!("MKE3: cannot read version: {e}")))?;
139 if version != ENTRIES_V3_VERSION {
140 return Err(DictError::Format(format!(
141 "MKE3: unsupported version {version}"
142 )));
143 }
144
145 let count = cur
146 .read_u32::<LittleEndian>()
147 .map_err(|e| DictError::Format(format!("MKE3: cannot read count: {e}")))?;
148
149 let flags = cur
150 .read_u16::<LittleEndian>()
151 .map_err(|e| DictError::Format(format!("MKE3: cannot read flags: {e}")))?;
152
153 cur.read_u16::<LittleEndian>()
155 .map_err(|e| DictError::Format(format!("MKE3: cannot read reserved: {e}")))?;
156
157 let index_offset = cur
158 .read_u64::<LittleEndian>()
159 .map_err(|e| DictError::Format(format!("MKE3: cannot read index_offset: {e}")))?;
160
161 let expected_index_end = index_offset + u64::from(count) * 8;
162 if expected_index_end > mmap.len() as u64 {
163 return Err(DictError::Format(format!(
164 "MKE3: index table extends beyond file (offset={index_offset}, count={count}, file_len={})",
165 mmap.len()
166 )));
167 }
168
169 Ok(Self {
170 path,
171 mmap,
172 count,
173 index_offset,
174 flags,
175 cache: RwLock::new(lru::LruCache::new(DEFAULT_CACHE_SIZE_NZ)),
176 })
177 }
178
179 #[must_use]
181 pub const fn len(&self) -> usize {
182 self.count as usize
183 }
184
185 #[must_use]
187 pub const fn is_empty(&self) -> bool {
188 self.count == 0
189 }
190
191 #[must_use]
193 pub fn cached_count(&self) -> usize {
194 self.cache.read().map(|c| c.len()).unwrap_or(0)
195 }
196
197 pub fn set_cache_size(&self, size: usize) {
199 if let Ok(mut cache) = self.cache.write() {
200 cache.resize(NonZeroUsize::new(size).unwrap_or(NonZeroUsize::new(1).unwrap()));
201 }
202 }
203
204 pub fn clear_cache(&self) {
206 if let Ok(mut cache) = self.cache.write() {
207 cache.clear();
208 }
209 }
210
211 #[must_use]
213 pub const fn flags(&self) -> u16 {
214 self.flags
215 }
216
217 pub fn get(&self, index: u32) -> Result<Arc<DictEntry>> {
227 {
228 let cache = self
229 .cache
230 .read()
231 .map_err(|_| DictError::Format("MKE3: cache lock poisoned".into()))?;
232 if let Some(entry) = cache.peek(&index) {
233 return Ok(Arc::clone(entry));
234 }
235 }
236
237 let entry = Arc::new(self.load_entry_from_mmap(index)?);
238 {
239 let mut cache = self
240 .cache
241 .write()
242 .map_err(|_| DictError::Format("MKE3: cache lock poisoned".into()))?;
243 if let Some(existing) = cache.get(&index) {
244 return Ok(Arc::clone(existing));
245 }
246 cache.put(index, Arc::clone(&entry));
247 }
248 Ok(entry)
249 }
250
251 pub fn get_entries_at(&self, first_index: u32, surface: &str) -> Result<Vec<Arc<DictEntry>>> {
260 let mut results = Vec::new();
261 let mut index = first_index;
262 while index < self.count {
263 let entry = self.get(index)?;
264 if entry.surface == surface {
265 results.push(entry);
266 index += 1;
267 } else {
268 break;
269 }
270 }
271 Ok(results)
272 }
273
274 fn entry_offset(&self, index: u32) -> Result<u64> {
275 if index >= self.count {
276 return Err(DictError::Format(format!(
277 "MKE3: index {index} out of bounds (count={})",
278 self.count
279 )));
280 }
281 let table_pos = self.index_offset + u64::from(index) * 8;
282 let mmap_len = u64::try_from(self.mmap.len())
283 .map_err(|_| DictError::Format("MKE3: mmap length overflow".into()))?;
284 if table_pos + 8 > mmap_len {
285 return Err(DictError::Format(format!(
286 "MKE3: index table overflow at position {table_pos}"
287 )));
288 }
289 let pos = usize::try_from(table_pos)
290 .map_err(|_| DictError::Format("MKE3: table position overflow".into()))?;
291 let mut cur = std::io::Cursor::new(&self.mmap[pos..]);
292 cur.read_u64::<LittleEndian>()
293 .map_err(|e| DictError::Format(format!("MKE3: cannot read entry offset: {e}")))
294 }
295
296 fn load_entry_from_mmap(&self, index: u32) -> Result<DictEntry> {
297 let offset = self.entry_offset(index)?;
298 let offset_usize = usize::try_from(offset)
299 .map_err(|_| DictError::Format("MKE3: offset overflow".into()))?;
300 if offset_usize >= self.mmap.len() {
301 return Err(DictError::Format(format!(
302 "MKE3: entry {index} offset {offset} out of mmap bounds"
303 )));
304 }
305
306 let mut cur = std::io::Cursor::new(&self.mmap[offset_usize..]);
307
308 let left_id = cur
309 .read_u16::<LittleEndian>()
310 .map_err(|e| DictError::Format(format!("MKE3 entry {index} left_id: {e}")))?;
311 let right_id = cur
312 .read_u16::<LittleEndian>()
313 .map_err(|e| DictError::Format(format!("MKE3 entry {index} right_id: {e}")))?;
314 let cost = cur
315 .read_i16::<LittleEndian>()
316 .map_err(|e| DictError::Format(format!("MKE3 entry {index} cost: {e}")))?;
317 let surface_len = cur
318 .read_u16::<LittleEndian>()
319 .map_err(|e| DictError::Format(format!("MKE3 entry {index} surface_len: {e}")))?
320 as usize;
321 let feature_len = cur
322 .read_u32::<LittleEndian>()
323 .map_err(|e| DictError::Format(format!("MKE3 entry {index} feature_len: {e}")))?
324 as usize;
325
326 let record_header = 2 + 2 + 2 + 2 + 4;
327 let remaining = self.mmap.len().saturating_sub(offset_usize + record_header);
328 if surface_len + feature_len > remaining {
329 return Err(DictError::Format(format!(
330 "MKE3 entry {index}: surface_len({surface_len}) + feature_len({feature_len}) exceeds remaining bytes({remaining})"
331 )));
332 }
333
334 let mut surface_bytes = vec![0u8; surface_len];
335 cur.read_exact(&mut surface_bytes)
336 .map_err(|e| DictError::Format(format!("MKE3 entry {index} surface: {e}")))?;
337 let surface = String::from_utf8(surface_bytes)
338 .map_err(|e| DictError::Format(format!("MKE3 entry {index} surface utf8: {e}")))?;
339
340 let mut feature_bytes = vec![0u8; feature_len];
341 cur.read_exact(&mut feature_bytes)
342 .map_err(|e| DictError::Format(format!("MKE3 entry {index} feature: {e}")))?;
343 let feature = String::from_utf8(feature_bytes)
344 .map_err(|e| DictError::Format(format!("MKE3 entry {index} feature utf8: {e}")))?;
345
346 Ok(DictEntry {
347 surface,
348 left_id,
349 right_id,
350 cost,
351 feature,
352 })
353 }
354}
355
356pub fn save_entries_v3<P: AsRef<Path>>(entries: &[DictEntry], path: P) -> Result<()> {
363 use std::io::Write;
364
365 let path = path.as_ref();
366 let mut file = std::fs::File::create(path).map_err(DictError::Io)?;
367
368 let count = u32::try_from(entries.len())
369 .map_err(|_| DictError::Format("MKE3: too many entries".into()))?;
370
371 file.write_all(ENTRIES_V3_MAGIC).map_err(DictError::Io)?;
373 file.write_u32::<LittleEndian>(ENTRIES_V3_VERSION)
374 .map_err(DictError::Io)?;
375 file.write_u32::<LittleEndian>(count)
376 .map_err(DictError::Io)?;
377 file.write_u16::<LittleEndian>(FEATURE_U32)
378 .map_err(DictError::Io)?;
379 file.write_u16::<LittleEndian>(0) .map_err(DictError::Io)?;
381 file.write_u64::<LittleEndian>(0) .map_err(DictError::Io)?;
383
384 let mut offsets: Vec<u64> = Vec::with_capacity(entries.len());
385
386 for entry in entries {
387 let offset = file.stream_position().map_err(DictError::Io)?;
388 offsets.push(offset);
389
390 file.write_u16::<LittleEndian>(entry.left_id)
391 .map_err(DictError::Io)?;
392 file.write_u16::<LittleEndian>(entry.right_id)
393 .map_err(DictError::Io)?;
394 file.write_i16::<LittleEndian>(entry.cost)
395 .map_err(DictError::Io)?;
396
397 let surface_bytes = entry.surface.as_bytes();
398 let surface_len = u16::try_from(surface_bytes.len())
399 .map_err(|_| DictError::Format("MKE3: surface too long".into()))?;
400 file.write_u16::<LittleEndian>(surface_len)
401 .map_err(DictError::Io)?;
402
403 let feature_bytes = entry.feature.as_bytes();
404 let feature_len = u32::try_from(feature_bytes.len())
405 .map_err(|_| DictError::Format("MKE3: feature too long".into()))?;
406 file.write_u32::<LittleEndian>(feature_len)
407 .map_err(DictError::Io)?;
408
409 file.write_all(surface_bytes).map_err(DictError::Io)?;
410 file.write_all(feature_bytes).map_err(DictError::Io)?;
411 }
412
413 let index_offset = file.stream_position().map_err(DictError::Io)?;
415 for offset in offsets {
416 file.write_u64::<LittleEndian>(offset)
417 .map_err(DictError::Io)?;
418 }
419
420 file.seek(SeekFrom::Start(16)).map_err(DictError::Io)?;
422 file.write_u64::<LittleEndian>(index_offset)
423 .map_err(DictError::Io)?;
424
425 Ok(())
426}
427
428pub fn migrate_v2_to_v3<P: AsRef<Path>, Q: AsRef<Path>>(
437 v2_path: P,
438 v3_path: Q,
439) -> Result<usize> {
440 use crate::lazy_entries::LazyEntries;
441
442 let v2 = LazyEntries::from_file(v2_path)?;
443 let count = v2.len();
444 let entries = v2.load_all()?;
445 save_entries_v3(&entries, v3_path)?;
446 Ok(count)
447}
448
449#[cfg(test)]
450mod tests {
451 #![allow(clippy::expect_used, clippy::unwrap_used)]
452
453 use super::*;
454 use tempfile::tempdir;
455
456 fn sample_entries() -> Vec<DictEntry> {
457 vec![
458 DictEntry::new("안녕", 1, 1, 100, "NNG,*,T,안녕,*,*,*,*"),
459 DictEntry::new("하세요", 2, 2, 50, "VV,*,F,하세요,*,*,*,*"),
460 DictEntry::new("감사", 3, 3, 80, "NNG,*,F,감사,*,*,*,*"),
461 DictEntry::new("합니다", 4, 4, -10, "XSV,*,F,합니다,*,*,*,*"),
462 DictEntry::new("가", 5, 5, 200, "JKS,*,F,가,*,*,*,*"),
463 ]
464 }
465
466 #[test]
467 fn test_v3_roundtrip() {
468 let entries = sample_entries();
469 let dir = tempdir().expect("tempdir");
470 let path = dir.path().join("entries_v3.bin");
471
472 save_entries_v3(&entries, &path).expect("save");
473
474 let lazy = LazyEntriesV3::from_file(&path).expect("load");
475 assert_eq!(lazy.len(), 5);
476 assert!(!lazy.is_empty());
477 assert_eq!(lazy.flags() & FEATURE_U32, FEATURE_U32);
478
479 for (i, expected) in entries.iter().enumerate() {
480 let got = lazy.get(i as u32).expect("get");
481 assert_eq!(got.surface, expected.surface, "surface[{i}]");
482 assert_eq!(got.left_id, expected.left_id, "left_id[{i}]");
483 assert_eq!(got.right_id, expected.right_id, "right_id[{i}]");
484 assert_eq!(got.cost, expected.cost, "cost[{i}]");
485 assert_eq!(got.feature, expected.feature, "feature[{i}]");
486 }
487
488 assert!(lazy.get(5).is_err());
489 }
490
491 #[test]
492 fn test_v3_large_feature() {
493 let large_feature = "X".repeat(70_000);
495 let entry = DictEntry::new("테스트", 10, 10, 0, &large_feature);
496
497 let dir = tempdir().expect("tempdir");
498 let path = dir.path().join("large_feature.bin");
499
500 save_entries_v3(&[entry], &path).expect("save large feature");
501
502 let lazy = LazyEntriesV3::from_file(&path).expect("load");
503 assert_eq!(lazy.len(), 1);
504
505 let got = lazy.get(0).expect("get");
506 assert_eq!(got.surface, "테스트");
507 assert_eq!(got.feature.len(), 70_000);
508 assert!(got.feature.chars().all(|c| c == 'X'));
509 }
510
511 #[test]
512 fn test_detect_format() {
513 let dir = tempdir().expect("tempdir");
514
515 let v3_path = dir.path().join("v3.bin");
516 save_entries_v3(&sample_entries(), &v3_path).expect("save v3");
517 assert_eq!(
518 detect_entries_format(&v3_path).expect("detect v3"),
519 EntriesFormat::V3
520 );
521
522 let v2_path = dir.path().join("v2.bin");
524 {
525 use std::io::Write;
526 let mut f = std::fs::File::create(&v2_path).expect("create v2 file");
527 f.write_all(b"MKE2").expect("write magic");
528 }
529 assert_eq!(
530 detect_entries_format(&v2_path).expect("detect v2"),
531 EntriesFormat::V2
532 );
533
534 let v1_path = dir.path().join("v1.bin");
536 {
537 use std::io::Write;
538 let mut f = std::fs::File::create(&v1_path).expect("create v1 file");
539 f.write_all(b"MKED").expect("write magic");
540 }
541 assert_eq!(
542 detect_entries_format(&v1_path).expect("detect v1"),
543 EntriesFormat::V1
544 );
545
546 let unk_path = dir.path().join("unk.bin");
548 {
549 use std::io::Write;
550 let mut f = std::fs::File::create(&unk_path).expect("create unk file");
551 f.write_all(b"????").expect("write magic");
552 }
553 assert!(detect_entries_format(&unk_path).is_err());
554 }
555
556 #[test]
557 fn test_get_entries_at() {
558 let dir = tempfile::tempdir().expect("tempdir");
559 let path = dir.path().join("entries_v3.bin");
560
561 let entries = vec![
562 DictEntry::new("가", 1, 1, 100, "NNG"),
563 DictEntry::new("가", 2, 2, 50, "JKS"),
564 DictEntry::new("나", 3, 3, 200, "NP"),
565 ];
566 save_entries_v3(&entries, &path).expect("save");
567
568 let lazy = LazyEntriesV3::from_file(&path).expect("load");
569
570 let results = lazy.get_entries_at(0, "가").expect("get_entries_at");
571 assert_eq!(results.len(), 2);
572 assert_eq!(results[0].feature, "NNG");
573 assert_eq!(results[1].feature, "JKS");
574
575 let results = lazy.get_entries_at(2, "나").expect("get_entries_at");
576 assert_eq!(results.len(), 1);
577 assert_eq!(results[0].surface, "나");
578
579 let results = lazy.get_entries_at(0, "다").expect("get_entries_at");
580 assert!(results.is_empty());
581 }
582
583 #[test]
584 fn test_v3_cache() {
585 let entries = sample_entries();
586 let dir = tempdir().expect("tempdir");
587 let path = dir.path().join("cache_test.bin");
588
589 save_entries_v3(&entries, &path).expect("save");
590
591 let lazy = LazyEntriesV3::from_file(&path).expect("load");
592
593 assert_eq!(lazy.cached_count(), 0);
594
595 let _ = lazy.get(0).expect("get 0");
596 assert_eq!(lazy.cached_count(), 1);
597
598 let _ = lazy.get(0).expect("get 0 again");
599 assert_eq!(lazy.cached_count(), 1, "no duplicate on repeated get");
600
601 let _ = lazy.get(1).expect("get 1");
602 assert_eq!(lazy.cached_count(), 2);
603
604 lazy.clear_cache();
605 assert_eq!(lazy.cached_count(), 0);
606
607 lazy.set_cache_size(1);
609 let _ = lazy.get(0).expect("get 0");
610 let _ = lazy.get(1).expect("get 1");
611 assert_eq!(lazy.cached_count(), 1);
612 }
613
614 #[test]
615 fn test_migrate_v2_to_v3() {
616 use crate::lazy_entries::LazyEntries;
617
618 let entries = vec![
619 DictEntry::new("가", 1, 1, 100, "NNG"),
620 DictEntry::new("가", 2, 2, 50, "JKS"),
621 DictEntry::new("나", 3, 3, 200, "NP"),
622 ];
623
624 let dir = tempdir().expect("tempdir");
625 let v2_path = dir.path().join("entries_v2.bin");
626 let v3_path = dir.path().join("entries_v3.bin");
627
628 LazyEntries::save_entries(&entries, &v2_path).expect("save v2");
629
630 let count = migrate_v2_to_v3(&v2_path, &v3_path).expect("migrate");
631 assert_eq!(count, 3);
632
633 assert_eq!(
634 detect_entries_format(&v3_path).expect("detect"),
635 EntriesFormat::V3
636 );
637
638 let v3 = LazyEntriesV3::from_file(&v3_path).expect("load v3");
639 assert_eq!(v3.len(), 3);
640
641 let e0 = v3.get(0).expect("get 0");
642 assert_eq!(e0.surface, "가");
643 assert_eq!(e0.left_id, 1);
644 assert_eq!(e0.feature, "NNG");
645
646 let e2 = v3.get(2).expect("get 2");
647 assert_eq!(e2.surface, "나");
648 assert_eq!(e2.feature, "NP");
649 }
650}