1use std::io::{Read, Seek, SeekFrom};
27use std::num::NonZeroUsize;
28use std::path::{Path, PathBuf};
29use std::sync::{Arc, RwLock};
30
31use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
32use memmap2::Mmap;
33
34use crate::dictionary::DictEntry;
35use crate::error::{DictError, Result};
36
37const LAZY_ENTRIES_MAGIC: &[u8; 4] = b"MKE2";
39const LAZY_ENTRIES_VERSION: u32 = 2;
41const HEADER_SIZE: usize = 20;
43
44const DEFAULT_CACHE_SIZE: usize = 10000;
46
47const DEFAULT_CACHE_SIZE_NZ: NonZeroUsize = {
50 match NonZeroUsize::new(DEFAULT_CACHE_SIZE) {
51 Some(n) => n,
52 None => panic!("DEFAULT_CACHE_SIZE must be > 0"),
53 }
54};
55
56pub struct LazyEntries {
61 #[allow(dead_code)]
63 path: PathBuf,
64 mmap: Mmap,
66 count: u32,
68 index_offset: u64,
70 cache: RwLock<lru::LruCache<u32, Arc<DictEntry>>>,
72}
73
74
75impl LazyEntries {
76 #[allow(unsafe_code)]
88 pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
89 let path = path.as_ref().to_path_buf();
90 let file = std::fs::File::open(&path).map_err(DictError::Io)?;
91 let mmap = unsafe { Mmap::map(&file).map_err(DictError::Io)? };
94
95 if mmap.len() < HEADER_SIZE {
97 return Err(DictError::Format("entries.bin v2: file too small".into()));
98 }
99
100 let mut cursor = std::io::Cursor::new(&mmap[..]);
101
102 let mut magic = [0u8; 4];
104 cursor
105 .read_exact(&mut magic)
106 .map_err(|e| DictError::Format(format!("entries.bin v2: failed to read magic: {e}")))?;
107 if &magic != LAZY_ENTRIES_MAGIC {
108 return Err(DictError::Format(
109 "entries.bin v2: invalid magic number (expected MKE2)".into(),
110 ));
111 }
112
113 let version = cursor.read_u32::<LittleEndian>().map_err(|e| {
115 DictError::Format(format!("entries.bin v2: failed to read version: {e}"))
116 })?;
117 if version != LAZY_ENTRIES_VERSION {
118 return Err(DictError::Format(format!(
119 "entries.bin v2: unsupported version {version}"
120 )));
121 }
122
123 let count = cursor
125 .read_u32::<LittleEndian>()
126 .map_err(|e| DictError::Format(format!("entries.bin v2: failed to read count: {e}")))?;
127
128 let index_offset = cursor.read_u64::<LittleEndian>().map_err(|e| {
130 DictError::Format(format!("entries.bin v2: failed to read index_offset: {e}"))
131 })?;
132
133 Ok(Self {
134 path,
135 mmap,
136 count,
137 index_offset,
138 cache: RwLock::new(lru::LruCache::new(DEFAULT_CACHE_SIZE_NZ)),
139 })
140 }
141
142 #[must_use]
144 pub const fn len(&self) -> usize {
145 self.count as usize
146 }
147
148 #[must_use]
150 pub const fn is_empty(&self) -> bool {
151 self.count == 0
152 }
153
154 #[must_use]
156 pub fn cached_count(&self) -> usize {
157 self.cache.read().map(|c| c.len()).unwrap_or(0)
158 }
159
160 pub fn set_cache_size(&self, size: usize) {
162 if let Ok(mut cache) = self.cache.write() {
163 cache.resize(NonZeroUsize::new(size).unwrap_or(NonZeroUsize::new(1).unwrap()));
164 }
165 }
166
167 pub fn clear_cache(&self) {
169 if let Ok(mut cache) = self.cache.write() {
170 cache.clear();
171 }
172 }
173
174 fn get_entry_offset(&self, index: u32) -> Result<u64> {
176 if index >= self.count {
177 return Err(DictError::Format(format!(
178 "entry index out of bounds: {index} >= {}",
179 self.count
180 )));
181 }
182
183 let index_pos = self.index_offset + (u64::from(index) * 8);
185 let mmap_len = u64::try_from(self.mmap.len())
186 .map_err(|_| DictError::Format("mmap length overflow".into()))?;
187 if index_pos + 8 > mmap_len {
188 return Err(DictError::Format(format!(
189 "index table overflow at position {index_pos}"
190 )));
191 }
192
193 let pos = usize::try_from(index_pos)
194 .map_err(|_| DictError::Format("index position overflow".into()))?;
195 let mut cursor = std::io::Cursor::new(&self.mmap[pos..]);
196 let offset = cursor
197 .read_u64::<LittleEndian>()
198 .map_err(|e| DictError::Format(format!("failed to read entry offset: {e}")))?;
199
200 Ok(offset)
201 }
202
203 pub fn get(&self, index: u32) -> Result<Arc<DictEntry>> {
212 {
214 let cache = self
215 .cache
216 .read()
217 .map_err(|_| DictError::Format("cache lock poisoned".into()))?;
218 if let Some(entry) = cache.peek(&index) {
219 return Ok(Arc::clone(entry));
220 }
221 }
222
223 let entry = self.load_entry_from_disk(index)?;
225
226 let arc_entry = Arc::new(entry);
228 self.cache
229 .write()
230 .map_err(|_| DictError::Format("cache lock poisoned".into()))?
231 .put(index, Arc::clone(&arc_entry));
232 Ok(arc_entry)
233 }
234
235 fn load_entry_from_disk(&self, index: u32) -> Result<DictEntry> {
237 let offset = self.get_entry_offset(index)?;
238
239 let offset_usize =
240 usize::try_from(offset).map_err(|_| DictError::Format("offset overflow".into()))?;
241
242 if offset_usize >= self.mmap.len() {
243 return Err(DictError::Format(format!(
244 "entry offset out of bounds: {offset}"
245 )));
246 }
247
248 let mut cursor = std::io::Cursor::new(&self.mmap[offset_usize..]);
249
250 let left_id = cursor
251 .read_u16::<LittleEndian>()
252 .map_err(|e| DictError::Format(format!("entry {index} left_id: {e}")))?;
253 let right_id = cursor
254 .read_u16::<LittleEndian>()
255 .map_err(|e| DictError::Format(format!("entry {index} right_id: {e}")))?;
256 let cost = cursor
257 .read_i16::<LittleEndian>()
258 .map_err(|e| DictError::Format(format!("entry {index} cost: {e}")))?;
259 let surface_len = cursor
260 .read_u16::<LittleEndian>()
261 .map_err(|e| DictError::Format(format!("entry {index} surface_len: {e}")))?
262 as usize;
263 let feature_len = cursor
264 .read_u16::<LittleEndian>()
265 .map_err(|e| DictError::Format(format!("entry {index} feature_len: {e}")))?
266 as usize;
267
268 let mut surface_bytes = vec![0u8; surface_len];
269 cursor
270 .read_exact(&mut surface_bytes)
271 .map_err(|e| DictError::Format(format!("entry {index} surface: {e}")))?;
272 let surface = String::from_utf8(surface_bytes)
273 .map_err(|e| DictError::Format(format!("entry {index} surface utf8: {e}")))?;
274
275 let mut feature_bytes = vec![0u8; feature_len];
276 cursor
277 .read_exact(&mut feature_bytes)
278 .map_err(|e| DictError::Format(format!("entry {index} feature: {e}")))?;
279 let feature = String::from_utf8(feature_bytes)
280 .map_err(|e| DictError::Format(format!("entry {index} feature utf8: {e}")))?;
281
282 Ok(DictEntry {
283 surface,
284 left_id,
285 right_id,
286 cost,
287 feature,
288 })
289 }
290
291 pub fn get_entries_at(&self, first_index: u32, surface: &str) -> Result<Vec<Arc<DictEntry>>> {
298 let mut results = Vec::new();
299 let mut index = first_index;
300
301 while index < self.count {
302 let entry = self.get(index)?;
303 if entry.surface == surface {
304 results.push(entry);
305 index += 1;
306 } else {
307 break;
308 }
309 }
310
311 Ok(results)
312 }
313
314 pub fn load_all(&self) -> Result<Vec<DictEntry>> {
322 let mut entries = Vec::with_capacity(self.count as usize);
323 for i in 0..self.count {
324 let entry = self.load_entry_from_disk(i)?;
325 entries.push(entry);
326 }
327 Ok(entries)
328 }
329
330 pub fn save_entries<P: AsRef<Path>>(entries: &[DictEntry], path: P) -> Result<()> {
336 use std::io::Write;
337
338 let path = path.as_ref();
339 let mut file = std::fs::File::create(path).map_err(DictError::Io)?;
340
341 let count = u32::try_from(entries.len())
342 .map_err(|_| DictError::Format("too many entries".into()))?;
343
344 file.write_all(LAZY_ENTRIES_MAGIC).map_err(DictError::Io)?;
346 file.write_u32::<LittleEndian>(LAZY_ENTRIES_VERSION)
347 .map_err(DictError::Io)?;
348 file.write_u32::<LittleEndian>(count)
349 .map_err(DictError::Io)?;
350 file.write_u64::<LittleEndian>(0) .map_err(DictError::Io)?;
352
353 let mut offsets = Vec::with_capacity(entries.len());
355
356 for entry in entries {
357 let offset = file.stream_position().map_err(DictError::Io)?;
358 offsets.push(offset);
359
360 file.write_u16::<LittleEndian>(entry.left_id)
361 .map_err(DictError::Io)?;
362 file.write_u16::<LittleEndian>(entry.right_id)
363 .map_err(DictError::Io)?;
364 file.write_i16::<LittleEndian>(entry.cost)
365 .map_err(DictError::Io)?;
366
367 let surface_bytes = entry.surface.as_bytes();
368 let surface_len = u16::try_from(surface_bytes.len())
369 .map_err(|_| DictError::Format("surface too long".into()))?;
370 file.write_u16::<LittleEndian>(surface_len)
371 .map_err(DictError::Io)?;
372
373 let feature_bytes = entry.feature.as_bytes();
374 let feature_len = u16::try_from(feature_bytes.len())
375 .map_err(|_| DictError::Format("feature too long".into()))?;
376 file.write_u16::<LittleEndian>(feature_len)
377 .map_err(DictError::Io)?;
378
379 file.write_all(surface_bytes).map_err(DictError::Io)?;
380 file.write_all(feature_bytes).map_err(DictError::Io)?;
381 }
382
383 let index_offset = file.stream_position().map_err(DictError::Io)?;
385
386 for offset in offsets {
387 file.write_u64::<LittleEndian>(offset)
388 .map_err(DictError::Io)?;
389 }
390
391 file.seek(SeekFrom::Start(12)).map_err(DictError::Io)?;
393 file.write_u64::<LittleEndian>(index_offset)
394 .map_err(DictError::Io)?;
395
396 Ok(())
397 }
398}
399
400pub fn migrate_entries_v1_to_v2<P: AsRef<Path>>(v1_path: P, v2_path: P) -> Result<()> {
410 use byteorder::ReadBytesExt;
411
412 let data = std::fs::read(v1_path.as_ref()).map_err(DictError::Io)?;
413 let mut cursor = std::io::Cursor::new(&data);
414
415 let mut magic = [0u8; 4];
417 cursor
418 .read_exact(&mut magic)
419 .map_err(|e| DictError::Format(format!("v1 magic: {e}")))?;
420 if &magic != b"MKED" {
421 return Err(DictError::Format(
422 "entries.bin v1: invalid magic number".into(),
423 ));
424 }
425
426 let version = cursor
428 .read_u32::<LittleEndian>()
429 .map_err(|e| DictError::Format(format!("v1 version: {e}")))?;
430 if version != 1 {
431 return Err(DictError::Format(format!(
432 "entries.bin v1: unsupported version {version}"
433 )));
434 }
435
436 let count = cursor
438 .read_u32::<LittleEndian>()
439 .map_err(|e| DictError::Format(format!("v1 count: {e}")))?;
440
441 let mut entries = Vec::with_capacity(count as usize);
442 for i in 0..count {
443 let left_id = cursor
444 .read_u16::<LittleEndian>()
445 .map_err(|e| DictError::Format(format!("v1 entry {i} left_id: {e}")))?;
446 let right_id = cursor
447 .read_u16::<LittleEndian>()
448 .map_err(|e| DictError::Format(format!("v1 entry {i} right_id: {e}")))?;
449 let cost = cursor
450 .read_i16::<LittleEndian>()
451 .map_err(|e| DictError::Format(format!("v1 entry {i} cost: {e}")))?;
452 let surface_len = cursor
453 .read_u16::<LittleEndian>()
454 .map_err(|e| DictError::Format(format!("v1 entry {i} surface_len: {e}")))?
455 as usize;
456 let feature_len = cursor
457 .read_u16::<LittleEndian>()
458 .map_err(|e| DictError::Format(format!("v1 entry {i} feature_len: {e}")))?
459 as usize;
460
461 let mut surface_bytes = vec![0u8; surface_len];
462 cursor
463 .read_exact(&mut surface_bytes)
464 .map_err(|e| DictError::Format(format!("v1 entry {i} surface: {e}")))?;
465 let surface = String::from_utf8(surface_bytes)
466 .map_err(|e| DictError::Format(format!("v1 entry {i} surface utf8: {e}")))?;
467
468 let mut feature_bytes = vec![0u8; feature_len];
469 cursor
470 .read_exact(&mut feature_bytes)
471 .map_err(|e| DictError::Format(format!("v1 entry {i} feature: {e}")))?;
472 let feature = String::from_utf8(feature_bytes)
473 .map_err(|e| DictError::Format(format!("v1 entry {i} feature utf8: {e}")))?;
474
475 entries.push(DictEntry {
476 surface,
477 left_id,
478 right_id,
479 cost,
480 feature,
481 });
482 }
483
484 LazyEntries::save_entries(&entries, v2_path)?;
486
487 Ok(())
488}
489
490#[cfg(test)]
491mod tests {
492 #![allow(clippy::expect_used, clippy::unwrap_used)]
493
494 use super::*;
495 use tempfile::tempdir;
496
497 #[test]
498 fn test_lazy_entries_roundtrip() {
499 let entries = vec![
500 DictEntry::new("안녕", 1, 1, 100, "NNG,*,T,안녕,*,*,*,*"),
501 DictEntry::new("하세요", 2, 2, 50, "VV,*,F,하세요,*,*,*,*"),
502 DictEntry::new("감사", 3, 3, 80, "NNG,*,F,감사,*,*,*,*"),
503 ];
504
505 let dir = tempdir().expect("create temp dir");
506 let path = dir.path().join("entries.bin");
507
508 LazyEntries::save_entries(&entries, &path).expect("save should work");
510
511 let lazy = LazyEntries::from_file(&path).expect("load should work");
513
514 assert_eq!(lazy.len(), 3);
515 assert!(!lazy.is_empty());
516
517 let e0 = lazy.get(0).expect("get entry 0");
519 assert_eq!(e0.surface, "안녕");
520 assert_eq!(e0.left_id, 1);
521 assert_eq!(e0.cost, 100);
522
523 let e1 = lazy.get(1).expect("get entry 1");
524 assert_eq!(e1.surface, "하세요");
525
526 let e2 = lazy.get(2).expect("get entry 2");
527 assert_eq!(e2.surface, "감사");
528
529 assert!(lazy.get(100).is_err());
531 }
532
533 #[test]
534 fn test_lazy_entries_cache() {
535 let entries = vec![
536 DictEntry::new("가", 1, 1, 100, "NNG"),
537 DictEntry::new("나", 2, 2, 200, "NNG"),
538 ];
539
540 let dir = tempdir().expect("create temp dir");
541 let path = dir.path().join("entries.bin");
542 LazyEntries::save_entries(&entries, &path).expect("save");
543
544 let lazy = LazyEntries::from_file(&path).expect("load");
545
546 assert_eq!(lazy.cached_count(), 0);
548 let _ = lazy.get(0).expect("get 0");
549 assert_eq!(lazy.cached_count(), 1);
550
551 let _ = lazy.get(0).expect("get 0 again");
553 assert_eq!(lazy.cached_count(), 1);
554
555 let _ = lazy.get(1).expect("get 1");
557 assert_eq!(lazy.cached_count(), 2);
558
559 lazy.clear_cache();
561 assert_eq!(lazy.cached_count(), 0);
562 }
563
564 #[test]
565 fn test_get_entries_at() {
566 let entries = vec![
567 DictEntry::new("가", 1, 1, 100, "VV"),
568 DictEntry::new("가", 2, 2, 50, "JKS"),
569 DictEntry::new("나", 3, 3, 200, "NP"),
570 ];
571
572 let dir = tempdir().expect("create temp dir");
573 let path = dir.path().join("entries.bin");
574 LazyEntries::save_entries(&entries, &path).expect("save");
575
576 let lazy = LazyEntries::from_file(&path).expect("load");
577
578 let results = lazy.get_entries_at(0, "가").expect("get_entries_at");
580 assert_eq!(results.len(), 2);
581 assert_eq!(results[0].feature, "VV");
582 assert_eq!(results[1].feature, "JKS");
583
584 let results = lazy.get_entries_at(2, "나").expect("get_entries_at");
586 assert_eq!(results.len(), 1);
587 }
588
589 #[test]
590 fn test_load_all() {
591 let entries = vec![
592 DictEntry::new("가", 1, 1, 100, "NNG"),
593 DictEntry::new("나", 2, 2, 200, "NNG"),
594 DictEntry::new("다", 3, 3, 300, "NNG"),
595 ];
596
597 let dir = tempdir().expect("create temp dir");
598 let path = dir.path().join("entries.bin");
599 LazyEntries::save_entries(&entries, &path).expect("save");
600
601 let lazy = LazyEntries::from_file(&path).expect("load");
602 let loaded = lazy.load_all().expect("load_all");
603
604 assert_eq!(loaded.len(), 3);
605 assert_eq!(loaded[0].surface, "가");
606 assert_eq!(loaded[1].surface, "나");
607 assert_eq!(loaded[2].surface, "다");
608 }
609
610 #[test]
611 fn test_lru_cache_eviction() {
612 let mut cache =
613 lru::LruCache::<u32, Arc<DictEntry>>::new(NonZeroUsize::new(2).unwrap());
614
615 cache.put(0, Arc::new(DictEntry::new("가", 1, 1, 100, "")));
616 cache.put(1, Arc::new(DictEntry::new("나", 2, 2, 200, "")));
617 assert_eq!(cache.len(), 2);
618
619 cache.put(2, Arc::new(DictEntry::new("다", 3, 3, 300, "")));
621 assert_eq!(cache.len(), 2);
622 assert!(cache.peek(&0).is_none()); assert!(cache.peek(&1).is_some());
624 assert!(cache.peek(&2).is_some());
625 }
626}