Skip to main content

mecab_ko_dict/
lazy_entries.rs

1//! # Lazy Entry Loading
2//!
3//! 메모리 효율을 위한 지연 로딩 엔트리 시스템입니다.
4//! 엔트리를 필요할 때만 디스크에서 읽어옵니다.
5//!
6//! ## entries.bin v2 포맷
7//!
8//! ```text
9//! [Header]
10//!   magic: [u8; 4] = "MKE2"
11//!   version: u32 = 2
12//!   count: u32
13//!   index_offset: u64 (인덱스 테이블 시작 위치)
14//!
15//! [Entry Data] (압축 가능)
16//!   entry_0: [left_id:u16][right_id:u16][cost:i16][surface_len:u16][feature_len:u16][surface][feature]
17//!   entry_1: ...
18//!   ...
19//!
20//! [Index Table]
21//!   offset_0: u64 (entry_0의 파일 내 위치)
22//!   offset_1: u64
23//!   ...
24//! ```
25
26use std::io::{Read, Seek, SeekFrom};
27use std::num::NonZeroUsize;
28use std::path::{Path, PathBuf};
29use std::sync::{Arc, RwLock};
30
31use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
32use memmap2::Mmap;
33
34use crate::dictionary::DictEntry;
35use crate::error::{DictError, Result};
36
37/// entries.bin v2 매직 넘버
38const LAZY_ENTRIES_MAGIC: &[u8; 4] = b"MKE2";
39/// entries.bin v2 버전
40const LAZY_ENTRIES_VERSION: u32 = 2;
41/// 헤더 크기 바이트
42const HEADER_SIZE: usize = 20;
43
44/// LRU 캐시 기본 크기 (핫스팟 엔트리 캐싱)
45const DEFAULT_CACHE_SIZE: usize = 10000;
46
47/// `DEFAULT_CACHE_SIZE` as a `NonZeroUsize` (compile-time guaranteed non-zero)
48// SAFETY: DEFAULT_CACHE_SIZE = 10000 > 0, so this is always Some.
49const DEFAULT_CACHE_SIZE_NZ: NonZeroUsize = {
50    match NonZeroUsize::new(DEFAULT_CACHE_SIZE) {
51        Some(n) => n,
52        None => panic!("DEFAULT_CACHE_SIZE must be > 0"),
53    }
54};
55
56/// 지연 로딩 엔트리 저장소
57///
58/// 엔트리를 필요할 때만 디스크에서 읽어옵니다.
59/// 자주 사용되는 엔트리는 LRU 캐시에 저장됩니다.
60pub struct LazyEntries {
61    /// 파일 경로 (디버깅용)
62    #[allow(dead_code)]
63    path: PathBuf,
64    /// 메모리 맵 (읽기 전용)
65    mmap: Mmap,
66    /// 엔트리 수
67    count: u32,
68    /// 인덱스 테이블 오프셋
69    index_offset: u64,
70    /// LRU 캐시 (인덱스 -> 엔트리)
71    cache: RwLock<lru::LruCache<u32, Arc<DictEntry>>>,
72}
73
74
75impl LazyEntries {
76    /// entries.bin v2 파일에서 로드
77    ///
78    /// # Errors
79    ///
80    /// - 파일 형식이 잘못된 경우
81    /// - 파일 읽기 실패한 경우
82    ///
83    /// # Safety
84    ///
85    /// Uses memory-mapped I/O which requires unsafe.
86    /// The safety is ensured by memmap2 crate's implementation.
87    #[allow(unsafe_code)]
88    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
89        let path = path.as_ref().to_path_buf();
90        let file = std::fs::File::open(&path).map_err(DictError::Io)?;
91        // SAFETY: memmap2::Mmap handles the memory mapping safely.
92        // The file is opened read-only and the mmap is immutable.
93        let mmap = unsafe { Mmap::map(&file).map_err(DictError::Io)? };
94
95        // 헤더 검증
96        if mmap.len() < HEADER_SIZE {
97            return Err(DictError::Format("entries.bin v2: file too small".into()));
98        }
99
100        let mut cursor = std::io::Cursor::new(&mmap[..]);
101
102        // 매직 넘버 검증
103        let mut magic = [0u8; 4];
104        cursor
105            .read_exact(&mut magic)
106            .map_err(|e| DictError::Format(format!("entries.bin v2: failed to read magic: {e}")))?;
107        if &magic != LAZY_ENTRIES_MAGIC {
108            return Err(DictError::Format(
109                "entries.bin v2: invalid magic number (expected MKE2)".into(),
110            ));
111        }
112
113        // 버전 검증
114        let version = cursor.read_u32::<LittleEndian>().map_err(|e| {
115            DictError::Format(format!("entries.bin v2: failed to read version: {e}"))
116        })?;
117        if version != LAZY_ENTRIES_VERSION {
118            return Err(DictError::Format(format!(
119                "entries.bin v2: unsupported version {version}"
120            )));
121        }
122
123        // 엔트리 수
124        let count = cursor
125            .read_u32::<LittleEndian>()
126            .map_err(|e| DictError::Format(format!("entries.bin v2: failed to read count: {e}")))?;
127
128        // 인덱스 테이블 오프셋
129        let index_offset = cursor.read_u64::<LittleEndian>().map_err(|e| {
130            DictError::Format(format!("entries.bin v2: failed to read index_offset: {e}"))
131        })?;
132
133        Ok(Self {
134            path,
135            mmap,
136            count,
137            index_offset,
138            cache: RwLock::new(lru::LruCache::new(DEFAULT_CACHE_SIZE_NZ)),
139        })
140    }
141
142    /// 엔트리 수 반환
143    #[must_use]
144    pub const fn len(&self) -> usize {
145        self.count as usize
146    }
147
148    /// 비어있는지 확인
149    #[must_use]
150    pub const fn is_empty(&self) -> bool {
151        self.count == 0
152    }
153
154    /// 캐시된 엔트리 수 반환
155    #[must_use]
156    pub fn cached_count(&self) -> usize {
157        self.cache.read().map(|c| c.len()).unwrap_or(0)
158    }
159
160    /// 캐시 크기 설정
161    pub fn set_cache_size(&self, size: usize) {
162        if let Ok(mut cache) = self.cache.write() {
163            cache.resize(NonZeroUsize::new(size).unwrap_or(NonZeroUsize::new(1).unwrap()));
164        }
165    }
166
167    /// 캐시 초기화
168    pub fn clear_cache(&self) {
169        if let Ok(mut cache) = self.cache.write() {
170            cache.clear();
171        }
172    }
173
174    /// 인덱스로 엔트리 오프셋 조회
175    fn get_entry_offset(&self, index: u32) -> Result<u64> {
176        if index >= self.count {
177            return Err(DictError::Format(format!(
178                "entry index out of bounds: {index} >= {}",
179                self.count
180            )));
181        }
182
183        // 인덱스 테이블에서 오프셋 읽기
184        let index_pos = self.index_offset + (u64::from(index) * 8);
185        let mmap_len = u64::try_from(self.mmap.len())
186            .map_err(|_| DictError::Format("mmap length overflow".into()))?;
187        if index_pos + 8 > mmap_len {
188            return Err(DictError::Format(format!(
189                "index table overflow at position {index_pos}"
190            )));
191        }
192
193        let pos = usize::try_from(index_pos)
194            .map_err(|_| DictError::Format("index position overflow".into()))?;
195        let mut cursor = std::io::Cursor::new(&self.mmap[pos..]);
196        let offset = cursor
197            .read_u64::<LittleEndian>()
198            .map_err(|e| DictError::Format(format!("failed to read entry offset: {e}")))?;
199
200        Ok(offset)
201    }
202
203    /// 인덱스로 엔트리 로드
204    ///
205    /// 캐시에 있으면 캐시에서 반환, 없으면 디스크에서 읽어 캐시에 저장
206    ///
207    /// # Errors
208    ///
209    /// - 인덱스가 범위를 벗어난 경우
210    /// - 엔트리 읽기 실패한 경우
211    pub fn get(&self, index: u32) -> Result<Arc<DictEntry>> {
212        // 1. 캐시 확인 (read lock — LRU 순서 업데이트 없이 peek)
213        {
214            let cache = self
215                .cache
216                .read()
217                .map_err(|_| DictError::Format("cache lock poisoned".into()))?;
218            if let Some(entry) = cache.peek(&index) {
219                return Ok(Arc::clone(entry));
220            }
221        }
222
223        // 2. 디스크에서 읽기
224        let entry = self.load_entry_from_disk(index)?;
225
226        // 3. 캐시에 저장 (write lock — guard dropped immediately after put)
227        let arc_entry = Arc::new(entry);
228        self.cache
229            .write()
230            .map_err(|_| DictError::Format("cache lock poisoned".into()))?
231            .put(index, Arc::clone(&arc_entry));
232        Ok(arc_entry)
233    }
234
235    /// 디스크에서 엔트리 로드
236    fn load_entry_from_disk(&self, index: u32) -> Result<DictEntry> {
237        let offset = self.get_entry_offset(index)?;
238
239        let offset_usize =
240            usize::try_from(offset).map_err(|_| DictError::Format("offset overflow".into()))?;
241
242        if offset_usize >= self.mmap.len() {
243            return Err(DictError::Format(format!(
244                "entry offset out of bounds: {offset}"
245            )));
246        }
247
248        let mut cursor = std::io::Cursor::new(&self.mmap[offset_usize..]);
249
250        let left_id = cursor
251            .read_u16::<LittleEndian>()
252            .map_err(|e| DictError::Format(format!("entry {index} left_id: {e}")))?;
253        let right_id = cursor
254            .read_u16::<LittleEndian>()
255            .map_err(|e| DictError::Format(format!("entry {index} right_id: {e}")))?;
256        let cost = cursor
257            .read_i16::<LittleEndian>()
258            .map_err(|e| DictError::Format(format!("entry {index} cost: {e}")))?;
259        let surface_len = cursor
260            .read_u16::<LittleEndian>()
261            .map_err(|e| DictError::Format(format!("entry {index} surface_len: {e}")))?
262            as usize;
263        let feature_len = cursor
264            .read_u16::<LittleEndian>()
265            .map_err(|e| DictError::Format(format!("entry {index} feature_len: {e}")))?
266            as usize;
267
268        let mut surface_bytes = vec![0u8; surface_len];
269        cursor
270            .read_exact(&mut surface_bytes)
271            .map_err(|e| DictError::Format(format!("entry {index} surface: {e}")))?;
272        let surface = String::from_utf8(surface_bytes)
273            .map_err(|e| DictError::Format(format!("entry {index} surface utf8: {e}")))?;
274
275        let mut feature_bytes = vec![0u8; feature_len];
276        cursor
277            .read_exact(&mut feature_bytes)
278            .map_err(|e| DictError::Format(format!("entry {index} feature: {e}")))?;
279        let feature = String::from_utf8(feature_bytes)
280            .map_err(|e| DictError::Format(format!("entry {index} feature utf8: {e}")))?;
281
282        Ok(DictEntry {
283            surface,
284            left_id,
285            right_id,
286            cost,
287            feature,
288        })
289    }
290
291    /// 인덱스에서 시작하여 같은 surface를 가진 연속된 엔트리 반환
292    ///
293    /// # Errors
294    ///
295    /// - 인덱스가 범위를 벗어난 경우
296    /// - 엔트리 읽기 실패한 경우
297    pub fn get_entries_at(&self, first_index: u32, surface: &str) -> Result<Vec<Arc<DictEntry>>> {
298        let mut results = Vec::new();
299        let mut index = first_index;
300
301        while index < self.count {
302            let entry = self.get(index)?;
303            if entry.surface == surface {
304                results.push(entry);
305                index += 1;
306            } else {
307                break;
308            }
309        }
310
311        Ok(results)
312    }
313
314    /// 모든 엔트리를 Vec으로 로드 (마이그레이션/변환용)
315    ///
316    /// 주의: 대용량 사전에서는 메모리를 많이 사용합니다.
317    ///
318    /// # Errors
319    ///
320    /// - 엔트리 읽기 실패한 경우
321    pub fn load_all(&self) -> Result<Vec<DictEntry>> {
322        let mut entries = Vec::with_capacity(self.count as usize);
323        for i in 0..self.count {
324            let entry = self.load_entry_from_disk(i)?;
325            entries.push(entry);
326        }
327        Ok(entries)
328    }
329
330    /// `DictEntry` 벡터를 entries.bin v2로 저장
331    ///
332    /// # Errors
333    ///
334    /// - 파일 쓰기 실패 시 에러
335    pub fn save_entries<P: AsRef<Path>>(entries: &[DictEntry], path: P) -> Result<()> {
336        use std::io::Write;
337
338        let path = path.as_ref();
339        let mut file = std::fs::File::create(path).map_err(DictError::Io)?;
340
341        let count = u32::try_from(entries.len())
342            .map_err(|_| DictError::Format("too many entries".into()))?;
343
344        // 1. 임시 헤더 작성 (index_offset은 나중에 업데이트)
345        file.write_all(LAZY_ENTRIES_MAGIC).map_err(DictError::Io)?;
346        file.write_u32::<LittleEndian>(LAZY_ENTRIES_VERSION)
347            .map_err(DictError::Io)?;
348        file.write_u32::<LittleEndian>(count)
349            .map_err(DictError::Io)?;
350        file.write_u64::<LittleEndian>(0) // placeholder
351            .map_err(DictError::Io)?;
352
353        // 2. 엔트리 데이터 작성, 오프셋 기록
354        let mut offsets = Vec::with_capacity(entries.len());
355
356        for entry in entries {
357            let offset = file.stream_position().map_err(DictError::Io)?;
358            offsets.push(offset);
359
360            file.write_u16::<LittleEndian>(entry.left_id)
361                .map_err(DictError::Io)?;
362            file.write_u16::<LittleEndian>(entry.right_id)
363                .map_err(DictError::Io)?;
364            file.write_i16::<LittleEndian>(entry.cost)
365                .map_err(DictError::Io)?;
366
367            let surface_bytes = entry.surface.as_bytes();
368            let surface_len = u16::try_from(surface_bytes.len())
369                .map_err(|_| DictError::Format("surface too long".into()))?;
370            file.write_u16::<LittleEndian>(surface_len)
371                .map_err(DictError::Io)?;
372
373            let feature_bytes = entry.feature.as_bytes();
374            let feature_len = u16::try_from(feature_bytes.len())
375                .map_err(|_| DictError::Format("feature too long".into()))?;
376            file.write_u16::<LittleEndian>(feature_len)
377                .map_err(DictError::Io)?;
378
379            file.write_all(surface_bytes).map_err(DictError::Io)?;
380            file.write_all(feature_bytes).map_err(DictError::Io)?;
381        }
382
383        // 3. 인덱스 테이블 작성
384        let index_offset = file.stream_position().map_err(DictError::Io)?;
385
386        for offset in offsets {
387            file.write_u64::<LittleEndian>(offset)
388                .map_err(DictError::Io)?;
389        }
390
391        // 4. 헤더의 index_offset 업데이트
392        file.seek(SeekFrom::Start(12)).map_err(DictError::Io)?;
393        file.write_u64::<LittleEndian>(index_offset)
394            .map_err(DictError::Io)?;
395
396        Ok(())
397    }
398}
399
400/// entries.bin v1을 v2로 마이그레이션
401///
402/// v1 파일을 읽어서 v2 포맷으로 저장합니다.
403/// v1 포맷: `[magic:4][version:u32][count:u32][entries...]`
404///
405/// # Errors
406///
407/// - 파일 읽기 실패 시 에러
408/// - 파일 쓰기 실패 시 에러
409pub fn migrate_entries_v1_to_v2<P: AsRef<Path>>(v1_path: P, v2_path: P) -> Result<()> {
410    use byteorder::ReadBytesExt;
411
412    let data = std::fs::read(v1_path.as_ref()).map_err(DictError::Io)?;
413    let mut cursor = std::io::Cursor::new(&data);
414
415    // v1 매직 넘버 검증 (MKED)
416    let mut magic = [0u8; 4];
417    cursor
418        .read_exact(&mut magic)
419        .map_err(|e| DictError::Format(format!("v1 magic: {e}")))?;
420    if &magic != b"MKED" {
421        return Err(DictError::Format(
422            "entries.bin v1: invalid magic number".into(),
423        ));
424    }
425
426    // v1 버전 검증
427    let version = cursor
428        .read_u32::<LittleEndian>()
429        .map_err(|e| DictError::Format(format!("v1 version: {e}")))?;
430    if version != 1 {
431        return Err(DictError::Format(format!(
432            "entries.bin v1: unsupported version {version}"
433        )));
434    }
435
436    // 엔트리 수
437    let count = cursor
438        .read_u32::<LittleEndian>()
439        .map_err(|e| DictError::Format(format!("v1 count: {e}")))?;
440
441    let mut entries = Vec::with_capacity(count as usize);
442    for i in 0..count {
443        let left_id = cursor
444            .read_u16::<LittleEndian>()
445            .map_err(|e| DictError::Format(format!("v1 entry {i} left_id: {e}")))?;
446        let right_id = cursor
447            .read_u16::<LittleEndian>()
448            .map_err(|e| DictError::Format(format!("v1 entry {i} right_id: {e}")))?;
449        let cost = cursor
450            .read_i16::<LittleEndian>()
451            .map_err(|e| DictError::Format(format!("v1 entry {i} cost: {e}")))?;
452        let surface_len = cursor
453            .read_u16::<LittleEndian>()
454            .map_err(|e| DictError::Format(format!("v1 entry {i} surface_len: {e}")))?
455            as usize;
456        let feature_len = cursor
457            .read_u16::<LittleEndian>()
458            .map_err(|e| DictError::Format(format!("v1 entry {i} feature_len: {e}")))?
459            as usize;
460
461        let mut surface_bytes = vec![0u8; surface_len];
462        cursor
463            .read_exact(&mut surface_bytes)
464            .map_err(|e| DictError::Format(format!("v1 entry {i} surface: {e}")))?;
465        let surface = String::from_utf8(surface_bytes)
466            .map_err(|e| DictError::Format(format!("v1 entry {i} surface utf8: {e}")))?;
467
468        let mut feature_bytes = vec![0u8; feature_len];
469        cursor
470            .read_exact(&mut feature_bytes)
471            .map_err(|e| DictError::Format(format!("v1 entry {i} feature: {e}")))?;
472        let feature = String::from_utf8(feature_bytes)
473            .map_err(|e| DictError::Format(format!("v1 entry {i} feature utf8: {e}")))?;
474
475        entries.push(DictEntry {
476            surface,
477            left_id,
478            right_id,
479            cost,
480            feature,
481        });
482    }
483
484    // v2로 저장
485    LazyEntries::save_entries(&entries, v2_path)?;
486
487    Ok(())
488}
489
490#[cfg(test)]
491mod tests {
492    #![allow(clippy::expect_used, clippy::unwrap_used)]
493
494    use super::*;
495    use tempfile::tempdir;
496
497    #[test]
498    fn test_lazy_entries_roundtrip() {
499        let entries = vec![
500            DictEntry::new("안녕", 1, 1, 100, "NNG,*,T,안녕,*,*,*,*"),
501            DictEntry::new("하세요", 2, 2, 50, "VV,*,F,하세요,*,*,*,*"),
502            DictEntry::new("감사", 3, 3, 80, "NNG,*,F,감사,*,*,*,*"),
503        ];
504
505        let dir = tempdir().expect("create temp dir");
506        let path = dir.path().join("entries.bin");
507
508        // 저장
509        LazyEntries::save_entries(&entries, &path).expect("save should work");
510
511        // 로드
512        let lazy = LazyEntries::from_file(&path).expect("load should work");
513
514        assert_eq!(lazy.len(), 3);
515        assert!(!lazy.is_empty());
516
517        // 엔트리 조회
518        let e0 = lazy.get(0).expect("get entry 0");
519        assert_eq!(e0.surface, "안녕");
520        assert_eq!(e0.left_id, 1);
521        assert_eq!(e0.cost, 100);
522
523        let e1 = lazy.get(1).expect("get entry 1");
524        assert_eq!(e1.surface, "하세요");
525
526        let e2 = lazy.get(2).expect("get entry 2");
527        assert_eq!(e2.surface, "감사");
528
529        // 범위 밖 인덱스
530        assert!(lazy.get(100).is_err());
531    }
532
533    #[test]
534    fn test_lazy_entries_cache() {
535        let entries = vec![
536            DictEntry::new("가", 1, 1, 100, "NNG"),
537            DictEntry::new("나", 2, 2, 200, "NNG"),
538        ];
539
540        let dir = tempdir().expect("create temp dir");
541        let path = dir.path().join("entries.bin");
542        LazyEntries::save_entries(&entries, &path).expect("save");
543
544        let lazy = LazyEntries::from_file(&path).expect("load");
545
546        // 첫 조회 (캐시 미스)
547        assert_eq!(lazy.cached_count(), 0);
548        let _ = lazy.get(0).expect("get 0");
549        assert_eq!(lazy.cached_count(), 1);
550
551        // 두 번째 조회 (캐시 히트)
552        let _ = lazy.get(0).expect("get 0 again");
553        assert_eq!(lazy.cached_count(), 1);
554
555        // 다른 엔트리 조회
556        let _ = lazy.get(1).expect("get 1");
557        assert_eq!(lazy.cached_count(), 2);
558
559        // 캐시 초기화
560        lazy.clear_cache();
561        assert_eq!(lazy.cached_count(), 0);
562    }
563
564    #[test]
565    fn test_get_entries_at() {
566        let entries = vec![
567            DictEntry::new("가", 1, 1, 100, "VV"),
568            DictEntry::new("가", 2, 2, 50, "JKS"),
569            DictEntry::new("나", 3, 3, 200, "NP"),
570        ];
571
572        let dir = tempdir().expect("create temp dir");
573        let path = dir.path().join("entries.bin");
574        LazyEntries::save_entries(&entries, &path).expect("save");
575
576        let lazy = LazyEntries::from_file(&path).expect("load");
577
578        // "가"로 시작하는 연속 엔트리
579        let results = lazy.get_entries_at(0, "가").expect("get_entries_at");
580        assert_eq!(results.len(), 2);
581        assert_eq!(results[0].feature, "VV");
582        assert_eq!(results[1].feature, "JKS");
583
584        // "나"
585        let results = lazy.get_entries_at(2, "나").expect("get_entries_at");
586        assert_eq!(results.len(), 1);
587    }
588
589    #[test]
590    fn test_load_all() {
591        let entries = vec![
592            DictEntry::new("가", 1, 1, 100, "NNG"),
593            DictEntry::new("나", 2, 2, 200, "NNG"),
594            DictEntry::new("다", 3, 3, 300, "NNG"),
595        ];
596
597        let dir = tempdir().expect("create temp dir");
598        let path = dir.path().join("entries.bin");
599        LazyEntries::save_entries(&entries, &path).expect("save");
600
601        let lazy = LazyEntries::from_file(&path).expect("load");
602        let loaded = lazy.load_all().expect("load_all");
603
604        assert_eq!(loaded.len(), 3);
605        assert_eq!(loaded[0].surface, "가");
606        assert_eq!(loaded[1].surface, "나");
607        assert_eq!(loaded[2].surface, "다");
608    }
609
610    #[test]
611    fn test_lru_cache_eviction() {
612        let mut cache =
613            lru::LruCache::<u32, Arc<DictEntry>>::new(NonZeroUsize::new(2).unwrap());
614
615        cache.put(0, Arc::new(DictEntry::new("가", 1, 1, 100, "")));
616        cache.put(1, Arc::new(DictEntry::new("나", 2, 2, 200, "")));
617        assert_eq!(cache.len(), 2);
618
619        // 새 항목 추가 시 가장 오래된 것(0) 제거
620        cache.put(2, Arc::new(DictEntry::new("다", 3, 3, 300, "")));
621        assert_eq!(cache.len(), 2);
622        assert!(cache.peek(&0).is_none()); // 제거됨
623        assert!(cache.peek(&1).is_some());
624        assert!(cache.peek(&2).is_some());
625    }
626}