1use std::collections::HashMap;
25use std::sync::atomic::{AtomicUsize, Ordering};
26
27use parking_lot::RwLock;
28
29#[derive(Debug)]
34pub struct PosTagInterner {
35 tags: RwLock<HashMap<String, u16>>,
37 reverse: RwLock<Vec<String>>,
39 intern_count: AtomicUsize,
41 hit_count: AtomicUsize,
43}
44
45impl PosTagInterner {
46 #[must_use]
50 pub fn new() -> Self {
51 let interner = Self {
52 tags: RwLock::new(HashMap::with_capacity(64)),
53 reverse: RwLock::new(Vec::with_capacity(64)),
54 intern_count: AtomicUsize::new(0),
55 hit_count: AtomicUsize::new(0),
56 };
57
58 for tag in COMMON_POS_TAGS {
60 interner.intern(tag);
61 }
62
63 interner
64 }
65
66 #[allow(clippy::significant_drop_tightening)]
70 pub fn intern(&self, tag: &str) -> u16 {
71 self.intern_count.fetch_add(1, Ordering::Relaxed);
72
73 {
75 let tags = self.tags.read();
76 if let Some(&idx) = tags.get(tag) {
77 self.hit_count.fetch_add(1, Ordering::Relaxed);
78 return idx;
79 }
80 }
81
82 let mut tags = self.tags.write();
84 let mut reverse = self.reverse.write();
85
86 if let Some(&idx) = tags.get(tag) {
88 self.hit_count.fetch_add(1, Ordering::Relaxed);
89 return idx;
90 }
91
92 let idx = u16::try_from(reverse.len()).unwrap_or(u16::MAX);
93 tags.insert(tag.to_string(), idx);
94 reverse.push(tag.to_string());
95 idx
96 }
97
98 #[must_use]
100 pub fn resolve(&self, idx: u16) -> Option<String> {
101 let reverse = self.reverse.read();
102 reverse.get(idx as usize).cloned()
103 }
104
105 pub fn resolve_ref<F, R>(&self, idx: u16, f: F) -> Option<R>
107 where
108 F: FnOnce(&str) -> R,
109 {
110 let reverse = self.reverse.read();
111 reverse.get(idx as usize).map(|s| f(s.as_str()))
112 }
113
114 #[must_use]
116 pub fn len(&self) -> usize {
117 self.reverse.read().len()
118 }
119
120 #[must_use]
122 pub fn is_empty(&self) -> bool {
123 self.reverse.read().is_empty()
124 }
125
126 #[must_use]
128 #[allow(clippy::cast_precision_loss)]
129 pub fn stats(&self) -> InternerStats {
130 let intern_count = self.intern_count.load(Ordering::Relaxed);
131 let hit_count = self.hit_count.load(Ordering::Relaxed);
132 InternerStats {
133 unique_tags: self.len(),
134 intern_calls: intern_count,
135 cache_hits: hit_count,
136 hit_rate: if intern_count > 0 {
137 hit_count as f64 / intern_count as f64
138 } else {
139 0.0
140 },
141 }
142 }
143
144 #[must_use]
146 #[allow(clippy::significant_drop_tightening)]
147 pub fn memory_usage(&self) -> usize {
148 let reverse = self.reverse.read();
149 let tags = self.tags.read();
150
151 let vec_overhead = reverse.capacity() * std::mem::size_of::<String>();
153 let string_bytes: usize = reverse.iter().map(String::len).sum();
155 let map_overhead = tags.capacity() * (std::mem::size_of::<String>() + 2);
157
158 vec_overhead + string_bytes + map_overhead
159 }
160}
161
162impl Default for PosTagInterner {
163 fn default() -> Self {
164 Self::new()
165 }
166}
167
168const COMMON_POS_TAGS: &[&str] = &[
170 "NNG", "NNP", "NNB", "NR", "NP", "VV", "VA", "VX", "VCP", "VCN", "MM", "MAG", "MAJ", "IC", "JKS", "JKC", "JKG", "JKO", "JKB", "JKV", "JKQ", "JX", "JC", "EP", "EF", "EC", "ETN", "ETM", "XPN", "XSN", "XSV", "XSA", "XR", "SF", "SE", "SS", "SP", "SO", "SL", "SH", "SN", "SW", "NA", "UNK", "UNKNOWN", "*", "NNBC",
181];
182
183#[derive(Debug, Clone, Copy)]
185pub struct InternerStats {
186 pub unique_tags: usize,
188 pub intern_calls: usize,
190 pub cache_hits: usize,
192 pub hit_rate: f64,
194}
195
196impl InternerStats {
197 #[must_use]
199 pub fn format(&self) -> String {
200 format!(
201 "POS Interner: {} unique tags, {} calls, {:.1}% hit rate",
202 self.unique_tags,
203 self.intern_calls,
204 self.hit_rate * 100.0
205 )
206 }
207}
208
209#[derive(Debug, Clone, Default)]
211pub struct MemoryStats {
212 pub dictionary_bytes: usize,
214 pub lattice_bytes: usize,
216 pub pool_bytes: usize,
218 pub cache_bytes: usize,
220 pub interner_bytes: usize,
222 pub token_bytes: usize,
224}
225
226impl MemoryStats {
227 #[must_use]
229 pub const fn estimate_total(&self) -> usize {
230 self.dictionary_bytes
231 + self.lattice_bytes
232 + self.pool_bytes
233 + self.cache_bytes
234 + self.interner_bytes
235 + self.token_bytes
236 }
237
238 #[must_use]
240 pub fn format_human_readable(&self) -> String {
241 format!(
242 "Memory Usage:\n\
243 - Dictionary: {} KB\n\
244 - Lattice: {} KB\n\
245 - Pool: {} KB\n\
246 - Cache: {} KB\n\
247 - Interner: {} KB\n\
248 - Tokens: {} KB\n\
249 - Total: {} KB",
250 self.dictionary_bytes / 1024,
251 self.lattice_bytes / 1024,
252 self.pool_bytes / 1024,
253 self.cache_bytes / 1024,
254 self.interner_bytes / 1024,
255 self.token_bytes / 1024,
256 self.estimate_total() / 1024
257 )
258 }
259}
260
261#[derive(Debug)]
266pub struct FeatureCache {
267 features: RwLock<HashMap<String, u32>>,
269 reverse: RwLock<Vec<String>>,
271 max_size: usize,
273}
274
275impl FeatureCache {
276 #[must_use]
278 pub fn new(max_size: usize) -> Self {
279 Self {
280 features: RwLock::new(HashMap::with_capacity(max_size.min(10000))),
281 reverse: RwLock::new(Vec::with_capacity(max_size.min(10000))),
282 max_size,
283 }
284 }
285
286 #[allow(clippy::significant_drop_tightening)]
290 pub fn intern(&self, feature: &str) -> Option<u32> {
291 {
293 let features = self.features.read();
294 if let Some(&idx) = features.get(feature) {
295 return Some(idx);
296 }
297 }
298
299 let len = self.reverse.read().len();
301 if len >= self.max_size {
302 return None;
303 }
304
305 let mut features = self.features.write();
307 let mut reverse = self.reverse.write();
308
309 if let Some(&idx) = features.get(feature) {
310 return Some(idx);
311 }
312
313 if reverse.len() >= self.max_size {
314 return None;
315 }
316
317 let idx = u32::try_from(reverse.len()).ok()?;
318 features.insert(feature.to_string(), idx);
319 reverse.push(feature.to_string());
320 Some(idx)
321 }
322
323 #[must_use]
325 pub fn resolve(&self, idx: u32) -> Option<String> {
326 self.reverse.read().get(idx as usize).cloned()
327 }
328
329 #[must_use]
331 pub fn len(&self) -> usize {
332 self.reverse.read().len()
333 }
334
335 #[must_use]
337 pub fn is_empty(&self) -> bool {
338 self.reverse.read().is_empty()
339 }
340
341 #[must_use]
343 #[allow(clippy::significant_drop_tightening)]
344 pub fn memory_usage(&self) -> usize {
345 let reverse = self.reverse.read();
346 let features = self.features.read();
347
348 let vec_bytes: usize = reverse.iter().map(String::len).sum();
349 let map_overhead = features.capacity() * (std::mem::size_of::<String>() + 4);
350
351 vec_bytes + map_overhead
352 }
353}
354
355impl Default for FeatureCache {
356 fn default() -> Self {
357 Self::new(50000)
358 }
359}
360
361#[must_use]
365pub fn estimate_tokens_memory(tokens: &[crate::tokenizer::Token]) -> usize {
366 let base_size = std::mem::size_of_val(tokens);
367 let string_bytes: usize = tokens
368 .iter()
369 .map(|t| {
370 t.surface.len()
371 + t.pos.len()
372 + t.features.len()
373 + t.reading.as_ref().map_or(0, String::len)
374 + t.lemma.as_ref().map_or(0, String::len)
375 + t.normalized.as_ref().map_or(0, String::len)
376 })
377 .sum();
378
379 base_size + string_bytes
380}
381
382#[cfg(test)]
383mod tests {
384 use super::*;
385
386 #[test]
387 fn test_pos_tag_interner() {
388 let interner = PosTagInterner::new();
389
390 let idx1 = interner.intern("NNG");
392 let idx2 = interner.intern("NNG");
393 assert_eq!(idx1, idx2);
394
395 let idx3 = interner.intern("CUSTOM_TAG");
397 assert_ne!(idx1, idx3);
398
399 assert_eq!(interner.resolve(idx1), Some("NNG".to_string()));
401 assert_eq!(interner.resolve(idx3), Some("CUSTOM_TAG".to_string()));
402 }
403
404 #[test]
405 fn test_pos_interner_stats() {
406 let interner = PosTagInterner::new();
407
408 for _ in 0..100 {
410 interner.intern("NNG");
411 interner.intern("VV");
412 }
413
414 let stats = interner.stats();
415 assert!(stats.unique_tags > 0);
416 assert!(stats.intern_calls > 200); assert!(stats.hit_rate > 0.75, "hit_rate: {}", stats.hit_rate);
419 }
420
421 #[test]
422 fn test_feature_cache() {
423 let cache = FeatureCache::new(100);
424
425 let idx1 = cache.intern("NNG,*,T,테스트,*,*,*,*");
426 assert!(idx1.is_some());
427
428 let idx2 = cache.intern("NNG,*,T,테스트,*,*,*,*");
429 assert_eq!(idx1, idx2);
430
431 assert_eq!(
432 cache.resolve(idx1.unwrap()),
433 Some("NNG,*,T,테스트,*,*,*,*".to_string())
434 );
435 }
436
437 #[test]
438 fn test_feature_cache_max_size() {
439 let cache = FeatureCache::new(2);
440
441 assert!(cache.intern("feature1").is_some());
442 assert!(cache.intern("feature2").is_some());
443 assert!(cache.intern("feature3").is_none());
445 }
446
447 #[test]
448 fn test_memory_stats_format() {
449 let stats = MemoryStats {
450 dictionary_bytes: 100 * 1024,
451 lattice_bytes: 10 * 1024,
452 pool_bytes: 5 * 1024,
453 cache_bytes: 20 * 1024,
454 interner_bytes: 1024,
455 token_bytes: 2 * 1024,
456 };
457
458 let formatted = stats.format_human_readable();
459 assert!(formatted.contains("Dictionary: 100 KB"));
460 assert!(formatted.contains("Total: 138 KB"));
461 }
462
463 #[test]
464 fn test_common_pos_tags_preloaded() {
465 let interner = PosTagInterner::new();
466
467 assert!(interner.len() > 30);
469
470 for tag in COMMON_POS_TAGS {
472 let idx = interner.intern(tag);
473 assert!(idx < 100);
474 }
475 }
476}