Skip to main content

mecab_ko_dict/
domain.rs

1//! Domain overlay dictionary: stack multiple [`UserDictionary`] instances by priority.
2//!
3//! ## Design
4//!
5//! Multiple dictionaries can coexist with explicit priority ordering (0 = highest).
6//! Searches across all domains are performed in priority order; higher-priority
7//! domain entries appear first in results. This enables domain-specific vocabulary
8//! to shadow or augment lower-priority entries without merging the underlying data.
9//!
10//! ## Example
11//!
12//! ```rust
13//! use std::sync::Arc;
14//! use mecab_ko_dict::domain::{DomainId, DomainStack};
15//! use mecab_ko_dict::user_dict::UserDictionary;
16//!
17//! let mut stack = DomainStack::new();
18//!
19//! let mut news = UserDictionary::new();
20//! news.add_entry("뉴스피드", "NNG", Some(-1000), None);
21//!
22//! let mut finance = UserDictionary::new();
23//! finance.add_entry("코스피", "NNP", Some(-1000), None);
24//!
25//! stack.add_domain(DomainId("news".into()), 0, Arc::new(news), None);
26//! stack.add_domain(DomainId("finance".into()), 1, Arc::new(finance), None);
27//!
28//! assert_eq!(stack.len(), 2);
29//! ```
30
31use std::path::PathBuf;
32use std::sync::Arc;
33use std::time::SystemTime;
34
35use crate::user_dict::{UserDictionary, UserEntry};
36
37/// Opaque identifier for a domain.
38///
39/// Equality and hashing use the inner string, so two `DomainId`s with the
40/// same string value are considered the same domain.
41#[derive(Debug, Clone, Eq, PartialEq, Hash)]
42pub struct DomainId(pub String);
43
44impl std::fmt::Display for DomainId {
45    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
46        f.write_str(&self.0)
47    }
48}
49
50/// A single domain overlay paired with its priority and metadata.
51pub struct DomainDictionary {
52    /// Domain identifier.
53    pub domain: DomainId,
54    /// Search priority — 0 is the highest (searched first).
55    pub priority: u8,
56    /// The underlying dictionary, shared via reference-counting.
57    pub dictionary: Arc<UserDictionary>,
58    /// Optional path from which the dictionary was loaded.
59    pub source_path: Option<PathBuf>,
60    /// Wall-clock time at which this domain was registered.
61    pub loaded_at: SystemTime,
62}
63
64impl std::fmt::Debug for DomainDictionary {
65    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
66        f.debug_struct("DomainDictionary")
67            .field("domain", &self.domain)
68            .field("priority", &self.priority)
69            .field("entry_count", &self.dictionary.len())
70            .field("source_path", &self.source_path)
71            .field("loaded_at", &self.loaded_at)
72            .finish()
73    }
74}
75
76impl DomainDictionary {
77    fn new(
78        domain: DomainId,
79        priority: u8,
80        dictionary: Arc<UserDictionary>,
81        source_path: Option<PathBuf>,
82    ) -> Self {
83        Self {
84            domain,
85            priority,
86            dictionary,
87            source_path,
88            loaded_at: SystemTime::now(),
89        }
90    }
91}
92
93/// Ordered stack of domain dictionaries searched in priority order.
94///
95/// The internal vector is kept sorted by `priority` ascending (lowest value =
96/// highest priority) at all times. `add_domain` and `remove_domain` preserve
97/// this invariant.
98#[derive(Debug, Default)]
99pub struct DomainStack {
100    // Invariant: sorted by `priority` ascending.
101    domains: Vec<DomainDictionary>,
102}
103
104impl DomainStack {
105    /// Create an empty domain stack.
106    #[must_use]
107    pub fn new() -> Self {
108        Self::default()
109    }
110
111    /// Add or replace a domain.
112    ///
113    /// If a domain with the same `DomainId` already exists it is replaced with
114    /// the new dictionary and priority. The stack remains sorted after the
115    /// operation.
116    pub fn add_domain(
117        &mut self,
118        domain: DomainId,
119        priority: u8,
120        dict: Arc<UserDictionary>,
121        source: Option<PathBuf>,
122    ) {
123        // Remove any existing entry with the same id so replacement is atomic.
124        self.domains.retain(|d| d.domain != domain);
125
126        let entry = DomainDictionary::new(domain, priority, dict, source);
127        self.domains.push(entry);
128        // Stable sort keeps equal-priority domains in insertion order.
129        self.domains.sort_by_key(|d| d.priority);
130    }
131
132    /// Remove a domain by id.
133    ///
134    /// Returns the removed `DomainDictionary`, or `None` if no domain with the
135    /// given id existed.
136    pub fn remove_domain(&mut self, domain: &DomainId) -> Option<DomainDictionary> {
137        if let Some(pos) = self.domains.iter().position(|d| &d.domain == domain) {
138            Some(self.domains.remove(pos))
139        } else {
140            None
141        }
142    }
143
144    /// Look up a domain by id.
145    #[must_use]
146    pub fn get_domain(&self, domain: &DomainId) -> Option<&DomainDictionary> {
147        self.domains.iter().find(|d| &d.domain == domain)
148    }
149
150    /// Return `(DomainId, priority, entry_count)` for every registered domain,
151    /// in priority order (highest priority first).
152    #[must_use]
153    pub fn list_domains(&self) -> Vec<(DomainId, u8, usize)> {
154        self.domains
155            .iter()
156            .map(|d| (d.domain.clone(), d.priority, d.dictionary.len()))
157            .collect()
158    }
159
160    /// Number of registered domains.
161    #[must_use]
162    pub fn len(&self) -> usize {
163        self.domains.len()
164    }
165
166    /// True when no domains are registered.
167    #[must_use]
168    pub fn is_empty(&self) -> bool {
169        self.domains.is_empty()
170    }
171
172    /// Common-prefix search across all domains.
173    ///
174    /// Returns all matching `UserEntry` references in priority order (higher
175    /// priority = lower numeric value appears first). Within the same domain,
176    /// entry order follows the domain's own iteration order.
177    ///
178    /// The returned references are valid for the lifetime of `&self`.
179    #[must_use]
180    pub fn common_prefix_search<'a>(&'a self, text: &str) -> Vec<&'a UserEntry> {
181        self.domains
182            .iter()
183            .flat_map(|d| d.dictionary.common_prefix_search(text))
184            .collect()
185    }
186
187    /// Exact surface lookup across all domains.
188    ///
189    /// Returns all `UserEntry` references whose surface equals `surface`,
190    /// in priority order.
191    ///
192    /// The returned references are valid for the lifetime of `&self`.
193    #[must_use]
194    pub fn lookup<'a>(&'a self, surface: &str) -> Vec<&'a UserEntry> {
195        self.domains
196            .iter()
197            .flat_map(|d| d.dictionary.lookup(surface))
198            .collect()
199    }
200}
201
202#[cfg(test)]
203#[allow(clippy::expect_used, clippy::unwrap_used)]
204mod tests {
205    use super::*;
206
207    fn make_dict(entries: &[(&str, &str, i16)]) -> Arc<UserDictionary> {
208        let mut d = UserDictionary::new();
209        for &(surface, pos, cost) in entries {
210            d.add_entry(surface, pos, Some(cost), None);
211        }
212        Arc::new(d)
213    }
214
215    #[test]
216    fn test_empty_stack() {
217        let stack = DomainStack::new();
218        assert!(stack.is_empty());
219        assert_eq!(stack.len(), 0);
220        assert!(stack.list_domains().is_empty());
221        assert!(stack.lookup("anything").is_empty());
222        assert!(stack.common_prefix_search("anything").is_empty());
223    }
224
225    #[test]
226    fn test_add_two_domains_priority_ordering() {
227        let mut stack = DomainStack::new();
228        let low = make_dict(&[("하위", "NNG", -100)]);
229        let high = make_dict(&[("상위", "NNP", -1000)]);
230
231        // Add lower priority first to verify that the sort is correct regardless
232        // of insertion order.
233        stack.add_domain(DomainId("low".into()), 10, low, None);
234        stack.add_domain(DomainId("high".into()), 0, high, None);
235
236        let listing = stack.list_domains();
237        assert_eq!(listing.len(), 2);
238        // priority 0 ("high") must come before priority 10 ("low")
239        assert_eq!(listing[0].0, DomainId("high".into()));
240        assert_eq!(listing[0].1, 0);
241        assert_eq!(listing[1].0, DomainId("low".into()));
242        assert_eq!(listing[1].1, 10);
243    }
244
245    #[test]
246    fn test_common_prefix_search_returns_entries_from_all_domains() {
247        let mut stack = DomainStack::new();
248        let d1 = make_dict(&[("형태", "NNG", -100), ("형태소", "NNG", -200)]);
249        let d2 = make_dict(&[("형태소분석", "NNG", -300)]);
250
251        stack.add_domain(DomainId("d1".into()), 0, d1, None);
252        stack.add_domain(DomainId("d2".into()), 1, d2, None);
253
254        let results = stack.common_prefix_search("형태소분석기");
255        // "형태" and "형태소" from d1, "형태소분석" from d2
256        assert_eq!(results.len(), 3);
257
258        // Higher priority domain (d1, priority=0) entries come first.
259        assert_eq!(results[0].surface, "형태");
260        assert_eq!(results[1].surface, "형태소");
261        assert_eq!(results[2].surface, "형태소분석");
262    }
263
264    #[test]
265    fn test_remove_domain_returns_correct_domain() {
266        let mut stack = DomainStack::new();
267        let d1 = make_dict(&[("단어1", "NNG", 0)]);
268        let d2 = make_dict(&[("단어2", "NNG", 0)]);
269
270        stack.add_domain(DomainId("alpha".into()), 0, d1, None);
271        stack.add_domain(DomainId("beta".into()), 1, d2, None);
272        assert_eq!(stack.len(), 2);
273
274        let removed = stack.remove_domain(&DomainId("alpha".into()));
275        assert!(removed.is_some());
276        assert_eq!(removed.unwrap().domain, DomainId("alpha".into()));
277        assert_eq!(stack.len(), 1);
278
279        // Removing a non-existent domain returns None.
280        let none = stack.remove_domain(&DomainId("alpha".into()));
281        assert!(none.is_none());
282    }
283
284    #[test]
285    fn test_list_domains_returns_all_ids_with_entry_counts() {
286        let mut stack = DomainStack::new();
287        stack.add_domain(
288            DomainId("a".into()),
289            2,
290            make_dict(&[("x", "NNG", 0), ("y", "NNG", 0)]),
291            None,
292        );
293        stack.add_domain(DomainId("b".into()), 1, make_dict(&[("z", "NNG", 0)]), None);
294
295        let listing = stack.list_domains();
296        // Sorted by priority: b(1) then a(2)
297        assert_eq!(listing[0].0, DomainId("b".into()));
298        assert_eq!(listing[0].2, 1); // entry count for "b"
299        assert_eq!(listing[1].0, DomainId("a".into()));
300        assert_eq!(listing[1].2, 2); // entry count for "a"
301    }
302
303    #[test]
304    fn test_duplicate_domain_add_replaces_existing() {
305        let mut stack = DomainStack::new();
306        let v1 = make_dict(&[("old_entry", "NNG", 0)]);
307        let v2 = make_dict(&[("new_entry", "NNP", -500)]);
308
309        stack.add_domain(DomainId("same".into()), 0, v1, None);
310        assert_eq!(stack.len(), 1);
311        assert!(!stack.lookup("old_entry").is_empty());
312
313        stack.add_domain(DomainId("same".into()), 0, v2, None);
314        // Still exactly one domain after replacement.
315        assert_eq!(stack.len(), 1);
316        // Old entry is gone.
317        assert!(stack.lookup("old_entry").is_empty());
318        // New entry is present.
319        assert!(!stack.lookup("new_entry").is_empty());
320    }
321
322    #[test]
323    fn test_lookup_returns_entries_in_priority_order() {
324        let mut stack = DomainStack::new();
325        let high = make_dict(&[("공통", "NNP", -2000)]);
326        let low = make_dict(&[("공통", "NNG", -100)]);
327
328        stack.add_domain(DomainId("high".into()), 0, high, None);
329        stack.add_domain(DomainId("low".into()), 5, low, None);
330
331        let results = stack.lookup("공통");
332        assert_eq!(results.len(), 2);
333        // High-priority domain result appears first.
334        assert_eq!(results[0].pos, "NNP");
335        assert_eq!(results[1].pos, "NNG");
336    }
337}