mecab_ko_core/sejong/
converter.rs1use std::collections::HashMap;
4
5use crate::tokenizer::Token;
6
7use super::corrections::apply_context_corrections;
8use super::ending_rules::init_ending_rules;
9use super::hangul::normalize_jamo as hangul_normalize_jamo;
10use super::lexicon::apply_lexicon_overrides;
11use super::postprocess::{
12 apply_decomposition_corrections, apply_token_merges, apply_vv_seyo_splits,
13};
14use super::splitter::{is_compound_tag, split_compound_tag, split_morpheme};
15use super::tag_map::tag_map;
16use super::types::{DecomposedMorpheme, EndingRule, SejongToken};
17
18pub struct SejongConverter {
20 tag_map: &'static HashMap<String, Vec<String>>,
22 ending_rules: Vec<EndingRule>,
24 use_decomposition: bool,
26}
27
28impl Default for SejongConverter {
29 fn default() -> Self {
30 Self::new()
31 }
32}
33
34impl SejongConverter {
35 #[must_use]
37 pub fn new() -> Self {
38 Self {
39 tag_map: tag_map(),
40 ending_rules: init_ending_rules(),
41 use_decomposition: true, }
43 }
44
45 #[must_use]
50 pub const fn with_decomposition(mut self, use_decomposition: bool) -> Self {
51 self.use_decomposition = use_decomposition;
52 self
53 }
54
55 #[must_use]
60 pub fn parse_decomposition(decomposition: &str) -> Vec<DecomposedMorpheme> {
61 if decomposition.is_empty() || decomposition == "*" {
62 return Vec::new();
63 }
64
65 let mut result = Vec::new();
66
67 for part in decomposition.split('+') {
69 let part = part.trim();
70 if part.is_empty() {
71 continue;
72 }
73
74 let segments: Vec<&str> = part.split('/').collect();
76 if segments.len() >= 2 {
77 let surface = segments[0].to_string();
78 let pos = segments[1].to_string();
79
80 if !surface.is_empty() && surface != "*" && !pos.is_empty() && pos != "*" {
82 result.push(DecomposedMorpheme { surface, pos });
83 }
84 }
85 }
86
87 result
88 }
89
90 #[must_use]
96 pub fn extract_decomposition(features: &str) -> Option<String> {
97 let fields: Vec<&str> = features.split(',').collect();
98 if fields.len() >= 8 {
101 let decomp = fields[7].trim();
102 if !decomp.is_empty() && decomp != "*" {
103 return Some(decomp.to_string());
104 }
105 }
106 None
107 }
108
109 #[must_use]
115 pub fn is_compound_tag(&self, pos: &str) -> bool {
116 is_compound_tag(pos)
117 }
118
119 #[must_use]
121 pub fn split_compound_tag(&self, pos: &str) -> Vec<String> {
122 split_compound_tag(self.tag_map, pos)
123 }
124
125 #[must_use]
134 pub fn split_morpheme(&self, surface: &str, pos: &str) -> Vec<(String, String)> {
135 split_morpheme(surface, pos, self.tag_map, &self.ending_rules)
136 }
137
138 #[must_use]
146 pub fn convert_token(&self, token: &Token) -> Vec<SejongToken> {
147 let skip_decomposition = token.surface == "는다" && token.pos == "VV+EC";
150
151 let skip_decomp_verbs = ["들리다", "놀리다"];
158 let force_rule_based =
159 token.pos == "VV+EF" && skip_decomp_verbs.contains(&token.surface.as_str());
160
161 if self.use_decomposition
163 && !token.features.is_empty()
164 && !skip_decomposition
165 && !force_rule_based
166 {
167 if let Some(decomp) = Self::extract_decomposition(&token.features) {
168 let morphemes = Self::parse_decomposition(&decomp);
169 if !morphemes.is_empty() {
170 let decomp_pos: String = morphemes
172 .iter()
173 .map(|m| m.pos.as_str())
174 .collect::<Vec<_>>()
175 .join("+");
176 if decomp_pos == token.pos {
177 if morphemes.len() == 1 && morphemes[0].surface != token.surface {
179 } else {
181 return Self::morphemes_to_sejong_tokens(&morphemes, token);
182 }
183 }
184 }
186 }
187 }
188
189 let morphemes = self.split_morpheme(&token.surface, &token.pos);
191
192 if morphemes.len() == 1 {
193 return vec![SejongToken::new(
195 &token.surface,
196 &morphemes[0].1,
197 token.start_pos,
198 token.end_pos,
199 )];
200 }
201
202 let mut result = Vec::new();
204 let mut current_pos = token.start_pos;
205
206 for (surface, pos) in &morphemes {
207 let char_len = surface.chars().count();
208 let end_pos = current_pos + char_len;
209
210 result.push(SejongToken::from_split(
211 surface,
212 pos,
213 current_pos,
214 end_pos,
215 &token.surface,
216 &token.pos,
217 ));
218
219 current_pos = end_pos;
220 }
221
222 result
223 }
224
225 fn morphemes_to_sejong_tokens(
227 morphemes: &[DecomposedMorpheme],
228 original_token: &Token,
229 ) -> Vec<SejongToken> {
230 let mut result = Vec::new();
231 let mut current_pos = original_token.start_pos;
232
233 for morpheme in morphemes {
234 let char_len = morpheme.surface.chars().count();
235 let end_pos = current_pos + char_len;
236
237 result.push(SejongToken::from_split(
238 &morpheme.surface,
239 &morpheme.pos,
240 current_pos,
241 end_pos,
242 &original_token.surface,
243 &original_token.pos,
244 ));
245
246 current_pos = end_pos;
247 }
248
249 result
250 }
251
252 #[must_use]
254 pub fn convert_tokens(&self, tokens: &[Token]) -> Vec<SejongToken> {
255 let mut sejong_tokens: Vec<SejongToken> =
256 tokens.iter().flat_map(|t| self.convert_token(t)).collect();
257
258 apply_decomposition_corrections(&mut sejong_tokens);
260
261 apply_token_merges(&mut sejong_tokens);
263
264 apply_lexicon_overrides(&mut sejong_tokens);
266
267 sejong_tokens = apply_vv_seyo_splits(sejong_tokens);
269
270 apply_context_corrections(&mut sejong_tokens);
272
273 sejong_tokens
274 }
275
276 #[must_use]
278 pub fn format_sejong(&self, tokens: &[SejongToken]) -> String {
279 tokens
280 .iter()
281 .map(|t| {
282 let normalized_surface = hangul_normalize_jamo(&t.surface);
283 format!("{}/{}", normalized_surface, t.pos)
284 })
285 .collect::<Vec<_>>()
286 .join(" ")
287 }
288
289 #[must_use]
291 pub fn tokens_to_sejong_string(&self, tokens: &[Token]) -> String {
292 let sejong_tokens = self.convert_tokens(tokens);
293 self.format_sejong(&sejong_tokens)
294 }
295
296 #[must_use]
300 pub fn normalize_jamo(text: &str) -> String {
301 hangul_normalize_jamo(text)
302 }
303
304 #[cfg(test)]
305 #[must_use]
306 pub(crate) fn split_prefinal_ending(ending: &str) -> (String, String) {
307 super::splitter::split_prefinal_ending(ending)
308 }
309}