1use crate::pos_tag::PosTag;
41use std::fmt;
42
43#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
47#[repr(u8)]
48pub enum KiwiPosTag {
49 NNG,
54 NNP,
56 NNB,
58 NR,
60 NP,
62
63 VV,
68 VA,
70 VX,
72 VCP,
74 VCN,
76
77 MM,
82 MAG,
84 MAJ,
86
87 IC,
92
93 JKS,
98 JKC,
100 JKG,
102 JKO,
104 JKB,
106 JKV,
108 JKQ,
110 JX,
112 JC,
114
115 EP,
120 EF,
122 EC,
124 ETN,
126 ETM,
128
129 XPN,
134 XSN,
136 XSV,
138 XSA,
140 XR,
142
143 SF,
148 SP,
150 SS,
152 SE,
154 SO,
156 SW,
158
159 SL,
161 SH,
163 SN,
165
166 #[allow(non_camel_case_types)]
171 W_URL,
172 #[allow(non_camel_case_types)]
174 W_EMAIL,
175 #[allow(non_camel_case_types)]
177 W_HASHTAG,
178 #[allow(non_camel_case_types)]
180 W_MENTION,
181 #[allow(non_camel_case_types)]
183 W_EMOJI,
184 #[allow(non_camel_case_types)]
186 W_OTHER,
187
188 Unknown,
193}
194
195impl KiwiPosTag {
196 #[must_use]
207 #[allow(clippy::should_implement_trait)]
208 pub fn from_str(s: &str) -> Option<Self> {
209 match s {
210 "NNG" => Some(Self::NNG),
212 "NNP" => Some(Self::NNP),
213 "NNB" => Some(Self::NNB),
214 "NR" => Some(Self::NR),
215 "NP" => Some(Self::NP),
216 "VV" => Some(Self::VV),
218 "VA" => Some(Self::VA),
219 "VX" => Some(Self::VX),
220 "VCP" => Some(Self::VCP),
221 "VCN" => Some(Self::VCN),
222 "MM" => Some(Self::MM),
224 "MAG" => Some(Self::MAG),
225 "MAJ" => Some(Self::MAJ),
226 "IC" => Some(Self::IC),
228 "JKS" => Some(Self::JKS),
230 "JKC" => Some(Self::JKC),
231 "JKG" => Some(Self::JKG),
232 "JKO" => Some(Self::JKO),
233 "JKB" => Some(Self::JKB),
234 "JKV" => Some(Self::JKV),
235 "JKQ" => Some(Self::JKQ),
236 "JX" => Some(Self::JX),
237 "JC" => Some(Self::JC),
238 "EP" => Some(Self::EP),
240 "EF" => Some(Self::EF),
241 "EC" => Some(Self::EC),
242 "ETN" => Some(Self::ETN),
243 "ETM" => Some(Self::ETM),
244 "XPN" => Some(Self::XPN),
246 "XSN" => Some(Self::XSN),
247 "XSV" => Some(Self::XSV),
248 "XSA" => Some(Self::XSA),
249 "XR" => Some(Self::XR),
250 "SF" => Some(Self::SF),
252 "SP" => Some(Self::SP),
253 "SS" => Some(Self::SS),
254 "SE" => Some(Self::SE),
255 "SO" => Some(Self::SO),
256 "SW" => Some(Self::SW),
257 "SL" => Some(Self::SL),
258 "SH" => Some(Self::SH),
259 "SN" => Some(Self::SN),
260 "W_URL" => Some(Self::W_URL),
262 "W_EMAIL" => Some(Self::W_EMAIL),
263 "W_HASHTAG" => Some(Self::W_HASHTAG),
264 "W_MENTION" => Some(Self::W_MENTION),
265 "W_EMOJI" => Some(Self::W_EMOJI),
266 "W_OTHER" => Some(Self::W_OTHER),
267 "UNKNOWN" | "UNK" => Some(Self::Unknown),
269 _ => None,
270 }
271 }
272
273 #[must_use]
275 pub const fn as_str(&self) -> &'static str {
276 match self {
277 Self::NNG => "NNG",
279 Self::NNP => "NNP",
280 Self::NNB => "NNB",
281 Self::NR => "NR",
282 Self::NP => "NP",
283 Self::VV => "VV",
285 Self::VA => "VA",
286 Self::VX => "VX",
287 Self::VCP => "VCP",
288 Self::VCN => "VCN",
289 Self::MM => "MM",
291 Self::MAG => "MAG",
292 Self::MAJ => "MAJ",
293 Self::IC => "IC",
295 Self::JKS => "JKS",
297 Self::JKC => "JKC",
298 Self::JKG => "JKG",
299 Self::JKO => "JKO",
300 Self::JKB => "JKB",
301 Self::JKV => "JKV",
302 Self::JKQ => "JKQ",
303 Self::JX => "JX",
304 Self::JC => "JC",
305 Self::EP => "EP",
307 Self::EF => "EF",
308 Self::EC => "EC",
309 Self::ETN => "ETN",
310 Self::ETM => "ETM",
311 Self::XPN => "XPN",
313 Self::XSN => "XSN",
314 Self::XSV => "XSV",
315 Self::XSA => "XSA",
316 Self::XR => "XR",
317 Self::SF => "SF",
319 Self::SP => "SP",
320 Self::SS => "SS",
321 Self::SE => "SE",
322 Self::SO => "SO",
323 Self::SW => "SW",
324 Self::SL => "SL",
325 Self::SH => "SH",
326 Self::SN => "SN",
327 Self::W_URL => "W_URL",
329 Self::W_EMAIL => "W_EMAIL",
330 Self::W_HASHTAG => "W_HASHTAG",
331 Self::W_MENTION => "W_MENTION",
332 Self::W_EMOJI => "W_EMOJI",
333 Self::W_OTHER => "W_OTHER",
334 Self::Unknown => "UNKNOWN",
336 }
337 }
338}
339
340impl fmt::Display for KiwiPosTag {
341 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
342 write!(f, "{}", self.as_str())
343 }
344}
345
346#[must_use]
359pub const fn to_kiwi_tag(mecab_tag: PosTag) -> KiwiPosTag {
360 match mecab_tag {
361 PosTag::NNG => KiwiPosTag::NNG,
363 PosTag::NNP => KiwiPosTag::NNP,
364 PosTag::NNB | PosTag::NNBC => KiwiPosTag::NNB, PosTag::NP => KiwiPosTag::NP,
366 PosTag::NR => KiwiPosTag::NR,
367
368 PosTag::VV => KiwiPosTag::VV,
370 PosTag::VA => KiwiPosTag::VA,
371 PosTag::VX => KiwiPosTag::VX,
372 PosTag::VCP => KiwiPosTag::VCP,
373 PosTag::VCN => KiwiPosTag::VCN,
374
375 PosTag::MM => KiwiPosTag::MM,
377 PosTag::MAG => KiwiPosTag::MAG,
378 PosTag::MAJ => KiwiPosTag::MAJ,
379
380 PosTag::IC => KiwiPosTag::IC,
382
383 PosTag::JKS => KiwiPosTag::JKS,
385 PosTag::JKC => KiwiPosTag::JKC,
386 PosTag::JKG => KiwiPosTag::JKG,
387 PosTag::JKO => KiwiPosTag::JKO,
388 PosTag::JKB => KiwiPosTag::JKB,
389 PosTag::JKV => KiwiPosTag::JKV,
390 PosTag::JKQ => KiwiPosTag::JKQ,
391 PosTag::JX => KiwiPosTag::JX,
392 PosTag::JC => KiwiPosTag::JC,
393
394 PosTag::EP => KiwiPosTag::EP,
396 PosTag::EF => KiwiPosTag::EF,
397 PosTag::EC => KiwiPosTag::EC,
398 PosTag::ETN => KiwiPosTag::ETN,
399 PosTag::ETM => KiwiPosTag::ETM,
400
401 PosTag::XPN => KiwiPosTag::XPN,
403 PosTag::XSN => KiwiPosTag::XSN,
404 PosTag::XSV => KiwiPosTag::XSV,
405 PosTag::XSA => KiwiPosTag::XSA,
406 PosTag::XR => KiwiPosTag::XR,
407
408 PosTag::SF => KiwiPosTag::SF,
410 PosTag::SP | PosTag::SC => KiwiPosTag::SP, PosTag::SSO | PosTag::SSC => KiwiPosTag::SS, PosTag::SE => KiwiPosTag::SE,
413 PosTag::SY => KiwiPosTag::SO, PosTag::SL => KiwiPosTag::SL,
415 PosTag::SH => KiwiPosTag::SH,
416 PosTag::SN => KiwiPosTag::SN,
417
418 PosTag::Unknown => KiwiPosTag::Unknown,
420 }
421}
422
423#[must_use]
437pub const fn from_kiwi_tag(kiwi_tag: KiwiPosTag) -> PosTag {
438 match kiwi_tag {
439 KiwiPosTag::NNG => PosTag::NNG,
441 KiwiPosTag::NNP => PosTag::NNP,
442 KiwiPosTag::NNB => PosTag::NNB, KiwiPosTag::NP => PosTag::NP,
444 KiwiPosTag::NR => PosTag::NR,
445
446 KiwiPosTag::VV => PosTag::VV,
448 KiwiPosTag::VA => PosTag::VA,
449 KiwiPosTag::VX => PosTag::VX,
450 KiwiPosTag::VCP => PosTag::VCP,
451 KiwiPosTag::VCN => PosTag::VCN,
452
453 KiwiPosTag::MM => PosTag::MM,
455 KiwiPosTag::MAG => PosTag::MAG,
456 KiwiPosTag::MAJ => PosTag::MAJ,
457
458 KiwiPosTag::IC => PosTag::IC,
460
461 KiwiPosTag::JKS => PosTag::JKS,
463 KiwiPosTag::JKC => PosTag::JKC,
464 KiwiPosTag::JKG => PosTag::JKG,
465 KiwiPosTag::JKO => PosTag::JKO,
466 KiwiPosTag::JKB => PosTag::JKB,
467 KiwiPosTag::JKV => PosTag::JKV,
468 KiwiPosTag::JKQ => PosTag::JKQ,
469 KiwiPosTag::JX => PosTag::JX,
470 KiwiPosTag::JC => PosTag::JC,
471
472 KiwiPosTag::EP => PosTag::EP,
474 KiwiPosTag::EF => PosTag::EF,
475 KiwiPosTag::EC => PosTag::EC,
476 KiwiPosTag::ETN => PosTag::ETN,
477 KiwiPosTag::ETM => PosTag::ETM,
478
479 KiwiPosTag::XPN => PosTag::XPN,
481 KiwiPosTag::XSN => PosTag::XSN,
482 KiwiPosTag::XSV => PosTag::XSV,
483 KiwiPosTag::XSA => PosTag::XSA,
484 KiwiPosTag::XR => PosTag::XR,
485
486 KiwiPosTag::SF => PosTag::SF,
488 KiwiPosTag::SP => PosTag::SP, KiwiPosTag::SS => PosTag::SSO, KiwiPosTag::SE => PosTag::SE,
491 KiwiPosTag::SO | KiwiPosTag::SW => PosTag::SY, KiwiPosTag::SL
493 | KiwiPosTag::W_URL
494 | KiwiPosTag::W_EMAIL
495 | KiwiPosTag::W_HASHTAG
496 | KiwiPosTag::W_MENTION
497 | KiwiPosTag::W_EMOJI
498 | KiwiPosTag::W_OTHER => PosTag::SL, KiwiPosTag::SH => PosTag::SH,
500 KiwiPosTag::SN => PosTag::SN,
501
502 KiwiPosTag::Unknown => PosTag::Unknown,
504 }
505}
506
507#[derive(Debug, Clone, PartialEq)]
527pub struct KiwiToken {
528 pub form: String,
530 pub tag: KiwiPosTag,
532 pub start: usize,
534 pub length: usize,
536 pub score: f64,
538}
539
540impl KiwiToken {
541 pub fn new(
551 form: impl Into<String>,
552 tag: KiwiPosTag,
553 start: usize,
554 length: usize,
555 score: f64,
556 ) -> Self {
557 Self {
558 form: form.into(),
559 tag,
560 start,
561 length,
562 score,
563 }
564 }
565
566 #[must_use]
568 pub const fn end(&self) -> usize {
569 self.start + self.length
570 }
571
572 #[must_use]
574 pub const fn to_mecab_tag(&self) -> PosTag {
575 from_kiwi_tag(self.tag)
576 }
577}
578
579impl fmt::Display for KiwiToken {
580 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
581 write!(f, "{}/{}", self.form, self.tag)
582 }
583}
584
585#[cfg(test)]
586#[allow(clippy::float_cmp)]
587mod tests {
588 use super::*;
589
590 #[test]
591 fn test_kiwi_tag_from_str() {
592 assert_eq!(KiwiPosTag::from_str("NNG"), Some(KiwiPosTag::NNG));
593 assert_eq!(KiwiPosTag::from_str("VV"), Some(KiwiPosTag::VV));
594 assert_eq!(KiwiPosTag::from_str("W_URL"), Some(KiwiPosTag::W_URL));
595 assert_eq!(KiwiPosTag::from_str("UNKNOWN"), Some(KiwiPosTag::Unknown));
596 assert_eq!(KiwiPosTag::from_str("INVALID"), None);
597 }
598
599 #[test]
600 fn test_kiwi_tag_as_str() {
601 assert_eq!(KiwiPosTag::NNG.as_str(), "NNG");
602 assert_eq!(KiwiPosTag::W_URL.as_str(), "W_URL");
603 assert_eq!(KiwiPosTag::Unknown.as_str(), "UNKNOWN");
604 }
605
606 #[test]
607 fn test_to_kiwi_tag_nominals() {
608 assert_eq!(to_kiwi_tag(PosTag::NNG), KiwiPosTag::NNG);
610 assert_eq!(to_kiwi_tag(PosTag::NNP), KiwiPosTag::NNP);
611 assert_eq!(to_kiwi_tag(PosTag::NNB), KiwiPosTag::NNB);
612 assert_eq!(to_kiwi_tag(PosTag::NNBC), KiwiPosTag::NNB); assert_eq!(to_kiwi_tag(PosTag::NP), KiwiPosTag::NP);
614 assert_eq!(to_kiwi_tag(PosTag::NR), KiwiPosTag::NR);
615 }
616
617 #[test]
618 fn test_to_kiwi_tag_predicates() {
619 assert_eq!(to_kiwi_tag(PosTag::VV), KiwiPosTag::VV);
621 assert_eq!(to_kiwi_tag(PosTag::VA), KiwiPosTag::VA);
622 assert_eq!(to_kiwi_tag(PosTag::VX), KiwiPosTag::VX);
623 assert_eq!(to_kiwi_tag(PosTag::VCP), KiwiPosTag::VCP);
624 assert_eq!(to_kiwi_tag(PosTag::VCN), KiwiPosTag::VCN);
625 }
626
627 #[test]
628 fn test_to_kiwi_tag_particles() {
629 assert_eq!(to_kiwi_tag(PosTag::JKS), KiwiPosTag::JKS);
631 assert_eq!(to_kiwi_tag(PosTag::JKO), KiwiPosTag::JKO);
632 assert_eq!(to_kiwi_tag(PosTag::JX), KiwiPosTag::JX);
633 }
634
635 #[test]
636 fn test_to_kiwi_tag_symbols() {
637 assert_eq!(to_kiwi_tag(PosTag::SSO), KiwiPosTag::SS); assert_eq!(to_kiwi_tag(PosTag::SSC), KiwiPosTag::SS); assert_eq!(to_kiwi_tag(PosTag::SC), KiwiPosTag::SP); assert_eq!(to_kiwi_tag(PosTag::SY), KiwiPosTag::SO); }
643
644 #[test]
645 fn test_from_kiwi_tag_nominals() {
646 assert_eq!(from_kiwi_tag(KiwiPosTag::NNG), PosTag::NNG);
648 assert_eq!(from_kiwi_tag(KiwiPosTag::NNP), PosTag::NNP);
649 assert_eq!(from_kiwi_tag(KiwiPosTag::NNB), PosTag::NNB); }
651
652 #[test]
653 fn test_from_kiwi_tag_symbols() {
654 assert_eq!(from_kiwi_tag(KiwiPosTag::SS), PosTag::SSO); assert_eq!(from_kiwi_tag(KiwiPosTag::SO), PosTag::SY); assert_eq!(from_kiwi_tag(KiwiPosTag::SW), PosTag::SY); }
659
660 #[test]
661 fn test_from_kiwi_tag_web() {
662 assert_eq!(from_kiwi_tag(KiwiPosTag::W_URL), PosTag::SL);
664 assert_eq!(from_kiwi_tag(KiwiPosTag::W_EMAIL), PosTag::SL);
665 assert_eq!(from_kiwi_tag(KiwiPosTag::W_HASHTAG), PosTag::SL);
666 assert_eq!(from_kiwi_tag(KiwiPosTag::W_MENTION), PosTag::SL);
667 assert_eq!(from_kiwi_tag(KiwiPosTag::W_EMOJI), PosTag::SL);
668 assert_eq!(from_kiwi_tag(KiwiPosTag::W_OTHER), PosTag::SL);
669 }
670
671 #[test]
672 fn test_roundtrip_conversion() {
673 let tags = [
675 PosTag::NNG,
676 PosTag::VV,
677 PosTag::JKS,
678 PosTag::EP,
679 PosTag::XPN,
680 PosTag::SF,
681 ];
682
683 for tag in tags {
684 let kiwi_tag = to_kiwi_tag(tag);
685 let back = from_kiwi_tag(kiwi_tag);
686 assert_eq!(tag, back, "Roundtrip failed for {tag:?}");
687 }
688 }
689
690 #[test]
691 fn test_lossy_conversion() {
692 assert_eq!(from_kiwi_tag(to_kiwi_tag(PosTag::NNBC)), PosTag::NNB);
695
696 assert_eq!(from_kiwi_tag(to_kiwi_tag(PosTag::SSC)), PosTag::SSO);
698
699 assert_eq!(from_kiwi_tag(to_kiwi_tag(PosTag::SC)), PosTag::SP);
701 }
702
703 #[test]
704 fn test_kiwi_token_creation() {
705 let token = KiwiToken::new("안녕", KiwiPosTag::NNG, 0, 6, -10.5);
706 assert_eq!(token.form, "안녕");
707 assert_eq!(token.tag, KiwiPosTag::NNG);
708 assert_eq!(token.start, 0);
709 assert_eq!(token.length, 6);
710 assert_eq!(token.score, -10.5);
711 assert_eq!(token.end(), 6);
712 }
713
714 #[test]
715 fn test_kiwi_token_display() {
716 let token = KiwiToken::new("하다", KiwiPosTag::VV, 0, 6, -5.0);
717 assert_eq!(token.to_string(), "하다/VV");
718 }
719
720 #[test]
721 fn test_kiwi_token_to_mecab() {
722 let token = KiwiToken::new("것", KiwiPosTag::NNB, 0, 3, -8.2);
723 assert_eq!(token.to_mecab_tag(), PosTag::NNB);
724
725 let url_token = KiwiToken::new("http://example.com", KiwiPosTag::W_URL, 0, 18, -15.0);
726 assert_eq!(url_token.to_mecab_tag(), PosTag::SL);
727 }
728
729 #[test]
730 fn test_all_kiwi_tags_covered() {
731 let kiwi_tags = [
733 KiwiPosTag::NNG,
734 KiwiPosTag::NNP,
735 KiwiPosTag::NNB,
736 KiwiPosTag::NR,
737 KiwiPosTag::NP,
738 KiwiPosTag::VV,
739 KiwiPosTag::VA,
740 KiwiPosTag::VX,
741 KiwiPosTag::VCP,
742 KiwiPosTag::VCN,
743 KiwiPosTag::MM,
744 KiwiPosTag::MAG,
745 KiwiPosTag::MAJ,
746 KiwiPosTag::IC,
747 KiwiPosTag::JKS,
748 KiwiPosTag::JKC,
749 KiwiPosTag::JKG,
750 KiwiPosTag::JKO,
751 KiwiPosTag::JKB,
752 KiwiPosTag::JKV,
753 KiwiPosTag::JKQ,
754 KiwiPosTag::JX,
755 KiwiPosTag::JC,
756 KiwiPosTag::EP,
757 KiwiPosTag::EF,
758 KiwiPosTag::EC,
759 KiwiPosTag::ETN,
760 KiwiPosTag::ETM,
761 KiwiPosTag::XPN,
762 KiwiPosTag::XSN,
763 KiwiPosTag::XSV,
764 KiwiPosTag::XSA,
765 KiwiPosTag::XR,
766 KiwiPosTag::SF,
767 KiwiPosTag::SP,
768 KiwiPosTag::SS,
769 KiwiPosTag::SE,
770 KiwiPosTag::SO,
771 KiwiPosTag::SW,
772 KiwiPosTag::SL,
773 KiwiPosTag::SH,
774 KiwiPosTag::SN,
775 KiwiPosTag::W_URL,
776 KiwiPosTag::W_EMAIL,
777 KiwiPosTag::W_HASHTAG,
778 KiwiPosTag::W_MENTION,
779 KiwiPosTag::W_EMOJI,
780 KiwiPosTag::W_OTHER,
781 KiwiPosTag::Unknown,
782 ];
783
784 for tag in kiwi_tags {
785 let mecab_tag = from_kiwi_tag(tag);
786 assert_ne!(mecab_tag.as_str(), "", "Conversion failed for {tag:?}");
788 }
789 }
790
791 #[test]
792 fn test_all_mecab_tags_covered() {
793 for tag in PosTag::all() {
795 let kiwi_tag = to_kiwi_tag(*tag);
796 assert_ne!(kiwi_tag.as_str(), "", "Conversion failed for {tag:?}");
798 }
799 }
800}