diff --git a/Cargo.toml b/Cargo.toml index 2a2a036..ccfce3d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,19 +1,18 @@ [package] -name = "tinysegmenter" -version = "0.1.1" -authors = [ "woxtu " ] +authors = ["woxtu "] description = "Compact Japanese tokenizer" -repository = "https://github.com/woxtu/rust-tinysegmenter" license = "MIT" - -[dependencies] -lazy_static = "1.0" -maplit = "1.0" +name = "tinysegmenter" +repository = "https://github.com/woxtu/rust-tinysegmenter" +version = "0.1.1" +[[bench]] +name = "benchmark" +path = "benchmark/benchmark.rs" [[test]] name = "test" path = "test/test.rs" -[[bench]] -name = "benchmark" -path = "benchmark/benchmark.rs" +[dependencies] +fnv = "1.0.6" +lazy_static = "1.0" diff --git a/benchmark/benchmark.rs b/benchmark/benchmark.rs index 1f77c12..54cd818 100644 --- a/benchmark/benchmark.rs +++ b/benchmark/benchmark.rs @@ -3,18 +3,21 @@ extern crate test; extern crate tinysegmenter; -use std::io::prelude::*; use std::fs::File; +use std::io::prelude::*; use test::Bencher; #[bench] fn run(b: &mut Bencher) { - // http://www.genpaku.org/timemachine/timemachineu8j.txt - let mut f = - File::open("benchmark/timemachineu8j.txt") - .expect("Failed to read a benchmark text."); - let mut s = String::new(); - let _ = f.read_to_string(&mut s); + // http://www.genpaku.org/timemachine/timemachineu8j.txt + let mut f = + File::open("benchmark/timemachineu8j.txt").expect("Failed to read a benchmark text."); + let mut s = String::new(); + let _ = f.read_to_string(&mut s); - b.iter(|| tinysegmenter::tokenize(&s)); + b.iter(|| tinysegmenter::tokenize(&s)); +} +#[bench] +fn test_small(b: &mut test::Bencher) { + b.iter(|| tinysegmenter::tokenize("私はおでぶです")) } diff --git a/src/constants.rs b/src/constants.rs deleted file mode 100644 index d40d615..0000000 --- a/src/constants.rs +++ /dev/null @@ -1,61 +0,0 @@ -const BIAS: i32 = -332; - -lazy_static! { - static ref B1: char = unsafe { char::from_u32_unchecked(0x110001) }; - static ref B2: char = unsafe { char::from_u32_unchecked(0x110002) }; - static ref B3: char = unsafe { char::from_u32_unchecked(0x110003) }; - static ref E1: char = unsafe { char::from_u32_unchecked(0x110004) }; - static ref E2: char = unsafe { char::from_u32_unchecked(0x110005) }; - static ref E3: char = unsafe { char::from_u32_unchecked(0x110006) }; -} - -lazy_static! { - static ref BC1: HashMap<(char, char), i32> = hashmap! { ('H', 'H') => 6, ('I', 'I') => 2461, ('K', 'H') => 406, ('O', 'H') => -1378, }; - static ref BC2: HashMap<(char, char), i32> = hashmap! { ('A', 'A') => -3267, ('A', 'I') => 2744, ('A', 'N') => -878, ('H', 'H') => -4070, ('H', 'M') => -1711, ('H', 'N') => 4012, ('H', 'O') => 3761, ('I', 'A') => 1327, ('I', 'H') => -1184, ('I', 'I') => -1332, ('I', 'K') => 1721, ('I', 'O') => 5492, ('K', 'I') => 3831, ('K', 'K') => -8741, ('M', 'H') => -3132, ('M', 'K') => 3334, ('O', 'O') => -2920, }; - static ref BC3: HashMap<(char, char), i32> = hashmap! { ('H', 'H') => 996, ('H', 'I') => 626, ('H', 'K') => -721, ('H', 'N') => -1307, ('H', 'O') => -836, ('I', 'H') => -301, ('K', 'K') => 2762, ('M', 'K') => 1079, ('M', 'M') => 4034, ('O', 'A') => -1652, ('O', 'H') => 266, }; - static ref BP1: HashMap<(char, char), i32> = hashmap! { ('B', 'B') => 295, ('O', 'B') => 304, ('O', 'O') => -125, ('U', 'B') => 352, }; - static ref BP2: HashMap<(char, char), i32> = hashmap! { ('B', 'O') => 60, ('O', 'O') => -1762, }; - static ref BQ1: HashMap<(char, char, char), i32> = hashmap! { ('B', 'H', 'H') => 1150, ('B', 'H', 'M') => 1521, ('B', 'I', 'I') => -1158, ('B', 'I', 'M') => 886, ('B', 'M', 'H') => 1208, ('B', 'N', 'H') => 449, ('B', 'O', 'H') => -91, ('B', 'O', 'O') => -2597, ('O', 'H', 'I') => 451, ('O', 'I', 'H') => -296, ('O', 'K', 'A') => 1851, ('O', 'K', 'H') => -1020, ('O', 'K', 'K') => 904, ('O', 'O', 'O') => 2965, }; - static ref BQ2: HashMap<(char, char, char), i32> = hashmap! { ('B', 'H', 'H') => 118, ('B', 'H', 'I') => -1159, ('B', 'H', 'M') => 466, ('B', 'I', 'H') => -919, ('B', 'K', 'K') => -1720, ('B', 'K', 'O') => 864, ('O', 'H', 'H') => -1139, ('O', 'H', 'M') => -181, ('O', 'I', 'H') => 153, ('U', 'H', 'I') => -1146, }; - static ref BQ3: HashMap<(char, char, char), i32> = hashmap! { ('B', 'H', 'H') => -792, ('B', 'H', 'I') => 2664, ('B', 'I', 'I') => -299, ('B', 'K', 'I') => 419, ('B', 'M', 'H') => 937, ('B', 'M', 'M') => 8335, ('B', 'N', 'N') => 998, ('B', 'O', 'H') => 775, ('O', 'H', 'H') => 2174, ('O', 'H', 'M') => 439, ('O', 'I', 'I') => 280, ('O', 'K', 'H') => 1798, ('O', 'K', 'I') => -793, ('O', 'K', 'O') => -2242, ('O', 'M', 'H') => -2402, ('O', 'O', 'O') => 11699, }; - static ref BQ4: HashMap<(char, char, char), i32> = hashmap! { ('B', 'H', 'H') => -3895, ('B', 'I', 'H') => 3761, ('B', 'I', 'I') => -4654, ('B', 'I', 'K') => 1348, ('B', 'K', 'K') => -1806, ('B', 'M', 'I') => -3385, ('B', 'O', 'O') => -12396, ('O', 'A', 'H') => 926, ('O', 'H', 'H') => 266, ('O', 'H', 'K') => -2036, ('O', 'N', 'N') => -973, }; - static ref BW1: HashMap<(char, char), i32> = hashmap! { (',', 'と') => 660, (',', '同') => 727, (*B1, 'あ') => 1404, (*B1, '同') => 542, ('、', 'と') => 660, ('、', '同') => 727, ('」', 'と') => 1682, ('あ', 'っ') => 1505, ('い', 'う') => 1743, ('い', 'っ') => -2055, ('い', 'る') => 672, ('う', 'し') => -4817, ('う', 'ん') => 665, ('か', 'ら') => 3472, ('が', 'ら') => 600, ('こ', 'う') => -790, ('こ', 'と') => 2083, ('こ', 'ん') => -1262, ('さ', 'ら') => -4143, ('さ', 'ん') => 4573, ('し', 'た') => 2641, ('し', 'て') => 1104, ('す', 'で') => -3399, ('そ', 'こ') => 1977, ('そ', 'れ') => -871, ('た', 'ち') => 1122, ('た', 'め') => 601, ('っ', 'た') => 3463, ('つ', 'い') => -802, ('て', 'い') => 805, ('て', 'き') => 1249, ('で', 'き') => 1127, ('で', 'す') => 3445, ('で', 'は') => 844, ('と', 'い') => -4915, ('と', 'み') => 1922, ('ど', 'こ') => 3887, ('な', 'い') => 5713, ('な', 'っ') => 3015, ('な', 'ど') => 7379, ('な', 'ん') => -1113, ('に', 'し') => 2468, ('に', 'は') => 1498, ('に', 'も') => 1671, ('に', '対') => -912, ('の', '一') => -501, ('の', '中') => 741, ('ま', 'せ') => 2448, ('ま', 'で') => 1711, ('ま', 'ま') => 2600, ('ま', 'る') => -2155, ('や', 'む') => -1947, ('よ', 'っ') => -2565, ('れ', 'た') => 2369, ('れ', 'で') => -913, ('を', 'し') => 1860, ('を', '見') => 731, ('亡', 'く') => -1886, ('京', '都') => 2558, ('取', 'り') => -2784, ('大', 'き') => -2604, ('大', '阪') => 1497, ('平', '方') => -2314, ('引', 'き') => -1336, ('日', '本') => -195, ('本', '当') => -2423, ('毎', '日') => -2113, ('目', '指') => -724, ('」', 'と') => 1682, }; - static ref BW2: HashMap<(char, char), i32> = hashmap! { ('.', '.') => -11822, ('1', '1') => -669, ('―', '―') => -5730, ('−', '−') => -13175, ('い', 'う') => -1609, ('う', 'か') => 2490, ('か', 'し') => -1350, ('か', 'も') => -602, ('か', 'ら') => -7194, ('か', 'れ') => 4612, ('が', 'い') => 853, ('が', 'ら') => -3198, ('き', 'た') => 1941, ('く', 'な') => -1597, ('こ', 'と') => -8392, ('こ', 'の') => -4193, ('さ', 'せ') => 4533, ('さ', 'れ') => 13168, ('さ', 'ん') => -3977, ('し', 'い') => -1819, ('し', 'か') => -545, ('し', 'た') => 5078, ('し', 'て') => 972, ('し', 'な') => 939, ('そ', 'の') => -3744, ('た', 'い') => -1253, ('た', 'た') => -662, ('た', 'だ') => -3857, ('た', 'ち') => -786, ('た', 'と') => 1224, ('た', 'は') => -939, ('っ', 'た') => 4589, ('っ', 'て') => 1647, ('っ', 'と') => -2094, ('て', 'い') => 6144, ('て', 'き') => 3640, ('て', 'く') => 2551, ('て', 'は') => -3110, ('て', 'も') => -3065, ('で', 'い') => 2666, ('で', 'き') => -1528, ('で', 'し') => -3828, ('で', 'す') => -4761, ('で', 'も') => -4203, ('と', 'い') => 1890, ('と', 'こ') => -1746, ('と', 'と') => -2279, ('と', 'の') => 720, ('と', 'み') => 5168, ('と', 'も') => -3941, ('な', 'い') => -2488, ('な', 'が') => -1313, ('な', 'ど') => -6509, ('な', 'の') => 2614, ('な', 'ん') => 3099, ('に', 'お') => -1615, ('に', 'し') => 2748, ('に', 'な') => 2454, ('に', 'よ') => -7236, ('に', '対') => -14943, ('に', '従') => -4688, ('に', '関') => -11388, ('の', 'か') => 2093, ('の', 'で') => -7059, ('の', 'に') => -6041, ('の', 'の') => -6125, ('は', 'い') => 1073, ('は', 'が') => -1033, ('は', 'ず') => -2532, ('ば', 'れ') => 1813, ('ま', 'し') => -1316, ('ま', 'で') => -6621, ('ま', 'れ') => 5409, ('め', 'て') => -3153, ('も', 'い') => 2230, ('も', 'の') => -10713, ('ら', 'か') => -944, ('ら', 'し') => -1611, ('ら', 'に') => -1897, ('り', 'し') => 651, ('り', 'ま') => 1620, ('れ', 'た') => 4270, ('れ', 'て') => 849, ('れ', 'ば') => 4114, ('ろ', 'う') => 6067, ('わ', 'れ') => 7901, ('を', '通') => -11877, ('ん', 'だ') => 728, ('ん', 'な') => -4115, ('一', '人') => 602, ('一', '方') => -1375, ('一', '日') => 970, ('一', '部') => -1051, ('上', 'が') => -4479, ('会', '社') => -1116, ('出', 'て') => 2163, ('分', 'の') => -7758, ('同', '党') => 970, ('同', '日') => -913, ('大', '阪') => -2471, ('委', '員') => -1250, ('少', 'な') => -1050, ('年', '度') => -8669, ('年', '間') => -1626, ('府', '県') => -2363, ('手', '権') => -1982, ('新', '聞') => -4066, ('日', '新') => -722, ('日', '本') => -7068, ('日', '米') => 3372, ('曜', '日') => -601, ('朝', '鮮') => -2355, ('本', '人') => -2697, ('東', '京') => -1543, ('然', 'と') => -1384, ('社', '会') => -1276, ('立', 'て') => -990, ('第', 'に') => -1612, ('米', '国') => -4268, ('1', '1') => -669, ('ク', '゙') => 1319,}; - static ref BW3: HashMap<(char, char), i32> = hashmap! { ('あ', 'た') => -2194, ('あ', 'り') => 719, ('あ', 'る') => 3846, ('い', '.') => -1185, ('い', '。') => -1185, ('い', 'い') => 5308, ('い', 'え') => 2079, ('い', 'く') => 3029, ('い', 'た') => 2056, ('い', 'っ') => 1883, ('い', 'る') => 5600, ('い', 'わ') => 1527, ('う', 'ち') => 1117, ('う', 'と') => 4798, ('え', 'と') => 1454, ('か', '.') => 2857, ('か', '。') => 2857, ('か', 'け') => -743, ('か', 'っ') => -4098, ('か', 'に') => -669, ('か', 'ら') => 6520, ('か', 'り') => -2670, ('が', ',') => 1816, ('が', '、') => 1816, ('が', 'き') => -4855, ('が', 'け') => -1127, ('が', 'っ') => -913, ('が', 'ら') => -4977, ('が', 'り') => -2064, ('き', 'た') => 1645, ('け', 'ど') => 1374, ('こ', 'と') => 7397, ('こ', 'の') => 1542, ('こ', 'ろ') => -2757, ('さ', 'い') => -714, ('さ', 'を') => 976, ('し', ',') => 1557, ('し', '、') => 1557, ('し', 'い') => -3714, ('し', 'た') => 3562, ('し', 'て') => 1449, ('し', 'な') => 2608, ('し', 'ま') => 1200, ('す', '.') => -1310, ('す', '。') => -1310, ('す', 'る') => 6521, ('ず', ',') => 3426, ('ず', '、') => 3426, ('ず', 'に') => 841, ('そ', 'う') => 428, ('た', '.') => 8875, ('た', '。') => 8875, ('た', 'い') => -594, ('た', 'の') => 812, ('た', 'り') => -1183, ('た', 'る') => -853, ('だ', '.') => 4098, ('だ', '。') => 4098, ('だ', 'っ') => 1004, ('っ', 'た') => -4748, ('っ', 'て') => 300, ('て', 'い') => 6240, ('て', 'お') => 855, ('て', 'も') => 302, ('で', 'す') => 1437, ('で', 'に') => -1482, ('で', 'は') => 2295, ('と', 'う') => -1387, ('と', 'し') => 2266, ('と', 'の') => 541, ('と', 'も') => -3543, ('ど', 'う') => 4664, ('な', 'い') => 1796, ('な', 'く') => -903, ('な', 'ど') => 2135, ('に', ',') => -1021, ('に', '、') => -1021, ('に', 'し') => 1771, ('に', 'な') => 1906, ('に', 'は') => 2644, ('の', ',') => -724, ('の', '、') => -724, ('の', '子') => -1000, ('は', ',') => 1337, ('は', '、') => 1337, ('べ', 'き') => 2181, ('ま', 'し') => 1113, ('ま', 'す') => 6943, ('ま', 'っ') => -1549, ('ま', 'で') => 6154, ('ま', 'れ') => -793, ('ら', 'し') => 1479, ('ら', 'れ') => 6820, ('る', 'る') => 3818, ('れ', ',') => 854, ('れ', '、') => 854, ('れ', 'た') => 1850, ('れ', 'て') => 1375, ('れ', 'ば') => -3246, ('れ', 'る') => 1091, ('わ', 'れ') => -605, ('ん', 'だ') => 606, ('ん', 'で') => 798, ('カ', '月') => 990, ('会', '議') => 860, ('入', 'り') => 1232, ('大', '会') => 2217, ('始', 'め') => 1681, ('市', ' ') => 965, ('新', '聞') => -5055, ('日', ',') => 974, ('日', '、') => 974, ('社', '会') => 2024, ('カ', '月') => 990, }; -} - -lazy_static! { - static ref TC1: HashMap<(char, char, char), i32> = hashmap! { ('A', 'A', 'A') => 1093, ('H', 'H', 'H') => 1029, ('H', 'H', 'M') => 580, ('H', 'I', 'I') => 998, ('H', 'O', 'H') => -390, ('H', 'O', 'M') => -331, ('I', 'H', 'I') => 1169, ('I', 'O', 'H') => -142, ('I', 'O', 'I') => -1015, ('I', 'O', 'M') => 467, ('M', 'M', 'H') => 187, ('O', 'O', 'I') => -1832, }; - static ref TC2: HashMap<(char, char, char), i32> = hashmap! { ('H', 'H', 'O') => 2088, ('H', 'I', 'I') => -1023, ('H', 'M', 'M') => -1154, ('I', 'H', 'I') => -1965, ('K', 'K', 'H') => 703, ('O', 'I', 'I') => -2649, }; - static ref TC3: HashMap<(char, char, char), i32> = hashmap! { ('A', 'A', 'A') => -294, ('H', 'H', 'H') => 346, ('H', 'H', 'I') => -341, ('H', 'I', 'I') => -1088, ('H', 'I', 'K') => 731, ('H', 'O', 'H') => -1486, ('I', 'H', 'H') => 128, ('I', 'H', 'I') => -3041, ('I', 'H', 'O') => -1935, ('I', 'I', 'H') => -825, ('I', 'I', 'M') => -1035, ('I', 'O', 'I') => -542, ('K', 'H', 'H') => -1216, ('K', 'K', 'A') => 491, ('K', 'K', 'H') => -1217, ('K', 'O', 'K') => -1009, ('M', 'H', 'H') => -2694, ('M', 'H', 'M') => -457, ('M', 'H', 'O') => 123, ('M', 'M', 'H') => -471, ('N', 'N', 'H') => -1689, ('N', 'N', 'O') => 662, ('O', 'H', 'O') => -3393, }; - static ref TC4: HashMap<(char, char, char), i32> = hashmap! { ('H', 'H', 'H') => -203, ('H', 'H', 'I') => 1344, ('H', 'H', 'K') => 365, ('H', 'H', 'M') => -122, ('H', 'H', 'N') => 182, ('H', 'H', 'O') => 669, ('H', 'I', 'H') => 804, ('H', 'I', 'I') => 679, ('H', 'O', 'H') => 446, ('I', 'H', 'H') => 695, ('I', 'H', 'O') => -2324, ('I', 'I', 'H') => 321, ('I', 'I', 'I') => 1497, ('I', 'I', 'O') => 656, ('I', 'O', 'O') => 54, ('K', 'A', 'K') => 4845, ('K', 'K', 'A') => 3386, ('K', 'K', 'K') => 3065, ('M', 'H', 'H') => -405, ('M', 'H', 'I') => 201, ('M', 'M', 'H') => -241, ('M', 'M', 'M') => 661, ('M', 'O', 'M') => 841, }; - static ref TQ1: HashMap<(char, char, char, char), i32> = hashmap! { ('B', 'H', 'H', 'H') => -227, ('B', 'H', 'H', 'I') => 316, ('B', 'H', 'I', 'H') => -132, ('B', 'I', 'H', 'H') => 60, ('B', 'I', 'I', 'I') => 1595, ('B', 'N', 'H', 'H') => -744, ('B', 'O', 'H', 'H') => 225, ('B', 'O', 'O', 'O') => -908, ('O', 'A', 'K', 'K') => 482, ('O', 'H', 'H', 'H') => 281, ('O', 'H', 'I', 'H') => 249, ('O', 'I', 'H', 'I') => 200, ('O', 'I', 'I', 'H') => -68, }; - static ref TQ2: HashMap<(char, char, char, char), i32> = hashmap! { ('B', 'I', 'H', 'H') => -1401, ('B', 'I', 'I', 'I') => -1033, ('B', 'K', 'A', 'K') => -543, ('B', 'O', 'O', 'O') => -5591, }; - static ref TQ3: HashMap<(char, char, char, char), i32> = hashmap! { ('B', 'H', 'H', 'H') => 478, ('B', 'H', 'H', 'M') => -1073, ('B', 'H', 'I', 'H') => 222, ('B', 'H', 'I', 'I') => -504, ('B', 'I', 'I', 'H') => -116, ('B', 'I', 'I', 'I') => -105, ('B', 'M', 'H', 'I') => -863, ('B', 'M', 'H', 'M') => -464, ('B', 'O', 'M', 'H') => 620, ('O', 'H', 'H', 'H') => 346, ('O', 'H', 'H', 'I') => 1729, ('O', 'H', 'I', 'I') => 997, ('O', 'H', 'M', 'H') => 481, ('O', 'I', 'H', 'H') => 623, ('O', 'I', 'I', 'H') => 1344, ('O', 'K', 'A', 'K') => 2792, ('O', 'K', 'H', 'H') => 587, ('O', 'K', 'K', 'A') => 679, ('O', 'O', 'H', 'H') => 110, ('O', 'O', 'I', 'I') => -685, }; - static ref TQ4: HashMap<(char, char, char, char), i32> = hashmap! { ('B', 'H', 'H', 'H') => -721, ('B', 'H', 'H', 'M') => -3604, ('B', 'H', 'I', 'I') => -966, ('B', 'I', 'I', 'H') => -607, ('B', 'I', 'I', 'I') => -2181, ('O', 'A', 'A', 'A') => -2763, ('O', 'A', 'K', 'K') => 180, ('O', 'H', 'H', 'H') => -294, ('O', 'H', 'H', 'I') => 2446, ('O', 'H', 'H', 'O') => 480, ('O', 'H', 'I', 'H') => -1573, ('O', 'I', 'H', 'H') => 1935, ('O', 'I', 'H', 'I') => -493, ('O', 'I', 'I', 'H') => 626, ('O', 'I', 'I', 'I') => -4007, ('O', 'K', 'A', 'K') => -8156, }; - static ref TW1: HashMap<(char, char, char), i32> = hashmap! { ('に', 'つ', 'い') => -4681, ('東', '京', '都') => 2026, }; - static ref TW2: HashMap<(char, char, char), i32> = hashmap! { ('あ', 'る', '程') => -2049, ('い', 'っ', 'た') => -1256, ('こ', 'ろ', 'が') => -2434, ('し', 'ょ', 'う') => 3873, ('そ', 'の', '後') => -4430, ('だ', 'っ', 'て') => -1049, ('て', 'い', 'た') => 1833, ('と', 'し', 'て') => -4657, ('と', 'も', 'に') => -4517, ('も', 'の', 'で') => 1882, ('一', '気', 'に') => -792, ('初', 'め', 'て') => -1512, ('同', '時', 'に') => -8097, ('大', 'き', 'な') => -1255, ('対', 'し', 'て') => -2721, ('社', '会', '党') => -3216, }; - static ref TW3: HashMap<(char, char, char), i32> = hashmap! { ('い', 'た', 'だ') => -1734, ('し', 'て', 'い') => 1314, ('と', 'し', 'て') => -4314, ('に', 'つ', 'い') => -5483, ('に', 'と', 'っ') => -5989, ('に', '当', 'た') => -6247, ('の', 'で', ',') => -727, ('の', 'で', '、') => -727, ('の', 'も', 'の') => -600, ('れ', 'か', 'ら') => -3752, ('十', '二', '月') => -2287, }; - static ref TW4: HashMap<(char, char, char), i32> = hashmap! { ('い', 'う', '.') => 8576, ('い', 'う', '。') => 8576, ('か', 'ら', 'な') => -2348, ('し', 'て', 'い') => 2958, ('た', 'が', ',') => 1516, ('た', 'が', '、') => 1516, ('て', 'い', 'る') => 1538, ('と', 'い', 'う') => 1349, ('ま', 'し', 'た') => 5543, ('ま', 'せ', 'ん') => 1097, ('よ', 'う', 'と') => -4258, ('よ', 'る', 'と') => 5865, }; -} - -lazy_static! { - static ref UC1: HashMap = hashmap! { 'A' => 484, 'K' => 93, 'M' => 645, 'O' => -505, }; - static ref UC2: HashMap = hashmap! { 'A' => 819, 'H' => 1059, 'I' => 409, 'M' => 3987, 'N' => 5775, 'O' => 646, }; - static ref UC3: HashMap = hashmap! { 'A' => -1370, 'I' => 2311, }; - static ref UC4: HashMap = hashmap! { 'A' => -2643, 'H' => 1809, 'I' => -1032, 'K' => -3450, 'M' => 3565, 'N' => 3876, 'O' => 6646, }; - static ref UC5: HashMap = hashmap! { 'H' => 313, 'I' => -1238, 'K' => -799, 'M' => 539, 'O' => -831, }; - static ref UC6: HashMap = hashmap! { 'H' => -506, 'I' => -253, 'K' => 87, 'M' => 247, 'O' => -387, }; - static ref UP1: HashMap = hashmap! { 'O' => -214, }; - static ref UP2: HashMap = hashmap! { 'B' => 69, 'O' => 935, }; - static ref UP3: HashMap = hashmap! { 'B' => 189, }; - static ref UQ1: HashMap<(char, char), i32> = hashmap! { ('B', 'H') => 21, ('B', 'I') => -12, ('B', 'K') => -99, ('B', 'N') => 142, ('B', 'O') => -56, ('O', 'H') => -95, ('O', 'I') => 477, ('O', 'K') => 410, ('O', 'O') => -2422, }; - static ref UQ2: HashMap<(char, char), i32> = hashmap! { ('B', 'H') => 216, ('B', 'I') => 113, ('O', 'K') => 1759, }; - static ref UQ3: HashMap<(char, char), i32> = hashmap! { ('B', 'A') => -479, ('B', 'H') => 42, ('B', 'I') => 1913, ('B', 'K') => -7198, ('B', 'M') => 3160, ('B', 'N') => 6427, ('B', 'O') => 14761, ('O', 'I') => -827, ('O', 'N') => -3212, }; - static ref UW1: HashMap = hashmap! { ',' => 156, '、' => 156, '「' => -463, 'あ' => -941, 'う' => -127, 'が' => -553, 'き' => 121, 'こ' => 505, 'で' => -201, 'と' => -547, 'ど' => -123, 'に' => -789, 'の' => -185, 'は' => -847, 'も' => -466, 'や' => -470, 'よ' => 182, 'ら' => -292, 'り' => 208, 'れ' => 169, 'を' => -446, 'ん' => -137, '・' => -135, '主' => -402, '京' => -268, '区' => -912, '午' => 871, '国' => -460, '大' => 561, '委' => 729, '市' => -411, '日' => -141, '理' => 361, '生' => -408, '県' => -386, '都' => -718, '「' => -463, '・' => -135, }; - static ref UW2: HashMap = hashmap! { ',' => -829, '、' => -829, '〇' => 892, '「' => -645, '」' => 3145, 'あ' => -538, 'い' => 505, 'う' => 134, 'お' => -502, 'か' => 1454, 'が' => -856, 'く' => -412, 'こ' => 1141, 'さ' => 878, 'ざ' => 540, 'し' => 1529, 'す' => -675, 'せ' => 300, 'そ' => -1011, 'た' => 188, 'だ' => 1837, 'つ' => -949, 'て' => -291, 'で' => -268, 'と' => -981, 'ど' => 1273, 'な' => 1063, 'に' => -1764, 'の' => 130, 'は' => -409, 'ひ' => -1273, 'べ' => 1261, 'ま' => 600, 'も' => -1263, 'や' => -402, 'よ' => 1639, 'り' => -579, 'る' => -694, 'れ' => 571, 'を' => -2516, 'ん' => 2095, 'ア' => -587, 'カ' => 306, 'キ' => 568, 'ッ' => 831, '三' => -758, '不' => -2150, '世' => -302, '中' => -968, '主' => -861, '事' => 492, '人' => -123, '会' => 978, '保' => 362, '入' => 548, '初' => -3025, '副' => -1566, '北' => -3414, '区' => -422, '大' => -1769, '天' => -865, '太' => -483, '子' => -1519, '学' => 760, '実' => 1023, '小' => -2009, '市' => -813, '年' => -1060, '強' => 1067, '手' => -1519, '揺' => -1033, '政' => 1522, '文' => -1355, '新' => -1682, '日' => -1815, '明' => -1462, '最' => -630, '朝' => -1843, '本' => -1650, '東' => -931, '果' => -665, '次' => -2378, '民' => -180, '気' => -1740, '理' => 752, '発' => 529, '目' => -1584, '相' => -242, '県' => -1165, '立' => -763, '第' => 810, '米' => 509, '自' => -1353, '行' => 838, '西' => -744, '見' => -3874, '調' => 1010, '議' => 1198, '込' => 3041, '開' => 1758, '間' => -1257, '「' => -645, '」' => 3145, 'ッ' => 831, 'ア' => -587, 'カ' => 306, 'キ' => 568, }; - static ref UW3: HashMap = hashmap! { ',' => 4889, '1' => -800, '−' => -1723, '、' => 4889, '々' => -2311, '〇' => 5827, '」' => 2670, '〓' => -3573, 'あ' => -2696, 'い' => 1006, 'う' => 2342, 'え' => 1983, 'お' => -4864, 'か' => -1163, 'が' => 3271, 'く' => 1004, 'け' => 388, 'げ' => 401, 'こ' => -3552, 'ご' => -3116, 'さ' => -1058, 'し' => -395, 'す' => 584, 'せ' => 3685, 'そ' => -5228, 'た' => 842, 'ち' => -521, 'っ' => -1444, 'つ' => -1081, 'て' => 6167, 'で' => 2318, 'と' => 1691, 'ど' => -899, 'な' => -2788, 'に' => 2745, 'の' => 4056, 'は' => 4555, 'ひ' => -2171, 'ふ' => -1798, 'へ' => 1199, 'ほ' => -5516, 'ま' => -4384, 'み' => -120, 'め' => 1205, 'も' => 2323, 'や' => -788, 'よ' => -202, 'ら' => 727, 'り' => 649, 'る' => 5905, 'れ' => 2773, 'わ' => -1207, 'を' => 6620, 'ん' => -518, 'ア' => 551, 'グ' => 1319, 'ス' => 874, 'ッ' => -1350, 'ト' => 521, 'ム' => 1109, 'ル' => 1591, 'ロ' => 2201, 'ン' => 278, '・' => -3794, '一' => -1619, '下' => -1759, '世' => -2087, '両' => 3815, '中' => 653, '主' => -758, '予' => -1193, '二' => 974, '人' => 2742, '今' => 792, '他' => 1889, '以' => -1368, '低' => 811, '何' => 4265, '作' => -361, '保' => -2439, '元' => 4858, '党' => 3593, '全' => 1574, '公' => -3030, '六' => 755, '共' => -1880, '円' => 5807, '再' => 3095, '分' => 457, '初' => 2475, '別' => 1129, '前' => 2286, '副' => 4437, '力' => 365, '動' => -949, '務' => -1872, '化' => 1327, '北' => -1038, '区' => 4646, '千' => -2309, '午' => -783, '協' => -1006, '口' => 483, '右' => 1233, '各' => 3588, '合' => -241, '同' => 3906, '和' => -837, '員' => 4513, '国' => 642, '型' => 1389, '場' => 1219, '外' => -241, '妻' => 2016, '学' => -1356, '安' => -423, '実' => -1008, '家' => 1078, '小' => -513, '少' => -3102, '州' => 1155, '市' => 3197, '平' => -1804, '年' => 2416, '広' => -1030, '府' => 1605, '度' => 1452, '建' => -2352, '当' => -3885, '得' => 1905, '思' => -1291, '性' => 1822, '戸' => -488, '指' => -3973, '政' => -2013, '教' => -1479, '数' => 3222, '文' => -1489, '新' => 1764, '日' => 2099, '旧' => 5792, '昨' => -661, '時' => -1248, '曜' => -951, '最' => -937, '月' => 4125, '期' => 360, '李' => 3094, '村' => 364, '東' => -805, '核' => 5156, '森' => 2438, '業' => 484, '氏' => 2613, '民' => -1694, '決' => -1073, '法' => 1868, '海' => -495, '無' => 979, '物' => 461, '特' => -3850, '生' => -273, '用' => 914, '町' => 1215, '的' => 7313, '直' => -1835, '省' => 792, '県' => 6293, '知' => -1528, '私' => 4231, '税' => 401, '立' => -960, '第' => 1201, '米' => 7767, '系' => 3066, '約' => 3663, '級' => 1384, '統' => -4229, '総' => 1163, '線' => 1255, '者' => 6457, '能' => 725, '自' => -2869, '英' => 785, '見' => 1044, '調' => -562, '財' => -733, '費' => 1777, '車' => 1835, '軍' => 1375, '込' => -1504, '通' => -1136, '選' => -681, '郎' => 1026, '郡' => 4404, '部' => 1200, '金' => 2163, '長' => 421, '開' => -1432, '間' => 1302, '関' => -1282, '雨' => 2009, '電' => -1045, '非' => 2066, '駅' => 1620, '1' => -800, '」' => 2670, '・' => -3794, 'ッ' => -1350, 'ア' => 551, 'ス' => 874, 'ト' => 521, 'ム' => 1109, 'ル' => 1591, 'ロ' => 2201, 'ン' => 278, }; - static ref UW4: HashMap = hashmap! { ',' => 3930, '.' => 3508, '―' => -4841, '、' => 3930, '。' => 3508, '〇' => 4999, '「' => 1895, '」' => 3798, '〓' => -5156, 'あ' => 4752, 'い' => -3435, 'う' => -640, 'え' => -2514, 'お' => 2405, 'か' => 530, 'が' => 6006, 'き' => -4482, 'ぎ' => -3821, 'く' => -3788, 'け' => -4376, 'げ' => -4734, 'こ' => 2255, 'ご' => 1979, 'さ' => 2864, 'し' => -843, 'じ' => -2506, 'す' => -731, 'ず' => 1251, 'せ' => 181, 'そ' => 4091, 'た' => 5034, 'だ' => 5408, 'ち' => -3654, 'っ' => -5882, 'つ' => -1659, 'て' => 3994, 'で' => 7410, 'と' => 4547, 'な' => 5433, 'に' => 6499, 'ぬ' => 1853, 'ね' => 1413, 'の' => 7396, 'は' => 8578, 'ば' => 1940, 'ひ' => 4249, 'び' => -4134, 'ふ' => 1345, 'へ' => 6665, 'べ' => -744, 'ほ' => 1464, 'ま' => 1051, 'み' => -2082, 'む' => -882, 'め' => -5046, 'も' => 4169, 'ゃ' => -2666, 'や' => 2795, 'ょ' => -1544, 'よ' => 3351, 'ら' => -2922, 'り' => -9726, 'る' => -14896, 'れ' => -2613, 'ろ' => -4570, 'わ' => -1783, 'を' => 13150, 'ん' => -2352, 'カ' => 2145, 'コ' => 1789, 'セ' => 1287, 'ッ' => -724, 'ト' => -403, 'メ' => -1635, 'ラ' => -881, 'リ' => -541, 'ル' => -856, 'ン' => -3637, '・' => -4371, 'ー' => -11870, '一' => -2069, '中' => 2210, '予' => 782, '事' => -190, '井' => -1768, '人' => 1036, '以' => 544, '会' => 950, '体' => -1286, '作' => 530, '側' => 4292, '先' => 601, '党' => -2006, '共' => -1212, '内' => 584, '円' => 788, '初' => 1347, '前' => 1623, '副' => 3879, '力' => -302, '動' => -740, '務' => -2715, '化' => 776, '区' => 4517, '協' => 1013, '参' => 1555, '合' => -1834, '和' => -681, '員' => -910, '器' => -851, '回' => 1500, '国' => -619, '園' => -1200, '地' => 866, '場' => -1410, '塁' => -2094, '士' => -1413, '多' => 1067, '大' => 571, '子' => -4802, '学' => -1397, '定' => -1057, '寺' => -809, '小' => 1910, '屋' => -1328, '山' => -1500, '島' => -2056, '川' => -2667, '市' => 2771, '年' => 374, '庁' => -4556, '後' => 456, '性' => 553, '感' => 916, '所' => -1566, '支' => 856, '改' => 787, '政' => 2182, '教' => 704, '文' => 522, '方' => -856, '日' => 1798, '時' => 1829, '最' => 845, '月' => -9066, '木' => -485, '来' => -442, '校' => -360, '業' => -1043, '氏' => 5388, '民' => -2716, '気' => -910, '沢' => -939, '済' => -543, '物' => -735, '率' => 672, '球' => -1267, '生' => -1286, '産' => -1101, '田' => -2900, '町' => 1826, '的' => 2586, '目' => 922, '省' => -3485, '県' => 2997, '空' => -867, '立' => -2112, '第' => 788, '米' => 2937, '系' => 786, '約' => 2171, '経' => 1146, '統' => -1169, '総' => 940, '線' => -994, '署' => 749, '者' => 2145, '能' => -730, '般' => -852, '行' => -792, '規' => 792, '警' => -1184, '議' => -244, '谷' => -1000, '賞' => 730, '車' => -1481, '軍' => 1158, '輪' => -1433, '込' => -3370, '近' => 929, '道' => -1291, '選' => 2596, '郎' => -4866, '都' => 1192, '野' => -1100, '銀' => -2213, '長' => 357, '間' => -2344, '院' => -2297, '際' => -2604, '電' => -878, '領' => -1659, '題' => -792, '館' => -1984, '首' => 1749, '高' => 2120, '「' => 1895, '」' => 3798, '・' => -4371, 'ッ' => -724, 'ー' => -11870, 'カ' => 2145, 'コ' => 1789, 'セ' => 1287, 'ト' => -403, 'メ' => -1635, 'ラ' => -881, 'リ' => -541, 'ル' => -856, 'ン' => -3637, }; - static ref UW5: HashMap = hashmap! { ',' => 465, '.' => -299, '1' => -514, *E2 => -32768, ']' => -2762, '、' => 465, '。' => -299, '「' => 363, 'あ' => 1655, 'い' => 331, 'う' => -503, 'え' => 1199, 'お' => 527, 'か' => 647, 'が' => -421, 'き' => 1624, 'ぎ' => 1971, 'く' => 312, 'げ' => -983, 'さ' => -1537, 'し' => -1371, 'す' => -852, 'だ' => -1186, 'ち' => 1093, 'っ' => 52, 'つ' => 921, 'て' => -18, 'で' => -850, 'と' => -127, 'ど' => 1682, 'な' => -787, 'に' => -1224, 'の' => -635, 'は' => -578, 'べ' => 1001, 'み' => 502, 'め' => 865, 'ゃ' => 3350, 'ょ' => 854, 'り' => -208, 'る' => 429, 'れ' => 504, 'わ' => 419, 'を' => -1264, 'ん' => 327, 'イ' => 241, 'ル' => 451, 'ン' => -343, '中' => -871, '京' => 722, '会' => -1153, '党' => -654, '務' => 3519, '区' => -901, '告' => 848, '員' => 2104, '大' => -1296, '学' => -548, '定' => 1785, '嵐' => -1304, '市' => -2991, '席' => 921, '年' => 1763, '思' => 872, '所' => -814, '挙' => 1618, '新' => -1682, '日' => 218, '月' => -4353, '査' => 932, '格' => 1356, '機' => -1508, '氏' => -1347, '田' => 240, '町' => -3912, '的' => -3149, '相' => 1319, '省' => -1052, '県' => -4003, '研' => -997, '社' => -278, '空' => -813, '統' => 1955, '者' => -2233, '表' => 663, '語' => -1073, '議' => 1219, '選' => -1018, '郎' => -368, '長' => 786, '間' => 1191, '題' => 2368, '館' => -689, '1' => -514, '「' => 363, 'イ' => 241, 'ル' => 451, 'ン' => -343, }; - static ref UW6: HashMap = hashmap! { ',' => 227, '.' => 808, '1' => -270, *E1 => 306, '、' => 227, '。' => 808, 'あ' => -307, 'う' => 189, 'か' => 241, 'が' => -73, 'く' => -121, 'こ' => -200, 'じ' => 1782, 'す' => 383, 'た' => -428, 'っ' => 573, 'て' => -1014, 'で' => 101, 'と' => -105, 'な' => -253, 'に' => -149, 'の' => -417, 'は' => -236, 'も' => -206, 'り' => 187, 'る' => -135, 'を' => 195, 'ル' => -673, 'ン' => -496, '一' => -277, '中' => 201, '件' => -800, '会' => 624, '前' => 302, '区' => 1792, '員' => -1212, '委' => 798, '学' => -960, '市' => 887, '広' => -695, '後' => 535, '業' => -697, '相' => 753, '社' => -507, '福' => 974, '空' => -822, '者' => 1811, '連' => 463, '郎' => 1082, '1' => -270, 'ル' => -673, 'ン' => -496, }; -} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 85e033e..b60501b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,109 +1,1563 @@ -#[macro_use] extern crate lazy_static; -#[macro_use] extern crate maplit; - +#[macro_use] +extern crate lazy_static; use std::char; -use std::collections::HashMap; -use std::hash::Hash; -include!("constants.rs"); +const BIAS: i32 = -332; + +// B1-B3 and E1-E3 are begin and end markers, they are set to invalid chracters so no character collision can occur +lazy_static! { + static ref B1: char = unsafe { char::from_u32_unchecked(0x110001) }; + static ref B2: char = unsafe { char::from_u32_unchecked(0x110002) }; + static ref B3: char = unsafe { char::from_u32_unchecked(0x110003) }; + static ref E1: char = unsafe { char::from_u32_unchecked(0x110004) }; + static ref E2: char = unsafe { char::from_u32_unchecked(0x110005) }; + static ref E3: char = unsafe { char::from_u32_unchecked(0x110006) }; +} + +fn get_ctype(c: char) -> Ctype { + match c as u32 { + 0x4E00 | 0x4E8C | 0x4E09 | 0x56DB | 0x4E94 | 0x516D | 0x4E03 | 0x516B | 0x4E5D | 0x5341 => { + Ctype::M + } + 0x767E | 0x5343 | 0x4E07 | 0x5104 | 0x5146 => Ctype::M, + 0x4E00...0x9FA0 | 0x3005 | 0x3006 | 0x30F5 | 0x30F6 => Ctype::H, + 0x3041...0x3093 => Ctype::I, + 0x30A1...0x30F4 | 0x30FC | 0xFF71...0xFF9D | 0xFF9E | 0xFF70 => Ctype::K, + 0x61...0x7A | 0x41...0x5A | 0xFF41...0xFF5A | 0xFF21...0xFF3A => Ctype::A, + 0x30...0x3a | 0xFF10...0xFF19 => Ctype::N, + _ => Ctype::O, + } +} -fn get_score(d: &HashMap, s: &T) -> i32 { - d.get(s).cloned().unwrap_or(0) +#[derive(Debug, Clone, Copy)] +enum Marker { + U, + O, + B, } -fn get_ctype(c: char) -> char { - match c as u32 { - 0x4E00|0x4E8C|0x4E09|0x56DB|0x4E94|0x516D|0x4E03|0x516B|0x4E5D|0x5341 => 'M', - 0x767E|0x5343|0x4E07|0x5104|0x5146 => 'M', - 0x4E00...0x9FA0|0x3005|0x3006|0x30F5|0x30F6 => 'H', - 0x3041...0x3093 => 'I', - 0x30A1...0x30F4|0x30FC|0xFF71...0xFF9D|0xFF9E|0xFF70 => 'K', - 0x61...0x7A|0x41...0x5A|0xFF41...0xFF5A|0xFF21...0xFF3A => 'A', - 0x30...0x3a|0xFF10...0xFF19 => 'N', - _ => 'O', - } +#[derive(Debug, Clone, Copy)] +enum Ctype { + M, + H, + I, + K, + A, + N, + O } pub fn tokenize(s: &str) -> Vec { - if s.is_empty() { - return Vec::new(); - } - - let mut result = Vec::with_capacity(s.chars().count()); - - let segments = - vec!(*B3, *B2, *B1).into_iter() - .chain(s.chars()) - .chain(vec!(*E1, *E2, *E3).into_iter()) - .collect::>(); - - let ctypes = - vec!('O'; 3).into_iter() - .chain(s.chars().map(get_ctype)) - .chain(vec!('O'; 3).into_iter()) - .collect::>(); - - let mut word = segments[3].to_string(); - let mut p = vec!('U'; 3); - - for index in 4 .. segments.len() - 3 { - let mut score = BIAS; - let w = &segments[index - 3 .. index + 3]; - let c = &ctypes[index - 3 .. index + 3]; - - score = score + get_score(&*UP1, &p[0]); - score = score + get_score(&*UP2, &p[1]); - score = score + get_score(&*UP3, &p[2]); - score = score + get_score(&*BP1, &(p[0], p[1])); - score = score + get_score(&*BP2, &(p[1], p[2])); - score = score + get_score(&*UW1, &w[0]); - score = score + get_score(&*UW2, &w[1]); - score = score + get_score(&*UW3, &w[2]); - score = score + get_score(&*UW4, &w[3]); - score = score + get_score(&*UW5, &w[4]); - score = score + get_score(&*UW6, &w[5]); - score = score + get_score(&*BW1, &(w[1], w[2])); - score = score + get_score(&*BW2, &(w[2], w[3])); - score = score + get_score(&*BW3, &(w[3], w[4])); - score = score + get_score(&*TW1, &(w[0], w[1], w[2])); - score = score + get_score(&*TW2, &(w[1], w[2], w[3])); - score = score + get_score(&*TW3, &(w[2], w[3], w[4])); - score = score + get_score(&*TW4, &(w[3], w[4], w[5])); - score = score + get_score(&*UC1, &c[0]); - score = score + get_score(&*UC2, &c[1]); - score = score + get_score(&*UC3, &c[2]); - score = score + get_score(&*UC4, &c[3]); - score = score + get_score(&*UC5, &c[4]); - score = score + get_score(&*UC6, &c[5]); - score = score + get_score(&*BC1, &(c[1], c[2])); - score = score + get_score(&*BC2, &(c[2], c[3])); - score = score + get_score(&*BC3, &(c[3], c[4])); - score = score + get_score(&*TC1, &(c[0], c[1], c[2])); - score = score + get_score(&*TC2, &(c[1], c[2], c[3])); - score = score + get_score(&*TC3, &(c[2], c[3], c[4])); - score = score + get_score(&*TC4, &(c[3], c[4], c[5])); - score = score + get_score(&*UQ1, &(p[0], c[0])); - score = score + get_score(&*UQ2, &(p[1], c[1])); - score = score + get_score(&*UQ3, &(p[2], c[2])); - score = score + get_score(&*BQ1, &(p[1], c[1], c[2])); - score = score + get_score(&*BQ2, &(p[1], c[2], c[3])); - score = score + get_score(&*BQ3, &(p[2], c[1], c[2])); - score = score + get_score(&*BQ4, &(p[2], c[2], c[3])); - score = score + get_score(&*TQ1, &(p[1], c[0], c[1], c[2])); - score = score + get_score(&*TQ2, &(p[1], c[1], c[2], c[3])); - score = score + get_score(&*TQ3, &(p[2], c[0], c[1], c[2])); - score = score + get_score(&*TQ4, &(p[2], c[1], c[2], c[3])); - - p.remove(0); - p.push(if score < 0 { 'O' } else { 'B' }); - - if 0 < score { - result.push(word.clone()); - word.clear(); + if s.is_empty() { + return Vec::new(); + } + + let mut result = Vec::with_capacity(s.chars().count()); + + let segments = vec![*B3, *B2, *B1] + .into_iter() + .chain(s.chars()) + .chain(vec![*E1, *E2, *E3].into_iter()) + .collect::>(); + + let ctypes = vec![Ctype::O; 3] + .into_iter() + .chain(s.chars().map(get_ctype)) + .chain(vec![Ctype::O; 3].into_iter()) + .collect::>(); + + let mut word = segments[3].to_string(); + let mut p = vec![Marker::U; 3]; + + for index in 4..segments.len() - 3 { + let mut score = BIAS; + let w = &segments[index - 3..index + 3]; + let c = &ctypes[index - 3..index + 3]; + + score += match &(c[0], c[1], c[2]) { + (Ctype::A, Ctype::A, Ctype::A) => 1093, + (Ctype::H, Ctype::H, Ctype::H) => 1029, + (Ctype::H, Ctype::H, Ctype::M) => 580, + (Ctype::H, Ctype::I, Ctype::I) => 998, + (Ctype::H, Ctype::O, Ctype::H) => -390, + (Ctype::H, Ctype::O, Ctype::M) => -331, + (Ctype::I, Ctype::H, Ctype::I) => 1169, + (Ctype::I, Ctype::O, Ctype::H) => -142, + (Ctype::I, Ctype::O, Ctype::I) => -1015, + (Ctype::I, Ctype::O, Ctype::M) => 467, + (Ctype::M, Ctype::M, Ctype::H) => 187, + (Ctype::O, Ctype::O, Ctype::I) => -1832, + _ => 0, + }; + score += match &(c[1], c[2], c[3]) { + (Ctype::H, Ctype::H, Ctype::O) => 2088, + (Ctype::H, Ctype::I, Ctype::I) => -1023, + (Ctype::H, Ctype::M, Ctype::M) => -1154, + (Ctype::I, Ctype::H, Ctype::I) => -1965, + (Ctype::K, Ctype::K, Ctype::H) => 703, + (Ctype::O, Ctype::I, Ctype::I) => -2649, + _ => 0, + }; + score += match &(c[2], c[3], c[4]) { + (Ctype::A, Ctype::A, Ctype::A) => -294, + (Ctype::H, Ctype::H, Ctype::H) => 346, + (Ctype::H, Ctype::H, Ctype::I) => -341, + (Ctype::H, Ctype::I, Ctype::I) => -1088, + (Ctype::H, Ctype::I, Ctype::K) => 731, + (Ctype::H, Ctype::O, Ctype::H) => -1486, + (Ctype::I, Ctype::H, Ctype::H) => 128, + (Ctype::I, Ctype::H, Ctype::I) => -3041, + (Ctype::I, Ctype::H, Ctype::O) => -1935, + (Ctype::I, Ctype::I, Ctype::H) => -825, + (Ctype::I, Ctype::I, Ctype::M) => -1035, + (Ctype::I, Ctype::O, Ctype::I) => -542, + (Ctype::K, Ctype::H, Ctype::H) => -1216, + (Ctype::K, Ctype::K, Ctype::A) => 491, + (Ctype::K, Ctype::K, Ctype::H) => -1217, + (Ctype::K, Ctype::O, Ctype::K) => -1009, + (Ctype::M, Ctype::H, Ctype::H) => -2694, + (Ctype::M, Ctype::H, Ctype::M) => -457, + (Ctype::M, Ctype::H, Ctype::O) => 123, + (Ctype::M, Ctype::M, Ctype::H) => -471, + (Ctype::N, Ctype::N, Ctype::H) => -1689, + (Ctype::N, Ctype::N, Ctype::O) => 662, + (Ctype::O, Ctype::H, Ctype::O) => -3393, + _ => 0, + }; + score += match &(c[3], c[4], c[5]) { + (Ctype::H, Ctype::H, Ctype::H) => -203, + (Ctype::H, Ctype::H, Ctype::I) => 1344, + (Ctype::H, Ctype::H, Ctype::K) => 365, + (Ctype::H, Ctype::H, Ctype::M) => -122, + (Ctype::H, Ctype::H, Ctype::N) => 182, + (Ctype::H, Ctype::H, Ctype::O) => 669, + (Ctype::H, Ctype::I, Ctype::H) => 804, + (Ctype::H, Ctype::I, Ctype::I) => 679, + (Ctype::H, Ctype::O, Ctype::H) => 446, + (Ctype::I, Ctype::H, Ctype::H) => 695, + (Ctype::I, Ctype::H, Ctype::O) => -2324, + (Ctype::I, Ctype::I, Ctype::H) => 321, + (Ctype::I, Ctype::I, Ctype::I) => 1497, + (Ctype::I, Ctype::I, Ctype::O) => 656, + (Ctype::I, Ctype::O, Ctype::O) => 54, + (Ctype::K, Ctype::A, Ctype::K) => 4845, + (Ctype::K, Ctype::K, Ctype::A) => 3386, + (Ctype::K, Ctype::K, Ctype::K) => 3065, + (Ctype::M, Ctype::H, Ctype::H) => -405, + (Ctype::M, Ctype::H, Ctype::I) => 201, + (Ctype::M, Ctype::M, Ctype::H) => -241, + (Ctype::M, Ctype::M, Ctype::M) => 661, + (Ctype::M, Ctype::O, Ctype::M) => 841, + _ => 0, + }; + score += match &(p[1], c[0], c[1], c[2]) { + (Marker::B, Ctype::H, Ctype::H, Ctype::H) => -227, + (Marker::B, Ctype::H, Ctype::H, Ctype::I) => 316, + (Marker::B, Ctype::H, Ctype::I, Ctype::H) => -132, + (Marker::B, Ctype::I, Ctype::H, Ctype::H) => 60, + (Marker::B, Ctype::I, Ctype::I, Ctype::I) => 1595, + (Marker::B, Ctype::N, Ctype::H, Ctype::H) => -744, + (Marker::B, Ctype::O, Ctype::H, Ctype::H) => 225, + (Marker::B, Ctype::O, Ctype::O, Ctype::O) => -908, + (Marker::O, Ctype::A, Ctype::K, Ctype::K) => 482, + (Marker::O, Ctype::H, Ctype::H, Ctype::H) => 281, + (Marker::O, Ctype::H, Ctype::I, Ctype::H) => 249, + (Marker::O, Ctype::I, Ctype::H, Ctype::I) => 200, + (Marker::O, Ctype::I, Ctype::I, Ctype::H) => -68, + _ => 0, + }; + score += match &(p[1], c[1], c[2], c[3]) { + (Marker::B, Ctype::I, Ctype::H, Ctype::H) => -1401, + (Marker::B, Ctype::I, Ctype::I, Ctype::I) => -1033, + (Marker::B, Ctype::K, Ctype::A, Ctype::K) => -543, + (Marker::B, Ctype::O, Ctype::O, Ctype::O) => -5591, + _ => 0, + }; + score += match &(p[2], c[0], c[1], c[2]) { + (Marker::B, Ctype::H, Ctype::H, Ctype::H) => 478, + (Marker::B, Ctype::H, Ctype::H, Ctype::M) => -1073, + (Marker::B, Ctype::H, Ctype::I, Ctype::H) => 222, + (Marker::B, Ctype::H, Ctype::I, Ctype::I) => -504, + (Marker::B, Ctype::I, Ctype::I, Ctype::H) => -116, + (Marker::B, Ctype::I, Ctype::I, Ctype::I) => -105, + (Marker::B, Ctype::M, Ctype::H, Ctype::I) => -863, + (Marker::B, Ctype::M, Ctype::H, Ctype::M) => -464, + (Marker::B, Ctype::O, Ctype::M, Ctype::H) => 620, + (Marker::O, Ctype::H, Ctype::H, Ctype::H) => 346, + (Marker::O, Ctype::H, Ctype::H, Ctype::I) => 1729, + (Marker::O, Ctype::H, Ctype::I, Ctype::I) => 997, + (Marker::O, Ctype::H, Ctype::M, Ctype::H) => 481, + (Marker::O, Ctype::I, Ctype::H, Ctype::H) => 623, + (Marker::O, Ctype::I, Ctype::I, Ctype::H) => 1344, + (Marker::O, Ctype::K, Ctype::A, Ctype::K) => 2792, + (Marker::O, Ctype::K, Ctype::H, Ctype::H) => 587, + (Marker::O, Ctype::K, Ctype::K, Ctype::A) => 679, + (Marker::O, Ctype::O, Ctype::H, Ctype::H) => 110, + (Marker::O, Ctype::O, Ctype::I, Ctype::I) => -685, + _ => 0, + }; + score += match &(p[2], c[1], c[2], c[3]) { + (Marker::B, Ctype::H, Ctype::H, Ctype::H) => -721, + (Marker::B, Ctype::H, Ctype::H, Ctype::M) => -3604, + (Marker::B, Ctype::H, Ctype::I, Ctype::I) => -966, + (Marker::B, Ctype::I, Ctype::I, Ctype::H) => -607, + (Marker::B, Ctype::I, Ctype::I, Ctype::I) => -2181, + (Marker::O, Ctype::A, Ctype::A, Ctype::A) => -2763, + (Marker::O, Ctype::A, Ctype::K, Ctype::K) => 180, + (Marker::O, Ctype::H, Ctype::H, Ctype::H) => -294, + (Marker::O, Ctype::H, Ctype::H, Ctype::I) => 2446, + (Marker::O, Ctype::H, Ctype::H, Ctype::O) => 480, + (Marker::O, Ctype::H, Ctype::I, Ctype::H) => -1573, + (Marker::O, Ctype::I, Ctype::H, Ctype::H) => 1935, + (Marker::O, Ctype::I, Ctype::H, Ctype::I) => -493, + (Marker::O, Ctype::I, Ctype::I, Ctype::H) => 626, + (Marker::O, Ctype::I, Ctype::I, Ctype::I) => -4007, + (Marker::O, Ctype::K, Ctype::A, Ctype::K) => -8156, + _ => 0, + }; + score += match &(w[0], w[1], w[2]) { + ('に', 'つ', 'い') => -4681, + ('東', '京', '都') => 2026, + _ => 0, + }; + score += match &(w[1], w[2], w[3]) { + ('あ', 'る', '程') => -2049, + ('い', 'っ', 'た') => -1256, + ('こ', 'ろ', 'が') => -2434, + ('し', 'ょ', 'う') => 3873, + ('そ', 'の', '後') => -4430, + ('だ', 'っ', 'て') => -1049, + ('て', 'い', 'た') => 1833, + ('と', 'し', 'て') => -4657, + ('と', 'も', 'に') => -4517, + ('も', 'の', 'で') => 1882, + ('一', '気', 'に') => -792, + ('初', 'め', 'て') => -1512, + ('同', '時', 'に') => -8097, + ('大', 'き', 'な') => -1255, + ('対', 'し', 'て') => -2721, + ('社', '会', '党') => -3216, + _ => 0, + }; + score += match &(w[2], w[3], w[4]) { + ('い', 'た', 'だ') => -1734, + ('し', 'て', 'い') => 1314, + ('と', 'し', 'て') => -4314, + ('に', 'つ', 'い') => -5483, + ('に', 'と', 'っ') => -5989, + ('に', '当', 'た') => -6247, + ('の', 'で', ',') => -727, + ('の', 'で', '、') => -727, + ('の', 'も', 'の') => -600, + ('れ', 'か', 'ら') => -3752, + ('十', '二', '月') => -2287, + _ => 0, + }; + score += match &(w[3], w[4], w[5]) { + ('い', 'う', '.') => 8576, + ('い', 'う', '。') => 8576, + ('か', 'ら', 'な') => -2348, + ('し', 'て', 'い') => 2958, + ('た', 'が', ',') => 1516, + ('た', 'が', '、') => 1516, + ('て', 'い', 'る') => 1538, + ('と', 'い', 'う') => 1349, + ('ま', 'し', 'た') => 5543, + ('ま', 'せ', 'ん') => 1097, + ('よ', 'う', 'と') => -4258, + ('よ', 'る', 'と') => 5865, + _ => 0, + }; + + score += match &c[0] { + Ctype::A => 484, + Ctype::K => 93, + Ctype::M => 645, + Ctype::O => -505, + _ => 0, + }; + score += match &c[1] { + Ctype::A => 819, + Ctype::H => 1059, + Ctype::I => 409, + Ctype::M => 3987, + Ctype::N => 5775, + Ctype::O => 646, + _ => 0, + }; + score += match &c[2] { + Ctype::A => -1370, + Ctype::I => 2311, + _ => 0, + }; + score += match &c[3] { + Ctype::A => -2643, + Ctype::H => 1809, + Ctype::I => -1032, + Ctype::K => -3450, + Ctype::M => 3565, + Ctype::N => 3876, + Ctype::O => 6646, + }; + score += match &c[4] { + Ctype::H => 313, + Ctype::I => -1238, + Ctype::K => -799, + Ctype::M => 539, + Ctype::O => -831, + _ => 0, + }; + score += match &c[5] { + Ctype::H => -506, + Ctype::I => -253, + Ctype::K => 87, + Ctype::M => 247, + Ctype::O => -387, + _ => 0, + }; + + score += match &p[0] { + Marker::O => -214, + _ => 0, + }; + score += match &p[1] { + Marker::B => 69, + Marker::O => 935, + _ => 0, + }; + score += match &p[2] { + Marker::B => 189, + _ => 0, + }; + score += match &(p[0], c[0]) { + (Marker::B, Ctype::H) => 21, + (Marker::B, Ctype::I) => -12, + (Marker::B, Ctype::K) => -99, + (Marker::B, Ctype::N) => 142, + (Marker::B, Ctype::O) => -56, + (Marker::O, Ctype::H) => -95, + (Marker::O, Ctype::I) => 477, + (Marker::O, Ctype::K) => 410, + (Marker::O, Ctype::O) => -2422, + _ => 0, + }; + score += match &(p[1], c[1]) { + (Marker::B, Ctype::H) => 216, + (Marker::B, Ctype::I) => 113, + (Marker::O, Ctype::K) => 1759, + _ => 0, + }; + score += match &(p[2], c[2]) { + (Marker::B, Ctype::A) => -479, + (Marker::B, Ctype::H) => 42, + (Marker::B, Ctype::I) => 1913, + (Marker::B, Ctype::K) => -7198, + (Marker::B, Ctype::M) => 3160, + (Marker::B, Ctype::N) => 6427, + (Marker::B, Ctype::O) => 14761, + (Marker::O, Ctype::I) => -827, + (Marker::O, Ctype::N) => -3212, + _ => 0, + }; + score += match &w[0] { + ',' => 156, + '、' => 156, + '「' => -463, + 'あ' => -941, + 'う' => -127, + 'が' => -553, + 'き' => 121, + 'こ' => 505, + 'で' => -201, + 'と' => -547, + 'ど' => -123, + 'に' => -789, + 'の' => -185, + 'は' => -847, + 'も' => -466, + 'や' => -470, + 'よ' => 182, + 'ら' => -292, + 'り' => 208, + 'れ' => 169, + 'を' => -446, + 'ん' => -137, + '・' => -135, + '主' => -402, + '京' => -268, + '区' => -912, + '午' => 871, + '国' => -460, + '大' => 561, + '委' => 729, + '市' => -411, + '日' => -141, + '理' => 361, + '生' => -408, + '県' => -386, + '都' => -718, + '「' => -463, + '・' => -135, + _ => 0, + }; + score += match &w[1] { + ',' => -829, + '、' => -829, + '〇' => 892, + '「' => -645, + '」' => 3145, + 'あ' => -538, + 'い' => 505, + 'う' => 134, + 'お' => -502, + 'か' => 1454, + 'が' => -856, + 'く' => -412, + 'こ' => 1141, + 'さ' => 878, + 'ざ' => 540, + 'し' => 1529, + 'す' => -675, + 'せ' => 300, + 'そ' => -1011, + 'た' => 188, + 'だ' => 1837, + 'つ' => -949, + 'て' => -291, + 'で' => -268, + 'と' => -981, + 'ど' => 1273, + 'な' => 1063, + 'に' => -1764, + 'の' => 130, + 'は' => -409, + 'ひ' => -1273, + 'べ' => 1261, + 'ま' => 600, + 'も' => -1263, + 'や' => -402, + 'よ' => 1639, + 'り' => -579, + 'る' => -694, + 'れ' => 571, + 'を' => -2516, + 'ん' => 2095, + 'ア' => -587, + 'カ' => 306, + 'キ' => 568, + 'ッ' => 831, + '三' => -758, + '不' => -2150, + '世' => -302, + '中' => -968, + '主' => -861, + '事' => 492, + '人' => -123, + '会' => 978, + '保' => 362, + '入' => 548, + '初' => -3025, + '副' => -1566, + '北' => -3414, + '区' => -422, + '大' => -1769, + '天' => -865, + '太' => -483, + '子' => -1519, + '学' => 760, + '実' => 1023, + '小' => -2009, + '市' => -813, + '年' => -1060, + '強' => 1067, + '手' => -1519, + '揺' => -1033, + '政' => 1522, + '文' => -1355, + '新' => -1682, + '日' => -1815, + '明' => -1462, + '最' => -630, + '朝' => -1843, + '本' => -1650, + '東' => -931, + '果' => -665, + '次' => -2378, + '民' => -180, + '気' => -1740, + '理' => 752, + '発' => 529, + '目' => -1584, + '相' => -242, + '県' => -1165, + '立' => -763, + '第' => 810, + '米' => 509, + '自' => -1353, + '行' => 838, + '西' => -744, + '見' => -3874, + '調' => 1010, + '議' => 1198, + '込' => 3041, + '開' => 1758, + '間' => -1257, + '「' => -645, + '」' => 3145, + 'ッ' => 831, + 'ア' => -587, + 'カ' => 306, + 'キ' => 568, + _ => 0, + }; + score += match &w[2] { + ',' => 4889, + '1' => -800, + '−' => -1723, + '、' => 4889, + '々' => -2311, + '〇' => 5827, + '」' => 2670, + '〓' => -3573, + 'あ' => -2696, + 'い' => 1006, + 'う' => 2342, + 'え' => 1983, + 'お' => -4864, + 'か' => -1163, + 'が' => 3271, + 'く' => 1004, + 'け' => 388, + 'げ' => 401, + 'こ' => -3552, + 'ご' => -3116, + 'さ' => -1058, + 'し' => -395, + 'す' => 584, + 'せ' => 3685, + 'そ' => -5228, + 'た' => 842, + 'ち' => -521, + 'っ' => -1444, + 'つ' => -1081, + 'て' => 6167, + 'で' => 2318, + 'と' => 1691, + 'ど' => -899, + 'な' => -2788, + 'に' => 2745, + 'の' => 4056, + 'は' => 4555, + 'ひ' => -2171, + 'ふ' => -1798, + 'へ' => 1199, + 'ほ' => -5516, + 'ま' => -4384, + 'み' => -120, + 'め' => 1205, + 'も' => 2323, + 'や' => -788, + 'よ' => -202, + 'ら' => 727, + 'り' => 649, + 'る' => 5905, + 'れ' => 2773, + 'わ' => -1207, + 'を' => 6620, + 'ん' => -518, + 'ア' => 551, + 'グ' => 1319, + 'ス' => 874, + 'ッ' => -1350, + 'ト' => 521, + 'ム' => 1109, + 'ル' => 1591, + 'ロ' => 2201, + 'ン' => 278, + '・' => -3794, + '一' => -1619, + '下' => -1759, + '世' => -2087, + '両' => 3815, + '中' => 653, + '主' => -758, + '予' => -1193, + '二' => 974, + '人' => 2742, + '今' => 792, + '他' => 1889, + '以' => -1368, + '低' => 811, + '何' => 4265, + '作' => -361, + '保' => -2439, + '元' => 4858, + '党' => 3593, + '全' => 1574, + '公' => -3030, + '六' => 755, + '共' => -1880, + '円' => 5807, + '再' => 3095, + '分' => 457, + '初' => 2475, + '別' => 1129, + '前' => 2286, + '副' => 4437, + '力' => 365, + '動' => -949, + '務' => -1872, + '化' => 1327, + '北' => -1038, + '区' => 4646, + '千' => -2309, + '午' => -783, + '協' => -1006, + '口' => 483, + '右' => 1233, + '各' => 3588, + '合' => -241, + '同' => 3906, + '和' => -837, + '員' => 4513, + '国' => 642, + '型' => 1389, + '場' => 1219, + '外' => -241, + '妻' => 2016, + '学' => -1356, + '安' => -423, + '実' => -1008, + '家' => 1078, + '小' => -513, + '少' => -3102, + '州' => 1155, + '市' => 3197, + '平' => -1804, + '年' => 2416, + '広' => -1030, + '府' => 1605, + '度' => 1452, + '建' => -2352, + '当' => -3885, + '得' => 1905, + '思' => -1291, + '性' => 1822, + '戸' => -488, + '指' => -3973, + '政' => -2013, + '教' => -1479, + '数' => 3222, + '文' => -1489, + '新' => 1764, + '日' => 2099, + '旧' => 5792, + '昨' => -661, + '時' => -1248, + '曜' => -951, + '最' => -937, + '月' => 4125, + '期' => 360, + '李' => 3094, + '村' => 364, + '東' => -805, + '核' => 5156, + '森' => 2438, + '業' => 484, + '氏' => 2613, + '民' => -1694, + '決' => -1073, + '法' => 1868, + '海' => -495, + '無' => 979, + '物' => 461, + '特' => -3850, + '生' => -273, + '用' => 914, + '町' => 1215, + '的' => 7313, + '直' => -1835, + '省' => 792, + '県' => 6293, + '知' => -1528, + '私' => 4231, + '税' => 401, + '立' => -960, + '第' => 1201, + '米' => 7767, + '系' => 3066, + '約' => 3663, + '級' => 1384, + '統' => -4229, + '総' => 1163, + '線' => 1255, + '者' => 6457, + '能' => 725, + '自' => -2869, + '英' => 785, + '見' => 1044, + '調' => -562, + '財' => -733, + '費' => 1777, + '車' => 1835, + '軍' => 1375, + '込' => -1504, + '通' => -1136, + '選' => -681, + '郎' => 1026, + '郡' => 4404, + '部' => 1200, + '金' => 2163, + '長' => 421, + '開' => -1432, + '間' => 1302, + '関' => -1282, + '雨' => 2009, + '電' => -1045, + '非' => 2066, + '駅' => 1620, + '1' => -800, + '」' => 2670, + '・' => -3794, + 'ッ' => -1350, + 'ア' => 551, + 'ス' => 874, + 'ト' => 521, + 'ム' => 1109, + 'ル' => 1591, + 'ロ' => 2201, + 'ン' => 278, + _ => 0, + }; + score += match &w[3] { + ',' => 3930, + '.' => 3508, + '―' => -4841, + '、' => 3930, + '。' => 3508, + '〇' => 4999, + '「' => 1895, + '」' => 3798, + '〓' => -5156, + 'あ' => 4752, + 'い' => -3435, + 'う' => -640, + 'え' => -2514, + 'お' => 2405, + 'か' => 530, + 'が' => 6006, + 'き' => -4482, + 'ぎ' => -3821, + 'く' => -3788, + 'け' => -4376, + 'げ' => -4734, + 'こ' => 2255, + 'ご' => 1979, + 'さ' => 2864, + 'し' => -843, + 'じ' => -2506, + 'す' => -731, + 'ず' => 1251, + 'せ' => 181, + 'そ' => 4091, + 'た' => 5034, + 'だ' => 5408, + 'ち' => -3654, + 'っ' => -5882, + 'つ' => -1659, + 'て' => 3994, + 'で' => 7410, + 'と' => 4547, + 'な' => 5433, + 'に' => 6499, + 'ぬ' => 1853, + 'ね' => 1413, + 'の' => 7396, + 'は' => 8578, + 'ば' => 1940, + 'ひ' => 4249, + 'び' => -4134, + 'ふ' => 1345, + 'へ' => 6665, + 'べ' => -744, + 'ほ' => 1464, + 'ま' => 1051, + 'み' => -2082, + 'む' => -882, + 'め' => -5046, + 'も' => 4169, + 'ゃ' => -2666, + 'や' => 2795, + 'ょ' => -1544, + 'よ' => 3351, + 'ら' => -2922, + 'り' => -9726, + 'る' => -14896, + 'れ' => -2613, + 'ろ' => -4570, + 'わ' => -1783, + 'を' => 13150, + 'ん' => -2352, + 'カ' => 2145, + 'コ' => 1789, + 'セ' => 1287, + 'ッ' => -724, + 'ト' => -403, + 'メ' => -1635, + 'ラ' => -881, + 'リ' => -541, + 'ル' => -856, + 'ン' => -3637, + '・' => -4371, + 'ー' => -11870, + '一' => -2069, + '中' => 2210, + '予' => 782, + '事' => -190, + '井' => -1768, + '人' => 1036, + '以' => 544, + '会' => 950, + '体' => -1286, + '作' => 530, + '側' => 4292, + '先' => 601, + '党' => -2006, + '共' => -1212, + '内' => 584, + '円' => 788, + '初' => 1347, + '前' => 1623, + '副' => 3879, + '力' => -302, + '動' => -740, + '務' => -2715, + '化' => 776, + '区' => 4517, + '協' => 1013, + '参' => 1555, + '合' => -1834, + '和' => -681, + '員' => -910, + '器' => -851, + '回' => 1500, + '国' => -619, + '園' => -1200, + '地' => 866, + '場' => -1410, + '塁' => -2094, + '士' => -1413, + '多' => 1067, + '大' => 571, + '子' => -4802, + '学' => -1397, + '定' => -1057, + '寺' => -809, + '小' => 1910, + '屋' => -1328, + '山' => -1500, + '島' => -2056, + '川' => -2667, + '市' => 2771, + '年' => 374, + '庁' => -4556, + '後' => 456, + '性' => 553, + '感' => 916, + '所' => -1566, + '支' => 856, + '改' => 787, + '政' => 2182, + '教' => 704, + '文' => 522, + '方' => -856, + '日' => 1798, + '時' => 1829, + '最' => 845, + '月' => -9066, + '木' => -485, + '来' => -442, + '校' => -360, + '業' => -1043, + '氏' => 5388, + '民' => -2716, + '気' => -910, + '沢' => -939, + '済' => -543, + '物' => -735, + '率' => 672, + '球' => -1267, + '生' => -1286, + '産' => -1101, + '田' => -2900, + '町' => 1826, + '的' => 2586, + '目' => 922, + '省' => -3485, + '県' => 2997, + '空' => -867, + '立' => -2112, + '第' => 788, + '米' => 2937, + '系' => 786, + '約' => 2171, + '経' => 1146, + '統' => -1169, + '総' => 940, + '線' => -994, + '署' => 749, + '者' => 2145, + '能' => -730, + '般' => -852, + '行' => -792, + '規' => 792, + '警' => -1184, + '議' => -244, + '谷' => -1000, + '賞' => 730, + '車' => -1481, + '軍' => 1158, + '輪' => -1433, + '込' => -3370, + '近' => 929, + '道' => -1291, + '選' => 2596, + '郎' => -4866, + '都' => 1192, + '野' => -1100, + '銀' => -2213, + '長' => 357, + '間' => -2344, + '院' => -2297, + '際' => -2604, + '電' => -878, + '領' => -1659, + '題' => -792, + '館' => -1984, + '首' => 1749, + '高' => 2120, + '「' => 1895, + '」' => 3798, + '・' => -4371, + 'ッ' => -724, + 'ー' => -11870, + 'カ' => 2145, + 'コ' => 1789, + 'セ' => 1287, + 'ト' => -403, + 'メ' => -1635, + 'ラ' => -881, + 'リ' => -541, + 'ル' => -856, + 'ン' => -3637, + _ => 0, + }; + score += match &w[4] { + ',' => 465, + '.' => -299, + '1' => -514, + ']' => -2762, + '、' => 465, + '。' => -299, + '「' => 363, + 'あ' => 1655, + 'い' => 331, + 'う' => -503, + 'え' => 1199, + 'お' => 527, + 'か' => 647, + 'が' => -421, + 'き' => 1624, + 'ぎ' => 1971, + 'く' => 312, + 'げ' => -983, + 'さ' => -1537, + 'し' => -1371, + 'す' => -852, + 'だ' => -1186, + 'ち' => 1093, + 'っ' => 52, + 'つ' => 921, + 'て' => -18, + 'で' => -850, + 'と' => -127, + 'ど' => 1682, + 'な' => -787, + 'に' => -1224, + 'の' => -635, + 'は' => -578, + 'べ' => 1001, + 'み' => 502, + 'め' => 865, + 'ゃ' => 3350, + 'ょ' => 854, + 'り' => -208, + 'る' => 429, + 'れ' => 504, + 'わ' => 419, + 'を' => -1264, + 'ん' => 327, + 'イ' => 241, + 'ル' => 451, + 'ン' => -343, + '中' => -871, + '京' => 722, + '会' => -1153, + '党' => -654, + '務' => 3519, + '区' => -901, + '告' => 848, + '員' => 2104, + '大' => -1296, + '学' => -548, + '定' => 1785, + '嵐' => -1304, + '市' => -2991, + '席' => 921, + '年' => 1763, + '思' => 872, + '所' => -814, + '挙' => 1618, + '新' => -1682, + '日' => 218, + '月' => -4353, + '査' => 932, + '格' => 1356, + '機' => -1508, + '氏' => -1347, + '田' => 240, + '町' => -3912, + '的' => -3149, + '相' => 1319, + '省' => -1052, + '県' => -4003, + '研' => -997, + '社' => -278, + '空' => -813, + '統' => 1955, + '者' => -2233, + '表' => 663, + '語' => -1073, + '議' => 1219, + '選' => -1018, + '郎' => -368, + '長' => 786, + '間' => 1191, + '題' => 2368, + '館' => -689, + '1' => -514, + '「' => 363, + 'イ' => 241, + 'ル' => 451, + 'ン' => -343, + _ => 0, + }; + if w[4] == *E2 { + score += -32768; + } + score += match &w[5] { + ',' => 227, + '.' => 808, + '1' => -270, + '、' => 227, + '。' => 808, + 'あ' => -307, + 'う' => 189, + 'か' => 241, + 'が' => -73, + 'く' => -121, + 'こ' => -200, + 'じ' => 1782, + 'す' => 383, + 'た' => -428, + 'っ' => 573, + 'て' => -1014, + 'で' => 101, + 'と' => -105, + 'な' => -253, + 'に' => -149, + 'の' => -417, + 'は' => -236, + 'も' => -206, + 'り' => 187, + 'る' => -135, + 'を' => 195, + 'ル' => -673, + 'ン' => -496, + '一' => -277, + '中' => 201, + '件' => -800, + '会' => 624, + '前' => 302, + '区' => 1792, + '員' => -1212, + '委' => 798, + '学' => -960, + '市' => 887, + '広' => -695, + '後' => 535, + '業' => -697, + '相' => 753, + '社' => -507, + '福' => 974, + '空' => -822, + '者' => 1811, + '連' => 463, + '郎' => 1082, + '1' => -270, + 'ル' => -673, + 'ン' => -496, + _ => 0, + }; + if w[5] == *E1 { + score += 306; + } + + score += match &(c[1], c[2]) { + (Ctype::H, Ctype::H) => 6, + (Ctype::I, Ctype::I) => 2461, + (Ctype::K, Ctype::H) => 406, + (Ctype::O, Ctype::H) => -1378, + _ => 0, + }; + score += match &(c[2], c[3]) { + (Ctype::A, Ctype::A) => -3267, + (Ctype::A, Ctype::I) => 2744, + (Ctype::A, Ctype::N) => -878, + (Ctype::H, Ctype::H) => -4070, + (Ctype::H, Ctype::M) => -1711, + (Ctype::H, Ctype::N) => 4012, + (Ctype::H, Ctype::O) => 3761, + (Ctype::I, Ctype::A) => 1327, + (Ctype::I, Ctype::H) => -1184, + (Ctype::I, Ctype::I) => -1332, + (Ctype::I, Ctype::K) => 1721, + (Ctype::I, Ctype::O) => 5492, + (Ctype::K, Ctype::I) => 3831, + (Ctype::K, Ctype::K) => -8741, + (Ctype::M, Ctype::H) => -3132, + (Ctype::M, Ctype::K) => 3334, + (Ctype::O, Ctype::O) => -2920, + _ => 0, + }; + score += match &(c[3], c[4]) { + (Ctype::H, Ctype::H) => 996, + (Ctype::H, Ctype::I) => 626, + (Ctype::H, Ctype::K) => -721, + (Ctype::H, Ctype::N) => -1307, + (Ctype::H, Ctype::O) => -836, + (Ctype::I, Ctype::H) => -301, + (Ctype::K, Ctype::K) => 2762, + (Ctype::M, Ctype::K) => 1079, + (Ctype::M, Ctype::M) => 4034, + (Ctype::O, Ctype::A) => -1652, + (Ctype::O, Ctype::H) => 266, + _ => 0, + }; + score += match &(p[0], p[1]) { + (Marker::B, Marker::B) => 295, + (Marker::O, Marker::B) => 304, + (Marker::O, Marker::O) => -125, + (Marker::U, Marker::B) => 352, + _ => 0, + }; + score += match &(p[1], p[2]) { + (Marker::B, Marker::O) => 60, + (Marker::O, Marker::O) => -1762, + _ => 0, + }; + score += match &(p[1], c[1], c[2]) { + (Marker::B, Ctype::H, Ctype::H) => 1150, + (Marker::B, Ctype::H, Ctype::M) => 1521, + (Marker::B, Ctype::I, Ctype::I) => -1158, + (Marker::B, Ctype::I, Ctype::M) => 886, + (Marker::B, Ctype::M, Ctype::H) => 1208, + (Marker::B, Ctype::N, Ctype::H) => 449, + (Marker::B, Ctype::O, Ctype::H) => -91, + (Marker::B, Ctype::O, Ctype::O) => -2597, + (Marker::O, Ctype::H, Ctype::I) => 451, + (Marker::O, Ctype::I, Ctype::H) => -296, + (Marker::O, Ctype::K, Ctype::A) => 1851, + (Marker::O, Ctype::K, Ctype::H) => -1020, + (Marker::O, Ctype::K, Ctype::K) => 904, + (Marker::O, Ctype::O, Ctype::O) => 2965, + _ => 0, + }; + score += match &(p[1], c[2], c[3]) { + (Marker::B, Ctype::H, Ctype::H) => 118, + (Marker::B, Ctype::H, Ctype::I) => -1159, + (Marker::B, Ctype::H, Ctype::M) => 466, + (Marker::B, Ctype::I, Ctype::H) => -919, + (Marker::B, Ctype::K, Ctype::K) => -1720, + (Marker::B, Ctype::K, Ctype::O) => 864, + (Marker::O, Ctype::H, Ctype::H) => -1139, + (Marker::O, Ctype::H, Ctype::M) => -181, + (Marker::O, Ctype::I, Ctype::H) => 153, + (Marker::U, Ctype::H, Ctype::I) => -1146, + _ => 0, + }; + score += match &(p[2], c[1], c[2]) { + (Marker::B, Ctype::H, Ctype::H) => -792, + (Marker::B, Ctype::H, Ctype::I) => 2664, + (Marker::B, Ctype::I, Ctype::I) => -299, + (Marker::B, Ctype::K, Ctype::I) => 419, + (Marker::B, Ctype::M, Ctype::H) => 937, + (Marker::B, Ctype::M, Ctype::M) => 8335, + (Marker::B, Ctype::N, Ctype::N) => 998, + (Marker::B, Ctype::O, Ctype::H) => 775, + (Marker::O, Ctype::H, Ctype::H) => 2174, + (Marker::O, Ctype::H, Ctype::M) => 439, + (Marker::O, Ctype::I, Ctype::I) => 280, + (Marker::O, Ctype::K, Ctype::H) => 1798, + (Marker::O, Ctype::K, Ctype::I) => -793, + (Marker::O, Ctype::K, Ctype::O) => -2242, + (Marker::O, Ctype::M, Ctype::H) => -2402, + (Marker::O, Ctype::O, Ctype::O) => 11699, + _ => 0, + }; + score += match &(p[2], c[2], c[3]) { + (Marker::B, Ctype::H, Ctype::H) => -3895, + (Marker::B, Ctype::I, Ctype::H) => 3761, + (Marker::B, Ctype::I, Ctype::I) => -4654, + (Marker::B, Ctype::I, Ctype::K) => 1348, + (Marker::B, Ctype::K, Ctype::K) => -1806, + (Marker::B, Ctype::M, Ctype::I) => -3385, + (Marker::B, Ctype::O, Ctype::O) => -12396, + (Marker::O, Ctype::A, Ctype::H) => 926, + (Marker::O, Ctype::H, Ctype::H) => 266, + (Marker::O, Ctype::H, Ctype::K) => -2036, + (Marker::O, Ctype::N, Ctype::N) => -973, + _ => 0, + }; + score += match &(w[1], w[2]) { + (',', 'と') => 660, + (',', '同') => 727, + ('、', 'と') => 660, + ('、', '同') => 727, + ('」', 'と') => 1682, + ('あ', 'っ') => 1505, + ('い', 'う') => 1743, + ('い', 'っ') => -2055, + ('い', 'る') => 672, + ('う', 'し') => -4817, + ('う', 'ん') => 665, + ('か', 'ら') => 3472, + ('が', 'ら') => 600, + ('こ', 'う') => -790, + ('こ', 'と') => 2083, + ('こ', 'ん') => -1262, + ('さ', 'ら') => -4143, + ('さ', 'ん') => 4573, + ('し', 'た') => 2641, + ('し', 'て') => 1104, + ('す', 'で') => -3399, + ('そ', 'こ') => 1977, + ('そ', 'れ') => -871, + ('た', 'ち') => 1122, + ('た', 'め') => 601, + ('っ', 'た') => 3463, + ('つ', 'い') => -802, + ('て', 'い') => 805, + ('て', 'き') => 1249, + ('で', 'き') => 1127, + ('で', 'す') => 3445, + ('で', 'は') => 844, + ('と', 'い') => -4915, + ('と', 'み') => 1922, + ('ど', 'こ') => 3887, + ('な', 'い') => 5713, + ('な', 'っ') => 3015, + ('な', 'ど') => 7379, + ('な', 'ん') => -1113, + ('に', 'し') => 2468, + ('に', 'は') => 1498, + ('に', 'も') => 1671, + ('に', '対') => -912, + ('の', '一') => -501, + ('の', '中') => 741, + ('ま', 'せ') => 2448, + ('ま', 'で') => 1711, + ('ま', 'ま') => 2600, + ('ま', 'る') => -2155, + ('や', 'む') => -1947, + ('よ', 'っ') => -2565, + ('れ', 'た') => 2369, + ('れ', 'で') => -913, + ('を', 'し') => 1860, + ('を', '見') => 731, + ('亡', 'く') => -1886, + ('京', '都') => 2558, + ('取', 'り') => -2784, + ('大', 'き') => -2604, + ('大', '阪') => 1497, + ('平', '方') => -2314, + ('引', 'き') => -1336, + ('日', '本') => -195, + ('本', '当') => -2423, + ('毎', '日') => -2113, + ('目', '指') => -724, + ('」', 'と') => 1682, + _ => 0, + }; + if (w[1], w[2]) == (*B1, 'あ') { + score += 1404; + } + if (w[1], w[2]) == (*B1, '同') { + score += 542; + } + + score += match &(w[2], w[3]) { + ('.', '.') => -11822, + ('1', '1') => -669, + ('―', '―') => -5730, + ('−', '−') => -13175, + ('い', 'う') => -1609, + ('う', 'か') => 2490, + ('か', 'し') => -1350, + ('か', 'も') => -602, + ('か', 'ら') => -7194, + ('か', 'れ') => 4612, + ('が', 'い') => 853, + ('が', 'ら') => -3198, + ('き', 'た') => 1941, + ('く', 'な') => -1597, + ('こ', 'と') => -8392, + ('こ', 'の') => -4193, + ('さ', 'せ') => 4533, + ('さ', 'れ') => 13168, + ('さ', 'ん') => -3977, + ('し', 'い') => -1819, + ('し', 'か') => -545, + ('し', 'た') => 5078, + ('し', 'て') => 972, + ('し', 'な') => 939, + ('そ', 'の') => -3744, + ('た', 'い') => -1253, + ('た', 'た') => -662, + ('た', 'だ') => -3857, + ('た', 'ち') => -786, + ('た', 'と') => 1224, + ('た', 'は') => -939, + ('っ', 'た') => 4589, + ('っ', 'て') => 1647, + ('っ', 'と') => -2094, + ('て', 'い') => 6144, + ('て', 'き') => 3640, + ('て', 'く') => 2551, + ('て', 'は') => -3110, + ('て', 'も') => -3065, + ('で', 'い') => 2666, + ('で', 'き') => -1528, + ('で', 'し') => -3828, + ('で', 'す') => -4761, + ('で', 'も') => -4203, + ('と', 'い') => 1890, + ('と', 'こ') => -1746, + ('と', 'と') => -2279, + ('と', 'の') => 720, + ('と', 'み') => 5168, + ('と', 'も') => -3941, + ('な', 'い') => -2488, + ('な', 'が') => -1313, + ('な', 'ど') => -6509, + ('な', 'の') => 2614, + ('な', 'ん') => 3099, + ('に', 'お') => -1615, + ('に', 'し') => 2748, + ('に', 'な') => 2454, + ('に', 'よ') => -7236, + ('に', '対') => -14943, + ('に', '従') => -4688, + ('に', '関') => -11388, + ('の', 'か') => 2093, + ('の', 'で') => -7059, + ('の', 'に') => -6041, + ('の', 'の') => -6125, + ('は', 'い') => 1073, + ('は', 'が') => -1033, + ('は', 'ず') => -2532, + ('ば', 'れ') => 1813, + ('ま', 'し') => -1316, + ('ま', 'で') => -6621, + ('ま', 'れ') => 5409, + ('め', 'て') => -3153, + ('も', 'い') => 2230, + ('も', 'の') => -10713, + ('ら', 'か') => -944, + ('ら', 'し') => -1611, + ('ら', 'に') => -1897, + ('り', 'し') => 651, + ('り', 'ま') => 1620, + ('れ', 'た') => 4270, + ('れ', 'て') => 849, + ('れ', 'ば') => 4114, + ('ろ', 'う') => 6067, + ('わ', 'れ') => 7901, + ('を', '通') => -11877, + ('ん', 'だ') => 728, + ('ん', 'な') => -4115, + ('一', '人') => 602, + ('一', '方') => -1375, + ('一', '日') => 970, + ('一', '部') => -1051, + ('上', 'が') => -4479, + ('会', '社') => -1116, + ('出', 'て') => 2163, + ('分', 'の') => -7758, + ('同', '党') => 970, + ('同', '日') => -913, + ('大', '阪') => -2471, + ('委', '員') => -1250, + ('少', 'な') => -1050, + ('年', '度') => -8669, + ('年', '間') => -1626, + ('府', '県') => -2363, + ('手', '権') => -1982, + ('新', '聞') => -4066, + ('日', '新') => -722, + ('日', '本') => -7068, + ('日', '米') => 3372, + ('曜', '日') => -601, + ('朝', '鮮') => -2355, + ('本', '人') => -2697, + ('東', '京') => -1543, + ('然', 'と') => -1384, + ('社', '会') => -1276, + ('立', 'て') => -990, + ('第', 'に') => -1612, + ('米', '国') => -4268, + ('1', '1') => -669, + ('ク', '゙') => 1319, + _ => 0, + }; + score += match &(w[3], w[4]) { + ('あ', 'た') => -2194, + ('あ', 'り') => 719, + ('あ', 'る') => 3846, + ('い', '.') => -1185, + ('い', '。') => -1185, + ('い', 'い') => 5308, + ('い', 'え') => 2079, + ('い', 'く') => 3029, + ('い', 'た') => 2056, + ('い', 'っ') => 1883, + ('い', 'る') => 5600, + ('い', 'わ') => 1527, + ('う', 'ち') => 1117, + ('う', 'と') => 4798, + ('え', 'と') => 1454, + ('か', '.') => 2857, + ('か', '。') => 2857, + ('か', 'け') => -743, + ('か', 'っ') => -4098, + ('か', 'に') => -669, + ('か', 'ら') => 6520, + ('か', 'り') => -2670, + ('が', ',') => 1816, + ('が', '、') => 1816, + ('が', 'き') => -4855, + ('が', 'け') => -1127, + ('が', 'っ') => -913, + ('が', 'ら') => -4977, + ('が', 'り') => -2064, + ('き', 'た') => 1645, + ('け', 'ど') => 1374, + ('こ', 'と') => 7397, + ('こ', 'の') => 1542, + ('こ', 'ろ') => -2757, + ('さ', 'い') => -714, + ('さ', 'を') => 976, + ('し', ',') => 1557, + ('し', '、') => 1557, + ('し', 'い') => -3714, + ('し', 'た') => 3562, + ('し', 'て') => 1449, + ('し', 'な') => 2608, + ('し', 'ま') => 1200, + ('す', '.') => -1310, + ('す', '。') => -1310, + ('す', 'る') => 6521, + ('ず', ',') => 3426, + ('ず', '、') => 3426, + ('ず', 'に') => 841, + ('そ', 'う') => 428, + ('た', '.') => 8875, + ('た', '。') => 8875, + ('た', 'い') => -594, + ('た', 'の') => 812, + ('た', 'り') => -1183, + ('た', 'る') => -853, + ('だ', '.') => 4098, + ('だ', '。') => 4098, + ('だ', 'っ') => 1004, + ('っ', 'た') => -4748, + ('っ', 'て') => 300, + ('て', 'い') => 6240, + ('て', 'お') => 855, + ('て', 'も') => 302, + ('で', 'す') => 1437, + ('で', 'に') => -1482, + ('で', 'は') => 2295, + ('と', 'う') => -1387, + ('と', 'し') => 2266, + ('と', 'の') => 541, + ('と', 'も') => -3543, + ('ど', 'う') => 4664, + ('な', 'い') => 1796, + ('な', 'く') => -903, + ('な', 'ど') => 2135, + ('に', ',') => -1021, + ('に', '、') => -1021, + ('に', 'し') => 1771, + ('に', 'な') => 1906, + ('に', 'は') => 2644, + ('の', ',') => -724, + ('の', '、') => -724, + ('の', '子') => -1000, + ('は', ',') => 1337, + ('は', '、') => 1337, + ('べ', 'き') => 2181, + ('ま', 'し') => 1113, + ('ま', 'す') => 6943, + ('ま', 'っ') => -1549, + ('ま', 'で') => 6154, + ('ま', 'れ') => -793, + ('ら', 'し') => 1479, + ('ら', 'れ') => 6820, + ('る', 'る') => 3818, + ('れ', ',') => 854, + ('れ', '、') => 854, + ('れ', 'た') => 1850, + ('れ', 'て') => 1375, + ('れ', 'ば') => -3246, + ('れ', 'る') => 1091, + ('わ', 'れ') => -605, + ('ん', 'だ') => 606, + ('ん', 'で') => 798, + ('カ', '月') => 990, + ('会', '議') => 860, + ('入', 'り') => 1232, + ('大', '会') => 2217, + ('始', 'め') => 1681, + ('市', ' ') => 965, + ('新', '聞') => -5055, + ('日', ',') => 974, + ('日', '、') => 974, + ('社', '会') => 2024, + ('カ', '月') => 990, + _ => 0, + }; + + p.remove(0); + p.push(if score < 0 { Marker::O } else { Marker::B }); + + if 0 < score { + result.push(word.clone()); + word.clear(); + } + word.push(segments[index]); } - word.push(segments[index]); - } - result.push(word.clone()); - result + result.push(word.clone()); + result } diff --git a/test/test.rs b/test/test.rs index 33381de..8b17792 100644 --- a/test/test.rs +++ b/test/test.rs @@ -2,13 +2,28 @@ extern crate tinysegmenter; #[test] fn tokenize() { - assert_eq!( - tinysegmenter::tokenize("私の名前は中野です"), - ["私", "の", "名前", "は", "中野", "です"]); + assert_eq!( + tinysegmenter::tokenize("私の名前は中野です"), + ["私", "の", "名前", "は", "中野", "です"] + ); - assert_eq!( - tinysegmenter::tokenize("TinySegmenterは25kBで書かれています。"), - ["TinySegmenter", "は", "2", "5", "kB", "で", "書か", "れ", "て", "い", "ます", "。"]); + assert_eq!( + tinysegmenter::tokenize("TinySegmenterは25kBで書かれています。"), + [ + "TinySegmenter", + "は", + "2", + "5", + "kB", + "で", + "書か", + "れ", + "て", + "い", + "ます", + "。" + ] + ); - assert_eq!(tinysegmenter::tokenize(""), [] as [&str; 0]); + assert_eq!(tinysegmenter::tokenize(""), [] as [&str; 0]); }