diff --git a/crates/bpe-openai/Cargo.toml b/crates/bpe-openai/Cargo.toml index 3f35ede..cc385c8 100644 --- a/crates/bpe-openai/Cargo.toml +++ b/crates/bpe-openai/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "bpe-openai" -version = "0.2.1" +version = "0.3.0" edition = "2021" description = "Prebuilt fast byte-pair encoders for OpenAI." repository = "https://github.com/github/rust-gems" @@ -13,18 +13,19 @@ crate-type = ["lib", "staticlib"] bench = false [dependencies] -bpe = { version = "0.2.0", path = "../bpe" } +bpe = { version = "0.2", path = "../bpe" } either = "1.13" regex-automata = "0.4" rmp-serde = "1" +unicode-normalization = "0.1" [dev-dependencies] -bpe = { version = "0.2.0", path = "../bpe", features = ["rand"] } +bpe = { version = "0.2", path = "../bpe", features = ["rand"] } tiktoken-rs = "0.6" [build-dependencies] -base64 = "0.22.1" -bpe = { version = "0.2.0", path = "../bpe", features = ["tiktoken"] } +base64 = "0.22" +bpe = { version = "0.2", path = "../bpe", features = ["tiktoken"] } flate2 = "1.0" rmp-serde = "1" serde = "1" diff --git a/crates/bpe-openai/build.rs b/crates/bpe-openai/build.rs index 528eae6..6976e91 100644 --- a/crates/bpe-openai/build.rs +++ b/crates/bpe-openai/build.rs @@ -17,6 +17,11 @@ fn main() { include_bytes!("data/o200k_base.tiktoken.gz"), 17846336922010275747, ); + serialize_tiktoken_bpe( + "voyage3_base", + include_bytes!("data/voyage3_base.tiktoken.gz"), + 17846336922010275747, + ); println!("cargo::rerun-if-changed=build.rs"); } diff --git a/crates/bpe-openai/data/voyage3_base.tiktoken.gz b/crates/bpe-openai/data/voyage3_base.tiktoken.gz new file mode 100644 index 0000000..96b3bc6 Binary files /dev/null and b/crates/bpe-openai/data/voyage3_base.tiktoken.gz differ diff --git a/crates/bpe-openai/src/lib.rs b/crates/bpe-openai/src/lib.rs index 385749e..b6d9add 100644 --- a/crates/bpe-openai/src/lib.rs +++ b/crates/bpe-openai/src/lib.rs @@ -8,6 +8,11 @@ use regex_automata::{ Anchored, Input, }; +pub mod normalizer; + +pub use bpe::*; +pub use normalizer::{Normalizable, NormalizedString}; + // Note: Below we rewrite the negative look-ahead with a positive pseudo look-ahead. // The look-ahead character is dropped from the match by the Pretokenizer iterator. // Note: The negative look-ahead `\\s+(?!\\S)` requires `\\s+\\s` but also `\\s+$` to handle end of file without dropping a character! @@ -18,7 +23,7 @@ static BPE_CL100K_BASE: LazyLock = LazyLock::new(|| { let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$"; let pat2 = "\\s+\\s"; let pat3 = "\\s+"; - Tokenizer::new_lookahead(bpe, &[(pat1, false), (pat2, true), (pat3, false)]) + Tokenizer::new_lookahead(bpe, &[(pat1, false), (pat2, true), (pat3, false)], false) .expect("valid regex") }); @@ -35,11 +40,19 @@ static BPE_O200K_BASE: LazyLock = LazyLock::new(|| { ].join("|"); let pat2 = "\\s+\\s"; let pat3 = "\\s+"; - Tokenizer::new_lookahead(bpe, &[(&pat1, false), (pat2, true), (pat3, false)]) + Tokenizer::new_lookahead(bpe, &[(&pat1, false), (pat2, true), (pat3, false)], false) .expect("valid regex") }); -pub use bpe::*; +static BPE_VOYAGE3_BASE: LazyLock = LazyLock::new(|| { + let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_voyage3_base.dict")); + let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data"); + let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$"; + let pat2 = "\\s+\\s"; + let pat3 = "\\s+"; + Tokenizer::new_lookahead(bpe, &[(pat1, false), (pat2, true), (pat3, false)], true) + .expect("valid regex") +}); /// A byte-pair encoding tokenizer that supports a pre-tokenization regex. /// The direct methods on this type pre-tokenize the input text and should @@ -52,6 +65,8 @@ pub struct Tokenizer { pub bpe: BytePairEncoding, /// The pattern regex used to split the input. pub pre: Option, + /// Indicates whether the input should be normalized with NFC. + nfc: bool, } pub struct Pretokenizer { @@ -64,9 +79,9 @@ pub struct Pretokenizer { impl Tokenizer { /// Build a tokenizer with an optional pretokenization regex pattern. #[allow(clippy::result_large_err)] - pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> Result { + pub fn new(bpe: BytePairEncoding, pat: Option<&str>, nfc: bool) -> Result { let pre = pat.map(Pretokenizer::new).transpose()?; - Ok(Self { bpe, pre }) + Ok(Self { nfc, bpe, pre }) } /// Build a tokenizer with pretokenization regex patterns. If the boolean for a pattern is true, @@ -75,15 +90,17 @@ impl Tokenizer { pub fn new_lookahead( bpe: BytePairEncoding, patterns: &[(&str, bool)], + nfc: bool, ) -> Result { let pre = Some(Pretokenizer::new_lookahead(patterns)?); - Ok(Self { bpe, pre }) + Ok(Self { nfc, bpe, pre }) } /// Count the number of tokens produced when encoding the text. Applies pre-tokenization /// before counting. - pub fn count(&self, text: &str) -> usize { - self.split(text) + pub fn count<'a, I: Normalizable<'a>>(&self, text: I) -> usize { + let text = self.normalize(text); + self.split(text.as_str()) .map(|piece| self.bpe.count(piece.as_bytes())) .sum() } @@ -91,18 +108,23 @@ impl Tokenizer { /// Returns the token count iff the total token count stays below the specified token_limit. /// Otherwise, it returns none. This function can be faster than [`Self::count`]` when the /// token limit is much smaller than the provided text. Applies pre-tokenization before counting. - pub fn count_till_limit(&self, text: &str, token_limit: usize) -> Option { - self.split(text).try_fold(0, |consumed, piece| { + /// + /// Note: This function assumes that the text is already normalized, so that this function can run + /// in roughly O(token_limit) time. + pub fn count_till_limit(&self, text: &NormalizedString, token_limit: usize) -> Option { + let res: Option = self.split(text.as_str()).try_fold(0, |consumed, piece| { self.bpe .count_till_limit(piece.as_bytes(), token_limit - consumed) .map(|piece_count| consumed + piece_count) - }) + }); + res } /// Returns the tokens for the encoding of the given text. Applies pre-tokenization before /// encoding. - pub fn encode(&self, text: &str) -> Vec { - self.split(text) + pub fn encode<'a, I: Normalizable<'a>>(&self, text: I) -> Vec { + let text: NormalizedString<'_> = self.normalize(text); + self.split(text.as_str()) .flat_map(|piece| self.bpe.encode_via_backtracking(piece.as_bytes())) .collect() } @@ -114,12 +136,18 @@ impl Tokenizer { /// Returns an iterator with the text pieces resulting from pre-tokenization. If this /// tokenizer does not have pre-tokenization, the iterator returns the full text. - pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator + 'a { + pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator { match &self.pre { Some(pre) => Either::Left(pre.split(text)), None => Either::Right(std::iter::once(text)), } } + + /// Returns the normalized text if the tokenizer requires normalization. + /// If the input was already normalized, this function is a noop. + pub fn normalize<'a, I: Normalizable<'a>>(&self, text: I) -> NormalizedString<'a> { + text.normalize(self.nfc) + } } impl Pretokenizer { @@ -143,7 +171,7 @@ impl Pretokenizer { } /// Returns an iterator with the text pieces after splitting with the regular expression. - pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator + 'a { + pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator { Splits { pat: &self.pat, lookahead: &self.lookahead, @@ -201,6 +229,10 @@ pub fn o200k_base() -> &'static Tokenizer { &BPE_O200K_BASE } +pub fn voyage3_base() -> &'static Tokenizer { + &BPE_VOYAGE3_BASE +} + #[cfg(test)] mod tests { use bpe::byte_pair_encoding::{create_test_string, select_test_string}; @@ -233,9 +265,21 @@ mod tests { #[test] fn test_count_till_limit() { - assert_eq!(cl100k_base().count_till_limit("abc", 3), Some(1)); - assert_eq!(cl100k_base().count_till_limit("abcabc", 3), Some(2)); - assert_eq!(cl100k_base().count_till_limit("abcabcabc", 3), Some(3)); - assert_eq!(cl100k_base().count_till_limit("abcabcabcabc", 3), None); + assert_eq!( + cl100k_base().count_till_limit(&cl100k_base().normalize("abc"), 3), + Some(1) + ); + assert_eq!( + cl100k_base().count_till_limit(&cl100k_base().normalize("abcabc"), 3), + Some(2) + ); + assert_eq!( + cl100k_base().count_till_limit(&cl100k_base().normalize("abcabcabc"), 3), + Some(3) + ); + assert_eq!( + cl100k_base().count_till_limit(&cl100k_base().normalize("abcabcabcabc"), 3), + None + ); } } diff --git a/crates/bpe-openai/src/normalizer.rs b/crates/bpe-openai/src/normalizer.rs new file mode 100644 index 0000000..50f3309 --- /dev/null +++ b/crates/bpe-openai/src/normalizer.rs @@ -0,0 +1,58 @@ +use std::borrow::Cow; + +use unicode_normalization::UnicodeNormalization; + +/// Type which represents a normalized string. +/// This is to avoid calling normalize multiple times or forgetting to call normalization! +/// +/// TODO: Annotate the type with the normalization type, once there are more than one. +pub struct NormalizedString<'a>(Cow<'a, str>); + +impl<'a> NormalizedString<'a> { + /// Returns the normalized inner str buffer. + pub fn as_str(&self) -> &str { + &self.0 + } + + /// This function is unsafe, since the caller must ensure that the correct normalization + /// was used. The normalization may vary by tokenizer. This mostly a backdoor which might + /// be handy for certain optimizations or for testing. + /// + /// # Safety + /// This is safe if `s` is in fact correctly normalized already. The caller is + /// responsible for ensuring that. + pub unsafe fn from_str(s: &'a str) -> NormalizedString<'a> { + NormalizedString(Cow::Borrowed(s)) + } +} + +/// Helper trait which converts string types into NormalizedString. +/// Calling normalize on a NormalizedString is a no-op. +pub trait Normalizable<'a> { + fn normalize(self, nfc: bool) -> NormalizedString<'a>; +} + +impl<'a> Normalizable<'a> for &'a str { + fn normalize(self, nfc: bool) -> NormalizedString<'a> { + if nfc { + NormalizedString(self.nfc().collect()) + } else { + NormalizedString(Cow::Borrowed(self)) + } + } +} + +impl<'a, T> Normalizable<'a> for &'a T +where + T: AsRef, +{ + fn normalize(self, nfc: bool) -> NormalizedString<'a> { + self.as_ref().normalize(nfc) + } +} + +impl<'a> Normalizable<'a> for NormalizedString<'a> { + fn normalize(self, _: bool) -> NormalizedString<'a> { + self + } +} diff --git a/crates/bpe/Cargo.toml b/crates/bpe/Cargo.toml index fc562ce..9e10955 100644 --- a/crates/bpe/Cargo.toml +++ b/crates/bpe/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "bpe" -version = "0.2.0" +version = "0.2.1" edition = "2021" description = "Fast byte-pair encoding implementation." repository = "https://github.com/github/rust-gems" diff --git a/crates/bpe/benchmarks/Cargo.toml b/crates/bpe/benchmarks/Cargo.toml index 765b5b1..a332b27 100644 --- a/crates/bpe/benchmarks/Cargo.toml +++ b/crates/bpe/benchmarks/Cargo.toml @@ -18,7 +18,7 @@ path = "equivalence.rs" test = true [dependencies] -bpe = { path = "../../bpe" } +bpe = { path = "../../bpe", features = ["rand", "tiktoken"] } bpe-openai = { path = "../../bpe-openai" } criterion = "0.5" rand = "0.9" diff --git a/crates/bpe/benchmarks/equivalence.rs b/crates/bpe/benchmarks/equivalence.rs index b3df973..4019602 100644 --- a/crates/bpe/benchmarks/equivalence.rs +++ b/crates/bpe/benchmarks/equivalence.rs @@ -1,19 +1,71 @@ +use std::collections::HashSet; + use bpe::byte_pair_encoding::{create_test_string, select_test_string}; use bpe_benchmarks::*; -#[cfg(test)] -const N: usize = 32; +/// Converts bytes to unicode characters. +/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L9 +/// Hugging face uses the same mapping to work with unicode instead of byte characters. +fn char_to_byte(c: char) -> u8 { + match c as u32 { + 0x21..0x7f => c as u8, // 94 + 0xa1..=0xac => c as u8, // 12 + 0xae..=0xff => c as u8, // 82 + 0x7f..0xa1 => c as u8 - 0x7f + 221, + 0x100..0x121 => (c as u32 - 0x100) as u8, + 0x121..0x143 => (c as u32 - 0x121) as u8 + 0x7f, + 0x143..0x144 => 0xad, + _ => panic!("Invalid character: {c} {}", c as u32), + } +} #[test] -fn test_huggingface_encoding_equivalence_without_pretokenization() { - for (_, bpe, _, huggingface) in TOKENIZERS.iter() { +fn test_compare_dictionary() { + for (name, bpe, _, huggingface) in TOKENIZERS.iter() { let huggingface = without_pretokenizer(huggingface); - let text = create_test_string(&bpe.bpe, 80_000); - let texts = (0..N) - .map(|_| select_test_string(&text, 100)) + let mut hugging_tokens = huggingface.get_vocab(false); + // HACK: There are incorrect vocabularies in huggingface which have the added tokens stored together with the base tokens.. + // This is a workaround to remove them. + for added_token in huggingface.get_added_vocabulary().get_vocab().keys() { + hugging_tokens.remove(added_token); + } + let mut hugging_tokens: Vec<_> = hugging_tokens.into_iter().collect(); + hugging_tokens.sort_by(|(_, a), (_, b)| a.cmp(b)); + let hugging_tokens: Vec<_> = hugging_tokens + .into_iter() + .map(|(token, _)| token.chars().map(char_to_byte).collect()) + .collect(); + let bpe_tokens: Vec<_> = (0..bpe.bpe.num_tokens()) + .map(|id| bpe.bpe.token_bytes(id as u32).to_vec()) + .collect(); + let hugging_set: HashSet<_> = hugging_tokens.iter().cloned().collect(); + let bpe_set: HashSet<_> = bpe_tokens.iter().cloned().collect(); + let diff: Vec<_> = hugging_set.symmetric_difference(&bpe_set).collect(); + assert!(diff.is_empty(), "{name}: Token sets differ"); + // Uncomment the following lines to write the tokens to a file in tiktoken format + /* + let mut file = + std::fs::File::create(std::path::Path::new(_name)).expect("can create output file"); + std::io::Write::write_all( + &mut file, + bpe::byte_pair_encoding::write_tiktoken(hugging_tokens).as_bytes(), + ) + .expect("can write output to file"); + */ + } +} + +#[test] +fn test_huggingface_encoding_equivalence_without_pretokenization() { + for (name, bpe, _, huggingface) in TOKENIZERS.iter() { + let text: String = create_test_string(&bpe.bpe, 200_000); + let text = bpe.normalize(&text); + let texts = (0..300) + .map(|_| select_test_string(text.as_str(), 100)) .chain(std::iter::once( "You should see the Greek word 'kosme': \"κόσμε\"", )); + let huggingface = without_pretokenizer(huggingface); for text in texts { let out = bpe.bpe.encode_via_backtracking(text.as_bytes()); let huggingface_out = huggingface @@ -26,14 +78,10 @@ fn test_huggingface_encoding_equivalence_without_pretokenization() { let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap(); if huggingface_text != text { panic!( - "huggingface tokens and text differ: {:?} != {:?}", - text, huggingface_text + "{name}: huggingface tokens and text differ: {text:?} != {huggingface_text:?}", ); } else { - panic!( - "huggingface tokens differ: {:?} != {:?}", - out, huggingface_out - ); + panic!("{name}: huggingface tokens differ: {out:?} != {huggingface_out:?}"); } } } @@ -42,9 +90,9 @@ fn test_huggingface_encoding_equivalence_without_pretokenization() { #[test] fn test_huggingface_encoding_equivalence_with_pretokenization() { - for (_, bpe, _, huggingface) in TOKENIZERS.iter() { - let text = create_test_string(&bpe.bpe, 80_000); - let texts = (0..N) + for (name, bpe, _, huggingface) in TOKENIZERS.iter() { + let text = create_test_string(&bpe.bpe, 200_000); + let texts = (0..300) .map(|_| select_test_string(&text, 100)) .chain(std::iter::once( "You should see the Greek word 'kosme': \"κόσμε\" ", @@ -62,14 +110,10 @@ fn test_huggingface_encoding_equivalence_with_pretokenization() { let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap(); if huggingface_text != text { panic!( - "huggingface tokens and text differ: {:?} != {:?}", - text, huggingface_text + "{name}: huggingface tokens and text differ: {text:?} != {huggingface_text:?}", ); } else { - panic!( - "huggingface tokens differ: {:?} != {:?}", - out, huggingface_out - ); + panic!("{name}: huggingface tokens differ: {out:?} != {huggingface_out:?}"); } } } diff --git a/crates/bpe/benchmarks/lib.rs b/crates/bpe/benchmarks/lib.rs index d364df8..00ffd4a 100644 --- a/crates/bpe/benchmarks/lib.rs +++ b/crates/bpe/benchmarks/lib.rs @@ -5,27 +5,35 @@ use tiktoken_rs::CoreBPE as TiktokenTokenizer; use tokenizers::pre_tokenizers::byte_level::ByteLevel as HuggingfaceByteLevel; use tokenizers::tokenizer::Tokenizer as HuggingfaceTokenizer; +#[allow(clippy::type_complexity)] pub static TOKENIZERS: LazyLock< [( &'static str, &'static Tokenizer, - TiktokenTokenizer, + Option, HuggingfaceTokenizer, - ); 2], + ); 3], > = LazyLock::new(|| { [ ( "cl100k", bpe_openai::cl100k_base(), - tiktoken_rs::cl100k_base().expect("tokenizer available"), + Some(tiktoken_rs::cl100k_base().expect("tokenizer available")), HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4", None).expect("model available"), ), ( "o200k", bpe_openai::o200k_base(), - tiktoken_rs::o200k_base().expect("tokenizer available"), + Some(tiktoken_rs::o200k_base().expect("tokenizer available")), HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4o", None).expect("model available"), ), + ( + "voyage3", + bpe_openai::voyage3_base(), + None, + HuggingfaceTokenizer::from_pretrained("voyageai/voyage-code-3", None) + .expect("model available"), + ), ] }); diff --git a/crates/bpe/benchmarks/performance.rs b/crates/bpe/benchmarks/performance.rs index 21ebff8..4259498 100644 --- a/crates/bpe/benchmarks/performance.rs +++ b/crates/bpe/benchmarks/performance.rs @@ -163,13 +163,15 @@ fn comparison_benchmark(c: &mut Criterion) { ) }, ); - group.bench_with_input(BenchmarkId::new("tiktoken", bytes), &bytes, |b, bytes| { - b.iter_batched( - || select_test_string(&text, *bytes), - |text| tiktoken.encode_ordinary(text), - criterion::BatchSize::SmallInput, - ) - }); + if let Some(tiktoken) = tiktoken { + group.bench_with_input(BenchmarkId::new("tiktoken", bytes), &bytes, |b, bytes| { + b.iter_batched( + || select_test_string(&text, *bytes), + |text| tiktoken.encode_ordinary(text), + criterion::BatchSize::SmallInput, + ) + }); + } group.bench_with_input( BenchmarkId::new("huggingface", bytes), &bytes, @@ -206,13 +208,15 @@ fn worstcase_comparison_benchmark(c: &mut Criterion) { ) }, ); - group.bench_with_input(BenchmarkId::new("tiktoken", bytes), &bytes, |b, bytes| { - b.iter_batched( - || select_test_string(&text, *bytes), - |text| tiktoken.encode_ordinary(text), - criterion::BatchSize::SmallInput, - ) - }); + if let Some(tiktoken) = tiktoken { + group.bench_with_input(BenchmarkId::new("tiktoken", bytes), &bytes, |b, bytes| { + b.iter_batched( + || select_test_string(&text, *bytes), + |text| tiktoken.encode_ordinary(text), + criterion::BatchSize::SmallInput, + ) + }); + } group.bench_with_input( BenchmarkId::new("huggingface", bytes), &bytes,