Skip to content

Commit e39fdcd

Browse files
author
guoyongzhi
committed
vector ngram
1 parent 287956d commit e39fdcd

File tree

2 files changed

+12
-11
lines changed

2 files changed

+12
-11
lines changed

src/detector.jl

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ const ALL_LANGUAGES = [f[1:end-4] for f in readdir(PROFILE_PATH)]
55
const LANGUAGES = String[]
66
const PROFILES = Vector{Dict{Vector{UInt8},Float32}}()
77
const UNK = UInt8[]
8-
NGRAM::UnitRange{Int64} = 0:0
8+
const NGRAM = Int[]
99

1010
"""
1111
supported_languages() -> Vector{String}
@@ -29,25 +29,26 @@ Initialize the language detector with the given parameters. Different parameters
2929
3030
# Arguments
3131
- `languages::Vector{String}`: A list of languages to be used for language detection. If this argument is not provided, all the languages returned by the [`supported_languages`](@ref) function will be used.
32-
- `ngram::Union{Int, AbstractRange}`: The length of utf-8 byte n-grams to use for language detection. A range can be provided to use multiple n-gram sizes. An integer value will be converted to a range from 1 to the given value. The default value is 4.
32+
- `ngram::Union{Int, AbstractVector}`: The length of utf-8 byte n-grams to use for language detection. A range can be provided to use multiple n-gram sizes. An integer value will be converted to a range from 1 to the given value. The default value is 4.
3333
- `cutoff::Float64`: The cutoff value of the cumulative probability of the n-grams to use for language detection. The default value is 0.85, and it must be between 0 and 1.
3434
- `vocabulary::Union{Int, AbstractRange}`: The size range of the vocabulary of each language. The default value is 1000:5000.
3535
"""
3636
function initialize(; languages=supported_languages(), ngram=4, cutoff=0.85, vocabulary=1000:5000)
37-
ngram = ngram isa AbstractRange ? ngram : 1:ngram
3837
vocabulary = vocabulary isa AbstractRange ? vocabulary : 1:vocabulary
38+
ngram = ngram isa AbstractVector ? ngram : 1:ngram
39+
empty!(NGRAM)
40+
append!(NGRAM, ngram)
3941
empty!(LANGUAGES)
4042
append!(LANGUAGES, languages)
4143
empty!(PROFILES)
4244
for lang in LANGUAGES
43-
push!(PROFILES, load_profile(lang, ngram, cutoff, vocabulary))
45+
push!(PROFILES, load_profile(lang, NGRAM, cutoff, vocabulary))
4446
end
4547
unk_decay = 0.01
4648
for P in PROFILES
4749
logp = minimum(values(P), init=typemax(Float32)) + log(unk_decay)
4850
P[UNK] = logp
4951
end
50-
global NGRAM = ngram
5152
nothing
5253
end
5354

@@ -58,15 +59,15 @@ function makesure_initialized()
5859
end
5960
end
6061

61-
function load_profile(lang, ngramrange::AbstractRange, cutoff, vocabularyrange)
62+
function load_profile(lang, ngram_list::AbstractVector, cutoff, vocabularyrange)
6263
vocmin, vocmax = first(vocabularyrange), last(vocabularyrange)
6364
hd, rows = ngram_table(joinpath(PROFILE_PATH, lang * ".txt"))
64-
total = sum(hd[ngramrange])
65+
total = sum(hd[ngram_list])
6566
threshold = cutoff * total
6667
cums = 0.0
6768
P = Pair{Vector{UInt8},Float32}[]
6869
for (k, v) in rows
69-
if length(k) in ngramrange
70+
if length(k) in ngram_list
7071
cums += v
7172
push!(P, k => v)
7273
if (length(P) >= vocmin) && (cums >= threshold || length(P) >= vocmax)
@@ -105,7 +106,7 @@ Return the language of the given text based on the provided language profiles.
105106
- `text::AbstractString`: The text to identify the language of.
106107
- `languages::Vector{String}`: The list of languages to choose from. Omitting this argument will use all supported languages.
107108
- `profiles::Vector{Dict{Vector{UInt8}, Float32}}`: The language profiles to use for identification. Omitting this argument will use the default profiles.
108-
- `ngram::Union{Int, AbstractRange}`: The length of utf-8 byte n-grams to use for language detection. The default value is the value set in [`initialize`](@ref), and should not exceed that value.
109+
- `ngram::Union{Int, AbstractVector}`: The length of utf-8 byte n-grams to use for language detection. The default value is the value set in [`initialize`](@ref), and should not exceed that value.
109110
# Returns
110111
- The language of the given text.
111112
"""
@@ -134,7 +135,7 @@ Returns the probability distribution of the language of the given text based on
134135
- `languages::Vector{String}`: A list of languages to choose from. If this argument is not provided, all the languages returned by the [`supported_languages`](@ref) function will be used.
135136
- `profiles::Vector{Dict{Vector{UInt8}, Float32}}`: The language profiles to use for identification. If this argument is not provided, the default profiles will be used.
136137
- `topk::Int`: The number of candidates to return. The default value is 5.
137-
- `ngram::Union{Int, AbstractRange}`: The length of utf-8 byte n-grams to use for language detection. The default value is the value set in [`initialize`](@ref), and should not exceed that value.
138+
- `ngram::Union{Int, AbstractVector}`: The length of utf-8 byte n-grams to use for language detection. The default value is the value set in [`initialize`](@ref), and should not exceed that value.
138139
139140
# Returns
140141
- A list of the `topk` languages and their probabilities.

src/ngrams.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ function count_ngrams(text::AbstractString, n, counter=Dict{Vector{UInt8},Float3
1919
end
2020
counter
2121
end
22-
function count_all_ngrams(text::AbstractString, rg::AbstractRange=1:5, counter=Dict{Vector{UInt8},Float32}(); kwargs...)
22+
function count_all_ngrams(text::AbstractString, rg::AbstractVector=1:5, counter=Dict{Vector{UInt8},Float32}(); kwargs...)
2323
text = normalize_text(text; kwargs...)
2424
text = transcode(UInt8, string(text))
2525
for k in rg

0 commit comments

Comments
 (0)