Skip to content

Commit bdf534d

Browse files
author
guoyongzhi
committed
support string list input
1 parent 0aac077 commit bdf534d

File tree

3 files changed

+19
-11
lines changed

3 files changed

+19
-11
lines changed

src/detector.jl

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -98,40 +98,40 @@ function loglikelihood(p_dict, logq_dict)
9898
end
9999

100100
"""
101-
langid(text::AbstractString, languages::Vector{String}, profiles::Vector{Dict{Vector{UInt8}, Float32}}; ngram=NGRAM)
101+
langid(text, languages::Vector{String}, profiles::Vector{Dict{Vector{UInt8}, Float32}}; ngram=NGRAM)
102102
103103
Return the language of the given text based on the provided language profiles.
104104
105105
# Arguments
106-
- `text::AbstractString`: The text to identify the language of.
106+
- `text`: A string or a collection of strings to be analyzed for language identification.
107107
- `languages::Vector{String}`: The list of languages to choose from. Omitting this argument will use all supported languages.
108108
- `profiles::Vector{Dict{Vector{UInt8}, Float32}}`: The language profiles to use for identification. Omitting this argument will use the default profiles.
109109
- `ngram::Union{Int, AbstractVector}`: The length of utf-8 byte n-grams to use for language detection. The default value is the value set in [`initialize`](@ref), and should not exceed that value.
110110
# Returns
111111
- The language of the given text.
112112
"""
113-
function langid(text::AbstractString, languages::Vector{String}, profiles::Vector{Dict{Vector{UInt8},Float32}}; ngram=NGRAM)
113+
function langid(text, languages::Vector{String}, profiles::Vector{Dict{Vector{UInt8},Float32}}; ngram=NGRAM)
114114
p = count_all_ngrams(text, ngram)
115115
lls = loglikelihood.(Ref(p), profiles)
116116
languages[argmax(lls)]
117117
end
118-
function langid(text::AbstractString, languages::Vector{String}; kwargs...)
118+
function langid(text, languages::Vector{String}; kwargs...)
119119
makesure_initialized()
120120
inds = [findfirst(isequal(l), LANGUAGES) for l in languages]
121121
langid(text, languages, PROFILES[inds]; kwargs...)
122122
end
123-
function langid(text::AbstractString; kwargs...)
123+
function langid(text; kwargs...)
124124
makesure_initialized()
125125
langid(text, LANGUAGES, PROFILES; kwargs...)
126126
end
127127

128128
"""
129-
langprob(text::AbstractString, languages::Vector{String}, profiles::Vector{Dict{Vector{UInt8}, Float32}}; topk=5, ngram=NGRAM)
129+
langprob(text, languages::Vector{String}, profiles::Vector{Dict{Vector{UInt8}, Float32}}; topk=5, ngram=NGRAM)
130130
131131
Returns the probability distribution of the language of the given text based on the provided language profiles.
132132
133133
# Arguments
134-
- `text::AbstractString`: The text to identify the language of.
134+
- `text`: A string or a collection of strings to be analyzed for language identification.
135135
- `languages::Vector{String}`: A list of languages to choose from. If this argument is not provided, all the languages returned by the [`supported_languages`](@ref) function will be used.
136136
- `profiles::Vector{Dict{Vector{UInt8}, Float32}}`: The language profiles to use for identification. If this argument is not provided, the default profiles will be used.
137137
- `topk::Int`: The number of candidates to return. The default value is 5.
@@ -140,7 +140,7 @@ Returns the probability distribution of the language of the given text based on
140140
# Returns
141141
- A list of the `topk` languages and their probabilities.
142142
"""
143-
function langprob(text::AbstractString, languages::Vector{String}, profiles::Vector{Dict{Vector{UInt8},Float32}}; topk=5, ngram=NGRAM)
143+
function langprob(text, languages::Vector{String}, profiles::Vector{Dict{Vector{UInt8},Float32}}; topk=5, ngram=NGRAM)
144144
p = count_all_ngrams(text, ngram)
145145
vs = sum(values(p))
146146
map!(v -> v / vs, values(p))
@@ -150,12 +150,12 @@ function langprob(text::AbstractString, languages::Vector{String}, profiles::Vec
150150
si = sortperm(ls, rev=true)[1:min(end, topk)]
151151
[k => v for (k, v) in zip(languages[si], ls[si])]
152152
end
153-
function langprob(text::AbstractString, languages::Vector{String}; kwargs...)
153+
function langprob(text, languages::Vector{String}; kwargs...)
154154
makesure_initialized()
155155
inds = [findfirst(isequal(l), LANGUAGES) for l in languages]
156156
langprob(text, languages, PROFILES[inds]; kwargs...)
157157
end
158-
function langprob(text::AbstractString; kwargs...)
158+
function langprob(text; kwargs...)
159159
makesure_initialized()
160160
langprob(text, LANGUAGES, PROFILES; kwargs...)
161161
end

src/ngrams.jl

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,12 @@ end
3333
function count_all_ngrams(text::AbstractString, n::Int, counter=Dict{Vector{UInt8},Float32}(); kwargs...)
3434
count_all_ngrams(text, 1:n, counter; kwargs...)
3535
end
36-
36+
function count_all_ngrams(text_list, n, counter = Dict{Vector{UInt8},Float32}(); kwargs...)
37+
for text in text_list
38+
count_all_ngrams(text, n, counter; kwargs...)
39+
end
40+
counter
41+
end
3742

3843
# function count_dataset_ngrams(dataset, n; kwargs...)
3944
# counters = [Dict{Vector{UInt8},Float32}() for i in 1:n]

test/runtests.jl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,11 @@ using Test
1919
langid("", ngram=3)
2020
langid(" ", ngram=3:4)
2121
langid(" ", ngram=5:7)
22+
@test langid(Set(["This", "is", "a", "test", "."])) == "eng"
23+
@test langid(["这是", "一个", "测试", ""]) == "zho"
2224
@test sum(last.(langprob("This is a test.", topk=length(LI.supported_languages())))) 1.0
2325
@test langprob("这是一个测试。", topk=1) |> only |> first == "zho"
26+
@test langprob(["这是", "一个", "测试", ""], topk=1) |> only |> first == "zho"
2427
@test langprob("これはテストです。", ["zho", "ara"], topk=30) |> length == 2
2528
LI.initialize(vocabulary=200)
2629
@test all(last.(LI.vocabulary_sizes()) .== 201)

0 commit comments

Comments
 (0)