support string list input

guoyongzhi · guoyongzhi · commit bdf534d7adcf · 2023-09-07T14:48:21.000+08:00
diff --git a/src/detector.jl b/src/detector.jl
@@ -98,40 +98,40 @@ function loglikelihood(p_dict, logq_dict)
 end
 
 """
-    langid(text::AbstractString, languages::Vector{String}, profiles::Vector{Dict{Vector{UInt8}, Float32}}; ngram=NGRAM)
+    langid(text, languages::Vector{String}, profiles::Vector{Dict{Vector{UInt8}, Float32}}; ngram=NGRAM)
 
 Return the language of the given text based on the provided language profiles.
 
 # Arguments
-- `text::AbstractString`: The text to identify the language of.
+- `text`: A string or a collection of strings to be analyzed for language identification.
 - `languages::Vector{String}`: The list of languages to choose from. Omitting this argument will use all supported languages.
 - `profiles::Vector{Dict{Vector{UInt8}, Float32}}`: The language profiles to use for identification. Omitting this argument will use the default profiles.
 - `ngram::Union{Int, AbstractVector}`: The length of utf-8 byte n-grams to use for language detection. The default value is the value set in [`initialize`](@ref), and should not exceed that value.
 # Returns
 - The language of the given text.
 """
-function langid(text::AbstractString, languages::Vector{String}, profiles::Vector{Dict{Vector{UInt8},Float32}}; ngram=NGRAM)
+function langid(text, languages::Vector{String}, profiles::Vector{Dict{Vector{UInt8},Float32}}; ngram=NGRAM)
     p = count_all_ngrams(text, ngram)
     lls = loglikelihood.(Ref(p), profiles)
     languages[argmax(lls)]
 end
-function langid(text::AbstractString, languages::Vector{String}; kwargs...)
+function langid(text, languages::Vector{String}; kwargs...)
     makesure_initialized()
     inds = [findfirst(isequal(l), LANGUAGES) for l in languages]
     langid(text, languages, PROFILES[inds]; kwargs...)
 end
-function langid(text::AbstractString; kwargs...)
+function langid(text; kwargs...)
     makesure_initialized()
     langid(text, LANGUAGES, PROFILES; kwargs...)
 end
 
 """
-    langprob(text::AbstractString, languages::Vector{String}, profiles::Vector{Dict{Vector{UInt8}, Float32}}; topk=5, ngram=NGRAM)
+    langprob(text, languages::Vector{String}, profiles::Vector{Dict{Vector{UInt8}, Float32}}; topk=5, ngram=NGRAM)
 
 Returns the probability distribution of the language of the given text based on the provided language profiles.
 
 # Arguments
-- `text::AbstractString`: The text to identify the language of.
+- `text`: A string or a collection of strings to be analyzed for language identification.
 - `languages::Vector{String}`: A list of languages to choose from. If this argument is not provided, all the languages returned by the [`supported_languages`](@ref) function will be used.
 - `profiles::Vector{Dict{Vector{UInt8}, Float32}}`: The language profiles to use for identification. If this argument is not provided, the default profiles will be used.
 - `topk::Int`: The number of candidates to return. The default value is 5.
@@ -140,7 +140,7 @@ Returns the probability distribution of the language of the given text based on
 # Returns
 - A list of the `topk` languages and their probabilities.
 """
-function langprob(text::AbstractString, languages::Vector{String}, profiles::Vector{Dict{Vector{UInt8},Float32}}; topk=5, ngram=NGRAM)
+function langprob(text, languages::Vector{String}, profiles::Vector{Dict{Vector{UInt8},Float32}}; topk=5, ngram=NGRAM)
     p = count_all_ngrams(text, ngram)
     vs = sum(values(p))
     map!(v -> v / vs, values(p))
@@ -150,12 +150,12 @@ function langprob(text::AbstractString, languages::Vector{String}, profiles::Vec
     si = sortperm(ls, rev=true)[1:min(end, topk)]
     [k => v for (k, v) in zip(languages[si], ls[si])]
 end
-function langprob(text::AbstractString, languages::Vector{String}; kwargs...)
+function langprob(text, languages::Vector{String}; kwargs...)
     makesure_initialized()
     inds = [findfirst(isequal(l), LANGUAGES) for l in languages]
     langprob(text, languages, PROFILES[inds]; kwargs...)
 end
-function langprob(text::AbstractString; kwargs...)
+function langprob(text; kwargs...)
     makesure_initialized()
     langprob(text, LANGUAGES, PROFILES; kwargs...)
 end
diff --git a/src/ngrams.jl b/src/ngrams.jl
@@ -33,7 +33,12 @@ end
 function count_all_ngrams(text::AbstractString, n::Int, counter=Dict{Vector{UInt8},Float32}(); kwargs...)
     count_all_ngrams(text, 1:n, counter; kwargs...)
 end
-
+function count_all_ngrams(text_list, n, counter = Dict{Vector{UInt8},Float32}(); kwargs...)
+    for text in text_list
+        count_all_ngrams(text, n, counter; kwargs...)
+    end
+    counter
+end
 
 # function count_dataset_ngrams(dataset, n; kwargs...)
 #     counters = [Dict{Vector{UInt8},Float32}() for i in 1:n]
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -19,8 +19,11 @@ using Test
     langid("", ngram=3)
     langid(" ", ngram=3:4)
     langid(" ", ngram=5:7)
+    @test langid(Set(["This", "is", "a", "test", "."])) == "eng"
+    @test langid(["这是", "一个", "测试", "。"]) == "zho"
     @test sum(last.(langprob("This is a test.", topk=length(LI.supported_languages())))) ≈ 1.0
     @test langprob("这是一个测试。", topk=1) |> only |> first == "zho"
+    @test langprob(["这是", "一个", "测试", "。"], topk=1) |> only |> first == "zho"
     @test langprob("これはテストです。", ["zho", "ara"], topk=30) |> length == 2
     LI.initialize(vocabulary=200)
     @test all(last.(LI.vocabulary_sizes()) .== 201)