comment unused code

guoyongzhi · guoyongzhi · commit e54ff1b0ab14 · 2023-09-04T00:21:08.000+08:00
diff --git a/src/compress.jl b/src/compress.jl
@@ -8,63 +8,63 @@ function RLCS(rank::Int=63)
     return RLCS(rank, fill("", rank), 0)
 end
 
-function lcs(s1::AbstractString, s2::AbstractString)
-    m, n = length(s1), length(s2)
-    len, pos1, pos2 = 0, 0, 0
-    dp = zeros(Int, m + 1, n + 1)
-    for i in 1:m
-        for j in 1:n
-            if s1[i] == s2[j]
-                dp[i+1, j+1] = dp[i, j] + 1
-                if dp[i+1, j+1] > len
-                    len = dp[i+1, j+1]
-                    pos1 = i
-                    pos2 = j
-                end
-            end
-        end
-    end
-    return pos1-len+1:pos1, pos2-len+1:pos2
-end
+# function lcs(s1::AbstractString, s2::AbstractString)
+#     m, n = length(s1), length(s2)
+#     len, pos1, pos2 = 0, 0, 0
+#     dp = zeros(Int, m + 1, n + 1)
+#     for i in 1:m
+#         for j in 1:n
+#             if s1[i] == s2[j]
+#                 dp[i+1, j+1] = dp[i, j] + 1
+#                 if dp[i+1, j+1] > len
+#                     len = dp[i+1, j+1]
+#                     pos1 = i
+#                     pos2 = j
+#                 end
+#             end
+#         end
+#     end
+#     return pos1-len+1:pos1, pos2-len+1:pos2
+# end
 
-function lcs_zip(str, refer) # `str` and `refer` must not contain any uppercase letters
-    rg1, rg2 = lcs(refer, str)
-    if length(rg1) > 2 || (length(rg1) == 2 && rg1[1] == 1)
-        b1, e1 = first(rg1), last(rg1)
-        b2, e2 = first(rg2), last(rg2)
-        l1 = e1 - b1 + 1
-        if b1 <= 26 && l1 <= 26
-            bc = b1 == 1 ? "" : 'A' + (b1 - 1)
-            lc = 'A' + (l1 - 1)
-            code = bc * lc
-            return l1 - length(code), str[1:b2-1], code, str[e2+1:end]
-        end
-    end
-    return 0, str, "", ""
-end
-function rlcs_zip(Z, str)
-    ec(i) = i <= 11 ? ('0' + i - 2) : (i <= 37 ? 'a' + i - 12 : 'A' + i - 38)
-    best = 0, str, "", ""
-    for rk in 1:min(Z.rank, Z.counter)
-        refer = Z.pool[(Z.counter-rk+1)%Z.rank+1]
-        # @show rk refer
-        l, h, c, t = lcs_zip(str, refer)
-        rk > 1 && (l -= 2)
-        # @show l
-        if l > best[1]
-            s = rk == 1 ? "" : 'A' * ec(rk)
-            best = l, h, s * c, t
-            # @show s
-        end
-        if length(str) - l < 2
-            break
-        end
-    end
-    Z.counter += 1
-    zip_str = string(best[2], best[3], best[4])
-    Z.rank > 0 && (Z.pool[Z.counter%Z.rank+1] = str)
-    return zip_str
-end
+# function lcs_zip(str, refer) # `str` and `refer` must not contain any uppercase letters
+#     rg1, rg2 = lcs(refer, str)
+#     if length(rg1) > 2 || (length(rg1) == 2 && rg1[1] == 1)
+#         b1, e1 = first(rg1), last(rg1)
+#         b2, e2 = first(rg2), last(rg2)
+#         l1 = e1 - b1 + 1
+#         if b1 <= 26 && l1 <= 26
+#             bc = b1 == 1 ? "" : 'A' + (b1 - 1)
+#             lc = 'A' + (l1 - 1)
+#             code = bc * lc
+#             return l1 - length(code), str[1:b2-1], code, str[e2+1:end]
+#         end
+#     end
+#     return 0, str, "", ""
+# end
+# function rlcs_zip(Z, str)
+#     ec(i) = i <= 11 ? ('0' + i - 2) : (i <= 37 ? 'a' + i - 12 : 'A' + i - 38)
+#     best = 0, str, "", ""
+#     for rk in 1:min(Z.rank, Z.counter)
+#         refer = Z.pool[(Z.counter-rk+1)%Z.rank+1]
+#         # @show rk refer
+#         l, h, c, t = lcs_zip(str, refer)
+#         rk > 1 && (l -= 2)
+#         # @show l
+#         if l > best[1]
+#             s = rk == 1 ? "" : 'A' * ec(rk)
+#             best = l, h, s * c, t
+#             # @show s
+#         end
+#         if length(str) - l < 2
+#             break
+#         end
+#     end
+#     Z.counter += 1
+#     zip_str = string(best[2], best[3], best[4])
+#     Z.rank > 0 && (Z.pool[Z.counter%Z.rank+1] = str)
+#     return zip_str
+# end
 
 function rlcs_unzip(Z, str)
     dc(c) = c <= '9' ? (c - '0' + 2) : (c >= 'a' ? c - 'a' + 12 : c - 'A' + 38)
diff --git a/src/ngrams.jl b/src/ngrams.jl
@@ -11,14 +11,14 @@ function normalize_text(text; blacklist=String[])
     end
     text = replace(text, r"\s\s+" => " ")
 end
-function count_ngrams(text::AbstractString, n, counter=Dict{Vector{UInt8},Float32}())
-    text = transcode(UInt8, string(text))
-    for i in 1:length(text)-n+1
-        p = text[i:i+n-1]
-        counter[p] = get(counter, p, 0.0) + 1.0
-    end
-    counter
-end
+# function count_ngrams(text::AbstractString, n, counter=Dict{Vector{UInt8},Float32}())
+#     text = transcode(UInt8, string(text))
+#     for i in 1:length(text)-n+1
+#         p = text[i:i+n-1]
+#         counter[p] = get(counter, p, 0.0) + 1.0
+#     end
+#     counter
+# end
 function count_all_ngrams(text::AbstractString, rg::AbstractVector=1:5, counter=Dict{Vector{UInt8},Float32}(); kwargs...)
     text = normalize_text(text; kwargs...)
     text = transcode(UInt8, string(text))
@@ -35,48 +35,48 @@ function count_all_ngrams(text::AbstractString, n::Int, counter=Dict{Vector{UInt
 end
 
 
-function count_dataset_ngrams(dataset, n; kwargs...)
-    counters = [Dict{Vector{UInt8},Float32}() for i in 1:n]
-    for (text, lang) in dataset
-        text = normalize_text(text; kwargs...)
-        for i in 1:n
-            count_ngrams(text, i, counters[i])
-        end
-    end
-    counters
-end
+# function count_dataset_ngrams(dataset, n; kwargs...)
+#     counters = [Dict{Vector{UInt8},Float32}() for i in 1:n]
+#     for (text, lang) in dataset
+#         text = normalize_text(text; kwargs...)
+#         for i in 1:n
+#             count_ngrams(text, i, counters[i])
+#         end
+#     end
+#     counters
+# end
 
-function count_dataset_all_ngrams(dataset, n; kwargs...)
-    counter = Dict{Vector{UInt8},Float32}()
-    for (text, lang) in dataset
-        count_all_ngrams(text, n, counter; kwargs...)
-    end
-    counter
-end
+# function count_dataset_all_ngrams(dataset, n; kwargs...)
+#     counter = Dict{Vector{UInt8},Float32}()
+#     for (text, lang) in dataset
+#         count_all_ngrams(text, n, counter; kwargs...)
+#     end
+#     counter
+# end
 
-function dump_ngram_table(head::Vector{Float32}, D, filename; compress_level=63)
-    Z1, Z2 = RLCS(compress_level), RLCS(compress_level)
-    open(filename, "w") do f
-        write(f, "total:")
-        write(f, join(head, ","))
-        write(f, "\n")
-        last_v = 0.0
-        for (k, v) in D
-            @assert k isa Vector{UInt8}
-            k = join(string.(k, base=16), "")
-            kz = rlcs_zip(Z1, k)
-            write(f, kz)
-            if last_v != v
-                last_v = v
-                write(f, ",")
-                vstr = string(v)
-                vstrz = rlcs_zip(Z2, vstr)
-                write(f, vstrz)
-            end
-            write(f, "\n")
-        end
-    end
-end
+# function dump_ngram_table(head::Vector{Float32}, D, filename; compress_level=63)
+#     Z1, Z2 = RLCS(compress_level), RLCS(compress_level)
+#     open(filename, "w") do f
+#         write(f, "total:")
+#         write(f, join(head, ","))
+#         write(f, "\n")
+#         last_v = 0.0
+#         for (k, v) in D
+#             @assert k isa Vector{UInt8}
+#             k = join(string.(k, base=16), "")
+#             kz = rlcs_zip(Z1, k)
+#             write(f, kz)
+#             if last_v != v
+#                 last_v = v
+#                 write(f, ",")
+#                 vstr = string(v)
+#                 vstrz = rlcs_zip(Z2, vstr)
+#                 write(f, vstrz)
+#             end
+#             write(f, "\n")
+#         end
+#     end
+# end
 
 function ngram_table(filename)
     el = eachline(filename)
@@ -104,7 +104,7 @@ function ngram_table(filename)
     hd, Channel(producer)
 end
 
-function load_ngram_table(filename)
-    hd, tb = ngram_table(filename)
-    hd, collect(tb)
-end
+# function load_ngram_table(filename)
+#     hd, tb = ngram_table(filename)
+#     hd, collect(tb)
+# end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -14,11 +14,17 @@ using Test
     @test langid("यह एक परीक्षण है।") == "hin"
     @test langid("এটি একটি পরীক্ষা।") == "ben"
     @test langid("این یک آزمایش است.") == "fas"
+    @test langid("این یک آزمایش است.", ["jpn", "eng"]) in ["jpn", "eng"]
+    langid("این یک آزمایش است.", ngram=[2, 4])
+    langid("", ngram=3)
+    langid(" ", ngram=3:4)
     @test sum(last.(langprob("This is a test.", topk=50))) ≈ 1.0
     @test langprob("这是一个测试。", topk=1) |> only |> first == "zho"
     @test langprob("これはテストです。", ["zho", "ara"], topk=30) |> length == 2
     LI.initialize(vocabulary=200)
     @test all(last.(LI.vocabulary_sizes()) .== 201)
     LI.initialize(cutoff=0.5)
     LI.initialize(cutoff=0.75, vocabulary=200:1000)
+    LI.initialize(languages=["rus", "ara", "hin"])
+    @test length(LI.PROFILES) == 3
 end