Skip to content

Commit e54ff1b

Browse files
author
guoyongzhi
committed
comment unused code
1 parent 17938d9 commit e54ff1b

File tree

3 files changed

+114
-108
lines changed

3 files changed

+114
-108
lines changed

src/compress.jl

Lines changed: 56 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -8,63 +8,63 @@ function RLCS(rank::Int=63)
88
return RLCS(rank, fill("", rank), 0)
99
end
1010

11-
function lcs(s1::AbstractString, s2::AbstractString)
12-
m, n = length(s1), length(s2)
13-
len, pos1, pos2 = 0, 0, 0
14-
dp = zeros(Int, m + 1, n + 1)
15-
for i in 1:m
16-
for j in 1:n
17-
if s1[i] == s2[j]
18-
dp[i+1, j+1] = dp[i, j] + 1
19-
if dp[i+1, j+1] > len
20-
len = dp[i+1, j+1]
21-
pos1 = i
22-
pos2 = j
23-
end
24-
end
25-
end
26-
end
27-
return pos1-len+1:pos1, pos2-len+1:pos2
28-
end
11+
# function lcs(s1::AbstractString, s2::AbstractString)
12+
# m, n = length(s1), length(s2)
13+
# len, pos1, pos2 = 0, 0, 0
14+
# dp = zeros(Int, m + 1, n + 1)
15+
# for i in 1:m
16+
# for j in 1:n
17+
# if s1[i] == s2[j]
18+
# dp[i+1, j+1] = dp[i, j] + 1
19+
# if dp[i+1, j+1] > len
20+
# len = dp[i+1, j+1]
21+
# pos1 = i
22+
# pos2 = j
23+
# end
24+
# end
25+
# end
26+
# end
27+
# return pos1-len+1:pos1, pos2-len+1:pos2
28+
# end
2929

30-
function lcs_zip(str, refer) # `str` and `refer` must not contain any uppercase letters
31-
rg1, rg2 = lcs(refer, str)
32-
if length(rg1) > 2 || (length(rg1) == 2 && rg1[1] == 1)
33-
b1, e1 = first(rg1), last(rg1)
34-
b2, e2 = first(rg2), last(rg2)
35-
l1 = e1 - b1 + 1
36-
if b1 <= 26 && l1 <= 26
37-
bc = b1 == 1 ? "" : 'A' + (b1 - 1)
38-
lc = 'A' + (l1 - 1)
39-
code = bc * lc
40-
return l1 - length(code), str[1:b2-1], code, str[e2+1:end]
41-
end
42-
end
43-
return 0, str, "", ""
44-
end
45-
function rlcs_zip(Z, str)
46-
ec(i) = i <= 11 ? ('0' + i - 2) : (i <= 37 ? 'a' + i - 12 : 'A' + i - 38)
47-
best = 0, str, "", ""
48-
for rk in 1:min(Z.rank, Z.counter)
49-
refer = Z.pool[(Z.counter-rk+1)%Z.rank+1]
50-
# @show rk refer
51-
l, h, c, t = lcs_zip(str, refer)
52-
rk > 1 && (l -= 2)
53-
# @show l
54-
if l > best[1]
55-
s = rk == 1 ? "" : 'A' * ec(rk)
56-
best = l, h, s * c, t
57-
# @show s
58-
end
59-
if length(str) - l < 2
60-
break
61-
end
62-
end
63-
Z.counter += 1
64-
zip_str = string(best[2], best[3], best[4])
65-
Z.rank > 0 && (Z.pool[Z.counter%Z.rank+1] = str)
66-
return zip_str
67-
end
30+
# function lcs_zip(str, refer) # `str` and `refer` must not contain any uppercase letters
31+
# rg1, rg2 = lcs(refer, str)
32+
# if length(rg1) > 2 || (length(rg1) == 2 && rg1[1] == 1)
33+
# b1, e1 = first(rg1), last(rg1)
34+
# b2, e2 = first(rg2), last(rg2)
35+
# l1 = e1 - b1 + 1
36+
# if b1 <= 26 && l1 <= 26
37+
# bc = b1 == 1 ? "" : 'A' + (b1 - 1)
38+
# lc = 'A' + (l1 - 1)
39+
# code = bc * lc
40+
# return l1 - length(code), str[1:b2-1], code, str[e2+1:end]
41+
# end
42+
# end
43+
# return 0, str, "", ""
44+
# end
45+
# function rlcs_zip(Z, str)
46+
# ec(i) = i <= 11 ? ('0' + i - 2) : (i <= 37 ? 'a' + i - 12 : 'A' + i - 38)
47+
# best = 0, str, "", ""
48+
# for rk in 1:min(Z.rank, Z.counter)
49+
# refer = Z.pool[(Z.counter-rk+1)%Z.rank+1]
50+
# # @show rk refer
51+
# l, h, c, t = lcs_zip(str, refer)
52+
# rk > 1 && (l -= 2)
53+
# # @show l
54+
# if l > best[1]
55+
# s = rk == 1 ? "" : 'A' * ec(rk)
56+
# best = l, h, s * c, t
57+
# # @show s
58+
# end
59+
# if length(str) - l < 2
60+
# break
61+
# end
62+
# end
63+
# Z.counter += 1
64+
# zip_str = string(best[2], best[3], best[4])
65+
# Z.rank > 0 && (Z.pool[Z.counter%Z.rank+1] = str)
66+
# return zip_str
67+
# end
6868

6969
function rlcs_unzip(Z, str)
7070
dc(c) = c <= '9' ? (c - '0' + 2) : (c >= 'a' ? c - 'a' + 12 : c - 'A' + 38)

src/ngrams.jl

Lines changed: 52 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@ function normalize_text(text; blacklist=String[])
1111
end
1212
text = replace(text, r"\s\s+" => " ")
1313
end
14-
function count_ngrams(text::AbstractString, n, counter=Dict{Vector{UInt8},Float32}())
15-
text = transcode(UInt8, string(text))
16-
for i in 1:length(text)-n+1
17-
p = text[i:i+n-1]
18-
counter[p] = get(counter, p, 0.0) + 1.0
19-
end
20-
counter
21-
end
14+
# function count_ngrams(text::AbstractString, n, counter=Dict{Vector{UInt8},Float32}())
15+
# text = transcode(UInt8, string(text))
16+
# for i in 1:length(text)-n+1
17+
# p = text[i:i+n-1]
18+
# counter[p] = get(counter, p, 0.0) + 1.0
19+
# end
20+
# counter
21+
# end
2222
function count_all_ngrams(text::AbstractString, rg::AbstractVector=1:5, counter=Dict{Vector{UInt8},Float32}(); kwargs...)
2323
text = normalize_text(text; kwargs...)
2424
text = transcode(UInt8, string(text))
@@ -35,48 +35,48 @@ function count_all_ngrams(text::AbstractString, n::Int, counter=Dict{Vector{UInt
3535
end
3636

3737

38-
function count_dataset_ngrams(dataset, n; kwargs...)
39-
counters = [Dict{Vector{UInt8},Float32}() for i in 1:n]
40-
for (text, lang) in dataset
41-
text = normalize_text(text; kwargs...)
42-
for i in 1:n
43-
count_ngrams(text, i, counters[i])
44-
end
45-
end
46-
counters
47-
end
38+
# function count_dataset_ngrams(dataset, n; kwargs...)
39+
# counters = [Dict{Vector{UInt8},Float32}() for i in 1:n]
40+
# for (text, lang) in dataset
41+
# text = normalize_text(text; kwargs...)
42+
# for i in 1:n
43+
# count_ngrams(text, i, counters[i])
44+
# end
45+
# end
46+
# counters
47+
# end
4848

49-
function count_dataset_all_ngrams(dataset, n; kwargs...)
50-
counter = Dict{Vector{UInt8},Float32}()
51-
for (text, lang) in dataset
52-
count_all_ngrams(text, n, counter; kwargs...)
53-
end
54-
counter
55-
end
49+
# function count_dataset_all_ngrams(dataset, n; kwargs...)
50+
# counter = Dict{Vector{UInt8},Float32}()
51+
# for (text, lang) in dataset
52+
# count_all_ngrams(text, n, counter; kwargs...)
53+
# end
54+
# counter
55+
# end
5656

57-
function dump_ngram_table(head::Vector{Float32}, D, filename; compress_level=63)
58-
Z1, Z2 = RLCS(compress_level), RLCS(compress_level)
59-
open(filename, "w") do f
60-
write(f, "total:")
61-
write(f, join(head, ","))
62-
write(f, "\n")
63-
last_v = 0.0
64-
for (k, v) in D
65-
@assert k isa Vector{UInt8}
66-
k = join(string.(k, base=16), "")
67-
kz = rlcs_zip(Z1, k)
68-
write(f, kz)
69-
if last_v != v
70-
last_v = v
71-
write(f, ",")
72-
vstr = string(v)
73-
vstrz = rlcs_zip(Z2, vstr)
74-
write(f, vstrz)
75-
end
76-
write(f, "\n")
77-
end
78-
end
79-
end
57+
# function dump_ngram_table(head::Vector{Float32}, D, filename; compress_level=63)
58+
# Z1, Z2 = RLCS(compress_level), RLCS(compress_level)
59+
# open(filename, "w") do f
60+
# write(f, "total:")
61+
# write(f, join(head, ","))
62+
# write(f, "\n")
63+
# last_v = 0.0
64+
# for (k, v) in D
65+
# @assert k isa Vector{UInt8}
66+
# k = join(string.(k, base=16), "")
67+
# kz = rlcs_zip(Z1, k)
68+
# write(f, kz)
69+
# if last_v != v
70+
# last_v = v
71+
# write(f, ",")
72+
# vstr = string(v)
73+
# vstrz = rlcs_zip(Z2, vstr)
74+
# write(f, vstrz)
75+
# end
76+
# write(f, "\n")
77+
# end
78+
# end
79+
# end
8080

8181
function ngram_table(filename)
8282
el = eachline(filename)
@@ -104,7 +104,7 @@ function ngram_table(filename)
104104
hd, Channel(producer)
105105
end
106106

107-
function load_ngram_table(filename)
108-
hd, tb = ngram_table(filename)
109-
hd, collect(tb)
110-
end
107+
# function load_ngram_table(filename)
108+
# hd, tb = ngram_table(filename)
109+
# hd, collect(tb)
110+
# end

test/runtests.jl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,17 @@ using Test
1414
@test langid("यह एक परीक्षण है।") == "hin"
1515
@test langid("এটি একটি পরীক্ষা।") == "ben"
1616
@test langid("این یک آزمایش است.") == "fas"
17+
@test langid("این یک آزمایش است.", ["jpn", "eng"]) in ["jpn", "eng"]
18+
langid("این یک آزمایش است.", ngram=[2, 4])
19+
langid("", ngram=3)
20+
langid(" ", ngram=3:4)
1721
@test sum(last.(langprob("This is a test.", topk=50))) 1.0
1822
@test langprob("这是一个测试。", topk=1) |> only |> first == "zho"
1923
@test langprob("これはテストです。", ["zho", "ara"], topk=30) |> length == 2
2024
LI.initialize(vocabulary=200)
2125
@test all(last.(LI.vocabulary_sizes()) .== 201)
2226
LI.initialize(cutoff=0.5)
2327
LI.initialize(cutoff=0.75, vocabulary=200:1000)
28+
LI.initialize(languages=["rus", "ara", "hin"])
29+
@test length(LI.PROFILES) == 3
2430
end

0 commit comments

Comments
 (0)