@@ -11,14 +11,14 @@ function normalize_text(text; blacklist=String[])
1111 end
1212 text = replace (text, r" \s\s +" => " " )
1313end
14- function count_ngrams (text:: AbstractString , n, counter= Dict {Vector{UInt8},Float32} ())
15- text = transcode (UInt8, string (text))
16- for i in 1 : length (text)- n+ 1
17- p = text[i: i+ n- 1 ]
18- counter[p] = get (counter, p, 0.0 ) + 1.0
19- end
20- counter
21- end
14+ # function count_ngrams(text::AbstractString, n, counter=Dict{Vector{UInt8},Float32}())
15+ # text = transcode(UInt8, string(text))
16+ # for i in 1:length(text)-n+1
17+ # p = text[i:i+n-1]
18+ # counter[p] = get(counter, p, 0.0) + 1.0
19+ # end
20+ # counter
21+ # end
2222function count_all_ngrams (text:: AbstractString , rg:: AbstractVector = 1 : 5 , counter= Dict {Vector{UInt8},Float32} (); kwargs... )
2323 text = normalize_text (text; kwargs... )
2424 text = transcode (UInt8, string (text))
@@ -35,48 +35,48 @@ function count_all_ngrams(text::AbstractString, n::Int, counter=Dict{Vector{UInt
3535end
3636
3737
38- function count_dataset_ngrams (dataset, n; kwargs... )
39- counters = [Dict {Vector{UInt8},Float32} () for i in 1 : n]
40- for (text, lang) in dataset
41- text = normalize_text (text; kwargs... )
42- for i in 1 : n
43- count_ngrams (text, i, counters[i])
44- end
45- end
46- counters
47- end
38+ # function count_dataset_ngrams(dataset, n; kwargs...)
39+ # counters = [Dict{Vector{UInt8},Float32}() for i in 1:n]
40+ # for (text, lang) in dataset
41+ # text = normalize_text(text; kwargs...)
42+ # for i in 1:n
43+ # count_ngrams(text, i, counters[i])
44+ # end
45+ # end
46+ # counters
47+ # end
4848
49- function count_dataset_all_ngrams (dataset, n; kwargs... )
50- counter = Dict {Vector{UInt8},Float32} ()
51- for (text, lang) in dataset
52- count_all_ngrams (text, n, counter; kwargs... )
53- end
54- counter
55- end
49+ # function count_dataset_all_ngrams(dataset, n; kwargs...)
50+ # counter = Dict{Vector{UInt8},Float32}()
51+ # for (text, lang) in dataset
52+ # count_all_ngrams(text, n, counter; kwargs...)
53+ # end
54+ # counter
55+ # end
5656
57- function dump_ngram_table (head:: Vector{Float32} , D, filename; compress_level= 63 )
58- Z1, Z2 = RLCS (compress_level), RLCS (compress_level)
59- open (filename, " w" ) do f
60- write (f, " total:" )
61- write (f, join (head, " ," ))
62- write (f, " \n " )
63- last_v = 0.0
64- for (k, v) in D
65- @assert k isa Vector{UInt8}
66- k = join (string .(k, base= 16 ), " " )
67- kz = rlcs_zip (Z1, k)
68- write (f, kz)
69- if last_v != v
70- last_v = v
71- write (f, " ," )
72- vstr = string (v)
73- vstrz = rlcs_zip (Z2, vstr)
74- write (f, vstrz)
75- end
76- write (f, " \n " )
77- end
78- end
79- end
57+ # function dump_ngram_table(head::Vector{Float32}, D, filename; compress_level=63)
58+ # Z1, Z2 = RLCS(compress_level), RLCS(compress_level)
59+ # open(filename, "w") do f
60+ # write(f, "total:")
61+ # write(f, join(head, ","))
62+ # write(f, "\n")
63+ # last_v = 0.0
64+ # for (k, v) in D
65+ # @assert k isa Vector{UInt8}
66+ # k = join(string.(k, base=16), "")
67+ # kz = rlcs_zip(Z1, k)
68+ # write(f, kz)
69+ # if last_v != v
70+ # last_v = v
71+ # write(f, ",")
72+ # vstr = string(v)
73+ # vstrz = rlcs_zip(Z2, vstr)
74+ # write(f, vstrz)
75+ # end
76+ # write(f, "\n")
77+ # end
78+ # end
79+ # end
8080
8181function ngram_table (filename)
8282 el = eachline (filename)
@@ -104,7 +104,7 @@ function ngram_table(filename)
104104 hd, Channel (producer)
105105end
106106
107- function load_ngram_table (filename)
108- hd, tb = ngram_table (filename)
109- hd, collect (tb)
110- end
107+ # function load_ngram_table(filename)
108+ # hd, tb = ngram_table(filename)
109+ # hd, collect(tb)
110+ # end
0 commit comments