diff --git a/docs/src/features.md b/docs/src/features.md index eb95e6db..6f2a3676 100644 --- a/docs/src/features.md +++ b/docs/src/features.md @@ -102,6 +102,20 @@ julia> hash_dtv(crps[1]) 0 0 0 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 0 0 ``` +## Top Features + +We can use the function `top_terms(x, n)` to quickly view the top features of a `Document`, `DocumentTermMatrix` or `Corpus`. + +```julia +julia> top_terms(m, 5) +5-element Vector{Pair{String, Int64}}: + "To" => 2 + "be" => 2 + "become" => 2 + "not" => 2 + "or" => 2 +``` + ## TF (Term Frequency) Often we need to find out what proportion of a document is contributed by each term. This can be done using the term frequency function: diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 2d763b6b..ab653697 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -54,6 +54,7 @@ export tf, tf_idf, bm_25, lsa, lda, summarize, cos_similarity export tf!, tf_idf!, bm_25!, lda! export remove_patterns!, remove_patterns export prune! +export top_terms export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles diff --git a/src/corpus.jl b/src/corpus.jl index 9d3b273b..c4d83360 100644 --- a/src/corpus.jl +++ b/src/corpus.jl @@ -298,3 +298,45 @@ function standardize!(crps::Corpus, ::Type{T}) where {T<:AbstractDocument} crps.documents[i] = convert(T, crps.documents[i]) end end + + +""" + top_terms(x, n) + +Return the top `n` most frequent terms from a lexicon-like object. + +The function accepts: +- `Dict{String,Int}`: a mapping of terms to their frequencies +- `Corpus`: extracts its lexicon internally via `lexicon(crps)` + +Terms are sorted by: +1. Descending frequency +2. Alphabetical order (to break ties) + +# Arguments +- `x`: A `Dict{String,Int}` or a `Corpus` +- `n::Int`: Number of top terms to return + +# Returns +A `Vector{Pair{String,Int}}` containing up to `n` term-frequency pairs. + +# Examples +```julia +julia> top_terms(m, 5) +5-element Vector{Pair{String, Int64}}: + "To" => 2 + "be" => 2 + "become" => 2 + "not" => 2 + "or" => 2 +``` +""" +function top_terms(lx::Dict{String,Int}, ::Val{N}) where {N} + D_pairs = collect(pairs(lx)) + n = min(N, length(D_pairs)) + # Count decreasing, break ties alphabetically + idx = partialsortperm(D_pairs, 1:n, by = p -> (-p.second, p.first)) + D_pairs[idx] +end +top_terms(lx::Dict{String,Int}, n::Int) = top_terms(lx, Val(n)) +top_terms(crps::Corpus, n::Int) = top_terms(lexicon(crps), Val(n)) diff --git a/src/document.jl b/src/document.jl index e933f9a7..29ca9bea 100644 --- a/src/document.jl +++ b/src/document.jl @@ -398,3 +398,34 @@ Base.convert(::Type{NGramDocument}, d::NGramDocument) = d ############################################################################## Base.getindex(d::AbstractDocument, term::AbstractString) = ngrams(d)[term] + +""" + top_terms(d, n) + +Return the top `n` most frequent terms in an `AbstractDocument`. + +The document is tokenized with [`tokens`](@ref), terms are counted, and the +result is returned as a vector of `Pair{String,Int}`-like values ordered by: + +1. descending term frequency +2. alphabetical order to break ties + +# Arguments +- `d::AbstractDocument`: document to analyze +- `n::Int`: number of top terms to return + +# Returns +A vector of term-count pairs containing up to `n` entries. + +# Notes +If `n` is larger than the number of distinct tokens in `d`, all distinct terms +are returned. +""" +function top_terms(d::AbstractDocument, ::Val{N}) where {N} + D_pairs = collect(pairs(countmap(tokens(d)))) + n = min(N, length(D_pairs)) + # Count decreasing, break ties alphabetically + idx = partialsortperm(D_pairs, 1:n; by = p -> (-p.second, p.first)) + D_pairs[idx] +end +top_terms(d::AbstractDocument, n::Int) = top_terms(d, Val(n)) diff --git a/src/dtm.jl b/src/dtm.jl index 35c9cc7c..0555a50d 100644 --- a/src/dtm.jl +++ b/src/dtm.jl @@ -440,3 +440,21 @@ function merge!(dtm1::DocumentTermMatrix{T}, dtm2::DocumentTermMatrix{T}) where dtm1 end + +""" + top_terms(x) + top_terms(x, n) + +Return terms sorted in descending frequency. With `n`, return only the top `n` terms. +Accepts a `Corpus`, `AbstractDocument`, lexicon `Dict`, or `DocumentTermMatrix`. +Ties are sorted alphabetically. +""" +function top_terms(D::DocumentTermMatrix, ::Val{N}) where {N} + counts = @view(sum(D.dtm; dims=1)[1, :]) + D_pairs = D.terms .=> counts + n = min(N, length(D_pairs)) + # Count decreasing, break ties alphabetically + idx = partialsortperm(D_pairs, 1:n; by = p -> (-p.second, p.first)) + D_pairs[idx] +end +top_terms(D::DocumentTermMatrix, n::Int) = top_terms(D, Val(n)) diff --git a/test/corpus.jl b/test/corpus.jl index 044c89a6..3495697b 100644 --- a/test/corpus.jl +++ b/test/corpus.jl @@ -39,6 +39,8 @@ update_lexicon!(crps) answer = Dict("1" => 2, "2" => 1, "4" => 1) + @test top_terms(crps, 1) == top_terms(crps[1], 1) + @test answer == lexicon(crps) end diff --git a/test/document.jl b/test/document.jl index 8ffa3ef3..8fe0aa29 100644 --- a/test/document.jl +++ b/test/document.jl @@ -66,6 +66,12 @@ @test isa(ngd, NGramDocument) @test "To" in keys(ngrams(ngd)) + # Test top features + top = top_terms(sd, 5) + @test [pair.first for pair in top] == ["be", "To", "not", "or", "to"] + @test [pair.second for pair in top] == [2, 1, 1, 1, 1] + @test top_terms(sd, 2) == ["be" => 2, "To" => 1] + sd = StringDocument(hamlet_text) td = TokenDocument(hamlet_text) ngd = NGramDocument(hamlet_text) diff --git a/test/dtm.jl b/test/dtm.jl index 0a2f01fd..e7a5a5f4 100644 --- a/test/dtm.jl +++ b/test/dtm.jl @@ -109,4 +109,13 @@ @test dtm2.terms == ["five", "four", "three", "two"] @test size(dtm2.dtm) == (2, 4) @test sum(dtm2.dtm, dims=(1,)) == [1 2 2 1] + + # Test top_terms + crps3 = Corpus([FileDocument(sample_file)]) + update_lexicon!(crps3) + m3 = DocumentTermMatrix(crps3) + top5 = top_terms(m3, 5) + @test top5 isa Vector{<:Pair} + @test [pair.first for pair in top5] == [",", "thou", "And", "and", ";"] + @test [pair.second for pair in top5] == [29, 6, 5, 5, 3] end