Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions docs/src/features.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,20 @@ julia> hash_dtv(crps[1])
0 0 0 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 0 0
```

## Top Features

We can use the function `top_terms(x, n)` to quickly view the top features of a `Document`, `DocumentTermMatrix` or `Corpus`.

```julia
julia> top_terms(m, 5)
5-element Vector{Pair{String, Int64}}:
"To" => 2
"be" => 2
"become" => 2
"not" => 2
"or" => 2
```

## TF (Term Frequency)

Often we need to find out what proportion of a document is contributed by each term. This can be done using the term frequency function:
Expand Down
1 change: 1 addition & 0 deletions src/TextAnalysis.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ export tf, tf_idf, bm_25, lsa, lda, summarize, cos_similarity
export tf!, tf_idf!, bm_25!, lda!
export remove_patterns!, remove_patterns
export prune!
export top_terms

export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation
export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles
Expand Down
42 changes: 42 additions & 0 deletions src/corpus.jl
Original file line number Diff line number Diff line change
Expand Up @@ -298,3 +298,45 @@ function standardize!(crps::Corpus, ::Type{T}) where {T<:AbstractDocument}
crps.documents[i] = convert(T, crps.documents[i])
end
end


"""
top_terms(x, n)

Return the top `n` most frequent terms from a lexicon-like object.

The function accepts:
- `Dict{String,Int}`: a mapping of terms to their frequencies
- `Corpus`: extracts its lexicon internally via `lexicon(crps)`

Terms are sorted by:
1. Descending frequency
2. Alphabetical order (to break ties)

# Arguments
- `x`: A `Dict{String,Int}` or a `Corpus`
- `n::Int`: Number of top terms to return

# Returns
A `Vector{Pair{String,Int}}` containing up to `n` term-frequency pairs.

# Examples
```julia
julia> top_terms(m, 5)
5-element Vector{Pair{String, Int64}}:
"To" => 2
"be" => 2
"become" => 2
"not" => 2
"or" => 2
```
"""
function top_terms(lx::Dict{String,Int}, ::Val{N}) where {N}
D_pairs = collect(pairs(lx))
n = min(N, length(D_pairs))
# Count decreasing, break ties alphabetically
idx = partialsortperm(D_pairs, 1:n, by = p -> (-p.second, p.first))
D_pairs[idx]
end
top_terms(lx::Dict{String,Int}, n::Int) = top_terms(lx, Val(n))
top_terms(crps::Corpus, n::Int) = top_terms(lexicon(crps), Val(n))
31 changes: 31 additions & 0 deletions src/document.jl
Original file line number Diff line number Diff line change
Expand Up @@ -398,3 +398,34 @@ Base.convert(::Type{NGramDocument}, d::NGramDocument) = d
##############################################################################

Base.getindex(d::AbstractDocument, term::AbstractString) = ngrams(d)[term]

"""
top_terms(d, n)

Return the top `n` most frequent terms in an `AbstractDocument`.

The document is tokenized with [`tokens`](@ref), terms are counted, and the
result is returned as a vector of `Pair{String,Int}`-like values ordered by:

1. descending term frequency
2. alphabetical order to break ties

# Arguments
- `d::AbstractDocument`: document to analyze
- `n::Int`: number of top terms to return

# Returns
A vector of term-count pairs containing up to `n` entries.

# Notes
If `n` is larger than the number of distinct tokens in `d`, all distinct terms
are returned.
"""
function top_terms(d::AbstractDocument, ::Val{N}) where {N}
D_pairs = collect(pairs(countmap(tokens(d))))
n = min(N, length(D_pairs))
# Count decreasing, break ties alphabetically
idx = partialsortperm(D_pairs, 1:n; by = p -> (-p.second, p.first))
D_pairs[idx]
end
top_terms(d::AbstractDocument, n::Int) = top_terms(d, Val(n))
18 changes: 18 additions & 0 deletions src/dtm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -440,3 +440,21 @@ function merge!(dtm1::DocumentTermMatrix{T}, dtm2::DocumentTermMatrix{T}) where

dtm1
end

"""
top_terms(x)
top_terms(x, n)

Return terms sorted in descending frequency. With `n`, return only the top `n` terms.
Accepts a `Corpus`, `AbstractDocument`, lexicon `Dict`, or `DocumentTermMatrix`.
Ties are sorted alphabetically.
"""
function top_terms(D::DocumentTermMatrix, ::Val{N}) where {N}
counts = @view(sum(D.dtm; dims=1)[1, :])
D_pairs = D.terms .=> counts
n = min(N, length(D_pairs))
# Count decreasing, break ties alphabetically
idx = partialsortperm(D_pairs, 1:n; by = p -> (-p.second, p.first))
D_pairs[idx]
end
top_terms(D::DocumentTermMatrix, n::Int) = top_terms(D, Val(n))
2 changes: 2 additions & 0 deletions test/corpus.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
update_lexicon!(crps)
answer = Dict("1" => 2, "2" => 1, "4" => 1)

@test top_terms(crps, 1) == top_terms(crps[1], 1)

@test answer == lexicon(crps)
end

Expand Down
6 changes: 6 additions & 0 deletions test/document.jl
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@
@test isa(ngd, NGramDocument)
@test "To" in keys(ngrams(ngd))

# Test top features
top = top_terms(sd, 5)
@test [pair.first for pair in top] == ["be", "To", "not", "or", "to"]
@test [pair.second for pair in top] == [2, 1, 1, 1, 1]
@test top_terms(sd, 2) == ["be" => 2, "To" => 1]

sd = StringDocument(hamlet_text)
td = TokenDocument(hamlet_text)
ngd = NGramDocument(hamlet_text)
Expand Down
9 changes: 9 additions & 0 deletions test/dtm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -109,4 +109,13 @@
@test dtm2.terms == ["five", "four", "three", "two"]
@test size(dtm2.dtm) == (2, 4)
@test sum(dtm2.dtm, dims=(1,)) == [1 2 2 1]

# Test top_terms
crps3 = Corpus([FileDocument(sample_file)])
update_lexicon!(crps3)
m3 = DocumentTermMatrix(crps3)
top5 = top_terms(m3, 5)
@test top5 isa Vector{<:Pair}
@test [pair.first for pair in top5] == [",", "thou", "And", "and", ";"]
@test [pair.second for pair in top5] == [29, 6, 5, 5, 3]
end
Loading