JuliaText · rssdev10 · Apr 8, 2026 · Mar 7, 2026 · Mar 7, 2026 · Mar 7, 2026
diff --git a/docs/src/features.md b/docs/src/features.md
@@ -102,6 +102,20 @@ julia> hash_dtv(crps[1])
  0  0  0  0  0  0  0  0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
 ```
 
+## Top Features
+
+We can use the function `top_terms(x, n)` to quickly view the top features of a `Document`, `DocumentTermMatrix` or `Corpus`.
+
+```julia
+julia> top_terms(m, 5)
+5-element Vector{Pair{String, Int64}}:
+     "To" => 2
+     "be" => 2
+ "become" => 2
+    "not" => 2
+     "or" => 2
+```
+
 ## TF (Term Frequency)
 
 Often we need to find out what proportion of a document is contributed by each term. This can be done using the term frequency function:

diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
@@ -54,6 +54,7 @@ export tf, tf_idf, bm_25, lsa, lda, summarize, cos_similarity
 export tf!, tf_idf!, bm_25!, lda!
 export remove_patterns!, remove_patterns
 export prune!
+export top_terms
 
 export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation
 export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles

diff --git a/src/corpus.jl b/src/corpus.jl
@@ -298,3 +298,45 @@ function standardize!(crps::Corpus, ::Type{T}) where {T<:AbstractDocument}
         crps.documents[i] = convert(T, crps.documents[i])
     end
 end
+
+
+"""
+    top_terms(x, n)
+
+Return the top `n` most frequent terms from a lexicon-like object.
+
+The function accepts:
+- `Dict{String,Int}`: a mapping of terms to their frequencies
+- `Corpus`: extracts its lexicon internally via `lexicon(crps)`
+
+Terms are sorted by:
+1. Descending frequency
+2. Alphabetical order (to break ties)
+
+# Arguments
+- `x`: A `Dict{String,Int}` or a `Corpus`
+- `n::Int`: Number of top terms to return
+
+# Returns
+A `Vector{Pair{String,Int}}` containing up to `n` term-frequency pairs.
+
+# Examples
+```julia
+julia> top_terms(m, 5)
+5-element Vector{Pair{String, Int64}}:
+     "To" => 2
+     "be" => 2
+ "become" => 2
+    "not" => 2
+     "or" => 2
+```
+"""
+function top_terms(lx::Dict{String,Int}, ::Val{N}) where {N}
+    D_pairs = collect(pairs(lx))
+    n = min(N, length(D_pairs))
+    # Count decreasing, break ties alphabetically
+    idx = partialsortperm(D_pairs, 1:n, by = p -> (-p.second, p.first)) 
+    D_pairs[idx]
+end
+top_terms(lx::Dict{String,Int}, n::Int) = top_terms(lx, Val(n))
+top_terms(crps::Corpus, n::Int) = top_terms(lexicon(crps), Val(n))
diff --git a/src/document.jl b/src/document.jl
@@ -398,3 +398,34 @@ Base.convert(::Type{NGramDocument}, d::NGramDocument) = d
 ##############################################################################
 
 Base.getindex(d::AbstractDocument, term::AbstractString) = ngrams(d)[term]
+
+"""
+    top_terms(d, n)
+
+Return the top `n` most frequent terms in an `AbstractDocument`.
+
+The document is tokenized with [`tokens`](@ref), terms are counted, and the
+result is returned as a vector of `Pair{String,Int}`-like values ordered by:
+
+1. descending term frequency
+2. alphabetical order to break ties
+
+# Arguments
+- `d::AbstractDocument`: document to analyze
+- `n::Int`: number of top terms to return
+
+# Returns
+A vector of term-count pairs containing up to `n` entries.
+
+# Notes
+If `n` is larger than the number of distinct tokens in `d`, all distinct terms
+are returned.
+"""
+function top_terms(d::AbstractDocument, ::Val{N}) where {N}
+    D_pairs = collect(pairs(countmap(tokens(d))))
+    n = min(N, length(D_pairs))
+    # Count decreasing, break ties alphabetically
+    idx = partialsortperm(D_pairs, 1:n; by = p -> (-p.second, p.first))
+    D_pairs[idx]
+end
+top_terms(d::AbstractDocument, n::Int) = top_terms(d, Val(n))
diff --git a/src/dtm.jl b/src/dtm.jl
@@ -440,3 +440,21 @@ function merge!(dtm1::DocumentTermMatrix{T}, dtm2::DocumentTermMatrix{T}) where
 
     dtm1
 end
+
+"""
+    top_terms(x)
+    top_terms(x, n)
+
+Return terms sorted in descending frequency. With `n`, return only the top `n` terms.
+Accepts a `Corpus`, `AbstractDocument`, lexicon `Dict`, or `DocumentTermMatrix`.
+Ties are sorted alphabetically.
+"""
+function top_terms(D::DocumentTermMatrix, ::Val{N}) where {N}
+    counts = @view(sum(D.dtm; dims=1)[1, :])
+    D_pairs = D.terms .=> counts
+    n = min(N, length(D_pairs))
+    # Count decreasing, break ties alphabetically
+    idx = partialsortperm(D_pairs, 1:n; by = p -> (-p.second, p.first))
+    D_pairs[idx]
+end
+top_terms(D::DocumentTermMatrix, n::Int) = top_terms(D, Val(n))
diff --git a/test/corpus.jl b/test/corpus.jl
@@ -39,6 +39,8 @@
     update_lexicon!(crps)
     answer = Dict("1" => 2, "2" => 1, "4" => 1)
 
+    @test top_terms(crps, 1) == top_terms(crps[1], 1)
+
     @test answer == lexicon(crps)
 end
 

diff --git a/test/document.jl b/test/document.jl
@@ -66,6 +66,12 @@
     @test isa(ngd, NGramDocument)
     @test "To" in keys(ngrams(ngd))
 
+    # Test top features
+    top = top_terms(sd, 5)
+    @test [pair.first for pair in top] == ["be", "To", "not", "or", "to"]
+    @test [pair.second for pair in top] == [2, 1, 1, 1, 1]
+    @test top_terms(sd, 2) == ["be" => 2, "To" => 1]
+
     sd = StringDocument(hamlet_text)
     td = TokenDocument(hamlet_text)
     ngd = NGramDocument(hamlet_text)

diff --git a/test/dtm.jl b/test/dtm.jl
@@ -109,4 +109,13 @@
     @test dtm2.terms == ["five", "four", "three", "two"]
     @test size(dtm2.dtm) == (2, 4)
     @test sum(dtm2.dtm, dims=(1,)) == [1 2 2 1]
+
+    # Test top_terms
+    crps3 = Corpus([FileDocument(sample_file)])
+    update_lexicon!(crps3)
+    m3 = DocumentTermMatrix(crps3)
+    top5 = top_terms(m3, 5)
+    @test top5 isa Vector{<:Pair}
+    @test [pair.first for pair in top5] == [",", "thou", "And", "and", ";"]
+    @test [pair.second for pair in top5] == [29, 6, 5, 5, 3]
 end