-
Notifications
You must be signed in to change notification settings - Fork 5
fix(trends): use TF-IDF to filter out low-signal words (#329) #338
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,12 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import asyncio | ||
| import logging | ||
| from collections import Counter | ||
| from dataclasses import dataclass | ||
|
|
||
| from sklearn.feature_extraction.text import TfidfVectorizer | ||
|
|
||
| from src.database import Database | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
@@ -44,47 +48,82 @@ class PeakHour: | |
| class TrendService: | ||
| """Trend analysis over collected messages.""" | ||
|
|
||
| _TOPIC_BATCH_SIZE = 5000 | ||
| _MAX_TOPIC_DOCUMENTS = 10000 | ||
|
|
||
| def __init__(self, db: Database) -> None: | ||
| self._db = db | ||
|
|
||
| async def get_trending_topics(self, days: int = 7, limit: int = 20) -> list[TrendingTopic]: | ||
| """Return top keywords by frequency in messages from the last N days. | ||
| """Return top keywords ranked by TF-IDF from recent messages. | ||
|
|
||
| Uses a simple word-frequency approach on the messages table so it works | ||
| even without FTS5. Short words (<4 chars) and stop-words are skipped. | ||
| Processes rows in batches to avoid loading all texts into memory at once. | ||
| Uses a bounded corpus of recent messages to avoid unbounded memory usage | ||
| and keeps raw mention counts so downstream callers can continue to render | ||
| the results as "mentions". | ||
| """ | ||
| batch_size = 5000 | ||
| if limit <= 0: | ||
| return [] | ||
|
|
||
| offset = 0 | ||
| word_counts: dict[str, int] = {} | ||
| stop_words = { | ||
| "и", "в", "на", "с", "по", "не", "это", "то", "что", | ||
| "как", "из", "за", "от", "для", "или", "но", "а", | ||
| "the", "and", "is", "in", "to", "of", "a", "for", | ||
| } | ||
| while True: | ||
| texts: list[str] = [] | ||
|
|
||
| while len(texts) < self._MAX_TOPIC_DOCUMENTS: | ||
| batch_size = min(self._TOPIC_BATCH_SIZE, self._MAX_TOPIC_DOCUMENTS - len(texts)) | ||
| rows = await self._db.execute_fetchall( | ||
| """ | ||
| SELECT text FROM messages | ||
| WHERE date >= date('now', ?) | ||
| AND COALESCE(TRIM(text), '') <> '' | ||
| ORDER BY date DESC, id DESC | ||
| LIMIT ? OFFSET ? | ||
| """, | ||
| (f"-{days} days", batch_size, offset), | ||
| ) | ||
| if not rows: | ||
| break | ||
| for row in rows: | ||
| text = row["text"] or "" | ||
| for word in text.split(): | ||
| w = word.lower().strip(".,!?:;\"'()[]{}–—") | ||
| if len(w) >= 4 and w not in stop_words and w.isalpha(): | ||
| word_counts[w] = word_counts.get(w, 0) + 1 | ||
| texts.extend(row["text"] or "" for row in rows) | ||
| if len(rows) < batch_size: | ||
| break | ||
| offset += batch_size | ||
| sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True) | ||
| return [TrendingTopic(keyword=w, count=c) for w, c in sorted_words[:limit]] | ||
| offset += len(rows) | ||
|
|
||
| if not texts: | ||
| return [] | ||
|
|
||
| if len(texts) == self._MAX_TOPIC_DOCUMENTS: | ||
| logger.info( | ||
| "Capped trending-topic corpus at %d most recent messages", | ||
| self._MAX_TOPIC_DOCUMENTS, | ||
| ) | ||
|
|
||
| return await asyncio.to_thread(self._rank_trending_topics, texts, limit) | ||
|
|
||
| def _rank_trending_topics(self, texts: list[str], limit: int) -> list[TrendingTopic]: | ||
| vectorizer = TfidfVectorizer( | ||
| token_pattern=r"(?u)\b[а-яёa-z]{4,}\b", | ||
| max_df=0.85, | ||
| min_df=2, | ||
| ) | ||
| try: | ||
| tfidf_matrix = vectorizer.fit_transform(texts) | ||
| except ValueError: | ||
|
Comment on lines
+101
to
+108
|
||
| return [] | ||
|
|
||
| feature_names = vectorizer.get_feature_names_out() | ||
| scores = tfidf_matrix.sum(axis=0).A1 | ||
| mention_counts: Counter[str] = Counter() | ||
| analyzer = vectorizer.build_analyzer() | ||
|
|
||
| for text in texts: | ||
| mention_counts.update(analyzer(text)) | ||
|
|
||
| top_indices = scores.argsort()[::-1] | ||
| topics: list[TrendingTopic] = [] | ||
| for index in top_indices: | ||
| keyword = feature_names[index] | ||
| topics.append(TrendingTopic(keyword=keyword, count=mention_counts[keyword])) | ||
| if len(topics) >= limit: | ||
| break | ||
| return topics | ||
|
|
||
| async def get_trending_channels(self, days: int = 7, limit: int = 10) -> list[TrendingChannel]: | ||
| """Return channels with the highest average views in the last N days.""" | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
PR description mentions “configurable thresholds”, but
max_df=0.85,min_df=2, and the token pattern are hard-coded here. If these are intended to be configurable, consider adding parameters (with defaults) or reading from config so behavior can be tuned without code changes.