Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .jules/bolt.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,7 @@
## 2026-02-10 - Group-By for Multi-Count Statistics
**Learning:** Executing multiple `count()` queries with different filters (e.g., for different statuses) causes redundant database scans and network round-trips.
**Action:** Use a single SQL `GROUP BY` query to fetch counts for all categories/statuses at once, then process the results in Python.

## 2026-02-12 - O(1) LRU vs O(N) LFU in High-Concurrency Cache
**Learning:** Using a simple dictionary with an access counter for eviction results in O(N) complexity for every `set` operation when the cache is full, as it must scan all keys to find the minimum. Under high concurrency and large cache sizes, this introduces significant latency and lock contention.
**Action:** Use `collections.OrderedDict` to implement an O(1) LRU cache. `move_to_end()` on `get` and `popitem(last=False)` on `set` (when full) ensures constant time complexity for eviction, keeping the event loop responsive even under load.
69 changes: 50 additions & 19 deletions backend/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,60 +3,68 @@
import threading
from typing import Any, Optional
from datetime import datetime, timedelta
from collections import OrderedDict

logger = logging.getLogger(__name__)

class ThreadSafeCache:
"""
Thread-safe cache implementation with TTL and memory management.
Fixes race conditions and implements proper cache expiration.
Thread-safe cache implementation with TTL and O(1) LRU eviction.
Uses OrderedDict to maintain access order for efficient memory management.
"""

def __init__(self, ttl: int = 300, max_size: int = 100):
self._data = {}
self._data = OrderedDict()
self._timestamps = {}
self._ttl = ttl # Time to live in seconds
self._max_size = max_size # Maximum number of cache entries
self._lock = threading.RLock() # Reentrant lock for thread safety
self._access_count = {} # Track access frequency for LRU eviction
self._hits = 0
self._misses = 0

def get(self, key: str = "default") -> Optional[Any]:
"""
Thread-safe get operation with automatic cleanup.
Thread-safe get operation with automatic cleanup and LRU update.
Performance: O(1)
"""
with self._lock:
current_time = time.time()

# Check if key exists and is not expired
if key in self._data and key in self._timestamps:
if current_time - self._timestamps[key] < self._ttl:
# Update access count for LRU
self._access_count[key] = self._access_count.get(key, 0) + 1
# Move to end (Mark as Most Recently Used)
self._data.move_to_end(key)
self._hits += 1
return self._data[key]
else:
# Expired entry - remove it
self._remove_key(key)

self._misses += 1
return None

def set(self, data: Any, key: str = "default") -> None:
"""
Thread-safe set operation with memory management.
Thread-safe set operation with O(1) LRU eviction.
Performance: O(1)
"""
with self._lock:
current_time = time.time()

# Clean up expired entries before adding new one
self._cleanup_expired()

Comment on lines +49 to 57
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

set() write path is still O(N) due unconditional TTL sweep.

Calling _cleanup_expired() on every write makes set linear in cache size under lock, which can bring back contention under load.

⚙️ Suggested adjustment
 def set(self, data: Any, key: str = "default") -> None:
@@
-            # Clean up expired entries before adding new one
-            self._cleanup_expired()
+            # Avoid full sweep on every write; sweep when needed
+            if key not in self._data and len(self._data) >= self._max_size:
+                self._cleanup_expired()
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@backend/cache.py` around lines 49 - 57, The set() path currently calls
_cleanup_expired() under self._lock on every write, making writes O(N); remove
the unconditional TTL sweep from set() and instead trigger expiration cleanup
only conditionally (e.g., when cache size exceeds capacity, on a probabilistic
sampling chance, or via a dedicated background janitor thread), while preserving
thread-safety and O(1) LRU operations; update references in set(),
_cleanup_expired(), and any eviction logic so the lock usage remains correct and
expired entries are still removed in time without making set() linear.

# If cache is full, evict least recently used entry
if len(self._data) >= self._max_size and key not in self._data:
# If key exists, move to end before updating
if key in self._data:
self._data.move_to_end(key)
# If cache is full and key is new, evict oldest (first) entry
elif len(self._data) >= self._max_size:
self._evict_lru()

# Set new data atomically
self._data[key] = data
self._timestamps[key] = current_time
self._access_count[key] = 1

logger.debug(f"Cache set: key={key}, size={len(self._data)}")

Expand All @@ -75,7 +83,6 @@ def clear(self) -> None:
with self._lock:
self._data.clear()
self._timestamps.clear()
self._access_count.clear()
logger.debug("Cache cleared")

def get_stats(self) -> dict:
Expand All @@ -93,7 +100,9 @@ def get_stats(self) -> dict:
"total_entries": len(self._data),
"expired_entries": expired_count,
"max_size": self._max_size,
"ttl_seconds": self._ttl
"ttl_seconds": self._ttl,
"hits": self._hits,
"misses": self._misses
}

def _remove_key(self, key: str) -> None:
Expand All @@ -103,14 +112,33 @@ def _remove_key(self, key: str) -> None:
"""
self._data.pop(key, None)
self._timestamps.pop(key, None)
self._access_count.pop(key, None)

def _cleanup_expired(self) -> None:
"""
Internal method to clean up expired entries.
Must be called within lock context.
"""
current_time = time.time()
# Note: In OrderedDict, we could stop early if we encounter a non-expired item,
# but since items can be updated (moving to end), we stick to full scan or
# just check the oldest. However, multiple items can expire.
# Efficient cleanup: check from the beginning (oldest)
expired_keys = []
for key in self._data:
if current_time - self._timestamps[key] >= self._ttl:
expired_keys.append(key)
else:
# Since we move accessed/updated items to the end,
# we can't assume total temporal ordering here if TTL varies,
# but with fixed TTL, items at the front are older.
# Actually, move_to_end breaks strict temporal ordering of 'set' time.
# So we keep the list comprehension for safety or just check all.
pass

# Re-evaluating: move_to_end is for LRU. If we want TTL to be efficient,
# we'd need another structure. But for max_size=100, full scan is fine.

# To be safe and simple:
Comment on lines +122 to +141
Copy link

Copilot AI Feb 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_cleanup_expired builds expired_keys via a loop above, but then immediately overwrites expired_keys with a list comprehension. This makes the earlier scan dead code and adds unnecessary work under the cache lock. Remove the redundant loop/commentary and keep a single implementation for collecting expired keys.

Suggested change
# Note: In OrderedDict, we could stop early if we encounter a non-expired item,
# but since items can be updated (moving to end), we stick to full scan or
# just check the oldest. However, multiple items can expire.
# Efficient cleanup: check from the beginning (oldest)
expired_keys = []
for key in self._data:
if current_time - self._timestamps[key] >= self._ttl:
expired_keys.append(key)
else:
# Since we move accessed/updated items to the end,
# we can't assume total temporal ordering here if TTL varies,
# but with fixed TTL, items at the front are older.
# Actually, move_to_end breaks strict temporal ordering of 'set' time.
# So we keep the list comprehension for safety or just check all.
pass
# Re-evaluating: move_to_end is for LRU. If we want TTL to be efficient,
# we'd need another structure. But for max_size=100, full scan is fine.
# To be safe and simple:
# Full scan over timestamps is acceptable for typical cache sizes.

Copilot uses AI. Check for mistakes.
expired_keys = [
Comment on lines +126 to 142
Copy link
Contributor

@cubic-dev-ai cubic-dev-ai bot Feb 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: expired_keys is computed twice; the new loop is dead work because it’s overwritten by the list comprehension, adding unnecessary O(n) overhead on every cache write. Remove the redundant scan and keep a single computation.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At backend/cache.py, line 126:

<comment>`expired_keys` is computed twice; the new loop is dead work because it’s overwritten by the list comprehension, adding unnecessary O(n) overhead on every cache write. Remove the redundant scan and keep a single computation.</comment>

<file context>
@@ -103,14 +112,33 @@ def _remove_key(self, key: str) -> None:
+        # but since items can be updated (moving to end), we stick to full scan or
+        # just check the oldest. However, multiple items can expire.
+        # Efficient cleanup: check from the beginning (oldest)
+        expired_keys = []
+        for key in self._data:
+            if current_time - self._timestamps[key] >= self._ttl:
</file context>
Suggested change
expired_keys = []
for key in self._data:
if current_time - self._timestamps[key] >= self._ttl:
expired_keys.append(key)
else:
# Since we move accessed/updated items to the end,
# we can't assume total temporal ordering here if TTL varies,
# but with fixed TTL, items at the front are older.
# Actually, move_to_end breaks strict temporal ordering of 'set' time.
# So we keep the list comprehension for safety or just check all.
pass
# Re-evaluating: move_to_end is for LRU. If we want TTL to be efficient,
# we'd need another structure. But for max_size=100, full scan is fine.
# To be safe and simple:
expired_keys = [
# To be safe and simple:
expired_keys = [
key for key, timestamp in self._timestamps.items()
if current_time - timestamp >= self._ttl
]
Fix with Cubic

key for key, timestamp in self._timestamps.items()
if current_time - timestamp >= self._ttl
Expand All @@ -125,15 +153,16 @@ def _cleanup_expired(self) -> None:
def _evict_lru(self) -> None:
"""
Internal method to evict least recently used entry.
OrderedDict.popitem(last=False) is O(1).
Must be called within lock context.
"""
if not self._access_count:
if not self._data:
return

# Find key with lowest access count
lru_key = min(self._access_count.keys(), key=lambda k: self._access_count[k])
self._remove_key(lru_key)
logger.debug(f"Evicted LRU cache entry: {lru_key}")
# Pop the first item (Least Recently Used)
key, _ = self._data.popitem(last=False)
self._timestamps.pop(key, None)
logger.debug(f"Evicted LRU cache entry: {key}")

class SimpleCache:
"""
Expand All @@ -156,3 +185,5 @@ def invalidate(self):
recent_issues_cache = ThreadSafeCache(ttl=300, max_size=20) # 5 minutes TTL, max 20 entries
nearby_issues_cache = ThreadSafeCache(ttl=60, max_size=100) # 1 minute TTL, max 100 entries
user_upload_cache = ThreadSafeCache(ttl=3600, max_size=1000) # 1 hour TTL for upload limits
user_issues_cache = ThreadSafeCache(ttl=300, max_size=50) # 5 minutes TTL, max 50 entries
blockchain_last_hash_cache = ThreadSafeCache(ttl=3600, max_size=1) # Cache for last blockchain hash
10 changes: 8 additions & 2 deletions backend/init_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,11 @@ def index_exists(table, index_name):
logger.info("Added action_plan column to issues")

if not column_exists("issues", "integrity_hash"):
conn.execute(text("ALTER TABLE issues ADD COLUMN integrity_hash VARCHAR"))
conn.execute(text("ALTER TABLE issues ADD COLUMN integrity_hash VARCHAR(255)"))
logger.info("Added integrity_hash column to issues")

if not column_exists("issues", "previous_integrity_hash"):
conn.execute(text("ALTER TABLE issues ADD COLUMN previous_integrity_hash VARCHAR"))
conn.execute(text("ALTER TABLE issues ADD COLUMN previous_integrity_hash VARCHAR(255)"))
logger.info("Added previous_integrity_hash column to issues")

# Indexes (using IF NOT EXISTS syntax where supported or check first)
Expand All @@ -95,6 +95,12 @@ def index_exists(table, index_name):
if not index_exists("issues", "ix_issues_user_email"):
conn.execute(text("CREATE INDEX IF NOT EXISTS ix_issues_user_email ON issues (user_email)"))

if not index_exists("issues", "ix_issues_integrity_hash"):
conn.execute(text("CREATE INDEX IF NOT EXISTS ix_issues_integrity_hash ON issues (integrity_hash)"))

if not index_exists("issues", "ix_issues_previous_integrity_hash"):
conn.execute(text("CREATE INDEX IF NOT EXISTS ix_issues_previous_integrity_hash ON issues (previous_integrity_hash)"))

# Voice and Language Support Columns (Issue #291)
if not column_exists("issues", "submission_type"):
conn.execute(text("ALTER TABLE issues ADD COLUMN submission_type VARCHAR DEFAULT 'text'"))
Expand Down
3 changes: 2 additions & 1 deletion backend/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ class Issue(Base):
longitude = Column(Float, nullable=True, index=True)
location = Column(String, nullable=True)
action_plan = Column(JSON, nullable=True)
integrity_hash = Column(String, nullable=True) # Blockchain integrity seal
integrity_hash = Column(String(255), index=True, nullable=True) # Blockchain integrity seal
previous_integrity_hash = Column(String(255), index=True, nullable=True) # Link to previous block

# Voice and Language Support (Issue #291)
submission_type = Column(String, default="text") # 'text', 'voice'
Expand Down
Loading