thewebscraping
diff --git a/‎.env.example‎
Lines changed: 55 additions & 12 deletions b/‎.env.example‎
Lines changed: 55 additions & 12 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 84 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 84 additions & 0 deletions
@@ -1,35 +1,78 @@
-# OpenAI (for embeddings)
+# ===================================================================
+# Embedding Providers
+# ===================================================================
+
+# OpenAI
 OPENAI_API_KEY=sk-your-key-here
-OPENAI_EMBEDDING_MODEL=text-embedding-3-small
+
+# Gemini (Google)
+GEMINI_API_KEY=your-gemini-api-key
+
+# Embedding Model (optional, shared across providers)
+# If not set, each adapter uses its own default:
+#   - OpenAI: text-embedding-3-small
+#   - Gemini: gemini-embedding-001
+# OpenAI options: text-embedding-3-small, text-embedding-3-large, text-embedding-ada-002
+# Gemini options: gemini-embedding-001, text-embedding-004, text-embedding-005
+# VECTOR_EMBEDDING_MODEL=gemini-embedding-001
+
+# ===================================================================
+# Vector Databases
+# ===================================================================
 
 # AstraDB
 ASTRA_DB_APPLICATION_TOKEN=AstraCS:your-token-here
 ASTRA_DB_API_ENDPOINT=https://your-id.apps.astra.datastax.com
-ASTRA_DB_COLLECTION_NAME=vector_documents
 
-# ChromaDB Cloud (optional)
+# ChromaDB Cloud
+# Note: Choose ONE deployment mode (Cloud, HTTP, or Local)
 CHROMA_API_KEY=your-chroma-api-key
 CHROMA_TENANT=your-tenant
 CHROMA_DATABASE=your-database
 
-# ChromaDB HTTP Server (optional)
+# ChromaDB HTTP Server
+# Important: Cannot set both CHROMA_HOST and CHROMA_PERSIST_DIR
 CHROMA_HOST=localhost
 CHROMA_PORT=8000
 
-# ChromaDB Local (optional)
+# ChromaDB Local Persistence
+# Important: Cannot set both CHROMA_HOST and CHROMA_PERSIST_DIR
 CHROMA_PERSIST_DIR=./chroma_data
 
-# Milvus
-MILVUS_API_ENDPOINT=https://your-endpoint.zillizcloud.com
-MILVUS_USER=your-user
-MILVUS_PASSWORD=your-password
+# Milvus / Zilliz Cloud
+MILVUS_API_ENDPOINT=http://localhost:19530
+MILVUS_API_KEY=your-milvus-api-key
 
 # PGVector (PostgreSQL with pgvector extension)
 PGVECTOR_HOST=localhost
 PGVECTOR_PORT=5432
-PGVECTOR_DBNAME=vectordb
 PGVECTOR_USER=postgres
 PGVECTOR_PASSWORD=your-password
 
-# Vector metric (cosine, dot_product, euclidean)
+# ===================================================================
+# Vector Engine Settings
+# ===================================================================
+
+# Database name (used by PGVector and collection naming)
+VECTOR_COLLECTION_NAME=vector_db
+
+# Distance metric: cosine, dot_product, euclidean
 VECTOR_METRIC=cosine
+
+# Store original text with vectors (true/false)
+VECTOR_STORE_TEXT=false
+
+# Vector embedding dimension
+VECTOR_DIM=1536
+
+# Default search result limit
+VECTOR_SEARCH_LIMIT=10
+
+# Primary key generation mode: uuid, hash_text, hash_vector, int64, auto
+PRIMARY_KEY_MODE=uuid
+
+# Optional: Custom PK factory (dotted path to callable)
+# PRIMARY_KEY_FACTORY=mymodule.custom_pk_function
+
+# Logging level: DEBUG, INFO, WARNING, ERROR, CRITICAL
+LOG_LEVEL=INFO
@@ -1,5 +1,89 @@
 # CrossVector - Changelog
 
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [1.0.0] - 2025-12-06 🎉
+
+**First Production Release!**
+
+### Added
+
+**Benchmarking System:**
+- Created comprehensive `scripts/benchmark.py` tool for performance testing
+- Support for 4 database backends (pgvector, astradb, milvus, chroma)
+- Support for 2 embedding providers (OpenAI, Gemini)
+- 7 operation types tested: bulk/individual create, vector/metadata search, Query DSL operators, update, delete
+- `--skip-slow` flag to skip cloud backends for faster local testing
+- Smart Query DSL optimization: 4 operators for slow backends, 10 for fast backends
+- Detailed markdown reports with performance metrics
+- Performance summary shows tested vs skipped backends clearly
+
+**Engine Improvements:**
+- Added `VectorEngine.drop_collection()` method for collection cleanup
+- Better collection lifecycle management
+
+**Documentation:**
+- Added benchmarking section to README.md (102 lines)
+- Created comprehensive `docs/benchmarking.md` guide (385 lines)
+- Updated `docs/contributing.md` with benchmarking workflow
+- Added usage examples and best practices
+- Cost estimation and troubleshooting guides
+
+**Testing:**
+- Added 50+ new unit tests
+- Test coverage for ABC adapters (82%)
+- Test coverage for logger (100%)
+- Extended engine tests
+- Schema, utils, and Q object coverage tests
+- Total: 365 tests passing (from ~300)
+
+**Architecture:**
+- Enhanced ABC base class with unified initialization
+- Improved adapter architecture
+- Better error reporting in benchmarks
+- Truncated error messages in reports for readability
+
+### Changed
+
+- Collection name defaults now use `api_settings.VECTOR_COLLECTION_NAME` instead of class constant
+- Improved Milvus metadata-only search support verification
+- Updated all adapter documentation
+- Modernized contributing.md with uv, pre-commit, ruff
+
+### Removed
+
+- Removed `scripts/e2e.py` (replaced with `pytest scripts/tests`)
+- Removed `DEFAULT_COLLECTION_NAME` class constant from adapters
+
+### Fixed
+
+- Fixed Milvus tests to verify metadata-only search functionality
+- Fixed collection name handling across all adapters
+- Better error messages in benchmark reports
+- Proper cleanup in benchmark tests
+
+### Breaking Changes
+
+- `DEFAULT_COLLECTION_NAME` class constant removed - use `api_settings.VECTOR_COLLECTION_NAME` in settings instead
+- Stricter ChromaDB config validation (prevents conflicting settings)
+
+### Performance
+
+- Benchmark results show ~60% reduction in API calls for cloud backends with optimization
+- Local testing with `--skip-slow`: ~2-3 minutes vs 10+ minutes
+- PgVector: ~6-10 docs/sec bulk create, ~0.5ms metadata queries
+- Gemini: 1.5x faster search vs OpenAI for same operations
+
+### Documentation Updates
+
+- Repository URLs and references updated
+- Enhanced architecture diagrams
+- Improved API documentation
+- Fixed all broken links
+
 ## [0.1.3] - 2025-11-30
 
 ### Testing Infrastructure