From 99fbf9ec7d575c08f8f05c8dbd27d1c99150eeac Mon Sep 17 00:00:00 2001 From: SamMintah Date: Tue, 4 Nov 2025 13:13:40 +0000 Subject: [PATCH] feat: implement core data models and sample data - Add comprehensive data models with validation (Commodity, Market, PriceRecord) - Create sample CSV files with realistic Ghana market data - Implement database schema with dimension and fact tables - Add database setup scripts and initialization - Include data quality tracking and monitoring capabilities Addresses requirements 4.2, 4.3, and 6.2 --- .gitignore | 1 + data/sample/sample_commodities.csv | 16 + data/sample/sample_markets.csv | 16 + data/sample/sample_prices.csv | 31 ++ infrastructure/sql/init_database.sql | 162 ++++++++++ infrastructure/sql/schema.sql | 402 ++++++++++++++++++++++++ scripts/setup/setup_database.py | 444 +++++++++++++++++++++++++++ src/utils/models.py | 402 ++++++++++++++++++++++++ 8 files changed, 1474 insertions(+) create mode 100644 infrastructure/sql/init_database.sql create mode 100644 scripts/setup/setup_database.py create mode 100644 src/utils/models.py diff --git a/.gitignore b/.gitignore index e69de29..6b3f41d 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1 @@ +.kiro/ \ No newline at end of file diff --git a/data/sample/sample_commodities.csv b/data/sample/sample_commodities.csv index e69de29..e88fc19 100644 --- a/data/sample/sample_commodities.csv +++ b/data/sample/sample_commodities.csv @@ -0,0 +1,16 @@ +commodity_id,commodity_name,category,unit_of_measure,is_active,created_date +1,Tomatoes,Vegetables,kg,1,2024-01-01 +2,Maize,Grains,kg,1,2024-01-01 +3,Rice,Grains,kg,1,2024-01-01 +4,Yam,Tubers,kg,1,2024-01-01 +5,Cassava,Tubers,kg,1,2024-01-01 +6,Plantain,Fruits,bunch,1,2024-01-01 +7,Onions,Vegetables,kg,1,2024-01-01 +8,Pepper,Vegetables,kg,1,2024-01-01 +9,Cocoa,Cash Crops,kg,1,2024-01-01 +10,Palm Oil,Oils,litre,1,2024-01-01 +11,Groundnuts,Legumes,kg,1,2024-01-01 +12,Cowpea,Legumes,kg,1,2024-01-01 +13,Sorghum,Grains,kg,1,2024-01-01 +14,Millet,Grains,kg,1,2024-01-01 +15,Sweet Potato,Tubers,kg,1,2024-01-01 \ No newline at end of file diff --git a/data/sample/sample_markets.csv b/data/sample/sample_markets.csv index e69de29..2fdd382 100644 --- a/data/sample/sample_markets.csv +++ b/data/sample/sample_markets.csv @@ -0,0 +1,16 @@ +market_id,market_name,location,region,latitude,longitude,is_active,created_date +1,Makola Market,Accra,Greater Accra,5.5500,-0.2167,1,2024-01-01 +2,Kejetia Market,Kumasi,Ashanti,6.6885,-1.6244,1,2024-01-01 +3,Tamale Central Market,Tamale,Northern,9.4034,-0.8424,1,2024-01-01 +4,Takoradi Market Circle,Takoradi,Western,4.8845,-1.7554,1,2024-01-01 +5,Cape Coast Central Market,Cape Coast,Central,5.1037,-1.2448,1,2024-01-01 +6,Bolgatanga Market,Bolgatanga,Upper East,10.7856,-0.8506,1,2024-01-01 +7,Wa Market,Wa,Upper West,10.0601,-2.5057,1,2024-01-01 +8,Ho Central Market,Ho,Volta,6.6108,0.4708,1,2024-01-01 +9,Koforidua New Juaben Market,Koforidua,Eastern,6.0898,-0.2599,1,2024-01-01 +10,Sunyani Central Market,Sunyani,Brong Ahafo,7.3386,-2.3265,1,2024-01-01 +11,Techiman Market,Techiman,Brong Ahafo,7.5931,-1.9303,1,2024-01-01 +12,Aflao Market,Aflao,Volta,6.1167,1.1833,1,2024-01-01 +13,Tema Community 1 Market,Tema,Greater Accra,5.6698,-0.0166,1,2024-01-01 +14,Obuasi Market,Obuasi,Ashanti,6.2027,-1.6640,1,2024-01-01 +15,Tarkwa Market,Tarkwa,Western,5.3006,-1.9959,1,2024-01-01 \ No newline at end of file diff --git a/data/sample/sample_prices.csv b/data/sample/sample_prices.csv index e69de29..1f0a8cd 100644 --- a/data/sample/sample_prices.csv +++ b/data/sample/sample_prices.csv @@ -0,0 +1,31 @@ +price_id,date,market_id,commodity_id,raw_price,normalized_price,data_source,quality_score,is_anomaly,created_date +1,2024-10-01,1,1,8.50,8.50,field_survey,0.95,0,2024-10-01 +2,2024-10-01,1,2,4.20,4.20,field_survey,0.98,0,2024-10-01 +3,2024-10-01,1,3,6.80,6.80,field_survey,0.92,0,2024-10-01 +4,2024-10-01,2,1,7.90,7.90,field_survey,0.96,0,2024-10-01 +5,2024-10-01,2,2,4.10,4.10,field_survey,0.94,0,2024-10-01 +6,2024-10-01,3,1,9.20,9.20,field_survey,0.89,0,2024-10-01 +7,2024-10-01,3,2,4.50,4.50,field_survey,0.91,0,2024-10-01 +8,2024-10-02,1,1,8.75,8.75,field_survey,0.97,0,2024-10-02 +9,2024-10-02,1,2,4.25,4.25,field_survey,0.96,0,2024-10-02 +10,2024-10-02,2,1,8.10,8.10,field_survey,0.93,0,2024-10-02 +11,2024-10-02,2,2,4.15,4.15,field_survey,0.95,0,2024-10-02 +12,2024-10-02,3,1,9.50,9.50,field_survey,0.88,0,2024-10-02 +13,2024-10-02,3,2,4.60,4.60,field_survey,0.90,0,2024-10-02 +14,2024-10-03,1,1,15.20,15.20,field_survey,0.85,1,2024-10-03 +15,2024-10-03,1,2,4.30,4.30,field_survey,0.97,0,2024-10-03 +16,2024-10-03,2,1,8.20,8.20,field_survey,0.94,0,2024-10-03 +17,2024-10-03,2,2,4.20,4.20,field_survey,0.96,0,2024-10-03 +18,2024-10-03,3,1,9.80,9.80,field_survey,0.87,0,2024-10-03 +19,2024-10-03,3,2,4.70,4.70,field_survey,0.89,0,2024-10-03 +20,2024-10-04,1,1,8.60,8.60,field_survey,0.96,0,2024-10-04 +21,2024-10-04,1,2,4.35,4.35,field_survey,0.95,0,2024-10-04 +22,2024-10-04,2,1,8.00,8.00,field_survey,0.97,0,2024-10-04 +23,2024-10-04,2,2,4.25,4.25,field_survey,0.94,0,2024-10-04 +24,2024-10-04,3,1,9.60,9.60,field_survey,0.90,0,2024-10-04 +25,2024-10-04,3,2,4.65,4.65,field_survey,0.88,0,2024-10-04 +26,2024-10-05,1,1,8.80,8.80,field_survey,0.93,0,2024-10-05 +27,2024-10-05,1,2,4.40,4.40,field_survey,0.96,0,2024-10-05 +28,2024-10-05,2,1,8.30,8.30,field_survey,0.95,0,2024-10-05 +29,2024-10-05,2,2,4.30,4.30,field_survey,0.97,0,2024-10-05 +30,2024-10-05,3,1,9.40,9.40,field_survey,0.91,0,2024-10-05 \ No newline at end of file diff --git a/infrastructure/sql/init_database.sql b/infrastructure/sql/init_database.sql new file mode 100644 index 0000000..bc754af --- /dev/null +++ b/infrastructure/sql/init_database.sql @@ -0,0 +1,162 @@ +-- Database Initialization Script +-- This script initializes the Ghana Commodity Pricing Engine database with sample data + +-- ===================================================== +-- POPULATE DATE DIMENSION +-- ===================================================== + +-- Populate date dimension for 2024-2025 +DECLARE @start_date DATE = '2024-01-01'; +DECLARE @end_date DATE = '2025-12-31'; +DECLARE @current_date DATE = @start_date; + +WHILE @current_date <= @end_date +BEGIN + DECLARE @date_key INT = CAST(FORMAT(@current_date, 'yyyyMMdd') AS INT); + DECLARE @year INT = YEAR(@current_date); + DECLARE @month INT = MONTH(@current_date); + DECLARE @day INT = DAY(@current_date); + DECLARE @quarter INT = DATEPART(QUARTER, @current_date); + DECLARE @month_name VARCHAR(20) = DATENAME(MONTH, @current_date); + DECLARE @day_name VARCHAR(20) = DATENAME(WEEKDAY, @current_date); + DECLARE @is_weekend BIT = CASE WHEN DATEPART(WEEKDAY, @current_date) IN (1, 7) THEN 1 ELSE 0 END; + DECLARE @fiscal_year INT = CASE WHEN @month >= 7 THEN @year + 1 ELSE @year END; + DECLARE @fiscal_quarter INT = CASE + WHEN @month IN (7, 8, 9) THEN 1 + WHEN @month IN (10, 11, 12) THEN 2 + WHEN @month IN (1, 2, 3) THEN 3 + ELSE 4 + END; + + INSERT INTO dim_date ( + date_key, full_date, year, month, day, quarter, + month_name, day_name, is_weekend, fiscal_year, fiscal_quarter + ) + VALUES ( + @date_key, @current_date, @year, @month, @day, @quarter, + @month_name, @day_name, @is_weekend, @fiscal_year, @fiscal_quarter + ); + + SET @current_date = DATEADD(DAY, 1, @current_date); +END; + +-- ===================================================== +-- POPULATE SAMPLE MARKETS +-- ===================================================== + +INSERT INTO dim_markets (market_id, market_name, location, region, latitude, longitude, is_active, created_date) VALUES +(1, 'Makola Market', 'Accra', 'Greater Accra', 5.5500, -0.2167, 1, '2024-01-01'), +(2, 'Kejetia Market', 'Kumasi', 'Ashanti', 6.6885, -1.6244, 1, '2024-01-01'), +(3, 'Tamale Central Market', 'Tamale', 'Northern', 9.4034, -0.8424, 1, '2024-01-01'), +(4, 'Takoradi Market Circle', 'Takoradi', 'Western', 4.8845, -1.7554, 1, '2024-01-01'), +(5, 'Cape Coast Central Market', 'Cape Coast', 'Central', 5.1037, -1.2448, 1, '2024-01-01'), +(6, 'Bolgatanga Market', 'Bolgatanga', 'Upper East', 10.7856, -0.8506, 1, '2024-01-01'), +(7, 'Wa Market', 'Wa', 'Upper West', 10.0601, -2.5057, 1, '2024-01-01'), +(8, 'Ho Central Market', 'Ho', 'Volta', 6.6108, 0.4708, 1, '2024-01-01'), +(9, 'Koforidua New Juaben Market', 'Koforidua', 'Eastern', 6.0898, -0.2599, 1, '2024-01-01'), +(10, 'Sunyani Central Market', 'Sunyani', 'Brong Ahafo', 7.3386, -2.3265, 1, '2024-01-01'), +(11, 'Techiman Market', 'Techiman', 'Brong Ahafo', 7.5931, -1.9303, 1, '2024-01-01'), +(12, 'Aflao Market', 'Aflao', 'Volta', 6.1167, 1.1833, 1, '2024-01-01'), +(13, 'Tema Community 1 Market', 'Tema', 'Greater Accra', 5.6698, -0.0166, 1, '2024-01-01'), +(14, 'Obuasi Market', 'Obuasi', 'Ashanti', 6.2027, -1.6640, 1, '2024-01-01'), +(15, 'Tarkwa Market', 'Tarkwa', 'Western', 5.3006, -1.9959, 1, '2024-01-01'); + +-- ===================================================== +-- POPULATE SAMPLE COMMODITIES +-- ===================================================== + +INSERT INTO dim_commodities (commodity_id, commodity_name, category, unit_of_measure, is_active, created_date) VALUES +(1, 'Tomatoes', 'Vegetables', 'kg', 1, '2024-01-01'), +(2, 'Maize', 'Grains', 'kg', 1, '2024-01-01'), +(3, 'Rice', 'Grains', 'kg', 1, '2024-01-01'), +(4, 'Yam', 'Tubers', 'kg', 1, '2024-01-01'), +(5, 'Cassava', 'Tubers', 'kg', 1, '2024-01-01'), +(6, 'Plantain', 'Fruits', 'bunch', 1, '2024-01-01'), +(7, 'Onions', 'Vegetables', 'kg', 1, '2024-01-01'), +(8, 'Pepper', 'Vegetables', 'kg', 1, '2024-01-01'), +(9, 'Cocoa', 'Cash Crops', 'kg', 1, '2024-01-01'), +(10, 'Palm Oil', 'Oils', 'litre', 1, '2024-01-01'), +(11, 'Groundnuts', 'Legumes', 'kg', 1, '2024-01-01'), +(12, 'Cowpea', 'Legumes', 'kg', 1, '2024-01-01'), +(13, 'Sorghum', 'Grains', 'kg', 1, '2024-01-01'), +(14, 'Millet', 'Grains', 'kg', 1, '2024-01-01'), +(15, 'Sweet Potato', 'Tubers', 'kg', 1, '2024-01-01'); + +-- ===================================================== +-- POPULATE SAMPLE PRICE DATA +-- ===================================================== + +-- Insert sample price data for October 2024 +INSERT INTO fact_prices (date_key, market_id, commodity_id, raw_price, normalized_price, data_source, quality_score, is_anomaly, created_date) VALUES +(20241001, 1, 1, 8.50, 8.50, 'field_survey', 0.95, 0, '2024-10-01'), +(20241001, 1, 2, 4.20, 4.20, 'field_survey', 0.98, 0, '2024-10-01'), +(20241001, 1, 3, 6.80, 6.80, 'field_survey', 0.92, 0, '2024-10-01'), +(20241001, 2, 1, 7.90, 7.90, 'field_survey', 0.96, 0, '2024-10-01'), +(20241001, 2, 2, 4.10, 4.10, 'field_survey', 0.94, 0, '2024-10-01'), +(20241001, 3, 1, 9.20, 9.20, 'field_survey', 0.89, 0, '2024-10-01'), +(20241001, 3, 2, 4.50, 4.50, 'field_survey', 0.91, 0, '2024-10-01'), +(20241002, 1, 1, 8.75, 8.75, 'field_survey', 0.97, 0, '2024-10-02'), +(20241002, 1, 2, 4.25, 4.25, 'field_survey', 0.96, 0, '2024-10-02'), +(20241002, 2, 1, 8.10, 8.10, 'field_survey', 0.93, 0, '2024-10-02'), +(20241002, 2, 2, 4.15, 4.15, 'field_survey', 0.95, 0, '2024-10-02'), +(20241002, 3, 1, 9.50, 9.50, 'field_survey', 0.88, 0, '2024-10-02'), +(20241002, 3, 2, 4.60, 4.60, 'field_survey', 0.90, 0, '2024-10-02'), +(20241003, 1, 1, 15.20, 15.20, 'field_survey', 0.85, 1, '2024-10-03'), +(20241003, 1, 2, 4.30, 4.30, 'field_survey', 0.97, 0, '2024-10-03'), +(20241003, 2, 1, 8.20, 8.20, 'field_survey', 0.94, 0, '2024-10-03'), +(20241003, 2, 2, 4.20, 4.20, 'field_survey', 0.96, 0, '2024-10-03'), +(20241003, 3, 1, 9.80, 9.80, 'field_survey', 0.87, 0, '2024-10-03'), +(20241003, 3, 2, 4.70, 4.70, 'field_survey', 0.89, 0, '2024-10-03'), +(20241004, 1, 1, 8.60, 8.60, 'field_survey', 0.96, 0, '2024-10-04'), +(20241004, 1, 2, 4.35, 4.35, 'field_survey', 0.95, 0, '2024-10-04'), +(20241004, 2, 1, 8.00, 8.00, 'field_survey', 0.97, 0, '2024-10-04'), +(20241004, 2, 2, 4.25, 4.25, 'field_survey', 0.94, 0, '2024-10-04'), +(20241004, 3, 1, 9.60, 9.60, 'field_survey', 0.90, 0, '2024-10-04'), +(20241004, 3, 2, 4.65, 4.65, 'field_survey', 0.88, 0, '2024-10-04'), +(20241005, 1, 1, 8.80, 8.80, 'field_survey', 0.93, 0, '2024-10-05'), +(20241005, 1, 2, 4.40, 4.40, 'field_survey', 0.96, 0, '2024-10-05'), +(20241005, 2, 1, 8.30, 8.30, 'field_survey', 0.95, 0, '2024-10-05'), +(20241005, 2, 2, 4.30, 4.30, 'field_survey', 0.97, 0, '2024-10-05'), +(20241005, 3, 1, 9.40, 9.40, 'field_survey', 0.91, 0, '2024-10-05'); + +-- ===================================================== +-- POPULATE SAMPLE PRICE RECOMMENDATIONS +-- ===================================================== + +INSERT INTO fact_price_recommendations (date_key, market_id, commodity_id, recommended_price, confidence_score, explanation, algorithm_version, price_range_min, price_range_max, created_date) VALUES +(20241006, 1, 1, 8.65, 0.92, 'Price recommendation based on 7-day rolling median with seasonal adjustment for October harvest period', '1.0.0', 8.20, 9.10, '2024-10-06'), +(20241006, 1, 2, 4.28, 0.95, 'Stable maize prices with minimal volatility observed across regional markets', '1.0.0', 4.10, 4.45, '2024-10-06'), +(20241006, 2, 1, 8.15, 0.89, 'Kumasi market showing consistent pricing patterns with slight upward trend', '1.0.0', 7.80, 8.50, '2024-10-06'), +(20241006, 2, 2, 4.22, 0.93, 'Maize prices remain steady with good supply from northern regions', '1.0.0', 4.05, 4.40, '2024-10-06'), +(20241006, 3, 1, 9.55, 0.87, 'Northern market premium due to transportation costs and local demand', '1.0.0', 9.20, 9.90, '2024-10-06'), +(20241006, 3, 2, 4.62, 0.90, 'Local maize production supporting stable pricing in northern markets', '1.0.0', 4.45, 4.80, '2024-10-06'); + +-- ===================================================== +-- POPULATE SAMPLE DATA QUALITY METRICS +-- ===================================================== + +INSERT INTO data_quality_metrics (date_key, source_name, total_records, valid_records, completeness_score, accuracy_score, timeliness_score, consistency_score, created_date) VALUES +(20241001, 'field_survey', 7, 7, 1.00, 0.94, 1.00, 0.98, '2024-10-01'), +(20241002, 'field_survey', 6, 6, 1.00, 0.93, 1.00, 0.97, '2024-10-02'), +(20241003, 'field_survey', 6, 5, 1.00, 0.91, 1.00, 0.85, '2024-10-03'), +(20241004, 'field_survey', 6, 6, 1.00, 0.94, 1.00, 0.96, '2024-10-04'), +(20241005, 'field_survey', 5, 5, 1.00, 0.95, 1.00, 0.98, '2024-10-05'); + +-- ===================================================== +-- POPULATE SAMPLE SYSTEM ALERTS +-- ===================================================== + +INSERT INTO system_alerts (alert_type, severity, message, component, market_id, commodity_id, alert_date, is_resolved, created_date) VALUES +('price_anomaly', 'high', 'Tomato prices in Makola Market exceeded normal range by 75% - possible supply disruption', 'pricing_engine', 1, 1, '2024-10-03 08:30:00', 0, '2024-10-03'), +('data_quality', 'medium', 'Quality score below threshold for field survey data on 2024-10-03', 'data_pipeline', NULL, NULL, '2024-10-03 09:15:00', 1, '2024-10-03'), +('system_error', 'low', 'API response time exceeded 1 second for price recommendation endpoint', 'api_service', NULL, NULL, '2024-10-04 14:22:00', 1, '2024-10-04'); + +PRINT 'Database initialization completed successfully!'; +PRINT 'Sample data loaded for:'; +PRINT '- Date dimension: 2024-2025'; +PRINT '- Markets: 15 major Ghana markets'; +PRINT '- Commodities: 15 common agricultural products'; +PRINT '- Price data: 5 days of sample prices'; +PRINT '- Recommendations: 6 sample price recommendations'; +PRINT '- Quality metrics: 5 days of data quality tracking'; +PRINT '- System alerts: 3 sample alerts'; \ No newline at end of file diff --git a/infrastructure/sql/schema.sql b/infrastructure/sql/schema.sql index e69de29..67aa6fc 100644 --- a/infrastructure/sql/schema.sql +++ b/infrastructure/sql/schema.sql @@ -0,0 +1,402 @@ +-- Ghana Commodity Pricing Engine Database Schema +-- This script creates the core database schema for the pricing system + +-- Create database if it doesn't exist +-- Note: This may need to be run separately depending on your database system +-- CREATE DATABASE IF NOT EXISTS ghana_commodity_pricing; +-- USE ghana_commodity_pricing; + +-- ===================================================== +-- DIMENSION TABLES +-- ===================================================== + +-- Markets dimension table +CREATE TABLE IF NOT EXISTS dim_markets ( + market_id INT PRIMARY KEY, + market_name VARCHAR(100) NOT NULL, + location VARCHAR(100) NOT NULL, + region VARCHAR(50) NOT NULL, + latitude DECIMAL(10, 8) NOT NULL, + longitude DECIMAL(11, 8) NOT NULL, + is_active BIT DEFAULT 1, + created_date DATETIME2 DEFAULT GETDATE(), + updated_date DATETIME2 DEFAULT GETDATE(), + + -- Constraints + CONSTRAINT chk_markets_latitude CHECK (latitude BETWEEN 4.5 AND 11.5), + CONSTRAINT chk_markets_longitude CHECK (longitude BETWEEN -3.5 AND 1.5), + CONSTRAINT chk_markets_region CHECK (region IN ( + 'Greater Accra', 'Ashanti', 'Northern', 'Western', 'Central', + 'Upper East', 'Upper West', 'Volta', 'Eastern', 'Brong Ahafo' + )) +); + +-- Commodities dimension table +CREATE TABLE IF NOT EXISTS dim_commodities ( + commodity_id INT PRIMARY KEY, + commodity_name VARCHAR(100) NOT NULL, + category VARCHAR(50) NOT NULL, + unit_of_measure VARCHAR(20) NOT NULL, + is_active BIT DEFAULT 1, + created_date DATETIME2 DEFAULT GETDATE(), + updated_date DATETIME2 DEFAULT GETDATE(), + + -- Constraints + CONSTRAINT chk_commodities_category CHECK (category IN ( + 'Vegetables', 'Grains', 'Tubers', 'Fruits', 'Cash Crops', + 'Oils', 'Legumes', 'Spices', 'Livestock', 'Fish' + )), + CONSTRAINT chk_commodities_unit CHECK (unit_of_measure IN ( + 'kg', 'litre', 'bunch', 'piece', 'bag', 'tonne' + )) +); + +-- Date dimension table for time-based analysis +CREATE TABLE IF NOT EXISTS dim_date ( + date_key INT PRIMARY KEY, + full_date DATE NOT NULL UNIQUE, + year INT NOT NULL, + month INT NOT NULL, + day INT NOT NULL, + quarter INT NOT NULL, + month_name VARCHAR(20) NOT NULL, + day_name VARCHAR(20) NOT NULL, + is_weekend BIT NOT NULL, + is_holiday BIT DEFAULT 0, + fiscal_year INT NOT NULL, + fiscal_quarter INT NOT NULL, + + -- Constraints + CONSTRAINT chk_date_year CHECK (year BETWEEN 2020 AND 2030), + CONSTRAINT chk_date_month CHECK (month BETWEEN 1 AND 12), + CONSTRAINT chk_date_day CHECK (day BETWEEN 1 AND 31), + CONSTRAINT chk_date_quarter CHECK (quarter BETWEEN 1 AND 4) +); + +-- ===================================================== +-- FACT TABLES +-- ===================================================== + +-- Price facts table - stores raw and normalized price data +CREATE TABLE IF NOT EXISTS fact_prices ( + price_id BIGINT IDENTITY(1,1) PRIMARY KEY, + date_key INT NOT NULL, + market_id INT NOT NULL, + commodity_id INT NOT NULL, + raw_price DECIMAL(10,2) NOT NULL, + normalized_price DECIMAL(10,2) NOT NULL, + data_source VARCHAR(50) NOT NULL, + quality_score DECIMAL(3,2) DEFAULT 1.0, + is_anomaly BIT DEFAULT 0, + created_date DATETIME2 DEFAULT GETDATE(), + updated_date DATETIME2 DEFAULT GETDATE(), + + -- Foreign key constraints + CONSTRAINT fk_prices_date FOREIGN KEY (date_key) REFERENCES dim_date(date_key), + CONSTRAINT fk_prices_market FOREIGN KEY (market_id) REFERENCES dim_markets(market_id), + CONSTRAINT fk_prices_commodity FOREIGN KEY (commodity_id) REFERENCES dim_commodities(commodity_id), + + -- Data validation constraints + CONSTRAINT chk_prices_raw_price CHECK (raw_price > 0 AND raw_price <= 10000), + CONSTRAINT chk_prices_normalized_price CHECK (normalized_price > 0 AND normalized_price <= 10000), + CONSTRAINT chk_prices_quality_score CHECK (quality_score BETWEEN 0.0 AND 1.0), + CONSTRAINT chk_prices_data_source CHECK (data_source IN ( + 'field_survey', 'government_api', 'market_association', + 'mobile_app', 'third_party', 'manual_entry' + )) +); + +-- Price recommendations table - stores algorithm-generated recommendations +CREATE TABLE IF NOT EXISTS fact_price_recommendations ( + recommendation_id BIGINT IDENTITY(1,1) PRIMARY KEY, + date_key INT NOT NULL, + market_id INT NOT NULL, + commodity_id INT NOT NULL, + recommended_price DECIMAL(10,2) NOT NULL, + confidence_score DECIMAL(3,2) NOT NULL, + explanation TEXT, + algorithm_version VARCHAR(20) NOT NULL, + price_range_min DECIMAL(10,2), + price_range_max DECIMAL(10,2), + seasonal_adjustment DECIMAL(5,4) DEFAULT 1.0, + created_date DATETIME2 DEFAULT GETDATE(), + updated_date DATETIME2 DEFAULT GETDATE(), + + -- Foreign key constraints + CONSTRAINT fk_recommendations_date FOREIGN KEY (date_key) REFERENCES dim_date(date_key), + CONSTRAINT fk_recommendations_market FOREIGN KEY (market_id) REFERENCES dim_markets(market_id), + CONSTRAINT fk_recommendations_commodity FOREIGN KEY (commodity_id) REFERENCES dim_commodities(commodity_id), + + -- Data validation constraints + CONSTRAINT chk_recommendations_price CHECK (recommended_price > 0 AND recommended_price <= 10000), + CONSTRAINT chk_recommendations_confidence CHECK (confidence_score BETWEEN 0.0 AND 1.0), + CONSTRAINT chk_recommendations_range CHECK ( + price_range_min IS NULL OR price_range_max IS NULL OR price_range_min <= price_range_max + ), + CONSTRAINT chk_recommendations_seasonal CHECK (seasonal_adjustment BETWEEN 0.1 AND 10.0) +); + +-- ===================================================== +-- DATA QUALITY AND MONITORING TABLES +-- ===================================================== + +-- Data quality metrics tracking +CREATE TABLE IF NOT EXISTS data_quality_metrics ( + metric_id BIGINT IDENTITY(1,1) PRIMARY KEY, + date_key INT NOT NULL, + source_name VARCHAR(50) NOT NULL, + total_records INT NOT NULL, + valid_records INT NOT NULL, + completeness_score DECIMAL(5,2) NOT NULL, + accuracy_score DECIMAL(5,2) NOT NULL, + timeliness_score DECIMAL(5,2) DEFAULT 1.0, + consistency_score DECIMAL(5,2) DEFAULT 1.0, + created_date DATETIME2 DEFAULT GETDATE(), + + -- Foreign key constraints + CONSTRAINT fk_quality_date FOREIGN KEY (date_key) REFERENCES dim_date(date_key), + + -- Data validation constraints + CONSTRAINT chk_quality_records CHECK (valid_records <= total_records), + CONSTRAINT chk_quality_completeness CHECK (completeness_score BETWEEN 0.0 AND 1.0), + CONSTRAINT chk_quality_accuracy CHECK (accuracy_score BETWEEN 0.0 AND 1.0), + CONSTRAINT chk_quality_timeliness CHECK (timeliness_score BETWEEN 0.0 AND 1.0), + CONSTRAINT chk_quality_consistency CHECK (consistency_score BETWEEN 0.0 AND 1.0) +); + +-- System monitoring and alerts +CREATE TABLE IF NOT EXISTS system_alerts ( + alert_id BIGINT IDENTITY(1,1) PRIMARY KEY, + alert_type VARCHAR(50) NOT NULL, + severity VARCHAR(20) NOT NULL, + message TEXT NOT NULL, + component VARCHAR(50) NOT NULL, + market_id INT NULL, + commodity_id INT NULL, + alert_date DATETIME2 DEFAULT GETDATE(), + resolved_date DATETIME2 NULL, + is_resolved BIT DEFAULT 0, + created_date DATETIME2 DEFAULT GETDATE(), + + -- Foreign key constraints (optional references) + CONSTRAINT fk_alerts_market FOREIGN KEY (market_id) REFERENCES dim_markets(market_id), + CONSTRAINT fk_alerts_commodity FOREIGN KEY (commodity_id) REFERENCES dim_commodities(commodity_id), + + -- Data validation constraints + CONSTRAINT chk_alerts_type CHECK (alert_type IN ( + 'price_anomaly', 'data_quality', 'system_error', 'performance', 'security' + )), + CONSTRAINT chk_alerts_severity CHECK (severity IN ('low', 'medium', 'high', 'critical')), + CONSTRAINT chk_alerts_component CHECK (component IN ( + 'data_pipeline', 'pricing_engine', 'api_service', 'dashboard', 'database' + )) +); + +-- ===================================================== +-- INDEXES FOR PERFORMANCE OPTIMIZATION +-- ===================================================== + +-- Indexes on fact_prices for common query patterns +CREATE INDEX IF NOT EXISTS idx_prices_date_market_commodity +ON fact_prices (date_key, market_id, commodity_id); + +CREATE INDEX IF NOT EXISTS idx_prices_market_commodity_date +ON fact_prices (market_id, commodity_id, date_key); + +CREATE INDEX IF NOT EXISTS idx_prices_anomaly_date +ON fact_prices (is_anomaly, date_key) WHERE is_anomaly = 1; + +CREATE INDEX IF NOT EXISTS idx_prices_data_source_date +ON fact_prices (data_source, date_key); + +-- Indexes on fact_price_recommendations +CREATE INDEX IF NOT EXISTS idx_recommendations_date_market_commodity +ON fact_price_recommendations (date_key, market_id, commodity_id); + +CREATE INDEX IF NOT EXISTS idx_recommendations_confidence_date +ON fact_price_recommendations (confidence_score, date_key); + +-- Indexes on dimension tables +CREATE INDEX IF NOT EXISTS idx_markets_region_active +ON dim_markets (region, is_active); + +CREATE INDEX IF NOT EXISTS idx_commodities_category_active +ON dim_commodities (category, is_active); + +-- Indexes on monitoring tables +CREATE INDEX IF NOT EXISTS idx_alerts_type_severity_date +ON system_alerts (alert_type, severity, alert_date); + +CREATE INDEX IF NOT EXISTS idx_alerts_resolved_date +ON system_alerts (is_resolved, alert_date); + +-- ===================================================== +-- VIEWS FOR COMMON QUERIES +-- ===================================================== + +-- View for latest prices by market and commodity +CREATE VIEW IF NOT EXISTS vw_latest_prices AS +SELECT + p.market_id, + m.market_name, + m.region, + p.commodity_id, + c.commodity_name, + c.category, + c.unit_of_measure, + p.normalized_price, + p.quality_score, + p.is_anomaly, + d.full_date, + p.created_date +FROM fact_prices p +INNER JOIN dim_markets m ON p.market_id = m.market_id +INNER JOIN dim_commodities c ON p.commodity_id = c.commodity_id +INNER JOIN dim_date d ON p.date_key = d.date_key +INNER JOIN ( + SELECT market_id, commodity_id, MAX(date_key) as max_date_key + FROM fact_prices + GROUP BY market_id, commodity_id +) latest ON p.market_id = latest.market_id + AND p.commodity_id = latest.commodity_id + AND p.date_key = latest.max_date_key +WHERE m.is_active = 1 AND c.is_active = 1; + +-- View for price recommendations with context +CREATE VIEW IF NOT EXISTS vw_price_recommendations_with_context AS +SELECT + pr.recommendation_id, + pr.market_id, + m.market_name, + m.region, + pr.commodity_id, + c.commodity_name, + c.category, + c.unit_of_measure, + pr.recommended_price, + pr.confidence_score, + pr.explanation, + pr.algorithm_version, + pr.price_range_min, + pr.price_range_max, + d.full_date, + pr.created_date +FROM fact_price_recommendations pr +INNER JOIN dim_markets m ON pr.market_id = m.market_id +INNER JOIN dim_commodities c ON pr.commodity_id = c.commodity_id +INNER JOIN dim_date d ON pr.date_key = d.date_key +WHERE m.is_active = 1 AND c.is_active = 1; + +-- View for data quality dashboard +CREATE VIEW IF NOT EXISTS vw_data_quality_summary AS +SELECT + d.full_date, + dqm.source_name, + dqm.total_records, + dqm.valid_records, + dqm.completeness_score, + dqm.accuracy_score, + dqm.timeliness_score, + dqm.consistency_score, + CASE + WHEN dqm.completeness_score >= 0.95 AND dqm.accuracy_score >= 0.95 THEN 'Excellent' + WHEN dqm.completeness_score >= 0.90 AND dqm.accuracy_score >= 0.90 THEN 'Good' + WHEN dqm.completeness_score >= 0.80 AND dqm.accuracy_score >= 0.80 THEN 'Fair' + ELSE 'Poor' + END as quality_rating +FROM data_quality_metrics dqm +INNER JOIN dim_date d ON dqm.date_key = d.date_key; + +-- ===================================================== +-- STORED PROCEDURES FOR DATA OPERATIONS +-- ===================================================== + +-- Procedure to populate date dimension +CREATE OR ALTER PROCEDURE sp_populate_date_dimension + @start_date DATE, + @end_date DATE +AS +BEGIN + SET NOCOUNT ON; + + DECLARE @current_date DATE = @start_date; + + WHILE @current_date <= @end_date + BEGIN + DECLARE @date_key INT = CAST(FORMAT(@current_date, 'yyyyMMdd') AS INT); + DECLARE @year INT = YEAR(@current_date); + DECLARE @month INT = MONTH(@current_date); + DECLARE @day INT = DAY(@current_date); + DECLARE @quarter INT = DATEPART(QUARTER, @current_date); + DECLARE @month_name VARCHAR(20) = DATENAME(MONTH, @current_date); + DECLARE @day_name VARCHAR(20) = DATENAME(WEEKDAY, @current_date); + DECLARE @is_weekend BIT = CASE WHEN DATEPART(WEEKDAY, @current_date) IN (1, 7) THEN 1 ELSE 0 END; + DECLARE @fiscal_year INT = CASE WHEN @month >= 7 THEN @year + 1 ELSE @year END; + DECLARE @fiscal_quarter INT = CASE + WHEN @month IN (7, 8, 9) THEN 1 + WHEN @month IN (10, 11, 12) THEN 2 + WHEN @month IN (1, 2, 3) THEN 3 + ELSE 4 + END; + + INSERT INTO dim_date ( + date_key, full_date, year, month, day, quarter, + month_name, day_name, is_weekend, fiscal_year, fiscal_quarter + ) + VALUES ( + @date_key, @current_date, @year, @month, @day, @quarter, + @month_name, @day_name, @is_weekend, @fiscal_year, @fiscal_quarter + ); + + SET @current_date = DATEADD(DAY, 1, @current_date); + END +END; + +-- Procedure to calculate data quality metrics +CREATE OR ALTER PROCEDURE sp_calculate_data_quality_metrics + @date_key INT, + @source_name VARCHAR(50) +AS +BEGIN + SET NOCOUNT ON; + + DECLARE @total_records INT; + DECLARE @valid_records INT; + DECLARE @completeness_score DECIMAL(5,2); + DECLARE @accuracy_score DECIMAL(5,2); + + -- Calculate metrics for the specified date and source + SELECT + @total_records = COUNT(*), + @valid_records = SUM(CASE WHEN quality_score >= 0.8 THEN 1 ELSE 0 END), + @accuracy_score = AVG(quality_score), + @completeness_score = CAST(SUM(CASE WHEN raw_price IS NOT NULL AND normalized_price IS NOT NULL THEN 1 ELSE 0 END) AS DECIMAL(5,2)) / COUNT(*) + FROM fact_prices + WHERE date_key = @date_key AND data_source = @source_name; + + -- Insert or update quality metrics + MERGE data_quality_metrics AS target + USING (SELECT @date_key as date_key, @source_name as source_name) AS source + ON target.date_key = source.date_key AND target.source_name = source.source_name + WHEN MATCHED THEN + UPDATE SET + total_records = @total_records, + valid_records = @valid_records, + completeness_score = @completeness_score, + accuracy_score = @accuracy_score, + updated_date = GETDATE() + WHEN NOT MATCHED THEN + INSERT (date_key, source_name, total_records, valid_records, completeness_score, accuracy_score) + VALUES (@date_key, @source_name, @total_records, @valid_records, @completeness_score, @accuracy_score); +END; + +-- ===================================================== +-- INITIAL DATA SETUP +-- ===================================================== + +-- Populate date dimension for current year and next year +-- EXEC sp_populate_date_dimension '2024-01-01', '2025-12-31'; + +-- Grant permissions (adjust as needed for your environment) +-- GRANT SELECT, INSERT, UPDATE ON ALL TABLES TO pricing_engine_user; +-- GRANT EXECUTE ON ALL PROCEDURES TO pricing_engine_user; \ No newline at end of file diff --git a/scripts/setup/setup_database.py b/scripts/setup/setup_database.py new file mode 100644 index 0000000..ab55404 --- /dev/null +++ b/scripts/setup/setup_database.py @@ -0,0 +1,444 @@ +#!/usr/bin/env python3 +""" +Database setup script for Ghana Commodity Pricing Engine. +This script creates the database schema and loads sample data. +""" + +import os +import sys +import csv +import logging +from datetime import datetime +from decimal import Decimal +from pathlib import Path +from typing import List, Dict, Any + +# Add src to path for imports +sys.path.append(str(Path(__file__).parent.parent.parent / 'src')) + +from utils.models import Commodity, Market, PriceRecord, ValidationError +from utils.database import DatabaseManager +from utils.config import get_database_config + + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + + +class DatabaseSetup: + """Handles database schema creation and sample data loading.""" + + def __init__(self, db_manager: DatabaseManager): + self.db_manager = db_manager + self.project_root = Path(__file__).parent.parent.parent + + def create_schema(self) -> bool: + """Create database schema from SQL file.""" + try: + schema_file = self.project_root / 'infrastructure' / 'sql' / 'schema.sql' + + if not schema_file.exists(): + logger.error(f"Schema file not found: {schema_file}") + return False + + logger.info("Creating database schema...") + + with open(schema_file, 'r', encoding='utf-8') as f: + schema_sql = f.read() + + # Execute schema creation (split by GO statements if using SQL Server) + sql_statements = schema_sql.split('GO') + + for statement in sql_statements: + statement = statement.strip() + if statement: + self.db_manager.execute_sql(statement) + + logger.info("Database schema created successfully") + return True + + except Exception as e: + logger.error(f"Error creating database schema: {e}") + return False + + def load_sample_data(self) -> bool: + """Load sample data from CSV files.""" + try: + logger.info("Loading sample data...") + + # Load in order due to foreign key dependencies + success = ( + self._load_markets() and + self._load_commodities() and + self._load_price_data() + ) + + if success: + logger.info("Sample data loaded successfully") + else: + logger.error("Failed to load sample data") + + return success + + except Exception as e: + logger.error(f"Error loading sample data: {e}") + return False + + def _load_markets(self) -> bool: + """Load market data from CSV file.""" + try: + markets_file = self.project_root / 'data' / 'sample' / 'sample_markets.csv' + + if not markets_file.exists(): + logger.error(f"Markets file not found: {markets_file}") + return False + + logger.info("Loading markets data...") + + with open(markets_file, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + markets = [] + + for row in reader: + try: + market = Market( + market_id=int(row['market_id']), + market_name=row['market_name'], + location=row['location'], + region=row['region'], + latitude=float(row['latitude']), + longitude=float(row['longitude']), + is_active=bool(int(row['is_active'])), + created_date=datetime.fromisoformat(row['created_date']) + ) + markets.append(market) + + except (ValidationError, ValueError) as e: + logger.warning(f"Skipping invalid market record: {row} - {e}") + continue + + # Insert markets into database + for market in markets: + self._insert_market(market) + + logger.info(f"Loaded {len(markets)} markets") + return True + + except Exception as e: + logger.error(f"Error loading markets: {e}") + return False + + def _load_commodities(self) -> bool: + """Load commodity data from CSV file.""" + try: + commodities_file = self.project_root / 'data' / 'sample' / 'sample_commodities.csv' + + if not commodities_file.exists(): + logger.error(f"Commodities file not found: {commodities_file}") + return False + + logger.info("Loading commodities data...") + + with open(commodities_file, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + commodities = [] + + for row in reader: + try: + commodity = Commodity( + commodity_id=int(row['commodity_id']), + commodity_name=row['commodity_name'], + category=row['category'], + unit_of_measure=row['unit_of_measure'], + is_active=bool(int(row['is_active'])), + created_date=datetime.fromisoformat(row['created_date']) + ) + commodities.append(commodity) + + except (ValidationError, ValueError) as e: + logger.warning(f"Skipping invalid commodity record: {row} - {e}") + continue + + # Insert commodities into database + for commodity in commodities: + self._insert_commodity(commodity) + + logger.info(f"Loaded {len(commodities)} commodities") + return True + + except Exception as e: + logger.error(f"Error loading commodities: {e}") + return False + + def _load_price_data(self) -> bool: + """Load price data from CSV file.""" + try: + prices_file = self.project_root / 'data' / 'sample' / 'sample_prices.csv' + + if not prices_file.exists(): + logger.error(f"Prices file not found: {prices_file}") + return False + + logger.info("Loading price data...") + + with open(prices_file, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + prices = [] + + for row in reader: + try: + price_record = PriceRecord( + price_id=int(row['price_id']), + date=datetime.fromisoformat(row['date']), + market_id=int(row['market_id']), + commodity_id=int(row['commodity_id']), + raw_price=Decimal(row['raw_price']), + normalized_price=Decimal(row['normalized_price']), + data_source=row['data_source'], + quality_score=float(row['quality_score']), + is_anomaly=bool(int(row['is_anomaly'])), + created_date=datetime.fromisoformat(row['created_date']) + ) + prices.append(price_record) + + except (ValidationError, ValueError) as e: + logger.warning(f"Skipping invalid price record: {row} - {e}") + continue + + # Insert prices into database + for price in prices: + self._insert_price_record(price) + + logger.info(f"Loaded {len(prices)} price records") + return True + + except Exception as e: + logger.error(f"Error loading price data: {e}") + return False + + def _insert_market(self, market: Market) -> bool: + """Insert market record into database.""" + try: + sql = """ + INSERT INTO dim_markets ( + market_id, market_name, location, region, + latitude, longitude, is_active, created_date + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """ + + params = ( + market.market_id, + market.market_name, + market.location, + market.region, + market.latitude, + market.longitude, + market.is_active, + market.created_date + ) + + self.db_manager.execute_sql(sql, params) + return True + + except Exception as e: + logger.error(f"Error inserting market {market.market_id}: {e}") + return False + + def _insert_commodity(self, commodity: Commodity) -> bool: + """Insert commodity record into database.""" + try: + sql = """ + INSERT INTO dim_commodities ( + commodity_id, commodity_name, category, + unit_of_measure, is_active, created_date + ) VALUES (?, ?, ?, ?, ?, ?) + """ + + params = ( + commodity.commodity_id, + commodity.commodity_name, + commodity.category, + commodity.unit_of_measure, + commodity.is_active, + commodity.created_date + ) + + self.db_manager.execute_sql(sql, params) + return True + + except Exception as e: + logger.error(f"Error inserting commodity {commodity.commodity_id}: {e}") + return False + + def _insert_price_record(self, price: PriceRecord) -> bool: + """Insert price record into database.""" + try: + # First, ensure date dimension record exists + date_key = int(price.date.strftime('%Y%m%d')) + self._ensure_date_record(date_key, price.date) + + sql = """ + INSERT INTO fact_prices ( + date_key, market_id, commodity_id, raw_price, + normalized_price, data_source, quality_score, + is_anomaly, created_date + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + """ + + params = ( + date_key, + price.market_id, + price.commodity_id, + price.raw_price, + price.normalized_price, + price.data_source, + price.quality_score, + price.is_anomaly, + price.created_date + ) + + self.db_manager.execute_sql(sql, params) + return True + + except Exception as e: + logger.error(f"Error inserting price record {price.price_id}: {e}") + return False + + def _ensure_date_record(self, date_key: int, date: datetime) -> bool: + """Ensure date dimension record exists for the given date.""" + try: + # Check if date record exists + check_sql = "SELECT COUNT(*) FROM dim_date WHERE date_key = ?" + result = self.db_manager.fetch_one(check_sql, (date_key,)) + + if result[0] == 0: + # Insert date record + sql = """ + INSERT INTO dim_date ( + date_key, full_date, year, month, day, quarter, + month_name, day_name, is_weekend, fiscal_year, fiscal_quarter + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """ + + quarter = (date.month - 1) // 3 + 1 + is_weekend = date.weekday() >= 5 # Saturday = 5, Sunday = 6 + fiscal_year = date.year + 1 if date.month >= 7 else date.year + fiscal_quarter = ((date.month - 7) % 12) // 3 + 1 + + params = ( + date_key, + date.date(), + date.year, + date.month, + date.day, + quarter, + date.strftime('%B'), + date.strftime('%A'), + is_weekend, + fiscal_year, + fiscal_quarter + ) + + self.db_manager.execute_sql(sql, params) + + return True + + except Exception as e: + logger.error(f"Error ensuring date record for {date}: {e}") + return False + + def validate_setup(self) -> bool: + """Validate that the database setup was successful.""" + try: + logger.info("Validating database setup...") + + # Check table existence and record counts + tables_to_check = [ + ('dim_markets', 'Markets'), + ('dim_commodities', 'Commodities'), + ('fact_prices', 'Price records'), + ('dim_date', 'Date records') + ] + + for table_name, description in tables_to_check: + count_sql = f"SELECT COUNT(*) FROM {table_name}" + result = self.db_manager.fetch_one(count_sql) + count = result[0] if result else 0 + + logger.info(f"{description}: {count} records") + + if count == 0: + logger.warning(f"No records found in {table_name}") + + # Test a sample query + test_sql = """ + SELECT m.market_name, c.commodity_name, p.normalized_price + FROM fact_prices p + JOIN dim_markets m ON p.market_id = m.market_id + JOIN dim_commodities c ON p.commodity_id = c.commodity_id + LIMIT 5 + """ + + results = self.db_manager.fetch_all(test_sql) + + if results: + logger.info("Sample query successful - database setup validated") + return True + else: + logger.error("Sample query returned no results") + return False + + except Exception as e: + logger.error(f"Error validating database setup: {e}") + return False + + +def main(): + """Main function to set up the database.""" + try: + logger.info("Starting database setup...") + + # Get database configuration + db_config = get_database_config() + + # Initialize database manager + db_manager = DatabaseManager(db_config) + + # Initialize setup handler + setup = DatabaseSetup(db_manager) + + # Create schema + if not setup.create_schema(): + logger.error("Failed to create database schema") + return False + + # Load sample data + if not setup.load_sample_data(): + logger.error("Failed to load sample data") + return False + + # Validate setup + if not setup.validate_setup(): + logger.error("Database setup validation failed") + return False + + logger.info("Database setup completed successfully!") + return True + + except Exception as e: + logger.error(f"Database setup failed: {e}") + return False + + finally: + # Close database connection + if 'db_manager' in locals(): + db_manager.close() + + +if __name__ == "__main__": + success = main() + sys.exit(0 if success else 1) \ No newline at end of file diff --git a/src/utils/models.py b/src/utils/models.py new file mode 100644 index 0000000..5af25a0 --- /dev/null +++ b/src/utils/models.py @@ -0,0 +1,402 @@ +""" +Data models for the Ghana Commodity Pricing Engine. +Implements validation and data structures for core entities. +""" + +from dataclasses import dataclass, field +from datetime import datetime +from typing import Optional, List, Dict, Any +from decimal import Decimal +import re + + +class ValidationError(Exception): + """Custom exception for data validation errors.""" + pass + + +@dataclass +class Commodity: + """Data model for commodity information with validation.""" + + commodity_id: int + commodity_name: str + category: str + unit_of_measure: str + is_active: bool = True + created_date: Optional[datetime] = field(default_factory=datetime.now) + + def __post_init__(self): + """Validate commodity data after initialization.""" + self._validate_commodity_id() + self._validate_commodity_name() + self._validate_category() + self._validate_unit_of_measure() + + def _validate_commodity_id(self): + """Validate commodity ID is positive integer.""" + if not isinstance(self.commodity_id, int) or self.commodity_id <= 0: + raise ValidationError(f"Commodity ID must be a positive integer, got: {self.commodity_id}") + + def _validate_commodity_name(self): + """Validate commodity name is non-empty string.""" + if not isinstance(self.commodity_name, str) or not self.commodity_name.strip(): + raise ValidationError(f"Commodity name must be a non-empty string, got: {self.commodity_name}") + + if len(self.commodity_name) > 100: + raise ValidationError(f"Commodity name must be 100 characters or less, got: {len(self.commodity_name)}") + + def _validate_category(self): + """Validate commodity category.""" + valid_categories = { + 'Vegetables', 'Grains', 'Tubers', 'Fruits', 'Cash Crops', + 'Oils', 'Legumes', 'Spices', 'Livestock', 'Fish' + } + + if not isinstance(self.category, str) or self.category not in valid_categories: + raise ValidationError(f"Category must be one of {valid_categories}, got: {self.category}") + + def _validate_unit_of_measure(self): + """Validate unit of measure.""" + valid_units = {'kg', 'litre', 'bunch', 'piece', 'bag', 'tonne'} + + if not isinstance(self.unit_of_measure, str) or self.unit_of_measure not in valid_units: + raise ValidationError(f"Unit of measure must be one of {valid_units}, got: {self.unit_of_measure}") + + def to_dict(self) -> Dict[str, Any]: + """Convert commodity to dictionary representation.""" + return { + 'commodity_id': self.commodity_id, + 'commodity_name': self.commodity_name, + 'category': self.category, + 'unit_of_measure': self.unit_of_measure, + 'is_active': self.is_active, + 'created_date': self.created_date.isoformat() if self.created_date else None + } + + +@dataclass +class Market: + """Data model for market information with validation.""" + + market_id: int + market_name: str + location: str + region: str + latitude: float + longitude: float + is_active: bool = True + created_date: Optional[datetime] = field(default_factory=datetime.now) + + def __post_init__(self): + """Validate market data after initialization.""" + self._validate_market_id() + self._validate_market_name() + self._validate_location() + self._validate_region() + self._validate_coordinates() + + def _validate_market_id(self): + """Validate market ID is positive integer.""" + if not isinstance(self.market_id, int) or self.market_id <= 0: + raise ValidationError(f"Market ID must be a positive integer, got: {self.market_id}") + + def _validate_market_name(self): + """Validate market name is non-empty string.""" + if not isinstance(self.market_name, str) or not self.market_name.strip(): + raise ValidationError(f"Market name must be a non-empty string, got: {self.market_name}") + + if len(self.market_name) > 100: + raise ValidationError(f"Market name must be 100 characters or less, got: {len(self.market_name)}") + + def _validate_location(self): + """Validate location is non-empty string.""" + if not isinstance(self.location, str) or not self.location.strip(): + raise ValidationError(f"Location must be a non-empty string, got: {self.location}") + + if len(self.location) > 100: + raise ValidationError(f"Location must be 100 characters or less, got: {len(self.location)}") + + def _validate_region(self): + """Validate region is a valid Ghana region.""" + valid_regions = { + 'Greater Accra', 'Ashanti', 'Northern', 'Western', 'Central', + 'Upper East', 'Upper West', 'Volta', 'Eastern', 'Brong Ahafo' + } + + if not isinstance(self.region, str) or self.region not in valid_regions: + raise ValidationError(f"Region must be one of {valid_regions}, got: {self.region}") + + def _validate_coordinates(self): + """Validate latitude and longitude are within Ghana's bounds.""" + # Ghana's approximate bounds + if not isinstance(self.latitude, (int, float)) or not (4.5 <= self.latitude <= 11.5): + raise ValidationError(f"Latitude must be between 4.5 and 11.5 for Ghana, got: {self.latitude}") + + if not isinstance(self.longitude, (int, float)) or not (-3.5 <= self.longitude <= 1.5): + raise ValidationError(f"Longitude must be between -3.5 and 1.5 for Ghana, got: {self.longitude}") + + def to_dict(self) -> Dict[str, Any]: + """Convert market to dictionary representation.""" + return { + 'market_id': self.market_id, + 'market_name': self.market_name, + 'location': self.location, + 'region': self.region, + 'latitude': self.latitude, + 'longitude': self.longitude, + 'is_active': self.is_active, + 'created_date': self.created_date.isoformat() if self.created_date else None + } + + +@dataclass +class PriceRecord: + """Data model for price records with validation.""" + + price_id: int + date: datetime + market_id: int + commodity_id: int + raw_price: Decimal + normalized_price: Decimal + data_source: str + quality_score: float = 1.0 + is_anomaly: bool = False + created_date: Optional[datetime] = field(default_factory=datetime.now) + + def __post_init__(self): + """Validate price record data after initialization.""" + self._validate_price_id() + self._validate_date() + self._validate_market_id() + self._validate_commodity_id() + self._validate_prices() + self._validate_data_source() + self._validate_quality_score() + + def _validate_price_id(self): + """Validate price ID is positive integer.""" + if not isinstance(self.price_id, int) or self.price_id <= 0: + raise ValidationError(f"Price ID must be a positive integer, got: {self.price_id}") + + def _validate_date(self): + """Validate date is a datetime object and not in the future.""" + if not isinstance(self.date, datetime): + raise ValidationError(f"Date must be a datetime object, got: {type(self.date)}") + + if self.date > datetime.now(): + raise ValidationError(f"Date cannot be in the future, got: {self.date}") + + def _validate_market_id(self): + """Validate market ID is positive integer.""" + if not isinstance(self.market_id, int) or self.market_id <= 0: + raise ValidationError(f"Market ID must be a positive integer, got: {self.market_id}") + + def _validate_commodity_id(self): + """Validate commodity ID is positive integer.""" + if not isinstance(self.commodity_id, int) or self.commodity_id <= 0: + raise ValidationError(f"Commodity ID must be a positive integer, got: {self.commodity_id}") + + def _validate_prices(self): + """Validate price values are positive decimals.""" + if not isinstance(self.raw_price, (Decimal, int, float)) or float(self.raw_price) <= 0: + raise ValidationError(f"Raw price must be a positive number, got: {self.raw_price}") + + if not isinstance(self.normalized_price, (Decimal, int, float)) or float(self.normalized_price) <= 0: + raise ValidationError(f"Normalized price must be a positive number, got: {self.normalized_price}") + + # Convert to Decimal if not already + if not isinstance(self.raw_price, Decimal): + self.raw_price = Decimal(str(self.raw_price)) + + if not isinstance(self.normalized_price, Decimal): + self.normalized_price = Decimal(str(self.normalized_price)) + + # Check for reasonable price ranges (0.01 to 10000 GHS) + if not (Decimal('0.01') <= self.raw_price <= Decimal('10000')): + raise ValidationError(f"Raw price must be between 0.01 and 10000, got: {self.raw_price}") + + if not (Decimal('0.01') <= self.normalized_price <= Decimal('10000')): + raise ValidationError(f"Normalized price must be between 0.01 and 10000, got: {self.normalized_price}") + + def _validate_data_source(self): + """Validate data source is from approved sources.""" + valid_sources = { + 'field_survey', 'government_api', 'market_association', + 'mobile_app', 'third_party', 'manual_entry' + } + + if not isinstance(self.data_source, str) or self.data_source not in valid_sources: + raise ValidationError(f"Data source must be one of {valid_sources}, got: {self.data_source}") + + def _validate_quality_score(self): + """Validate quality score is between 0.0 and 1.0.""" + if not isinstance(self.quality_score, (int, float)) or not (0.0 <= self.quality_score <= 1.0): + raise ValidationError(f"Quality score must be between 0.0 and 1.0, got: {self.quality_score}") + + def to_dict(self) -> Dict[str, Any]: + """Convert price record to dictionary representation.""" + return { + 'price_id': self.price_id, + 'date': self.date.isoformat(), + 'market_id': self.market_id, + 'commodity_id': self.commodity_id, + 'raw_price': float(self.raw_price), + 'normalized_price': float(self.normalized_price), + 'data_source': self.data_source, + 'quality_score': self.quality_score, + 'is_anomaly': self.is_anomaly, + 'created_date': self.created_date.isoformat() if self.created_date else None + } + + +@dataclass +class PriceRecommendation: + """Data model for price recommendations with validation.""" + + recommendation_id: int + date: datetime + market_id: int + commodity_id: int + recommended_price: Decimal + confidence_score: float + explanation: str + algorithm_version: str + created_date: Optional[datetime] = field(default_factory=datetime.now) + + def __post_init__(self): + """Validate price recommendation data after initialization.""" + self._validate_recommendation_id() + self._validate_date() + self._validate_market_id() + self._validate_commodity_id() + self._validate_recommended_price() + self._validate_confidence_score() + self._validate_explanation() + self._validate_algorithm_version() + + def _validate_recommendation_id(self): + """Validate recommendation ID is positive integer.""" + if not isinstance(self.recommendation_id, int) or self.recommendation_id <= 0: + raise ValidationError(f"Recommendation ID must be a positive integer, got: {self.recommendation_id}") + + def _validate_date(self): + """Validate date is a datetime object.""" + if not isinstance(self.date, datetime): + raise ValidationError(f"Date must be a datetime object, got: {type(self.date)}") + + def _validate_market_id(self): + """Validate market ID is positive integer.""" + if not isinstance(self.market_id, int) or self.market_id <= 0: + raise ValidationError(f"Market ID must be a positive integer, got: {self.market_id}") + + def _validate_commodity_id(self): + """Validate commodity ID is positive integer.""" + if not isinstance(self.commodity_id, int) or self.commodity_id <= 0: + raise ValidationError(f"Commodity ID must be a positive integer, got: {self.commodity_id}") + + def _validate_recommended_price(self): + """Validate recommended price is positive decimal.""" + if not isinstance(self.recommended_price, (Decimal, int, float)) or float(self.recommended_price) <= 0: + raise ValidationError(f"Recommended price must be a positive number, got: {self.recommended_price}") + + # Convert to Decimal if not already + if not isinstance(self.recommended_price, Decimal): + self.recommended_price = Decimal(str(self.recommended_price)) + + # Check for reasonable price ranges (0.01 to 10000 GHS) + if not (Decimal('0.01') <= self.recommended_price <= Decimal('10000')): + raise ValidationError(f"Recommended price must be between 0.01 and 10000, got: {self.recommended_price}") + + def _validate_confidence_score(self): + """Validate confidence score is between 0.0 and 1.0.""" + if not isinstance(self.confidence_score, (int, float)) or not (0.0 <= self.confidence_score <= 1.0): + raise ValidationError(f"Confidence score must be between 0.0 and 1.0, got: {self.confidence_score}") + + def _validate_explanation(self): + """Validate explanation is non-empty string.""" + if not isinstance(self.explanation, str) or not self.explanation.strip(): + raise ValidationError(f"Explanation must be a non-empty string, got: {self.explanation}") + + if len(self.explanation) > 1000: + raise ValidationError(f"Explanation must be 1000 characters or less, got: {len(self.explanation)}") + + def _validate_algorithm_version(self): + """Validate algorithm version follows semantic versioning.""" + if not isinstance(self.algorithm_version, str): + raise ValidationError(f"Algorithm version must be a string, got: {type(self.algorithm_version)}") + + # Basic semantic versioning pattern (e.g., "1.0.0", "2.1.3") + version_pattern = r'^\d+\.\d+\.\d+$' + if not re.match(version_pattern, self.algorithm_version): + raise ValidationError(f"Algorithm version must follow semantic versioning (x.y.z), got: {self.algorithm_version}") + + def to_dict(self) -> Dict[str, Any]: + """Convert price recommendation to dictionary representation.""" + return { + 'recommendation_id': self.recommendation_id, + 'date': self.date.isoformat(), + 'market_id': self.market_id, + 'commodity_id': self.commodity_id, + 'recommended_price': float(self.recommended_price), + 'confidence_score': self.confidence_score, + 'explanation': self.explanation, + 'algorithm_version': self.algorithm_version, + 'created_date': self.created_date.isoformat() if self.created_date else None + } + + +class DataValidator: + """Utility class for validating data integrity and business rules.""" + + @staticmethod + def validate_price_consistency(price_record: PriceRecord) -> List[str]: + """Validate price record consistency and return list of warnings.""" + warnings = [] + + # Check if normalized price differs significantly from raw price + price_diff = abs(float(price_record.normalized_price) - float(price_record.raw_price)) + price_ratio = price_diff / float(price_record.raw_price) + + if price_ratio > 0.1: # More than 10% difference + warnings.append(f"Normalized price differs by {price_ratio:.1%} from raw price") + + # Check quality score consistency with anomaly flag + if price_record.is_anomaly and price_record.quality_score > 0.8: + warnings.append("High quality score for anomalous price may indicate data inconsistency") + + return warnings + + @staticmethod + def validate_market_commodity_combination(market_id: int, commodity_id: int, + markets: List[Market], + commodities: List[Commodity]) -> bool: + """Validate that market and commodity combination is valid.""" + market_exists = any(m.market_id == market_id and m.is_active for m in markets) + commodity_exists = any(c.commodity_id == commodity_id and c.is_active for c in commodities) + + return market_exists and commodity_exists + + @staticmethod + def validate_data_completeness(data: List[Dict[str, Any]], required_fields: List[str]) -> Dict[str, Any]: + """Validate data completeness and return completeness metrics.""" + total_records = len(data) + if total_records == 0: + return {'completeness_score': 0.0, 'missing_fields': required_fields} + + field_completeness = {} + for field in required_fields: + non_null_count = sum(1 for record in data if record.get(field) is not None and record.get(field) != '') + field_completeness[field] = non_null_count / total_records + + overall_completeness = sum(field_completeness.values()) / len(required_fields) + + missing_fields = [field for field, completeness in field_completeness.items() if completeness < 1.0] + + return { + 'completeness_score': overall_completeness, + 'field_completeness': field_completeness, + 'missing_fields': missing_fields, + 'total_records': total_records + } \ No newline at end of file