Skip to content

Add Full pipeline with Refactored Classifier and LLM modules #173

Add Full pipeline with Refactored Classifier and LLM modules

Add Full pipeline with Refactored Classifier and LLM modules #173

Workflow file for this run

name: ML Pipeline CI
on: [push, pull_request]
jobs:
run:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y tesseract-ocr
python -m pip install --upgrade pip
pip install -r requirements.txt
# CI pipeline (streams 20 PDFs per class from Drive and trains model)
- name: CI pipeline (Drive stream)
env:
GOOGLE_SERVICE_ACCOUNT_JSON: ${{ secrets.GOOGLE_SERVICE_ACCOUNT_JSON }}
GOOGLE_DRIVE_ROOT_FOLDER_ID: ${{ secrets.GOOGLE_DRIVE_ROOT_FOLDER_ID }}
PYTHONUNBUFFERED: 1
run: |
python -u scripts/ci_pipeline.py
# OPTIONAL: Model cache and artifacts (commented until enabled)
- name: Cache model artifacts
uses: actions/cache@v3
with:
path: |
src/model/models
~/.joblib
~/.sklearn
key: ${{ runner.os }}-sklearn-model-${{ hashFiles('src/model/**/*.py') }}
restore-keys: |
${{ runner.os }}-sklearn-model-
# Code quality checks
- name: Check code formatting (Black)
run: |
echo "Checking code format with Black..."
black --check src tests
- name: Run linter (Flake8)
run: |
echo "Running flake8 linter..."
flake8 src/ tests/ --statistics --count || true
echo "Flake8 check completed."
# Data preprocessing pipeline
# Optional: still run a single-file extraction test on repo test asset
- name: Run PDF extraction script (repo test asset)
run: |
INPUT_PDF="tests/test.pdf"
python src/preprocessing/pdf_text_extraction.py "$INPUT_PDF"
# - name: Generate labels
# run: |
# echo "Generating labels for extracted text..."
# python src/preprocessing/generate_labels.py
# - name: Load and preprocess data
# run: |
# echo "Loading and preprocessing dataset..."
# python src/preprocessing/data_loader.py
# No full pipeline here; use full_training_pipeline.py on main or a scheduled workflow
- name: Validate preprocessing pipeline
run: |
# Validate PDF extraction
INPUT_PDF="tests/test.pdf"
OUTPUT_FILE="data/processed-text/$(basename "$INPUT_PDF" .pdf).txt"
echo "Validating PDF extraction..."
[ -f "$OUTPUT_FILE" ] && [ -s "$OUTPUT_FILE" ] || exit 1
# Validate labels
echo "Validating label generation..."
[ -f "data/labels.json" ] || exit 1
# Validate data loading
echo "Validating data loading..."
python -c "import json; f=open('data/labels.json'); json.load(f); f.close()" || exit 1
# Model workflow temporarily disabled (training & classification skipped)
# Skip model training in CI for speed; exercised in full-training job on main/schedule
# Unit testing and coverage
- name: Run unit tests with coverage
run: |
echo "Running pytest with coverage..."
coverage run -m pytest tests/
coverage report -m
coverage html
- name: Upload coverage report
uses: actions/upload-artifact@v4
with:
name: coverage-report
path: htmlcov/
# OPTIONAL: Upload trained model artifacts (enable once model steps are active)
- name: Upload trained model
uses: actions/upload-artifact@v4
with:
name: trained-model
path: |
src/model/models/pdf_classifier_model.pkl
src/model/models/tfidf_vectorizer.pkl
- name: Show pipeline summary
run: |
echo "Pipeline Summary:"
echo "1. PDF Extraction Results:"
ls -l data/processed-text/
echo "2. Label Generation Results:"
ls -l data/labels.json
echo "3. Model Training Results:"
ls -l src/model/models/