Add Full pipeline with Refactored Classifier and LLM modules #173
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: ML Pipeline CI | |
| on: [push, pull_request] | |
| jobs: | |
| run: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.11" | |
| - name: Install dependencies | |
| run: | | |
| sudo apt-get update | |
| sudo apt-get install -y tesseract-ocr | |
| python -m pip install --upgrade pip | |
| pip install -r requirements.txt | |
| # CI pipeline (streams 20 PDFs per class from Drive and trains model) | |
| - name: CI pipeline (Drive stream) | |
| env: | |
| GOOGLE_SERVICE_ACCOUNT_JSON: ${{ secrets.GOOGLE_SERVICE_ACCOUNT_JSON }} | |
| GOOGLE_DRIVE_ROOT_FOLDER_ID: ${{ secrets.GOOGLE_DRIVE_ROOT_FOLDER_ID }} | |
| PYTHONUNBUFFERED: 1 | |
| run: | | |
| python -u scripts/ci_pipeline.py | |
| # OPTIONAL: Model cache and artifacts (commented until enabled) | |
| - name: Cache model artifacts | |
| uses: actions/cache@v3 | |
| with: | |
| path: | | |
| src/model/models | |
| ~/.joblib | |
| ~/.sklearn | |
| key: ${{ runner.os }}-sklearn-model-${{ hashFiles('src/model/**/*.py') }} | |
| restore-keys: | | |
| ${{ runner.os }}-sklearn-model- | |
| # Code quality checks | |
| - name: Check code formatting (Black) | |
| run: | | |
| echo "Checking code format with Black..." | |
| black --check src tests | |
| - name: Run linter (Flake8) | |
| run: | | |
| echo "Running flake8 linter..." | |
| flake8 src/ tests/ --statistics --count || true | |
| echo "Flake8 check completed." | |
| # Data preprocessing pipeline | |
| # Optional: still run a single-file extraction test on repo test asset | |
| - name: Run PDF extraction script (repo test asset) | |
| run: | | |
| INPUT_PDF="tests/test.pdf" | |
| python src/preprocessing/pdf_text_extraction.py "$INPUT_PDF" | |
| # - name: Generate labels | |
| # run: | | |
| # echo "Generating labels for extracted text..." | |
| # python src/preprocessing/generate_labels.py | |
| # - name: Load and preprocess data | |
| # run: | | |
| # echo "Loading and preprocessing dataset..." | |
| # python src/preprocessing/data_loader.py | |
| # No full pipeline here; use full_training_pipeline.py on main or a scheduled workflow | |
| - name: Validate preprocessing pipeline | |
| run: | | |
| # Validate PDF extraction | |
| INPUT_PDF="tests/test.pdf" | |
| OUTPUT_FILE="data/processed-text/$(basename "$INPUT_PDF" .pdf).txt" | |
| echo "Validating PDF extraction..." | |
| [ -f "$OUTPUT_FILE" ] && [ -s "$OUTPUT_FILE" ] || exit 1 | |
| # Validate labels | |
| echo "Validating label generation..." | |
| [ -f "data/labels.json" ] || exit 1 | |
| # Validate data loading | |
| echo "Validating data loading..." | |
| python -c "import json; f=open('data/labels.json'); json.load(f); f.close()" || exit 1 | |
| # Model workflow temporarily disabled (training & classification skipped) | |
| # Skip model training in CI for speed; exercised in full-training job on main/schedule | |
| # Unit testing and coverage | |
| - name: Run unit tests with coverage | |
| run: | | |
| echo "Running pytest with coverage..." | |
| coverage run -m pytest tests/ | |
| coverage report -m | |
| coverage html | |
| - name: Upload coverage report | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: coverage-report | |
| path: htmlcov/ | |
| # OPTIONAL: Upload trained model artifacts (enable once model steps are active) | |
| - name: Upload trained model | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: trained-model | |
| path: | | |
| src/model/models/pdf_classifier_model.pkl | |
| src/model/models/tfidf_vectorizer.pkl | |
| - name: Show pipeline summary | |
| run: | | |
| echo "Pipeline Summary:" | |
| echo "1. PDF Extraction Results:" | |
| ls -l data/processed-text/ | |
| echo "2. Label Generation Results:" | |
| ls -l data/labels.json | |
| echo "3. Model Training Results:" | |
| ls -l src/model/models/ |