Add Full pipeline with Refactored Classifier and LLM modules #173

Workflow file for this run

.github/workflows/working_sw.yml at 6bac2e3

	name: ML Pipeline CI

	on: [push, pull_request]

	jobs:
	run:
	runs-on: ubuntu-latest

	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: "3.11"

	- name: Install dependencies
	run: \|
	sudo apt-get update
	sudo apt-get install -y tesseract-ocr
	python -m pip install --upgrade pip
	pip install -r requirements.txt

	# CI pipeline (streams 20 PDFs per class from Drive and trains model)
	- name: CI pipeline (Drive stream)
	env:
	GOOGLE_SERVICE_ACCOUNT_JSON: ${{ secrets.GOOGLE_SERVICE_ACCOUNT_JSON }}
	GOOGLE_DRIVE_ROOT_FOLDER_ID: ${{ secrets.GOOGLE_DRIVE_ROOT_FOLDER_ID }}
	PYTHONUNBUFFERED: 1
	run: \|
	python -u scripts/ci_pipeline.py

	# OPTIONAL: Model cache and artifacts (commented until enabled)
	- name: Cache model artifacts
	uses: actions/cache@v3
	with:
	path: \|
	src/model/models
	~/.joblib
	~/.sklearn
	key: ${{ runner.os }}-sklearn-model-${{ hashFiles('src/model/*/.py') }}
	restore-keys: \|
	${{ runner.os }}-sklearn-model-

	# Code quality checks
	- name: Check code formatting (Black)
	run: \|
	echo "Checking code format with Black..."
	black --check src tests

	- name: Run linter (Flake8)
	run: \|
	echo "Running flake8 linter..."
	flake8 src/ tests/ --statistics --count \|\| true
	echo "Flake8 check completed."

	# Data preprocessing pipeline
	# Optional: still run a single-file extraction test on repo test asset
	- name: Run PDF extraction script (repo test asset)
	run: \|
	INPUT_PDF="tests/test.pdf"
	python src/preprocessing/pdf_text_extraction.py "$INPUT_PDF"

	# - name: Generate labels
	# run: \|
	# echo "Generating labels for extracted text..."
	# python src/preprocessing/generate_labels.py

	# - name: Load and preprocess data
	# run: \|
	# echo "Loading and preprocessing dataset..."
	# python src/preprocessing/data_loader.py
	# No full pipeline here; use full_training_pipeline.py on main or a scheduled workflow

	- name: Validate preprocessing pipeline
	run: \|
	# Validate PDF extraction
	INPUT_PDF="tests/test.pdf"
	OUTPUT_FILE="data/processed-text/$(basename "$INPUT_PDF" .pdf).txt"

	echo "Validating PDF extraction..."
	[ -f "$OUTPUT_FILE" ] && [ -s "$OUTPUT_FILE" ] \|\| exit 1

	# Validate labels
	echo "Validating label generation..."
	[ -f "data/labels.json" ] \|\| exit 1

	# Validate data loading
	echo "Validating data loading..."
	python -c "import json; f=open('data/labels.json'); json.load(f); f.close()" \|\| exit 1

	# Model workflow temporarily disabled (training & classification skipped)
	# Skip model training in CI for speed; exercised in full-training job on main/schedule

	# Unit testing and coverage
	- name: Run unit tests with coverage
	run: \|
	echo "Running pytest with coverage..."
	coverage run -m pytest tests/
	coverage report -m
	coverage html

	- name: Upload coverage report
	uses: actions/upload-artifact@v4
	with:
	name: coverage-report
	path: htmlcov/

	# OPTIONAL: Upload trained model artifacts (enable once model steps are active)
	- name: Upload trained model
	uses: actions/upload-artifact@v4
	with:
	name: trained-model
	path: \|
	src/model/models/pdf_classifier_model.pkl
	src/model/models/tfidf_vectorizer.pkl

	- name: Show pipeline summary
	run: \|
	echo "Pipeline Summary:"
	echo "1. PDF Extraction Results:"
	ls -l data/processed-text/
	echo "2. Label Generation Results:"
	ls -l data/labels.json
	echo "3. Model Training Results:"
	ls -l src/model/models/

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add Full pipeline with Refactored Classifier and LLM modules #173

Workflow file

Add Full pipeline with Refactored Classifier and LLM modules #173

Uh oh!

Workflow file for this run