feat: Add PowerPoint (PPTX) extraction support

anthonycamilleri · claude · anthonycamilleri · commit cc84cb55831f · 2025-10-30T20:44:56.000Z
This commit adds comprehensive support for extracting text from PowerPoint presentations (.pptx files) to the docprocessor library. Changes: - Add python-pptx>=0.6.21 dependency - Implement _extract_pptx() method in ContentExtractor - Extract text from slides, tables, and speaker notes - Add slide separators and notes labels for clarity - Include metadata: slide_count, shape_count, has_tables - Add 6 comprehensive tests for PPTX extraction - Update README.md to document PPTX support - Update CHANGELOG.md for v1.1.0 release - Bump version from 1.0.0 to 1.1.0 Test results: - All 6 new PPTX tests pass - All 23 extractor tests pass (2 skipped) - Code passes Black, isort, and flake8 linting 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,30 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [1.1.0] - 2025-10-30
+
+### Added
+- **PowerPoint (PPTX) support**: Extract text from PowerPoint presentations
+  - Text extraction from all slides with slide separators
+  - Table content extraction within slides
+  - Speaker notes extraction
+  - Metadata including slide count, shape count, and table detection
+- `python-pptx>=0.6.21` dependency for PPTX processing
+- Comprehensive test suite for PPTX extraction (6 new tests):
+  - Test for missing dependency handling
+  - Test for slide content extraction
+  - Test for table extraction
+  - Test for speaker notes extraction
+  - Test for empty presentations
+  - Test for corrupted file handling
+- Updated documentation to include PPTX in supported formats
+
+### Changed
+- Updated `ContentExtractor` to support `.pptx` file extension
+- Enhanced README.md to list PPTX in multi-format support and dependencies
+
+## [Earlier Unreleased]
+
 ### Added
 - Production-grade test suite with 144 tests and 81% coverage
 - Configuration management module with ProcessorConfig and MeiliSearchConfig
@@ -60,5 +84,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - **Flexible Integration**: BYOC pattern allows any LLM provider
 - **Environment Isolation**: Index prefixing for multi-environment deployments
 
-[Unreleased]: https://github.com/Knowledge-Innovation-Centre/doc-processor/compare/v1.0.0...HEAD
+[Unreleased]: https://github.com/Knowledge-Innovation-Centre/doc-processor/compare/v1.1.0...HEAD
+[1.1.0]: https://github.com/Knowledge-Innovation-Centre/doc-processor/compare/v1.0.0...v1.1.0
 [1.0.0]: https://github.com/Knowledge-Innovation-Centre/doc-processor/releases/tag/v1.0.0
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ A Python library for processing documents with OCR, semantic chunking, and LLM-b
 
 ## Features
 
-- **Multi-format Support**: PDF, DOCX, TXT, MD, and images (PNG, JPG, GIF, BMP)
+- **Multi-format Support**: PDF, DOCX, PPTX, TXT, MD, and images (PNG, JPG, GIF, BMP)
 - **Intelligent OCR**: Layout-aware PDF text extraction with OCR fallback for images
 - **Semantic Chunking**: Smart text segmentation using LangChain's RecursiveCharacterTextSplitter
 - **LLM Summarization**: Generate concise document summaries (with fallback)
@@ -279,6 +279,7 @@ Each component can be used independently or through the unified `DocumentProcess
 - opencv-python - Image preprocessing
 - Pillow - Image handling
 - python-docx - DOCX extraction
+- python-pptx - PPTX extraction
 - langchain-text-splitters - Semantic chunking
 - tiktoken - Token counting
 
diff --git a/docprocessor/core/extractor.py b/docprocessor/core/extractor.py
@@ -29,6 +29,7 @@ class ContentExtractor:
     - PDF: Using OCR pipeline (extract_pdf_for_llm)
     - TXT/MD: Direct text read
     - DOCX: python-docx extraction
+    - PPTX: python-pptx extraction
     - Images: OCR fallback
     """
 
@@ -38,6 +39,7 @@ def __init__(self):
             ".txt",
             ".md",
             ".docx",
+            ".pptx",
             ".png",
             ".jpg",
             ".jpeg",
@@ -77,6 +79,8 @@ def extract(self, file_path: Path) -> Dict[str, Any]:
                 return self._extract_text(file_path)
             elif extension == ".docx":
                 return self._extract_docx(file_path)
+            elif extension == ".pptx":
+                return self._extract_pptx(file_path)
             elif extension in {".png", ".jpg", ".jpeg", ".gif", ".bmp"}:
                 return self._extract_image(file_path)
             else:
@@ -165,6 +169,74 @@ def _extract_docx(self, file_path: Path) -> Dict[str, Any]:
             logger.error(f"Error extracting DOCX: {e}")
             raise ContentExtractionError(f"Failed to extract DOCX: {str(e)}")
 
+    def _extract_pptx(self, file_path: Path) -> Dict[str, Any]:
+        """Extract text from PPTX files."""
+        logger.info(f"Extracting PPTX: {file_path}")
+
+        try:
+            from pptx import Presentation
+        except ImportError:
+            raise ContentExtractionError(
+                "python-pptx not installed. Install with: pip install python-pptx"
+            )
+
+        try:
+            prs = Presentation(str(file_path))
+
+            # Extract text from all slides
+            all_text = []
+            slide_count = 0
+            total_shapes = 0
+            has_tables = False
+
+            for slide in prs.slides:
+                slide_count += 1
+                slide_text = []
+
+                # Extract text from shapes
+                for shape in slide.shapes:
+                    total_shapes += 1
+
+                    # Handle text frames
+                    if hasattr(shape, "text") and shape.text.strip():
+                        slide_text.append(shape.text.strip())
+
+                    # Handle tables
+                    if shape.has_table:
+                        has_tables = True
+                        table_data = []
+                        for row in shape.table.rows:
+                            row_data = [cell.text.strip() for cell in row.cells]
+                            table_data.append(" | ".join(row_data))
+                        slide_text.append("\n".join(table_data))
+
+                # Add slide text with separator
+                if slide_text:
+                    all_text.append(f"--- Slide {slide_count} ---\n" + "\n\n".join(slide_text))
+
+                # Extract notes if present
+                if slide.has_notes_slide:
+                    notes_text = slide.notes_slide.notes_text_frame.text.strip()
+                    if notes_text:
+                        all_text.append(f"[Notes for Slide {slide_count}]\n{notes_text}")
+
+            text = "\n\n".join(all_text)
+
+            return {
+                "text": text,
+                "page_count": slide_count,  # Use slide count as "pages"
+                "metadata": {
+                    "format": "pptx",
+                    "extraction_method": "python-pptx",
+                    "slide_count": slide_count,
+                    "shape_count": total_shapes,
+                    "has_tables": has_tables,
+                },
+            }
+        except Exception as e:
+            logger.error(f"Error extracting PPTX: {e}")
+            raise ContentExtractionError(f"Failed to extract PPTX: {str(e)}")
+
     def _extract_image(self, file_path: Path) -> Dict[str, Any]:
         """Extract text from images using OCR."""
         logger.info(f"Extracting image with OCR: {file_path}")
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "docprocessor"
-version = "1.0.0"
+version = "1.1.0"
 description = "Intelligent document processing with OCR, chunking, and AI summarization"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/requirements.txt b/requirements.txt
@@ -4,6 +4,7 @@ pytesseract>=0.3.10
 opencv-python>=4.9.0
 Pillow>=10.2.0
 python-docx>=1.1.0
+python-pptx>=0.6.21
 langchain-text-splitters>=0.2.0
 tiktoken>=0.7.0
 numpy>=1.24.0
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="docprocessor",
-    version="1.0.0",
+    version="1.1.0",
     author="Knowledge Innovation Centre",
     author_email="info@knowledgeinnovation.eu",
     description="A Python library for document processing with OCR, chunking, and summarization",
@@ -32,6 +32,7 @@
         "opencv-python>=4.9.0",
         "Pillow>=10.2.0",
         "python-docx>=1.1.0",
+        "python-pptx>=0.6.21",
         "langchain-text-splitters>=0.2.0",
         "tiktoken>=0.7.0",
         "numpy>=1.24.0",
diff --git a/tests/test_extractor.py b/tests/test_extractor.py
@@ -81,6 +81,7 @@ def test_is_supported(self):
         assert extractor.is_supported(".pdf") is True
         assert extractor.is_supported(".txt") is True
         assert extractor.is_supported(".docx") is True
+        assert extractor.is_supported(".pptx") is True
         assert extractor.is_supported(".xyz") is False
 
     def test_extract_unsupported_file(self, tmp_path):
@@ -269,6 +270,176 @@ def test_extract_docx_corruption(self, tmp_path):
         with pytest.raises(ContentExtractionError, match="Failed to extract"):
             extractor.extract(bad_docx)
 
+    def test_extract_pptx_not_installed(self, tmp_path, monkeypatch):
+        """Test PPTX extraction when python-pptx is not installed."""
+        pptx_file = tmp_path / "test.pptx"
+        pptx_file.touch()
+
+        extractor = ContentExtractor()
+
+        # Mock pptx import to raise ImportError
+        import builtins
+
+        original_import = builtins.__import__
+
+        def mock_import(name, *args, **kwargs):
+            if name == "pptx":
+                raise ImportError("No module named 'pptx'")
+            return original_import(name, *args, **kwargs)
+
+        monkeypatch.setattr(builtins, "__import__", mock_import)
+
+        with pytest.raises(ContentExtractionError, match="python-pptx not installed"):
+            extractor._extract_pptx(pptx_file)
+
+    def test_extract_pptx_with_content(self, tmp_path):
+        """Test extracting text from PPTX with slides and content."""
+        try:
+            from pptx import Presentation
+        except ImportError:
+            pytest.skip("python-pptx not installed")
+
+        # Create PPTX with content
+        pptx_file = tmp_path / "test.pptx"
+        prs = Presentation()
+
+        # Add slide 1 with title and content
+        slide_layout = prs.slide_layouts[1]  # Title and content layout
+        slide1 = prs.slides.add_slide(slide_layout)
+        slide1.shapes.title.text = "First Slide Title"
+        slide1.placeholders[1].text = "This is the content of the first slide."
+
+        # Add slide 2 with different content
+        slide2 = prs.slides.add_slide(slide_layout)
+        slide2.shapes.title.text = "Second Slide Title"
+        slide2.placeholders[1].text = "This is the content of the second slide."
+
+        prs.save(str(pptx_file))
+
+        extractor = ContentExtractor()
+        result = extractor.extract(pptx_file)
+
+        assert "text" in result
+        assert "First Slide Title" in result["text"]
+        assert "Second Slide Title" in result["text"]
+        assert "first slide" in result["text"]
+        assert "second slide" in result["text"]
+        assert result["page_count"] == 2
+        assert result["metadata"]["format"] == "pptx"
+        assert result["metadata"]["extraction_method"] == "python-pptx"
+        assert result["metadata"]["slide_count"] == 2
+        assert result["metadata"]["shape_count"] > 0
+
+    def test_extract_pptx_with_tables(self, tmp_path):
+        """Test extracting text from PPTX with tables."""
+        try:
+            from pptx import Presentation
+            from pptx.util import Inches
+        except ImportError:
+            pytest.skip("python-pptx not installed")
+
+        # Create PPTX with table
+        pptx_file = tmp_path / "table_test.pptx"
+        prs = Presentation()
+
+        # Add slide with blank layout
+        blank_layout = prs.slide_layouts[6]  # Blank layout
+        slide = prs.slides.add_slide(blank_layout)
+
+        # Add a table
+        rows, cols = 3, 2
+        left = Inches(2)
+        top = Inches(2)
+        width = Inches(4)
+        height = Inches(2)
+
+        table_shape = slide.shapes.add_table(rows, cols, left, top, width, height)
+        table = table_shape.table
+
+        # Fill table with data
+        table.cell(0, 0).text = "Header 1"
+        table.cell(0, 1).text = "Header 2"
+        table.cell(1, 0).text = "Row 1 Col 1"
+        table.cell(1, 1).text = "Row 1 Col 2"
+        table.cell(2, 0).text = "Row 2 Col 1"
+        table.cell(2, 1).text = "Row 2 Col 2"
+
+        prs.save(str(pptx_file))
+
+        extractor = ContentExtractor()
+        result = extractor.extract(pptx_file)
+
+        assert "text" in result
+        assert "Header 1" in result["text"]
+        assert "Header 2" in result["text"]
+        assert "Row 1 Col 1" in result["text"]
+        assert result["metadata"]["has_tables"] is True
+
+    def test_extract_pptx_with_notes(self, tmp_path):
+        """Test extracting text from PPTX with speaker notes."""
+        try:
+            from pptx import Presentation
+        except ImportError:
+            pytest.skip("python-pptx not installed")
+
+        # Create PPTX with notes
+        pptx_file = tmp_path / "notes_test.pptx"
+        prs = Presentation()
+
+        slide_layout = prs.slide_layouts[1]
+        slide = prs.slides.add_slide(slide_layout)
+        slide.shapes.title.text = "Slide with Notes"
+
+        # Add speaker notes
+        notes_slide = slide.notes_slide
+        notes_text_frame = notes_slide.notes_text_frame
+        notes_text_frame.text = "These are important speaker notes for the presentation."
+
+        prs.save(str(pptx_file))
+
+        extractor = ContentExtractor()
+        result = extractor.extract(pptx_file)
+
+        assert "text" in result
+        assert "Slide with Notes" in result["text"]
+        assert "speaker notes" in result["text"]
+        assert "[Notes for Slide 1]" in result["text"]
+
+    def test_extract_pptx_empty(self, tmp_path):
+        """Test extracting empty PPTX file."""
+        try:
+            from pptx import Presentation
+        except ImportError:
+            pytest.skip("python-pptx not installed")
+
+        # Create empty PPTX
+        pptx_file = tmp_path / "empty.pptx"
+        prs = Presentation()
+        prs.save(str(pptx_file))
+
+        extractor = ContentExtractor()
+        result = extractor.extract(pptx_file)
+
+        assert "text" in result
+        assert result["page_count"] == 0
+        assert result["metadata"]["slide_count"] == 0
+
+    def test_extract_pptx_corruption(self, tmp_path):
+        """Test PPTX extraction handles corrupted files."""
+        try:
+            import pptx  # noqa: F401
+        except ImportError:
+            pytest.skip("python-pptx not installed")
+
+        # Create a corrupted PPTX file
+        bad_pptx = tmp_path / "corrupt.pptx"
+        bad_pptx.write_bytes(b"not a real pptx file")
+
+        extractor = ContentExtractor()
+
+        with pytest.raises(ContentExtractionError, match="Failed to extract"):
+            extractor.extract(bad_pptx)
+
 
 class TestContentExtractionError:
     """Tests for ContentExtractionError exception."""