Skip to content

Commit cc84cb5

Browse files
feat: Add PowerPoint (PPTX) extraction support
This commit adds comprehensive support for extracting text from PowerPoint presentations (.pptx files) to the docprocessor library. Changes: - Add python-pptx>=0.6.21 dependency - Implement _extract_pptx() method in ContentExtractor - Extract text from slides, tables, and speaker notes - Add slide separators and notes labels for clarity - Include metadata: slide_count, shape_count, has_tables - Add 6 comprehensive tests for PPTX extraction - Update README.md to document PPTX support - Update CHANGELOG.md for v1.1.0 release - Bump version from 1.0.0 to 1.1.0 Test results: - All 6 new PPTX tests pass - All 23 extractor tests pass (2 skipped) - Code passes Black, isort, and flake8 linting 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 8f3c1e9 commit cc84cb5

7 files changed

Lines changed: 275 additions & 4 deletions

File tree

CHANGELOG.md

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,30 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [1.1.0] - 2025-10-30
11+
12+
### Added
13+
- **PowerPoint (PPTX) support**: Extract text from PowerPoint presentations
14+
- Text extraction from all slides with slide separators
15+
- Table content extraction within slides
16+
- Speaker notes extraction
17+
- Metadata including slide count, shape count, and table detection
18+
- `python-pptx>=0.6.21` dependency for PPTX processing
19+
- Comprehensive test suite for PPTX extraction (6 new tests):
20+
- Test for missing dependency handling
21+
- Test for slide content extraction
22+
- Test for table extraction
23+
- Test for speaker notes extraction
24+
- Test for empty presentations
25+
- Test for corrupted file handling
26+
- Updated documentation to include PPTX in supported formats
27+
28+
### Changed
29+
- Updated `ContentExtractor` to support `.pptx` file extension
30+
- Enhanced README.md to list PPTX in multi-format support and dependencies
31+
32+
## [Earlier Unreleased]
33+
1034
### Added
1135
- Production-grade test suite with 144 tests and 81% coverage
1236
- Configuration management module with ProcessorConfig and MeiliSearchConfig
@@ -60,5 +84,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
6084
- **Flexible Integration**: BYOC pattern allows any LLM provider
6185
- **Environment Isolation**: Index prefixing for multi-environment deployments
6286

63-
[Unreleased]: https://github.com/Knowledge-Innovation-Centre/doc-processor/compare/v1.0.0...HEAD
87+
[Unreleased]: https://github.com/Knowledge-Innovation-Centre/doc-processor/compare/v1.1.0...HEAD
88+
[1.1.0]: https://github.com/Knowledge-Innovation-Centre/doc-processor/compare/v1.0.0...v1.1.0
6489
[1.0.0]: https://github.com/Knowledge-Innovation-Centre/doc-processor/releases/tag/v1.0.0

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ A Python library for processing documents with OCR, semantic chunking, and LLM-b
2020

2121
## Features
2222

23-
- **Multi-format Support**: PDF, DOCX, TXT, MD, and images (PNG, JPG, GIF, BMP)
23+
- **Multi-format Support**: PDF, DOCX, PPTX, TXT, MD, and images (PNG, JPG, GIF, BMP)
2424
- **Intelligent OCR**: Layout-aware PDF text extraction with OCR fallback for images
2525
- **Semantic Chunking**: Smart text segmentation using LangChain's RecursiveCharacterTextSplitter
2626
- **LLM Summarization**: Generate concise document summaries (with fallback)
@@ -279,6 +279,7 @@ Each component can be used independently or through the unified `DocumentProcess
279279
- opencv-python - Image preprocessing
280280
- Pillow - Image handling
281281
- python-docx - DOCX extraction
282+
- python-pptx - PPTX extraction
282283
- langchain-text-splitters - Semantic chunking
283284
- tiktoken - Token counting
284285

docprocessor/core/extractor.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ class ContentExtractor:
2929
- PDF: Using OCR pipeline (extract_pdf_for_llm)
3030
- TXT/MD: Direct text read
3131
- DOCX: python-docx extraction
32+
- PPTX: python-pptx extraction
3233
- Images: OCR fallback
3334
"""
3435

@@ -38,6 +39,7 @@ def __init__(self):
3839
".txt",
3940
".md",
4041
".docx",
42+
".pptx",
4143
".png",
4244
".jpg",
4345
".jpeg",
@@ -77,6 +79,8 @@ def extract(self, file_path: Path) -> Dict[str, Any]:
7779
return self._extract_text(file_path)
7880
elif extension == ".docx":
7981
return self._extract_docx(file_path)
82+
elif extension == ".pptx":
83+
return self._extract_pptx(file_path)
8084
elif extension in {".png", ".jpg", ".jpeg", ".gif", ".bmp"}:
8185
return self._extract_image(file_path)
8286
else:
@@ -165,6 +169,74 @@ def _extract_docx(self, file_path: Path) -> Dict[str, Any]:
165169
logger.error(f"Error extracting DOCX: {e}")
166170
raise ContentExtractionError(f"Failed to extract DOCX: {str(e)}")
167171

172+
def _extract_pptx(self, file_path: Path) -> Dict[str, Any]:
173+
"""Extract text from PPTX files."""
174+
logger.info(f"Extracting PPTX: {file_path}")
175+
176+
try:
177+
from pptx import Presentation
178+
except ImportError:
179+
raise ContentExtractionError(
180+
"python-pptx not installed. Install with: pip install python-pptx"
181+
)
182+
183+
try:
184+
prs = Presentation(str(file_path))
185+
186+
# Extract text from all slides
187+
all_text = []
188+
slide_count = 0
189+
total_shapes = 0
190+
has_tables = False
191+
192+
for slide in prs.slides:
193+
slide_count += 1
194+
slide_text = []
195+
196+
# Extract text from shapes
197+
for shape in slide.shapes:
198+
total_shapes += 1
199+
200+
# Handle text frames
201+
if hasattr(shape, "text") and shape.text.strip():
202+
slide_text.append(shape.text.strip())
203+
204+
# Handle tables
205+
if shape.has_table:
206+
has_tables = True
207+
table_data = []
208+
for row in shape.table.rows:
209+
row_data = [cell.text.strip() for cell in row.cells]
210+
table_data.append(" | ".join(row_data))
211+
slide_text.append("\n".join(table_data))
212+
213+
# Add slide text with separator
214+
if slide_text:
215+
all_text.append(f"--- Slide {slide_count} ---\n" + "\n\n".join(slide_text))
216+
217+
# Extract notes if present
218+
if slide.has_notes_slide:
219+
notes_text = slide.notes_slide.notes_text_frame.text.strip()
220+
if notes_text:
221+
all_text.append(f"[Notes for Slide {slide_count}]\n{notes_text}")
222+
223+
text = "\n\n".join(all_text)
224+
225+
return {
226+
"text": text,
227+
"page_count": slide_count, # Use slide count as "pages"
228+
"metadata": {
229+
"format": "pptx",
230+
"extraction_method": "python-pptx",
231+
"slide_count": slide_count,
232+
"shape_count": total_shapes,
233+
"has_tables": has_tables,
234+
},
235+
}
236+
except Exception as e:
237+
logger.error(f"Error extracting PPTX: {e}")
238+
raise ContentExtractionError(f"Failed to extract PPTX: {str(e)}")
239+
168240
def _extract_image(self, file_path: Path) -> Dict[str, Any]:
169241
"""Extract text from images using OCR."""
170242
logger.info(f"Extracting image with OCR: {file_path}")

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "docprocessor"
7-
version = "1.0.0"
7+
version = "1.1.0"
88
description = "Intelligent document processing with OCR, chunking, and AI summarization"
99
readme = "README.md"
1010
requires-python = ">=3.10"

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ pytesseract>=0.3.10
44
opencv-python>=4.9.0
55
Pillow>=10.2.0
66
python-docx>=1.1.0
7+
python-pptx>=0.6.21
78
langchain-text-splitters>=0.2.0
89
tiktoken>=0.7.0
910
numpy>=1.24.0

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setup(
77
name="docprocessor",
8-
version="1.0.0",
8+
version="1.1.0",
99
author="Knowledge Innovation Centre",
1010
author_email="info@knowledgeinnovation.eu",
1111
description="A Python library for document processing with OCR, chunking, and summarization",
@@ -32,6 +32,7 @@
3232
"opencv-python>=4.9.0",
3333
"Pillow>=10.2.0",
3434
"python-docx>=1.1.0",
35+
"python-pptx>=0.6.21",
3536
"langchain-text-splitters>=0.2.0",
3637
"tiktoken>=0.7.0",
3738
"numpy>=1.24.0",

tests/test_extractor.py

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ def test_is_supported(self):
8181
assert extractor.is_supported(".pdf") is True
8282
assert extractor.is_supported(".txt") is True
8383
assert extractor.is_supported(".docx") is True
84+
assert extractor.is_supported(".pptx") is True
8485
assert extractor.is_supported(".xyz") is False
8586

8687
def test_extract_unsupported_file(self, tmp_path):
@@ -269,6 +270,176 @@ def test_extract_docx_corruption(self, tmp_path):
269270
with pytest.raises(ContentExtractionError, match="Failed to extract"):
270271
extractor.extract(bad_docx)
271272

273+
def test_extract_pptx_not_installed(self, tmp_path, monkeypatch):
274+
"""Test PPTX extraction when python-pptx is not installed."""
275+
pptx_file = tmp_path / "test.pptx"
276+
pptx_file.touch()
277+
278+
extractor = ContentExtractor()
279+
280+
# Mock pptx import to raise ImportError
281+
import builtins
282+
283+
original_import = builtins.__import__
284+
285+
def mock_import(name, *args, **kwargs):
286+
if name == "pptx":
287+
raise ImportError("No module named 'pptx'")
288+
return original_import(name, *args, **kwargs)
289+
290+
monkeypatch.setattr(builtins, "__import__", mock_import)
291+
292+
with pytest.raises(ContentExtractionError, match="python-pptx not installed"):
293+
extractor._extract_pptx(pptx_file)
294+
295+
def test_extract_pptx_with_content(self, tmp_path):
296+
"""Test extracting text from PPTX with slides and content."""
297+
try:
298+
from pptx import Presentation
299+
except ImportError:
300+
pytest.skip("python-pptx not installed")
301+
302+
# Create PPTX with content
303+
pptx_file = tmp_path / "test.pptx"
304+
prs = Presentation()
305+
306+
# Add slide 1 with title and content
307+
slide_layout = prs.slide_layouts[1] # Title and content layout
308+
slide1 = prs.slides.add_slide(slide_layout)
309+
slide1.shapes.title.text = "First Slide Title"
310+
slide1.placeholders[1].text = "This is the content of the first slide."
311+
312+
# Add slide 2 with different content
313+
slide2 = prs.slides.add_slide(slide_layout)
314+
slide2.shapes.title.text = "Second Slide Title"
315+
slide2.placeholders[1].text = "This is the content of the second slide."
316+
317+
prs.save(str(pptx_file))
318+
319+
extractor = ContentExtractor()
320+
result = extractor.extract(pptx_file)
321+
322+
assert "text" in result
323+
assert "First Slide Title" in result["text"]
324+
assert "Second Slide Title" in result["text"]
325+
assert "first slide" in result["text"]
326+
assert "second slide" in result["text"]
327+
assert result["page_count"] == 2
328+
assert result["metadata"]["format"] == "pptx"
329+
assert result["metadata"]["extraction_method"] == "python-pptx"
330+
assert result["metadata"]["slide_count"] == 2
331+
assert result["metadata"]["shape_count"] > 0
332+
333+
def test_extract_pptx_with_tables(self, tmp_path):
334+
"""Test extracting text from PPTX with tables."""
335+
try:
336+
from pptx import Presentation
337+
from pptx.util import Inches
338+
except ImportError:
339+
pytest.skip("python-pptx not installed")
340+
341+
# Create PPTX with table
342+
pptx_file = tmp_path / "table_test.pptx"
343+
prs = Presentation()
344+
345+
# Add slide with blank layout
346+
blank_layout = prs.slide_layouts[6] # Blank layout
347+
slide = prs.slides.add_slide(blank_layout)
348+
349+
# Add a table
350+
rows, cols = 3, 2
351+
left = Inches(2)
352+
top = Inches(2)
353+
width = Inches(4)
354+
height = Inches(2)
355+
356+
table_shape = slide.shapes.add_table(rows, cols, left, top, width, height)
357+
table = table_shape.table
358+
359+
# Fill table with data
360+
table.cell(0, 0).text = "Header 1"
361+
table.cell(0, 1).text = "Header 2"
362+
table.cell(1, 0).text = "Row 1 Col 1"
363+
table.cell(1, 1).text = "Row 1 Col 2"
364+
table.cell(2, 0).text = "Row 2 Col 1"
365+
table.cell(2, 1).text = "Row 2 Col 2"
366+
367+
prs.save(str(pptx_file))
368+
369+
extractor = ContentExtractor()
370+
result = extractor.extract(pptx_file)
371+
372+
assert "text" in result
373+
assert "Header 1" in result["text"]
374+
assert "Header 2" in result["text"]
375+
assert "Row 1 Col 1" in result["text"]
376+
assert result["metadata"]["has_tables"] is True
377+
378+
def test_extract_pptx_with_notes(self, tmp_path):
379+
"""Test extracting text from PPTX with speaker notes."""
380+
try:
381+
from pptx import Presentation
382+
except ImportError:
383+
pytest.skip("python-pptx not installed")
384+
385+
# Create PPTX with notes
386+
pptx_file = tmp_path / "notes_test.pptx"
387+
prs = Presentation()
388+
389+
slide_layout = prs.slide_layouts[1]
390+
slide = prs.slides.add_slide(slide_layout)
391+
slide.shapes.title.text = "Slide with Notes"
392+
393+
# Add speaker notes
394+
notes_slide = slide.notes_slide
395+
notes_text_frame = notes_slide.notes_text_frame
396+
notes_text_frame.text = "These are important speaker notes for the presentation."
397+
398+
prs.save(str(pptx_file))
399+
400+
extractor = ContentExtractor()
401+
result = extractor.extract(pptx_file)
402+
403+
assert "text" in result
404+
assert "Slide with Notes" in result["text"]
405+
assert "speaker notes" in result["text"]
406+
assert "[Notes for Slide 1]" in result["text"]
407+
408+
def test_extract_pptx_empty(self, tmp_path):
409+
"""Test extracting empty PPTX file."""
410+
try:
411+
from pptx import Presentation
412+
except ImportError:
413+
pytest.skip("python-pptx not installed")
414+
415+
# Create empty PPTX
416+
pptx_file = tmp_path / "empty.pptx"
417+
prs = Presentation()
418+
prs.save(str(pptx_file))
419+
420+
extractor = ContentExtractor()
421+
result = extractor.extract(pptx_file)
422+
423+
assert "text" in result
424+
assert result["page_count"] == 0
425+
assert result["metadata"]["slide_count"] == 0
426+
427+
def test_extract_pptx_corruption(self, tmp_path):
428+
"""Test PPTX extraction handles corrupted files."""
429+
try:
430+
import pptx # noqa: F401
431+
except ImportError:
432+
pytest.skip("python-pptx not installed")
433+
434+
# Create a corrupted PPTX file
435+
bad_pptx = tmp_path / "corrupt.pptx"
436+
bad_pptx.write_bytes(b"not a real pptx file")
437+
438+
extractor = ContentExtractor()
439+
440+
with pytest.raises(ContentExtractionError, match="Failed to extract"):
441+
extractor.extract(bad_pptx)
442+
272443

273444
class TestContentExtractionError:
274445
"""Tests for ContentExtractionError exception."""

0 commit comments

Comments
 (0)