@@ -81,6 +81,7 @@ def test_is_supported(self):
8181 assert extractor .is_supported (".pdf" ) is True
8282 assert extractor .is_supported (".txt" ) is True
8383 assert extractor .is_supported (".docx" ) is True
84+ assert extractor .is_supported (".pptx" ) is True
8485 assert extractor .is_supported (".xyz" ) is False
8586
8687 def test_extract_unsupported_file (self , tmp_path ):
@@ -269,6 +270,176 @@ def test_extract_docx_corruption(self, tmp_path):
269270 with pytest .raises (ContentExtractionError , match = "Failed to extract" ):
270271 extractor .extract (bad_docx )
271272
273+ def test_extract_pptx_not_installed (self , tmp_path , monkeypatch ):
274+ """Test PPTX extraction when python-pptx is not installed."""
275+ pptx_file = tmp_path / "test.pptx"
276+ pptx_file .touch ()
277+
278+ extractor = ContentExtractor ()
279+
280+ # Mock pptx import to raise ImportError
281+ import builtins
282+
283+ original_import = builtins .__import__
284+
285+ def mock_import (name , * args , ** kwargs ):
286+ if name == "pptx" :
287+ raise ImportError ("No module named 'pptx'" )
288+ return original_import (name , * args , ** kwargs )
289+
290+ monkeypatch .setattr (builtins , "__import__" , mock_import )
291+
292+ with pytest .raises (ContentExtractionError , match = "python-pptx not installed" ):
293+ extractor ._extract_pptx (pptx_file )
294+
295+ def test_extract_pptx_with_content (self , tmp_path ):
296+ """Test extracting text from PPTX with slides and content."""
297+ try :
298+ from pptx import Presentation
299+ except ImportError :
300+ pytest .skip ("python-pptx not installed" )
301+
302+ # Create PPTX with content
303+ pptx_file = tmp_path / "test.pptx"
304+ prs = Presentation ()
305+
306+ # Add slide 1 with title and content
307+ slide_layout = prs .slide_layouts [1 ] # Title and content layout
308+ slide1 = prs .slides .add_slide (slide_layout )
309+ slide1 .shapes .title .text = "First Slide Title"
310+ slide1 .placeholders [1 ].text = "This is the content of the first slide."
311+
312+ # Add slide 2 with different content
313+ slide2 = prs .slides .add_slide (slide_layout )
314+ slide2 .shapes .title .text = "Second Slide Title"
315+ slide2 .placeholders [1 ].text = "This is the content of the second slide."
316+
317+ prs .save (str (pptx_file ))
318+
319+ extractor = ContentExtractor ()
320+ result = extractor .extract (pptx_file )
321+
322+ assert "text" in result
323+ assert "First Slide Title" in result ["text" ]
324+ assert "Second Slide Title" in result ["text" ]
325+ assert "first slide" in result ["text" ]
326+ assert "second slide" in result ["text" ]
327+ assert result ["page_count" ] == 2
328+ assert result ["metadata" ]["format" ] == "pptx"
329+ assert result ["metadata" ]["extraction_method" ] == "python-pptx"
330+ assert result ["metadata" ]["slide_count" ] == 2
331+ assert result ["metadata" ]["shape_count" ] > 0
332+
333+ def test_extract_pptx_with_tables (self , tmp_path ):
334+ """Test extracting text from PPTX with tables."""
335+ try :
336+ from pptx import Presentation
337+ from pptx .util import Inches
338+ except ImportError :
339+ pytest .skip ("python-pptx not installed" )
340+
341+ # Create PPTX with table
342+ pptx_file = tmp_path / "table_test.pptx"
343+ prs = Presentation ()
344+
345+ # Add slide with blank layout
346+ blank_layout = prs .slide_layouts [6 ] # Blank layout
347+ slide = prs .slides .add_slide (blank_layout )
348+
349+ # Add a table
350+ rows , cols = 3 , 2
351+ left = Inches (2 )
352+ top = Inches (2 )
353+ width = Inches (4 )
354+ height = Inches (2 )
355+
356+ table_shape = slide .shapes .add_table (rows , cols , left , top , width , height )
357+ table = table_shape .table
358+
359+ # Fill table with data
360+ table .cell (0 , 0 ).text = "Header 1"
361+ table .cell (0 , 1 ).text = "Header 2"
362+ table .cell (1 , 0 ).text = "Row 1 Col 1"
363+ table .cell (1 , 1 ).text = "Row 1 Col 2"
364+ table .cell (2 , 0 ).text = "Row 2 Col 1"
365+ table .cell (2 , 1 ).text = "Row 2 Col 2"
366+
367+ prs .save (str (pptx_file ))
368+
369+ extractor = ContentExtractor ()
370+ result = extractor .extract (pptx_file )
371+
372+ assert "text" in result
373+ assert "Header 1" in result ["text" ]
374+ assert "Header 2" in result ["text" ]
375+ assert "Row 1 Col 1" in result ["text" ]
376+ assert result ["metadata" ]["has_tables" ] is True
377+
378+ def test_extract_pptx_with_notes (self , tmp_path ):
379+ """Test extracting text from PPTX with speaker notes."""
380+ try :
381+ from pptx import Presentation
382+ except ImportError :
383+ pytest .skip ("python-pptx not installed" )
384+
385+ # Create PPTX with notes
386+ pptx_file = tmp_path / "notes_test.pptx"
387+ prs = Presentation ()
388+
389+ slide_layout = prs .slide_layouts [1 ]
390+ slide = prs .slides .add_slide (slide_layout )
391+ slide .shapes .title .text = "Slide with Notes"
392+
393+ # Add speaker notes
394+ notes_slide = slide .notes_slide
395+ notes_text_frame = notes_slide .notes_text_frame
396+ notes_text_frame .text = "These are important speaker notes for the presentation."
397+
398+ prs .save (str (pptx_file ))
399+
400+ extractor = ContentExtractor ()
401+ result = extractor .extract (pptx_file )
402+
403+ assert "text" in result
404+ assert "Slide with Notes" in result ["text" ]
405+ assert "speaker notes" in result ["text" ]
406+ assert "[Notes for Slide 1]" in result ["text" ]
407+
408+ def test_extract_pptx_empty (self , tmp_path ):
409+ """Test extracting empty PPTX file."""
410+ try :
411+ from pptx import Presentation
412+ except ImportError :
413+ pytest .skip ("python-pptx not installed" )
414+
415+ # Create empty PPTX
416+ pptx_file = tmp_path / "empty.pptx"
417+ prs = Presentation ()
418+ prs .save (str (pptx_file ))
419+
420+ extractor = ContentExtractor ()
421+ result = extractor .extract (pptx_file )
422+
423+ assert "text" in result
424+ assert result ["page_count" ] == 0
425+ assert result ["metadata" ]["slide_count" ] == 0
426+
427+ def test_extract_pptx_corruption (self , tmp_path ):
428+ """Test PPTX extraction handles corrupted files."""
429+ try :
430+ import pptx # noqa: F401
431+ except ImportError :
432+ pytest .skip ("python-pptx not installed" )
433+
434+ # Create a corrupted PPTX file
435+ bad_pptx = tmp_path / "corrupt.pptx"
436+ bad_pptx .write_bytes (b"not a real pptx file" )
437+
438+ extractor = ContentExtractor ()
439+
440+ with pytest .raises (ContentExtractionError , match = "Failed to extract" ):
441+ extractor .extract (bad_pptx )
442+
272443
273444class TestContentExtractionError :
274445 """Tests for ContentExtractionError exception."""
0 commit comments