diff --git a/actions/snowflake-document-ai/CHANGELOG.md b/actions/snowflake-document-ai/CHANGELOG.md index 78775e49..71df9311 100644 --- a/actions/snowflake-document-ai/CHANGELOG.md +++ b/actions/snowflake-document-ai/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/) and this project adheres to [Semantic Versioning](https://semver.org/). +## [0.0.2] - 2025-10-11 + +### Changed +- Limited support only to PDF, PPTX, and DOCX files +- Reduced logging + ## [0.0.1] - 2025-10-01 ### Changed diff --git a/actions/snowflake-document-ai/README.md b/actions/snowflake-document-ai/README.md index ad49b08b..c839a6f8 100644 --- a/actions/snowflake-document-ai/README.md +++ b/actions/snowflake-document-ai/README.md @@ -1,13 +1,13 @@ # Snowflake Document AI -This action package enables you to upload PDF documents to Snowflake and extract their content using [Snowflake Document AI](https://docs.snowflake.com/en/user-guide/snowflake-cortex/document-ai/overview). It uses one-shot parsing with no training required, making it easy to extract structured data from PDFs and image files. +This action package enables you to upload PDF, DOCX, or PPTX documents to Snowflake and extract their content using [Snowflake Document AI](https://docs.snowflake.com/en/user-guide/snowflake-cortex/document-ai/overview). It uses one-shot parsing with no training required, making it easy to extract structured data from digital documents. ## What it does This action package provides two main capabilities: 1. **List Stage Files**: List all files currently stored in a Snowflake stage -2. **Parse Document**: Upload PDF or image file files and extract their content using AI-powered parsing +2. **Parse Document**: Upload PDF, DOCX, or PPTX files and extract their content using AI-powered parsing The parsing uses Snowflake's `AI_PARSE_DOCUMENT` function which can extract text, tables, and other structured content from PDFs in layout-aware mode. @@ -16,7 +16,7 @@ The parsing uses Snowflake's `AI_PARSE_DOCUMENT` function which can extract text Before using this action, you need: 1. **Snowflake Account**: An active Snowflake account with Document AI enabled -2. **Database, Schema, and Stage**: A Snowflake stage configured to store your PDF files +2. **Database, Schema, and Stage**: A Snowflake stage configured to store your document files 3. **Authentication**: One of the following authentication methods: - Username and password - Key pair authentication @@ -39,7 +39,7 @@ Lists the most recently modified files in a specific Snowflake stage. ### 2. Parse Document -Uploads a PDF file from chat and parses its content using Snowflake Document AI. +Uploads a PDF, DOCX, or PPTX file from chat and parses its content using Snowflake Document AI. **Parameters:** - `filename` (string): The name of the file to upload from chat @@ -91,9 +91,9 @@ Uploads a PDF file from chat and parses its content using Snowflake Document AI. Details up to date [here](https://docs.snowflake.com/en/user-guide/snowflake-cortex/parse-document#input-requirements). -- File types: PDF, PPTX, DOCX, JPEG, JPG, PNG, TIFF, TIF, HTML, TXT +- File types: PDF, PPTX, DOCX (images/HTML/TXT are not supported by this action) - Maximum file size: 100 MB - Maximum number of pages: 500 - Stage encryption: Server-side encryption -- Requires Snowflake Document AI to be enabled and set upin your account +- Requires Snowflake Document AI to be enabled and set up in your account - Processing time depends on document complexity and size - timeouts are possible with larger documents \ No newline at end of file diff --git a/actions/snowflake-document-ai/actions.py b/actions/snowflake-document-ai/actions.py index 80e19a9e..2834b6b0 100644 --- a/actions/snowflake-document-ai/actions.py +++ b/actions/snowflake-document-ai/actions.py @@ -70,7 +70,7 @@ def parse_document( stage_path: str = "" ) -> Response[dict]: """ - Uploads a file (PDF, PPTX, DOCX, JPEG, JPG, PNG, TIFF, TIF, HTML, TXT) from the chat to a specified Snowflake stage and parses it's content using Snowflake Document AI. + Uploads a file (PDF, PPTX, DOCX) from the chat to a specified Snowflake stage and parses it's content using Snowflake Document AI. Args: filename: The name of the file to upload from chat @@ -83,45 +83,46 @@ def parse_document( Details of the uploaded file, and the json string of the document ai processing results. """ try: - print(f"Starting process_document for file: {filename}") + print("Starting document processing") # Get the file from chat - print("Getting file from chat...") chat_file = chat.get_file(filename) temp_file_path = str(chat_file) - print(f"Got file: {temp_file_path}") # Extract just the original filename without path original_filename = os.path.basename(filename) - print(f"Original filename: {original_filename}") # Create a filename with timestamp to ensure uniqueness file_base, file_ext = os.path.splitext(original_filename) + file_ext_lower = file_ext.lower() + + # Validate supported file types (Snowflake AI_PARSE_DOCUMENT supports .pdf, .docx, .pptx) + supported_extensions = {'.pdf', '.docx', '.pptx'} + if file_ext_lower not in supported_extensions: + unsupported_msg = ( + f"Unsupported file type '{file_ext}'. Only PDF, DOCX, and PPTX are supported " + f"by AI_PARSE_DOCUMENT for this action. See docs: https://docs.snowflake.com/en/user-guide/snowflake-cortex/parse-document" + ) + return Response(error=unsupported_msg) timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') unique_filename = f"{file_base}_{timestamp}{file_ext}" - print(f"Using unique filename: {unique_filename}") # Create a temporary directory to hold our renamed file - print("Creating temporary directory...") with tempfile.TemporaryDirectory() as temp_dir: # Create a path for our correctly named file correct_name_path = os.path.join(temp_dir, unique_filename) - print(f"Correct name path: {correct_name_path}") # Copy the temporary file to our new path with the correct name - print("Copying file...") shutil.copy2(temp_file_path, correct_name_path) - print("Establishing Snowflake connection...") + print("Connecting to Snowflake...") with get_snowflake_connection() as conn: cursor = conn.cursor() - print("Connected to Snowflake") # Access Secret values db_name = database_name.value.upper() schema = schema_name.value.upper() stage = stage_name.value.upper() - print(f"Using stage: {db_name}.{schema}.{stage}") # Construct fully qualified stage name fully_qualified_stage = f'@"{db_name}"."{schema}"."{stage}"' @@ -133,27 +134,24 @@ def parse_document( clean_path = stage_path.strip('/') if clean_path: stage_location = f'{fully_qualified_stage}/{clean_path}' - print(f"Stage location: {stage_location}") # Record the upload time to use for polling upload_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') - print(f"Upload time: {upload_time}") # Execute PUT command with the correctly named file put_command = f"PUT 'file://{correct_name_path}' '{stage_location}' OVERWRITE=TRUE AUTO_COMPRESS=FALSE SOURCE_COMPRESSION=NONE" - print(f"[{datetime.datetime.now().strftime('%H:%M:%S.%f')}] Executing PUT command to upload file...") + print("Uploading file to stage...") try: cursor.execute(put_command) - print(f"[{datetime.datetime.now().strftime('%H:%M:%S.%f')}] PUT command executed successfully") + print("Upload completed") except Exception as put_error: - print(f"[{datetime.datetime.now().strftime('%H:%M:%S.%f')}] Error executing PUT command: {str(put_error)}") + print("Upload failed") raise # Get results of the upload result_rows = cursor.fetchall() status = cursor.sfqid - print(f"[{datetime.datetime.now().strftime('%H:%M:%S.%f')}] Upload completed with status ID: {status}") - print(f"[{datetime.datetime.now().strftime('%H:%M:%S.%f')}] File upload status: {result_rows}") + print("Upload acknowledged by Snowflake") # Create file paths for the response if stage_path: @@ -164,7 +162,6 @@ def parse_document( # Create the fully qualified path including the file (like @DB.SCHEMA.STAGE/path/file.ext) fully_qualified_path = f"@{db_name}.{schema}.{stage}/{stage_file_path}" - print(f"[{datetime.datetime.now().strftime('%H:%M:%S.%f')}] File uploaded to: {fully_qualified_path}") # Prepare the upload result upload_result = { @@ -177,7 +174,6 @@ def parse_document( "upload_time": upload_time, "query_id": status } - print("Upload completed successfully") # TODO: handle image (OCR) files differently than others - page splitting does not work for them @@ -187,37 +183,31 @@ def parse_document( TO_FILE('{fully_qualified_path}'), {{'mode': 'LAYOUT' , 'page_split': true}}) AS parsed_document; """ - print(f"Executing query: {query}") + print("Parsing document via AI_PARSE_DOCUMENT...") cursor.execute(query) - print("Fetching results...") rows = cursor.fetchall() - print(f"Got {len(rows) if rows else 0} results") processing_result = None if rows and len(rows) > 0: # Convert row to dict columns = [desc[0] for desc in cursor.description] processing_result = dict(zip(columns, rows[0])) - print(f"Available columns: {columns}") - print(f"Found processing result with {len(processing_result)} fields") - else: - print("No processing results found") - print("Query completed") + print("Parsing completed") # Combine upload result with parsed result combined_result = { "upload": upload_result, "parsed": processing_result } - print("Returning combined result") + print("Returning result") return Response(result=combined_result) except Exception as e: error_msg = f"Error processing document {filename}: {str(e)}" - print(f"ERROR: {error_msg}") + print("ERROR during document processing") import traceback print(f"Traceback: {traceback.format_exc()}") return Response(error=error_msg) diff --git a/actions/snowflake-document-ai/package.yaml b/actions/snowflake-document-ai/package.yaml index c61766aa..4ac41ba9 100644 --- a/actions/snowflake-document-ai/package.yaml +++ b/actions/snowflake-document-ai/package.yaml @@ -4,10 +4,10 @@ spec-version: v2 name: Snowflake Document AI # Required: A description of what's in the action package. -description: Upload PDF files to Snowflake and read their content with one-shot parse. No training required. +description: Upload PDF, PPTX, DOCX files to Snowflake and read their content with one-shot parse. No training required. # Package version number, recommend using semver.org -version: 0.0.1 +version: 0.0.2 dependencies: conda-forge: diff --git a/bin/publisher/action_packages_whitelist.json b/bin/publisher/action_packages_whitelist.json index 7e4c582b..f2687197 100644 --- a/bin/publisher/action_packages_whitelist.json +++ b/bin/publisher/action_packages_whitelist.json @@ -67,6 +67,7 @@ "snowflake-cortex-analyst", "snowflake-cortex-search", "snowflake-data", + "snowflake-document-ai", "serper", "perplexity", "robocorp"