From 3ad0b300b18103f88b303e9f82a5bde1c40afd96 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 26 Feb 2026 21:35:12 +0000 Subject: [PATCH 1/5] Initial plan From 5d93d4492f7bd2d58a157e4516cbe81e8590586e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 26 Feb 2026 21:37:47 +0000 Subject: [PATCH 2/5] Remove duplicate ID fallback logic and enforce creator_{type}_{id} pattern Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- example_traject_config_eac_cpf.rb | 83 +++++++++---------------------- 1 file changed, 24 insertions(+), 59 deletions(-) diff --git a/example_traject_config_eac_cpf.rb b/example_traject_config_eac_cpf.rb index 84a956d..4ba2111 100644 --- a/example_traject_config_eac_cpf.rb +++ b/example_traject_config_eac_cpf.rb @@ -42,80 +42,45 @@ # CRITICAL: The 'id' field is required by Solr's schema (uniqueKey) # Must ensure this field is never empty or indexing will fail # -# IMPORTANT: Real EAC-CPF from ArchivesSpace has empty element! -# Cannot rely on recordId being present. Must extract from filename or generate. +# ID FORMAT: Always use 'creator_{entity_type}_{id}' pattern +# This matches the format used by arcflow when creating EAC-CPF files to_field 'id' do |record, accumulator, context| # Try 1: Extract from control/recordId (if present) record_id = record.xpath('//eac:control/eac:recordId', EAC_NS).first record_id ||= record.xpath('//control/recordId').first if record_id && !record_id.text.strip.empty? - accumulator << record_id.text.strip - else - # Try 2: Extract from source filename (most reliable for ArchivesSpace exports) - # Filename format: creator_corporate_entities_584.xml or similar + # Validate it matches expected pattern (creator_{type}_{id}) + id_value = record_id.text.strip + if id_value =~ /^creator_[a-z_]+_\d+$/ + accumulator << id_value + context.logger.info("Using recordId: #{id_value}") + else + context.logger.warn("recordId doesn't match expected pattern 'creator_{type}_{id}': #{id_value}") + # Fall through to try filename + end + end + + # Try 2: Extract from source filename if recordId not valid + # Filename format: creator_{entity_type}_{id}.xml (e.g., creator_corporate_entities_584.xml) + if accumulator.empty? source_file = context.source_record_id || context.input_name if source_file # Remove .xml extension and any path id_from_filename = File.basename(source_file, '.xml') - # Check if it looks valid (starts with creator_ or agent_) - if id_from_filename =~ /^(creator_|agent_)/ + # Validate it matches expected pattern + if id_from_filename =~ /^creator_[a-z_]+_\d+$/ accumulator << id_from_filename context.logger.info("Using filename-based ID: #{id_from_filename}") else - # Try 3: Generate from entity type and name - entity_type = record.xpath('//eac:cpfDescription/eac:identity/eac:entityType', EAC_NS).first&.text&.strip - name_entry = record.xpath('//eac:cpfDescription/eac:identity/eac:nameEntry/eac:part', EAC_NS).first&.text&.strip - - if entity_type && name_entry - # Create stable ID from type and name - type_short = case entity_type - when 'corporateBody' then 'corporate' - when 'person' then 'person' - when 'family' then 'family' - else 'entity' - end - name_id = name_entry.gsub(/[^a-z0-9]/i, '_').downcase[0..50] # Limit length - generated_id = "creator_#{type_short}_#{name_id}" - accumulator << generated_id - context.logger.warn("Generated ID from name: #{generated_id}") - else - # No valid ID available - skip indexing this record - # If we reach here, something has gone wrong with the data pipeline: - # - No recordId in XML - # - Filename doesn't match expected pattern - # - No entity type or name in XML to generate from - # Skipping ensures we don't create non-deterministic IDs that break idempotent indexing - context.logger.error("Cannot generate valid ID for record - skipping indexing. Source: #{source_file}") - context.skip!("Missing required ID data") - end + # Filename doesn't match expected pattern - skip record + context.logger.error("Filename doesn't match expected pattern 'creator_{type}_{id}': #{id_from_filename}") + context.skip!("Invalid ID format in filename") end else - # No filename available, generate from name - entity_type = record.xpath('//eac:cpfDescription/eac:identity/eac:entityType', EAC_NS).first&.text&.strip - name_entry = record.xpath('//eac:cpfDescription/eac:identity/eac:nameEntry/eac:part', EAC_NS).first&.text&.strip - - if entity_type && name_entry - type_short = case entity_type - when 'corporateBody' then 'corporate' - when 'person' then 'person' - when 'family' then 'family' - else 'entity' - end - name_id = name_entry.gsub(/[^a-z0-9]/i, '_').downcase[0..50] - generated_id = "creator_#{type_short}_#{name_id}" - accumulator << generated_id - context.logger.warn("Generated ID from name: #{generated_id}") - else - # No valid ID available - skip indexing this record - # If we reach here, something has gone wrong with the data pipeline: - # - No recordId in XML - # - No filename available - # - No entity type or name in XML to generate from - # Skipping ensures we don't create non-deterministic IDs that break idempotent indexing - context.logger.error("Cannot generate valid ID for record - skipping indexing. No filename or entity data available.") - context.skip!("Missing required ID data") - end + # No filename available - skip record + context.logger.error("No source filename available for record") + context.skip!("Missing source filename") end end end From 114208cea316cc7d6b9a1f32df3957206d54533f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 26 Feb 2026 21:39:07 +0000 Subject: [PATCH 3/5] Extract regex to constant and use specific entity types Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- example_traject_config_eac_cpf.rb | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/example_traject_config_eac_cpf.rb b/example_traject_config_eac_cpf.rb index 4ba2111..566d9d3 100644 --- a/example_traject_config_eac_cpf.rb +++ b/example_traject_config_eac_cpf.rb @@ -22,6 +22,13 @@ # EAC-CPF namespace - used consistently throughout this config EAC_NS = { 'eac' => 'urn:isbn:1-931666-33-4' } +# Creator ID pattern - matches the format used by arcflow when creating EAC-CPF files +# Format: creator_{entity_type}_{id} where entity_type is one of: +# - corporate_entities (for organizations) +# - people (for persons) +# - families (for families) +CREATOR_ID_PATTERN = /^creator_(corporate_entities|people|families)_\d+$/ + settings do provide "solr.url", ENV['SOLR_URL'] || "http://localhost:8983/solr/blacklight-core" provide "solr_writer.commit_on_close", "true" @@ -52,7 +59,7 @@ if record_id && !record_id.text.strip.empty? # Validate it matches expected pattern (creator_{type}_{id}) id_value = record_id.text.strip - if id_value =~ /^creator_[a-z_]+_\d+$/ + if id_value =~ CREATOR_ID_PATTERN accumulator << id_value context.logger.info("Using recordId: #{id_value}") else @@ -69,7 +76,7 @@ # Remove .xml extension and any path id_from_filename = File.basename(source_file, '.xml') # Validate it matches expected pattern - if id_from_filename =~ /^creator_[a-z_]+_\d+$/ + if id_from_filename =~ CREATOR_ID_PATTERN accumulator << id_from_filename context.logger.info("Using filename-based ID: #{id_from_filename}") else From b113430977d72ba9514db2eb6408031e19f982e5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 26 Feb 2026 21:47:14 +0000 Subject: [PATCH 4/5] Use filename only for ID extraction, remove recordId fallback Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- example_traject_config_eac_cpf.rb | 51 +++++++++++-------------------- 1 file changed, 17 insertions(+), 34 deletions(-) diff --git a/example_traject_config_eac_cpf.rb b/example_traject_config_eac_cpf.rb index 566d9d3..0593dc0 100644 --- a/example_traject_config_eac_cpf.rb +++ b/example_traject_config_eac_cpf.rb @@ -51,44 +51,27 @@ # # ID FORMAT: Always use 'creator_{entity_type}_{id}' pattern # This matches the format used by arcflow when creating EAC-CPF files +# We extract the ID from the filename only, as that's the source of truth to_field 'id' do |record, accumulator, context| - # Try 1: Extract from control/recordId (if present) - record_id = record.xpath('//eac:control/eac:recordId', EAC_NS).first - record_id ||= record.xpath('//control/recordId').first - - if record_id && !record_id.text.strip.empty? - # Validate it matches expected pattern (creator_{type}_{id}) - id_value = record_id.text.strip - if id_value =~ CREATOR_ID_PATTERN - accumulator << id_value - context.logger.info("Using recordId: #{id_value}") - else - context.logger.warn("recordId doesn't match expected pattern 'creator_{type}_{id}': #{id_value}") - # Fall through to try filename - end - end - - # Try 2: Extract from source filename if recordId not valid + # Extract from source filename - this is the only reliable source # Filename format: creator_{entity_type}_{id}.xml (e.g., creator_corporate_entities_584.xml) - if accumulator.empty? - source_file = context.source_record_id || context.input_name - if source_file - # Remove .xml extension and any path - id_from_filename = File.basename(source_file, '.xml') - # Validate it matches expected pattern - if id_from_filename =~ CREATOR_ID_PATTERN - accumulator << id_from_filename - context.logger.info("Using filename-based ID: #{id_from_filename}") - else - # Filename doesn't match expected pattern - skip record - context.logger.error("Filename doesn't match expected pattern 'creator_{type}_{id}': #{id_from_filename}") - context.skip!("Invalid ID format in filename") - end + source_file = context.source_record_id || context.input_name + if source_file + # Remove .xml extension and any path + id_from_filename = File.basename(source_file, '.xml') + # Validate it matches expected pattern + if id_from_filename =~ CREATOR_ID_PATTERN + accumulator << id_from_filename + context.logger.info("Using filename-based ID: #{id_from_filename}") else - # No filename available - skip record - context.logger.error("No source filename available for record") - context.skip!("Missing source filename") + # Filename doesn't match expected pattern - skip record + context.logger.error("Filename doesn't match expected pattern 'creator_{type}_{id}': #{id_from_filename}") + context.skip!("Invalid ID format in filename") end + else + # No filename available - skip record + context.logger.error("No source filename available for record") + context.skip!("Missing source filename") end end From 874dc9c237cb1d7e1ea8903c6eeb1636d4ba6258 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 26 Feb 2026 22:01:02 +0000 Subject: [PATCH 5/5] Clean up comments to be concise (1-2 lines max) Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- example_traject_config_eac_cpf.rb | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/example_traject_config_eac_cpf.rb b/example_traject_config_eac_cpf.rb index 0593dc0..be544e9 100644 --- a/example_traject_config_eac_cpf.rb +++ b/example_traject_config_eac_cpf.rb @@ -22,11 +22,7 @@ # EAC-CPF namespace - used consistently throughout this config EAC_NS = { 'eac' => 'urn:isbn:1-931666-33-4' } -# Creator ID pattern - matches the format used by arcflow when creating EAC-CPF files -# Format: creator_{entity_type}_{id} where entity_type is one of: -# - corporate_entities (for organizations) -# - people (for persons) -# - families (for families) +# Pattern matching arcflow's creator file naming: creator_{entity_type}_{id} CREATOR_ID_PATTERN = /^creator_(corporate_entities|people|families)_\d+$/ settings do @@ -45,31 +41,19 @@ context.clipboard[:is_creator] = true end -# Core identity field -# CRITICAL: The 'id' field is required by Solr's schema (uniqueKey) -# Must ensure this field is never empty or indexing will fail -# -# ID FORMAT: Always use 'creator_{entity_type}_{id}' pattern -# This matches the format used by arcflow when creating EAC-CPF files -# We extract the ID from the filename only, as that's the source of truth +# Solr uniqueKey - extract ID from filename using arcflow's creator_{entity_type}_{id} pattern to_field 'id' do |record, accumulator, context| - # Extract from source filename - this is the only reliable source - # Filename format: creator_{entity_type}_{id}.xml (e.g., creator_corporate_entities_584.xml) source_file = context.source_record_id || context.input_name if source_file - # Remove .xml extension and any path id_from_filename = File.basename(source_file, '.xml') - # Validate it matches expected pattern if id_from_filename =~ CREATOR_ID_PATTERN accumulator << id_from_filename context.logger.info("Using filename-based ID: #{id_from_filename}") else - # Filename doesn't match expected pattern - skip record context.logger.error("Filename doesn't match expected pattern 'creator_{type}_{id}': #{id_from_filename}") context.skip!("Invalid ID format in filename") end else - # No filename available - skip record context.logger.error("No source filename available for record") context.skip!("Missing source filename") end