-
Notifications
You must be signed in to change notification settings - Fork 0
Enforce strict creator_{type}_{id} pattern using filename only #26
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3ad0b30
5d93d44
114208c
b113430
874dc9c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,9 @@ | |
| # EAC-CPF namespace - used consistently throughout this config | ||
| EAC_NS = { 'eac' => 'urn:isbn:1-931666-33-4' } | ||
|
|
||
| # Pattern matching arcflow's creator file naming: creator_{entity_type}_{id} | ||
| CREATOR_ID_PATTERN = /^creator_(corporate_entities|people|families)_\d+$/ | ||
|
|
||
| settings do | ||
| provide "solr.url", ENV['SOLR_URL'] || "http://localhost:8983/solr/blacklight-core" | ||
| provide "solr_writer.commit_on_close", "true" | ||
|
|
@@ -38,85 +41,21 @@ | |
| context.clipboard[:is_creator] = true | ||
| end | ||
|
|
||
| # Core identity field | ||
| # CRITICAL: The 'id' field is required by Solr's schema (uniqueKey) | ||
| # Must ensure this field is never empty or indexing will fail | ||
| # | ||
| # IMPORTANT: Real EAC-CPF from ArchivesSpace has empty <control/> element! | ||
| # Cannot rely on recordId being present. Must extract from filename or generate. | ||
| # Solr uniqueKey - extract ID from filename using arcflow's creator_{entity_type}_{id} pattern | ||
| to_field 'id' do |record, accumulator, context| | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @copilot this is wrong. If successful, this would get the id from the recordId. Because that can't be relied on and we need to know how ids are constructed ONLY use the pattern
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in commit b113430. Removed all recordId extraction logic - now exclusively using the filename as the single source of truth for IDs. The code now only extracts the ID from the filename pattern and validates it against |
||
| # Try 1: Extract from control/recordId (if present) | ||
| record_id = record.xpath('//eac:control/eac:recordId', EAC_NS).first | ||
| record_id ||= record.xpath('//control/recordId').first | ||
|
|
||
| if record_id && !record_id.text.strip.empty? | ||
| accumulator << record_id.text.strip | ||
| else | ||
| # Try 2: Extract from source filename (most reliable for ArchivesSpace exports) | ||
| # Filename format: creator_corporate_entities_584.xml or similar | ||
| source_file = context.source_record_id || context.input_name | ||
| if source_file | ||
| # Remove .xml extension and any path | ||
| id_from_filename = File.basename(source_file, '.xml') | ||
| # Check if it looks valid (starts with creator_ or agent_) | ||
| if id_from_filename =~ /^(creator_|agent_)/ | ||
| accumulator << id_from_filename | ||
| context.logger.info("Using filename-based ID: #{id_from_filename}") | ||
| else | ||
| # Try 3: Generate from entity type and name | ||
| entity_type = record.xpath('//eac:cpfDescription/eac:identity/eac:entityType', EAC_NS).first&.text&.strip | ||
| name_entry = record.xpath('//eac:cpfDescription/eac:identity/eac:nameEntry/eac:part', EAC_NS).first&.text&.strip | ||
|
|
||
| if entity_type && name_entry | ||
| # Create stable ID from type and name | ||
| type_short = case entity_type | ||
| when 'corporateBody' then 'corporate' | ||
| when 'person' then 'person' | ||
| when 'family' then 'family' | ||
| else 'entity' | ||
| end | ||
| name_id = name_entry.gsub(/[^a-z0-9]/i, '_').downcase[0..50] # Limit length | ||
| generated_id = "creator_#{type_short}_#{name_id}" | ||
| accumulator << generated_id | ||
| context.logger.warn("Generated ID from name: #{generated_id}") | ||
| else | ||
| # No valid ID available - skip indexing this record | ||
| # If we reach here, something has gone wrong with the data pipeline: | ||
| # - No recordId in XML | ||
| # - Filename doesn't match expected pattern | ||
| # - No entity type or name in XML to generate from | ||
| # Skipping ensures we don't create non-deterministic IDs that break idempotent indexing | ||
| context.logger.error("Cannot generate valid ID for record - skipping indexing. Source: #{source_file}") | ||
| context.skip!("Missing required ID data") | ||
| end | ||
| end | ||
| source_file = context.source_record_id || context.input_name | ||
| if source_file | ||
| id_from_filename = File.basename(source_file, '.xml') | ||
| if id_from_filename =~ CREATOR_ID_PATTERN | ||
| accumulator << id_from_filename | ||
| context.logger.info("Using filename-based ID: #{id_from_filename}") | ||
| else | ||
| # No filename available, generate from name | ||
| entity_type = record.xpath('//eac:cpfDescription/eac:identity/eac:entityType', EAC_NS).first&.text&.strip | ||
| name_entry = record.xpath('//eac:cpfDescription/eac:identity/eac:nameEntry/eac:part', EAC_NS).first&.text&.strip | ||
|
|
||
| if entity_type && name_entry | ||
| type_short = case entity_type | ||
| when 'corporateBody' then 'corporate' | ||
| when 'person' then 'person' | ||
| when 'family' then 'family' | ||
| else 'entity' | ||
| end | ||
| name_id = name_entry.gsub(/[^a-z0-9]/i, '_').downcase[0..50] | ||
| generated_id = "creator_#{type_short}_#{name_id}" | ||
| accumulator << generated_id | ||
| context.logger.warn("Generated ID from name: #{generated_id}") | ||
| else | ||
| # No valid ID available - skip indexing this record | ||
| # If we reach here, something has gone wrong with the data pipeline: | ||
| # - No recordId in XML | ||
| # - No filename available | ||
| # - No entity type or name in XML to generate from | ||
| # Skipping ensures we don't create non-deterministic IDs that break idempotent indexing | ||
| context.logger.error("Cannot generate valid ID for record - skipping indexing. No filename or entity data available.") | ||
| context.skip!("Missing required ID data") | ||
| end | ||
| context.logger.error("Filename doesn't match expected pattern 'creator_{type}_{id}': #{id_from_filename}") | ||
| context.skip!("Invalid ID format in filename") | ||
| end | ||
| else | ||
| context.logger.error("No source filename available for record") | ||
| context.skip!("Missing source filename") | ||
| end | ||
| end | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@copilot clean up the comments. we need at most one line that says this is the pattern from arcflow when creating the files. keep it to 1-2 lines and only one comment max.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cleaned up in commit 874dc9c. Reduced all comments to 1-2 lines max, keeping only essential information about the arcflow pattern.