tags for line in lines: paragraphs.append(f'
{line}
') - + # Create nested bioghist element if we have paragraphs if paragraphs: paragraphs_xml = '\n'.join(paragraphs) @@ -665,7 +801,7 @@ def get_creator_bioghist(self, resource, indent_size=0): bioghist_elements.append(bioghist_el) except Exception as e: self.log.error(f'{indent}Error fetching biographical information for agent {agent_ref}: {e}') - + if bioghist_elements: # Return the agent bioghist elements (unwrapped) # The caller will decide whether to wrap them based on whether @@ -673,80 +809,153 @@ def get_creator_bioghist(self, resource, indent_size=0): return '\n'.join(bioghist_elements) return None + def _get_target_agent_criteria(self, modified_since=0): + """ + Defines the Solr query criteria for "target" agents. + These are agents we want to process. + """ + criteria = [ + "linked_agent_roles:creator", + "system_generated:false", + "is_user:false", +# "is_repo_agent:false", + ] - def get_all_agents(self, agent_types=None, modified_since=0, indent_size=0): + # Add time filter if applicable + if modified_since > 0 and not self.force_update: + mtime_utc = datetime.fromtimestamp(modified_since, tz=timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') + criteria.append(f"system_mtime:[{mtime_utc} TO *]") + + return criteria + + def _get_nontarget_agent_criteria(self, modified_since=0): """ - Fetch ALL agents from ArchivesSpace (not just creators). - Uses direct agent API endpoints for comprehensive coverage. - + Defines the Solr query criteria for "non-target" (excluded) agents. + This is the logical inverse of the target criteria. + """ + # The core logic for what makes an agent a "target" + target_logic = " AND ".join([ + "linked_agent_roles:creator", + "system_generated:false", + "is_user:false", +# "is_repo_agent:false", + ]) + + # We find non-targets by negating the entire block of target logic + criteria = [f"NOT ({target_logic})"] + + # We still apply the time filter to the overall query + if modified_since > 0 and not self.force_update: + mtime_utc = datetime.fromtimestamp(modified_since, tz=timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') + criteria.append(f"system_mtime:[{mtime_utc} TO *]") + + return criteria + + def _execute_solr_query(self, query_parts, solr_url=None, fields=['id'], indent_size=0): + """ + A generic function to execute a query against the Solr index. + Args: - agent_types: List of agent types to fetch. Default: ['corporate_entities', 'people', 'families'] - modified_since: Unix timestamp to filter agents modified since this time (if API supports it) - indent_size: Indentation size for logging - + query_parts (list): A list of strings that will be joined with " AND ". + fields (list): A list of Solr fields to return in the response. + Returns: - set: Set of agent URIs (e.g., '/agents/corporate_entities/123') + list: A list of dictionaries, where each dictionary contains the requested fields. + Returns an empty list on failure. + """ + indent = ' ' * indent_size + if not query_parts: + self.log.error("Cannot execute Solr query with empty criteria.") + return [] + + if not solr_url: + solr_url = self.solr_url + + query_string = " AND ".join(query_parts) + self.log.info(f"{indent}Executing Solr query: {query_string}") + + try: + # First, get the total count of matching documents + count_params = {'q': query_string, 'rows': 0, 'wt': 'json'} + count_response = requests.get(f'{solr_url}/select', params=count_params) + self.log.info(f" [Solr Count Request]: {count_response.request.url}") + + count_response.raise_for_status() + num_found = count_response.json()['response']['numFound'] + + if num_found == 0: + return [] # No need to query again if nothing was found + + # Now, fetch the actual data for the documents + data_params = { + 'q': query_string, + 'rows': num_found, # Use the exact count to fetch all results + 'fl': ','.join(fields), # Join field list into a comma-separated string + 'wt': 'json' + } + response = requests.get(f'{solr_url}/select', params=data_params) + response.raise_for_status() + # Log the exact URL for the data request + self.log.info(f" [Solr Data Request]: {response.request.url}") + + return response.json()['response']['docs'] + + except requests.exceptions.RequestException as e: + self.log.error(f"Failed to execute Solr query: {e}") + self.log.error(f" Failed query string: {query_string}") + return [] + + def get_all_agents(self, agent_types=None, modified_since=0, indent_size=0): + """ + Fetch target agent URIs from the Solr index and log non-target agents. """ if agent_types is None: - agent_types = ['corporate_entities', 'people', 'families'] - + agent_types = ['agent_person', 'agent_corporate_entity', 'agent_family'] + + if self.force_update: + modified_since = 0 indent = ' ' * indent_size - all_agents = set() - - self.log.info(f'{indent}Fetching ALL agents from ArchivesSpace...') - - for agent_type in agent_types: - try: - # Try with modified_since parameter first - params = {'all_ids': True} - if modified_since > 0: - params['modified_since'] = modified_since - - response = self.client.get(f'/agents/{agent_type}', params=params) - agent_ids = response.json() - - self.log.info(f'{indent}Found {len(agent_ids)} {agent_type} agents') - - # Add agent URIs to set - for agent_id in agent_ids: - agent_uri = f'/agents/{agent_type}/{agent_id}' - all_agents.add(agent_uri) - - except Exception as e: - self.log.error(f'{indent}Error fetching {agent_type} agents: {e}') - # If modified_since fails, try without it - if modified_since > 0: - self.log.warning(f'{indent}Retrying {agent_type} without modified_since filter...') - try: - response = self.client.get(f'/agents/{agent_type}', params={'all_ids': True}) - agent_ids = response.json() - self.log.info(f'{indent}Found {len(agent_ids)} {agent_type} agents (no date filter)') - for agent_id in agent_ids: - agent_uri = f'/agents/{agent_type}/{agent_id}' - all_agents.add(agent_uri) - except Exception as e2: - self.log.error(f'{indent}Failed to fetch {agent_type} agents: {e2}') - - self.log.info(f'{indent}Found {len(all_agents)} total agents across all types.') - return all_agents + self.log.info(f'{indent}Fetching agent data from Solr...') + # Base criteria for all queries in this function + base_criteria = [f"primary_type:({' OR '.join(agent_types)})"] + + # Get and log the non-target agents + nontarget_criteria = base_criteria + self._get_nontarget_agent_criteria(modified_since) + excluded_docs = self._execute_solr_query(nontarget_criteria,self.aspace_solr_url, fields=['id']) + if excluded_docs: + excluded_ids = [doc['id'] for doc in excluded_docs] + self.log.info(f"{indent} Found {len(excluded_ids)} non-target (excluded) agents.") + # Optional: Log the actual IDs if the list isn't too long + # for agent_id in excluded_ids: + # self.log.debug(f"{indent} - Excluded: {agent_id}") + + # Get and return the target agents + target_criteria = base_criteria + self._get_target_agent_criteria(modified_since) + self.log.info('Target Criteria:') + target_docs = self._execute_solr_query(target_criteria, self.aspace_solr_url, fields=['id']) + + target_agents = [doc['id'] for doc in target_docs] + self.log.info(f"{indent} Found {len(target_agents)} target agents to process.") + + return target_agents def task_agent(self, agent_uri, agents_dir, repo_id=1, indent_size=0): """ Process a single agent and generate a creator document in EAC-CPF XML format. Retrieves EAC-CPF directly from ArchivesSpace archival_contexts endpoint. - + Args: agent_uri: Agent URI from ArchivesSpace (e.g., '/agents/corporate_entities/123') agents_dir: Directory to save agent XML files repo_id: Repository ID to use for archival_contexts endpoint (default: 1) indent_size: Indentation size for logging - + Returns: str: Creator document ID if successful, None otherwise """ indent = ' ' * indent_size - + try: # Parse agent URI to extract type and ID # URI format: /agents/{agent_type}/{id} @@ -754,25 +963,25 @@ def task_agent(self, agent_uri, agents_dir, repo_id=1, indent_size=0): if len(parts) != 3 or parts[0] != 'agents': self.log.error(f'{indent}Invalid agent URI format: {agent_uri}') return None - + agent_type = parts[1] # e.g., 'corporate_entities', 'people', 'families' agent_id = parts[2] - + # Construct EAC-CPF endpoint # Format: /repositories/{repo_id}/archival_contexts/{agent_type}/{id}.xml eac_cpf_endpoint = f'/repositories/{repo_id}/archival_contexts/{agent_type}/{agent_id}.xml' - + self.log.debug(f'{indent}Fetching EAC-CPF from: {eac_cpf_endpoint}') - + # Fetch EAC-CPF XML response = self.client.get(eac_cpf_endpoint) - + if response.status_code != 200: self.log.error(f'{indent}Failed to fetch EAC-CPF for {agent_uri}: HTTP {response.status_code}') return None - + eac_cpf_xml = response.text - + # Parse the EAC-CPF XML to validate and inspect its structure try: root = ET.fromstring(eac_cpf_xml) @@ -780,18 +989,18 @@ def task_agent(self, agent_uri, agents_dir, repo_id=1, indent_size=0): except ET.ParseError as e: self.log.error(f'{indent}Failed to parse EAC-CPF XML for {agent_uri}: {e}') return None - + # Generate creator ID creator_id = f'creator_{agent_type}_{agent_id}' - + # Save EAC-CPF XML to file filename = f'{agents_dir}/{creator_id}.xml' with open(filename, 'w', encoding='utf-8') as f: f.write(eac_cpf_xml) - + self.log.info(f'{indent}Created creator document: {creator_id}') return creator_id - + except Exception as e: self.log.error(f'{indent}Error processing agent {agent_uri}: {e}') import traceback @@ -844,14 +1053,15 @@ def process_creators(self): self.log.info(f'{indent}Indexing {len(creator_ids)} creator records to Solr...') traject_config = self.find_traject_config() if traject_config: + self.log.info(f'{indent}Using traject config: {traject_config}') indexed = self.index_creators(agents_dir, creator_ids) self.log.info(f'{indent}Creator indexing complete: {indexed}/{len(creator_ids)} indexed') else: - self.log.info(f'{indent}Skipping creator indexing (traject config not found)') + self.log.warning(f'{indent}Skipping creator indexing (traject config not found)') self.log.info(f'{indent}To index manually:') self.log.info(f'{indent} cd {self.arclight_dir}') self.log.info(f'{indent} bundle exec traject -u {self.solr_url} -i xml \\') - self.log.info(f'{indent} -c /path/to/arcuit/arcflow/traject_config_eac_cpf.rb \\') + self.log.info(f'{indent} -c /path/to/arcuit-gem/traject_config_eac_cpf.rb \\') self.log.info(f'{indent} {agents_dir}/*.xml') elif self.skip_creator_indexing: self.log.info(f'{indent}Skipping creator indexing (--skip-creator-indexing flag set)') @@ -862,16 +1072,33 @@ def process_creators(self): def find_traject_config(self): """ Find the traject config for creator indexing. - - Tries: - 1. bundle show arcuit (finds installed gem) - 2. self.arcuit_dir (explicit path) - 3. Returns None if neither works - + + Search order (follows collection records pattern): + 1. arcuit_dir if provided (most up-to-date user control) + 2. arcuit gem via bundle show (for backward compatibility) + 3. example_traject_config_eac_cpf.rb in arcflow (fallback when used as module without arcuit) + Returns: str: Path to traject config, or None if not found """ - # Try bundle show arcuit first + self.log.info('Searching for traject_config_eac_cpf.rb...') + searched_paths = [] + + # Try 1: arcuit_dir if provided (highest priority - user's explicit choice) + if self.arcuit_dir: + self.log.debug(f' Checking arcuit_dir parameter: {self.arcuit_dir}') + candidate_paths = [ + os.path.join(self.arcuit_dir, 'traject_config_eac_cpf.rb'), + os.path.join(self.arcuit_dir, 'lib', 'arcuit', 'traject', 'traject_config_eac_cpf.rb'), + ] + searched_paths.extend(candidate_paths) + for traject_config in candidate_paths: + if os.path.exists(traject_config): + self.log.info(f'✓ Using traject config from arcuit_dir: {traject_config}') + return traject_config + self.log.debug(' traject_config_eac_cpf.rb not found in arcuit_dir') + + # Try 2: bundle show arcuit (for backward compatibility when arcuit_dir not provided) try: result = subprocess.run( ['bundle', 'show', 'arcuit'], @@ -882,77 +1109,84 @@ def find_traject_config(self): ) if result.returncode == 0: arcuit_path = result.stdout.strip() - # Prefer config at gem root, fall back to legacy subdirectory layout + self.log.debug(f' Found arcuit gem at: {arcuit_path}') candidate_paths = [ os.path.join(arcuit_path, 'traject_config_eac_cpf.rb'), - os.path.join(arcuit_path, 'arcflow', 'traject_config_eac_cpf.rb'), + os.path.join(arcuit_path, 'lib', 'arcuit', 'traject', 'traject_config_eac_cpf.rb'), ] + searched_paths.extend(candidate_paths) for traject_config in candidate_paths: if os.path.exists(traject_config): - self.log.info(f'Found traject config via bundle show: {traject_config}') + self.log.info(f'✓ Using traject config from arcuit gem: {traject_config}') return traject_config - self.log.warning( - 'bundle show arcuit succeeded but traject_config_eac_cpf.rb ' - 'was not found in any expected location under the gem root' + self.log.debug( + ' traject_config_eac_cpf.rb not found in arcuit gem ' + '(checked root and lib/arcuit/traject/ subdirectory)' ) else: - self.log.debug('bundle show arcuit failed (gem not installed?)') + self.log.debug(' arcuit gem not found via bundle show') except Exception as e: - self.log.debug(f'Error running bundle show arcuit: {e}') - # Fall back to arcuit_dir if provided - if self.arcuit_dir: - candidate_paths = [ - os.path.join(self.arcuit_dir, 'traject_config_eac_cpf.rb'), - os.path.join(self.arcuit_dir, 'arcflow', 'traject_config_eac_cpf.rb'), - ] - for traject_config in candidate_paths: - if os.path.exists(traject_config): - self.log.info(f'Using traject config from arcuit_dir: {traject_config}') - return traject_config - self.log.warning( - 'arcuit_dir provided but traject_config_eac_cpf.rb was not found ' - 'in any expected location' + self.log.debug(f' Error checking for arcuit gem: {e}') + + # Try 3: example file in arcflow package (fallback for module usage without arcuit) + # We know exactly where this file is located - at the repo root + arcflow_package_dir = os.path.dirname(os.path.abspath(__file__)) + arcflow_repo_root = os.path.dirname(arcflow_package_dir) + traject_config = os.path.join(arcflow_repo_root, 'example_traject_config_eac_cpf.rb') + searched_paths.append(traject_config) + + if os.path.exists(traject_config): + self.log.info(f'✓ Using example traject config from arcflow: {traject_config}') + self.log.info( + ' Note: Using example config. For production, copy this file to your ' + 'arcuit gem or specify location with --arcuit-dir.' ) - # No config found - self.log.warning('Could not find traject config (bundle show arcuit failed and arcuit_dir not provided or invalid)') + return traject_config + + # No config found anywhere - show all paths searched + self.log.error('✗ Could not find traject_config_eac_cpf.rb in any of these locations:') + for i, path in enumerate(searched_paths, 1): + self.log.error(f' {i}. {path}') + self.log.error('') + self.log.error(' Add traject_config_eac_cpf.rb to your arcuit gem or specify with --arcuit-dir.') return None def index_creators(self, agents_dir, creator_ids, batch_size=100): """ Index creator XML files to Solr using traject. - + Args: agents_dir: Directory containing creator XML files creator_ids: List of creator IDs to index batch_size: Number of files to index per traject call (default: 100) - + Returns: int: Number of successfully indexed creators """ traject_config = self.find_traject_config() if not traject_config: return 0 - + indexed_count = 0 failed_count = 0 - + # Process in batches to avoid command line length limits total_batches = math.ceil(len(creator_ids) / batch_size) for i in range(0, len(creator_ids), batch_size): batch = creator_ids[i:i+batch_size] batch_num = (i // batch_size) + 1 - + # Build list of XML files for this batch xml_files = [f'{agents_dir}/{cid}.xml' for cid in batch] - + # Filter to only existing files existing_files = [f for f in xml_files if os.path.exists(f)] - + if not existing_files: self.log.warning(f' Batch {batch_num}/{total_batches}: No files found, skipping') continue - + try: cmd = [ 'bundle', 'exec', 'traject', @@ -960,16 +1194,16 @@ def index_creators(self, agents_dir, creator_ids, batch_size=100): '-i', 'xml', '-c', traject_config ] + existing_files - + self.log.info(f' Indexing batch {batch_num}/{total_batches}: {len(existing_files)} files') - + result = subprocess.run( cmd, cwd=self.arclight_dir, stderr=subprocess.PIPE, timeout=300 # 5 minute timeout per batch ) - + if result.returncode == 0: indexed_count += len(existing_files) self.log.info(f' Successfully indexed {len(existing_files)} creators') @@ -978,7 +1212,7 @@ def index_creators(self, agents_dir, creator_ids, batch_size=100): self.log.error(f' Traject failed with exit code {result.returncode}') if result.stderr: self.log.error(f' STDERR: {result.stderr.decode("utf-8")}') - + except subprocess.TimeoutExpired: self.log.error(f' Traject timed out for batch {batch_num}/{total_batches}') failed_count += len(existing_files) @@ -988,7 +1222,7 @@ def index_creators(self, agents_dir, creator_ids, batch_size=100): if failed_count > 0: self.log.warning(f'Creator indexing completed with errors: {indexed_count} succeeded, {failed_count} failed') - + return indexed_count @@ -1068,37 +1302,49 @@ def create_symlink(self, target_path, symlink_path, indent_size=0): self.log.info(f'{indent}{e}') return False - - def delete_ead(self, resource_id, ead_id, - xml_file_path, pdf_file_path, indent_size=0): + def delete_arclight_solr_record(self, solr_record_id, indent_size=0): indent = ' ' * indent_size - # delete from solr + try: response = requests.post( f'{self.solr_url}/update?commit=true', - json={'delete': {'id': ead_id}}, + json={'delete': {'id': solr_record_id}}, ) if response.status_code == 200: - self.log.info(f'{indent}Deleted EAD "{ead_id}" from ArcLight Solr.') - # delete related files after suscessful deletion from solr - for file_path in (xml_file_path, pdf_file_path): - try: - os.remove(file_path) - self.log.info(f'{indent}Deleted file {file_path}.') - except FileNotFoundError: - self.log.error(f'{indent}File {file_path} not found.') - - # delete symlink if exists - symlink_path = f'{os.path.dirname(xml_file_path)}/{resource_id}.xml' - try: - os.remove(symlink_path) - self.log.info(f'{indent}Deleted symlink {symlink_path}.') - except FileNotFoundError: - self.log.info(f'{indent}Symlink {symlink_path} not found.') + self.log.info(f'{indent}Deleted Solr record {solr_record_id}. from ArcLight Solr') + return True else: - self.log.error(f'{indent}Failed to delete EAD "{ead_id}" from Arclight Solr. Status code: {response.status_code}') + self.log.error( + f'{indent}Failed to delete Solr record {solr_record_id} from Arclight Solr. Status code: {response.status_code}') + return False except requests.exceptions.RequestException as e: - self.log.error(f'{indent}Error deleting EAD "{ead_id}" from ArcLight Solr: {e}') + self.log.error(f'{indent}Error deleting Solr record {solr_record_id} from ArcLight Solr: {e}') + + def delete_file(self, file_path, indent_size=0): + indent = ' ' * indent_size + + try: + os.remove(file_path) + self.log.info(f'{indent}Deleted file {file_path}.') + except FileNotFoundError: + self.log.error(f'{indent}File {file_path} not found.') + + def delete_ead(self, resource_id, ead_id, + xml_file_path, pdf_file_path, indent_size=0): + # delete from solr + deleted_solr_record = self.delete_arclight_solr_record(ead_id, indent_size=indent_size) + if deleted_solr_record: + self.delete_file(pdf_file_path, indent_size=indent_size) + self.delete_file(xml_file_path, indent_size=indent_size) + # delete symlink if exists + symlink_path = f'{os.path.dirname(xml_file_path)}/{resource_id}.xml' + self.delete_file(symlink_path, indent_size=indent_size) + + def delete_creator(self, file_path, solr_id, indent_size=0): + deleted_solr_record = self.delete_arclight_solr_record(solr_id, indent_size=indent_size) + if deleted_solr_record: + self.delete_file(file_path, indent_size=indent_size) + def save_config_file(self): @@ -1120,23 +1366,30 @@ def run(self): Run the ArcFlow process. """ self.log.info(f'ArcFlow process started (PID: {self.pid}).') - + # Update repositories (unless agents-only mode) if not self.agents_only: self.update_repositories() - + # Update collections/EADs (unless agents-only mode) if not self.agents_only: self.update_eads() - + # Update creator records (unless collections-only mode) if not self.collections_only: self.process_creators() - + + # processing deleted resources is not needed when + # force-update is set or modified_since is set to 0 + if self.force_update or int(self.last_updated.timestamp()) <= 0: + self.log.info('Skipping deleted record processing.') + else: + self.process_deleted_records() + self.save_config_file() self.log.info(f'ArcFlow process completed (PID: {self.pid}). Elapsed time: {time.strftime("%H:%M:%S", time.gmtime(int(time.time()) - self.start_time))}.') - + def main(): @@ -1156,7 +1409,11 @@ def main(): parser.add_argument( '--solr-url', required=True, - help='URL of the Solr core',) + help='URL of the ArcLight Solr core',) + parser.add_argument( + '--aspace-solr-url', + required=True, + help='URL of the ASpace Solr core',) parser.add_argument( '--traject-extra-config', default='', @@ -1177,22 +1434,35 @@ def main(): '--skip-creator-indexing', action='store_true', help='Generate creator XML files but skip Solr indexing (for testing)',) + parser.add_argument( + '--pdf-timeout-queued', + type=int, + default=300, + help='Timeout in seconds for PDF jobs stuck in "queued" status (default: 300 = 5 minutes)',) + parser.add_argument( + '--pdf-timeout-running', + type=int, + default=1800, + help='Timeout in seconds for PDF jobs in "running" status (default: 1800 = 30 minutes)',) args = parser.parse_args() - + + # Validate mutually exclusive flags if args.agents_only and args.collections_only: parser.error('Cannot use both --agents-only and --collections-only') - arcflow = ArcFlow( arclight_dir=args.arclight_dir, aspace_dir=args.aspace_dir, solr_url=args.solr_url, + aspace_solr_url=args.aspace_solr_url, traject_extra_config=args.traject_extra_config, force_update=args.force_update, agents_only=args.agents_only, collections_only=args.collections_only, arcuit_dir=args.arcuit_dir, - skip_creator_indexing=args.skip_creator_indexing) + skip_creator_indexing=args.skip_creator_indexing, + pdf_timeout_queued=args.pdf_timeout_queued, + pdf_timeout_running=args.pdf_timeout_running) arcflow.run() diff --git a/traject_config_eac_cpf.rb b/example_traject_config_eac_cpf.rb similarity index 59% rename from traject_config_eac_cpf.rb rename to example_traject_config_eac_cpf.rb index 62c9a5a..52d0050 100644 --- a/traject_config_eac_cpf.rb +++ b/example_traject_config_eac_cpf.rb @@ -4,7 +4,9 @@ # Persons, and Families) XML documents from ArchivesSpace archival_contexts endpoint. # # Usage: -# bundle exec traject -u $SOLR_URL -c traject_config_eac_cpf.rb /path/to/agents/*.xml +# bundle exec traject -u $SOLR_URL -c example_traject_config_eac_cpf.rb /path/to/agents/*.xml +# +# For production, copy this file to your arcuit gem as traject_config_eac_cpf.rb # # The EAC-CPF XML documents are retrieved directly from ArchivesSpace via: # /repositories/{repo_id}/archival_contexts/{agent_type}/{id}.xml @@ -20,6 +22,12 @@ # EAC-CPF namespace - used consistently throughout this config EAC_NS = { 'eac' => 'urn:isbn:1-931666-33-4' } +# Pattern matching arcflow's creator file naming: creator_{entity_type}_{id} +CREATOR_ID_PATTERN = /^creator_(corporate_entities|people|families)_\d+$/ + +# Entity types - SINGLE SOURCE OF TRUTH +ENTITY_TYPES = ['corporate_entities', 'people', 'families'] + settings do provide "solr.url", ENV['SOLR_URL'] || "http://localhost:8983/solr/blacklight-core" provide "solr_writer.commit_on_close", "true" @@ -36,77 +44,21 @@ context.clipboard[:is_creator] = true end -# Core identity field -# CRITICAL: The 'id' field is required by Solr's schema (uniqueKey) -# Must ensure this field is never empty or indexing will fail -# -# IMPORTANT: Real EAC-CPF from ArchivesSpace has empty#{p.text}
" }.join("\n") + # Preserve inline EAC markup inside#{p.inner_html}
" }.join("\n") accumulator << html end end @@ -210,26 +163,25 @@ accumulator << bioghist.map(&:text).join(' ') if bioghist.any? end -# Related agents (from cpfRelation elements) -to_field 'related_agents_ssim' do |record, accumulator| +# Related agents (from cpfRelation elements) for display parsing and debugging, stored as a single line +# "https://archivesspace-stage.library.illinois.edu/agents/corporate_entities/57|associative" +to_field 'related_agents_debug_ssim' do |record, accumulator| relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation', EAC_NS) relations.each do |rel| - # Get the related entity href/identifier href = rel['href'] || rel['xlink:href'] relation_type = rel['cpfRelationType'] - + if href - # Store as: "uri|type" for easy parsing later - accumulator << "#{href}|#{relation_type}" - elsif relation_entry = rel.xpath('eac:relationEntry', EAC_NS).first - # If no href, at least store the name - name = relation_entry.text - accumulator << "#{name}|#{relation_type}" if name + solr_id = aspace_uri_to_solr_id(href) + if solr_id + # Format: "solr_id|type" + accumulator << "#{solr_id}|#{relation_type || 'unknown'}" + end end end end -# Related agents - just URIs (for simpler queries) +# Related agents - ASpace URIs, in parallel array to match ids and types to_field 'related_agent_uris_ssim' do |record, accumulator| relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation', EAC_NS) relations.each do |rel| @@ -238,7 +190,31 @@ end end -# Relationship types +# Related agents - Parallel array of relationship ids to match relationship types and uris +to_field 'related_agent_ids_ssim' do |record, accumulator| + relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation', EAC_NS) + relations.each do |rel| + href = rel['href'] || rel['xlink:href'] + if href + solr_id = aspace_uri_to_solr_id(href) # CONVERT URI TO ID + accumulator << solr_id if solr_id + end + end +end + +# Related Agents - Parallel array of relationship types to match relationship ids and uris +to_field 'related_agent_relationship_types_ssim' do |record, accumulator| + relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation', EAC_NS) + relations.each do |rel| + href = rel['href'] || rel['xlink:href'] + if href + relation_type = rel['cpfRelationType'] || 'unknown' + accumulator << relation_type # NO deduplication - keeps array parallel + end + end +end + +# Relationship types used for faceting, to_field 'relationship_types_ssim' do |record, accumulator| relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation', EAC_NS) relations.each do |rel| @@ -248,7 +224,7 @@ end # Agent source URI (from original ArchivesSpace) -to_field 'agent_uri' do |record, accumulator| +to_field 'agent_uri_ssi' do |record, accumulator| # Try to extract from control section or otherRecordId other_id = record.xpath('//eac:control/eac:otherRecordId[@localType="archivesspace_uri"]', EAC_NS).first if other_id @@ -261,10 +237,10 @@ accumulator << Time.now.utc.iso8601 end -# Document type marker -to_field 'document_type' do |record, accumulator| - accumulator << 'creator' -end +# # Document type marker +# to_field 'document_type' do |record, accumulator| +# accumulator << 'creator' +# end # Log successful indexing each_record do |record, context| @@ -273,3 +249,27 @@ context.logger.info("Indexed creator: #{record_id.text}") end end + + + + +# Pattern matching arcflow's creator file naming: creator_{entity_type}_{id} +CREATOR_ID_PATTERN = /^creator_(#{ENTITY_TYPES.join('|')})_\d+$/ + +# Helper to build and validate creator IDs +def build_creator_id(entity_type, id_number) + creator_id = "creator_#{entity_type}_#{id_number}" + unless creator_id =~ CREATOR_ID_PATTERN + raise ArgumentError, "Invalid creator ID: #{creator_id} doesn't match pattern" + end + creator_id +end + +# Helper to convert ArchivesSpace URI to Solr creator ID +def aspace_uri_to_solr_id(uri) + return nil unless uri + # Match: /agents/{type}/{id} or https://.../agents/{type}/{id} + if uri =~ /agents\/(#{ENTITY_TYPES.join('|')})\/(\d+)/ + build_creator_id($1, $2) + end +end \ No newline at end of file