diff --git a/README.md b/README.md index 3eac054..710ddc4 100644 --- a/README.md +++ b/README.md @@ -137,7 +137,22 @@ This is a **one-time setup** per Solr instance. --- -To index creator documents to Solr: +### Traject Configuration for Creator Indexing + +The `traject_config_eac_cpf.rb` file defines how EAC-CPF creator records are mapped to Solr fields. + +**Search Order**: arcflow searches for the traject config following the collection records pattern: +1. **arcuit_dir parameter** (if provided via `--arcuit-dir`) - Highest priority, most up-to-date user control +2. **arcuit gem** (via `bundle show arcuit`) - For backward compatibility when arcuit_dir not provided +3. **example_traject_config_eac_cpf.rb** in arcflow - Fallback for module usage without arcuit + +**Example File**: arcflow includes `example_traject_config_eac_cpf.rb` as a reference implementation. For production: +- Copy this file to your arcuit gem as `traject_config_eac_cpf.rb`, or +- Specify the location with `--arcuit-dir /path/to/arcuit` + +**Logging**: arcflow clearly logs which traject config file is being used when creator indexing runs. + +To index creator documents to Solr manually: ```bash bundle exec traject \ diff --git a/arcflow/main.py b/arcflow/main.py index 18875ed..50f7583 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -965,14 +965,15 @@ def process_creators(self): self.log.info(f'{indent}Indexing {len(creator_ids)} creator records to Solr...') traject_config = self.find_traject_config() if traject_config: + self.log.info(f'{indent}Using traject config: {traject_config}') indexed = self.index_creators(agents_dir, creator_ids) self.log.info(f'{indent}Creator indexing complete: {indexed}/{len(creator_ids)} indexed') else: - self.log.info(f'{indent}Skipping creator indexing (traject config not found)') + self.log.warning(f'{indent}Skipping creator indexing (traject config not found)') self.log.info(f'{indent}To index manually:') self.log.info(f'{indent} cd {self.arclight_dir}') self.log.info(f'{indent} bundle exec traject -u {self.solr_url} -i xml \\') - self.log.info(f'{indent} -c /path/to/arcuit/arcflow/traject_config_eac_cpf.rb \\') + self.log.info(f'{indent} -c /path/to/arcuit-gem/traject_config_eac_cpf.rb \\') self.log.info(f'{indent} {agents_dir}/*.xml') elif self.skip_creator_indexing: self.log.info(f'{indent}Skipping creator indexing (--skip-creator-indexing flag set)') @@ -984,15 +985,32 @@ def find_traject_config(self): """ Find the traject config for creator indexing. - Tries: - 1. bundle show arcuit (finds installed gem) - 2. self.arcuit_dir (explicit path) - 3. Returns None if neither works + Search order (follows collection records pattern): + 1. arcuit_dir if provided (most up-to-date user control) + 2. arcuit gem via bundle show (for backward compatibility) + 3. example_traject_config_eac_cpf.rb in arcflow (fallback when used as module without arcuit) Returns: str: Path to traject config, or None if not found """ - # Try bundle show arcuit first + self.log.info('Searching for traject_config_eac_cpf.rb...') + searched_paths = [] + + # Try 1: arcuit_dir if provided (highest priority - user's explicit choice) + if self.arcuit_dir: + self.log.debug(f' Checking arcuit_dir parameter: {self.arcuit_dir}') + candidate_paths = [ + os.path.join(self.arcuit_dir, 'traject_config_eac_cpf.rb'), + os.path.join(self.arcuit_dir, 'lib', 'arcuit', 'traject', 'traject_config_eac_cpf.rb'), + ] + searched_paths.extend(candidate_paths) + for traject_config in candidate_paths: + if os.path.exists(traject_config): + self.log.info(f'✓ Using traject config from arcuit_dir: {traject_config}') + return traject_config + self.log.debug(' traject_config_eac_cpf.rb not found in arcuit_dir') + + # Try 2: bundle show arcuit (for backward compatibility when arcuit_dir not provided) try: result = subprocess.run( ['bundle', 'show', 'arcuit'], @@ -1003,39 +1021,46 @@ def find_traject_config(self): ) if result.returncode == 0: arcuit_path = result.stdout.strip() - # Prefer config at gem root, fall back to legacy subdirectory layout + self.log.debug(f' Found arcuit gem at: {arcuit_path}') candidate_paths = [ os.path.join(arcuit_path, 'traject_config_eac_cpf.rb'), - os.path.join(arcuit_path, 'arcflow', 'traject_config_eac_cpf.rb'), + os.path.join(arcuit_path, 'lib', 'arcuit', 'traject', 'traject_config_eac_cpf.rb'), ] + searched_paths.extend(candidate_paths) for traject_config in candidate_paths: if os.path.exists(traject_config): - self.log.info(f'Found traject config via bundle show: {traject_config}') + self.log.info(f'✓ Using traject config from arcuit gem: {traject_config}') return traject_config - self.log.warning( - 'bundle show arcuit succeeded but traject_config_eac_cpf.rb ' - 'was not found in any expected location under the gem root' + self.log.debug( + ' traject_config_eac_cpf.rb not found in arcuit gem ' + '(checked root and lib/arcuit/traject/ subdirectory)' ) else: - self.log.debug('bundle show arcuit failed (gem not installed?)') + self.log.debug(' arcuit gem not found via bundle show') except Exception as e: - self.log.debug(f'Error running bundle show arcuit: {e}') - # Fall back to arcuit_dir if provided - if self.arcuit_dir: - candidate_paths = [ - os.path.join(self.arcuit_dir, 'traject_config_eac_cpf.rb'), - os.path.join(self.arcuit_dir, 'arcflow', 'traject_config_eac_cpf.rb'), - ] - for traject_config in candidate_paths: - if os.path.exists(traject_config): - self.log.info(f'Using traject config from arcuit_dir: {traject_config}') - return traject_config - self.log.warning( - 'arcuit_dir provided but traject_config_eac_cpf.rb was not found ' - 'in any expected location' + self.log.debug(f' Error checking for arcuit gem: {e}') + + # Try 3: example file in arcflow package (fallback for module usage without arcuit) + # We know exactly where this file is located - at the repo root + arcflow_package_dir = os.path.dirname(os.path.abspath(__file__)) + arcflow_repo_root = os.path.dirname(arcflow_package_dir) + traject_config = os.path.join(arcflow_repo_root, 'example_traject_config_eac_cpf.rb') + searched_paths.append(traject_config) + + if os.path.exists(traject_config): + self.log.info(f'✓ Using example traject config from arcflow: {traject_config}') + self.log.info( + ' Note: Using example config. For production, copy this file to your ' + 'arcuit gem or specify location with --arcuit-dir.' ) - # No config found - self.log.warning('Could not find traject config (bundle show arcuit failed and arcuit_dir not provided or invalid)') + return traject_config + + # No config found anywhere - show all paths searched + self.log.error('✗ Could not find traject_config_eac_cpf.rb in any of these locations:') + for i, path in enumerate(searched_paths, 1): + self.log.error(f' {i}. {path}') + self.log.error('') + self.log.error(' Add traject_config_eac_cpf.rb to your arcuit gem or specify with --arcuit-dir.') return None diff --git a/traject_config_eac_cpf.rb b/example_traject_config_eac_cpf.rb similarity index 98% rename from traject_config_eac_cpf.rb rename to example_traject_config_eac_cpf.rb index deb9ac5..1fe97d0 100644 --- a/traject_config_eac_cpf.rb +++ b/example_traject_config_eac_cpf.rb @@ -4,7 +4,9 @@ # Persons, and Families) XML documents from ArchivesSpace archival_contexts endpoint. # # Usage: -# bundle exec traject -u $SOLR_URL -c traject_config_eac_cpf.rb /path/to/agents/*.xml +# bundle exec traject -u $SOLR_URL -c example_traject_config_eac_cpf.rb /path/to/agents/*.xml +# +# For production, copy this file to your arcuit gem as traject_config_eac_cpf.rb # # The EAC-CPF XML documents are retrieved directly from ArchivesSpace via: # /repositories/{repo_id}/archival_contexts/{agent_type}/{id}.xml