Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,22 @@ This is a **one-time setup** per Solr instance.

---

To index creator documents to Solr:
### Traject Configuration for Creator Indexing

The `traject_config_eac_cpf.rb` file defines how EAC-CPF creator records are mapped to Solr fields.

**Search Order**: arcflow searches for the traject config following the collection records pattern:
1. **arcuit_dir parameter** (if provided via `--arcuit-dir`) - Highest priority, most up-to-date user control
2. **arcuit gem** (via `bundle show arcuit`) - For backward compatibility when arcuit_dir not provided
3. **example_traject_config_eac_cpf.rb** in arcflow - Fallback for module usage without arcuit

**Example File**: arcflow includes `example_traject_config_eac_cpf.rb` as a reference implementation. For production:
- Copy this file to your arcuit gem as `traject_config_eac_cpf.rb`, or
- Specify the location with `--arcuit-dir /path/to/arcuit`

**Logging**: arcflow clearly logs which traject config file is being used when creator indexing runs.

To index creator documents to Solr manually:

```bash
bundle exec traject \
Expand Down
85 changes: 55 additions & 30 deletions arcflow/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -965,14 +965,15 @@ def process_creators(self):
self.log.info(f'{indent}Indexing {len(creator_ids)} creator records to Solr...')
traject_config = self.find_traject_config()
if traject_config:
self.log.info(f'{indent}Using traject config: {traject_config}')
indexed = self.index_creators(agents_dir, creator_ids)
self.log.info(f'{indent}Creator indexing complete: {indexed}/{len(creator_ids)} indexed')
else:
self.log.info(f'{indent}Skipping creator indexing (traject config not found)')
self.log.warning(f'{indent}Skipping creator indexing (traject config not found)')
self.log.info(f'{indent}To index manually:')
self.log.info(f'{indent} cd {self.arclight_dir}')
self.log.info(f'{indent} bundle exec traject -u {self.solr_url} -i xml \\')
self.log.info(f'{indent} -c /path/to/arcuit/arcflow/traject_config_eac_cpf.rb \\')
self.log.info(f'{indent} -c /path/to/arcuit-gem/traject_config_eac_cpf.rb \\')
self.log.info(f'{indent} {agents_dir}/*.xml')
elif self.skip_creator_indexing:
self.log.info(f'{indent}Skipping creator indexing (--skip-creator-indexing flag set)')
Expand All @@ -984,15 +985,32 @@ def find_traject_config(self):
"""
Find the traject config for creator indexing.

Tries:
1. bundle show arcuit (finds installed gem)
2. self.arcuit_dir (explicit path)
3. Returns None if neither works
Search order (follows collection records pattern):
1. arcuit_dir if provided (most up-to-date user control)
2. arcuit gem via bundle show (for backward compatibility)
3. example_traject_config_eac_cpf.rb in arcflow (fallback when used as module without arcuit)

Returns:
str: Path to traject config, or None if not found
"""
# Try bundle show arcuit first
self.log.info('Searching for traject_config_eac_cpf.rb...')
searched_paths = []

# Try 1: arcuit_dir if provided (highest priority - user's explicit choice)
if self.arcuit_dir:
self.log.debug(f' Checking arcuit_dir parameter: {self.arcuit_dir}')
candidate_paths = [
os.path.join(self.arcuit_dir, 'traject_config_eac_cpf.rb'),
os.path.join(self.arcuit_dir, 'lib', 'arcuit', 'traject', 'traject_config_eac_cpf.rb'),
]
searched_paths.extend(candidate_paths)
for traject_config in candidate_paths:
if os.path.exists(traject_config):
self.log.info(f'✓ Using traject config from arcuit_dir: {traject_config}')
return traject_config
self.log.debug(' traject_config_eac_cpf.rb not found in arcuit_dir')

# Try 2: bundle show arcuit (for backward compatibility when arcuit_dir not provided)
try:
result = subprocess.run(
['bundle', 'show', 'arcuit'],
Expand All @@ -1003,39 +1021,46 @@ def find_traject_config(self):
)
if result.returncode == 0:
arcuit_path = result.stdout.strip()
# Prefer config at gem root, fall back to legacy subdirectory layout
self.log.debug(f' Found arcuit gem at: {arcuit_path}')
candidate_paths = [
os.path.join(arcuit_path, 'traject_config_eac_cpf.rb'),
os.path.join(arcuit_path, 'arcflow', 'traject_config_eac_cpf.rb'),
os.path.join(arcuit_path, 'lib', 'arcuit', 'traject', 'traject_config_eac_cpf.rb'),
]
searched_paths.extend(candidate_paths)
for traject_config in candidate_paths:
if os.path.exists(traject_config):
self.log.info(f'Found traject config via bundle show: {traject_config}')
self.log.info(f'✓ Using traject config from arcuit gem: {traject_config}')
return traject_config
self.log.warning(
'bundle show arcuit succeeded but traject_config_eac_cpf.rb '
'was not found in any expected location under the gem root'
self.log.debug(
' traject_config_eac_cpf.rb not found in arcuit gem '
'(checked root and lib/arcuit/traject/ subdirectory)'
)
else:
self.log.debug('bundle show arcuit failed (gem not installed?)')
self.log.debug(' arcuit gem not found via bundle show')
except Exception as e:
self.log.debug(f'Error running bundle show arcuit: {e}')
# Fall back to arcuit_dir if provided
if self.arcuit_dir:
candidate_paths = [
os.path.join(self.arcuit_dir, 'traject_config_eac_cpf.rb'),
os.path.join(self.arcuit_dir, 'arcflow', 'traject_config_eac_cpf.rb'),
]
for traject_config in candidate_paths:
if os.path.exists(traject_config):
self.log.info(f'Using traject config from arcuit_dir: {traject_config}')
return traject_config
self.log.warning(
'arcuit_dir provided but traject_config_eac_cpf.rb was not found '
'in any expected location'
self.log.debug(f' Error checking for arcuit gem: {e}')

# Try 3: example file in arcflow package (fallback for module usage without arcuit)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot we know exactly where we intend to put the file within arcflow--we don't need to search for it. Lets just look where we know it to be.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Simplified in commit 43fc88c. Now directly checks the one known location at repo root instead of looping through candidate paths.

# We know exactly where this file is located - at the repo root
arcflow_package_dir = os.path.dirname(os.path.abspath(__file__))
arcflow_repo_root = os.path.dirname(arcflow_package_dir)
traject_config = os.path.join(arcflow_repo_root, 'example_traject_config_eac_cpf.rb')
searched_paths.append(traject_config)

if os.path.exists(traject_config):
self.log.info(f'✓ Using example traject config from arcflow: {traject_config}')
self.log.info(
' Note: Using example config. For production, copy this file to your '
'arcuit gem or specify location with --arcuit-dir.'
)
# No config found
self.log.warning('Could not find traject config (bundle show arcuit failed and arcuit_dir not provided or invalid)')
return traject_config

# No config found anywhere - show all paths searched
self.log.error('✗ Could not find traject_config_eac_cpf.rb in any of these locations:')
for i, path in enumerate(searched_paths, 1):
self.log.error(f' {i}. {path}')
self.log.error('')
self.log.error(' Add traject_config_eac_cpf.rb to your arcuit gem or specify with --arcuit-dir.')
return None


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
# Persons, and Families) XML documents from ArchivesSpace archival_contexts endpoint.
#
# Usage:
# bundle exec traject -u $SOLR_URL -c traject_config_eac_cpf.rb /path/to/agents/*.xml
# bundle exec traject -u $SOLR_URL -c example_traject_config_eac_cpf.rb /path/to/agents/*.xml
#
# For production, copy this file to your arcuit gem as traject_config_eac_cpf.rb
#
# The EAC-CPF XML documents are retrieved directly from ArchivesSpace via:
# /repositories/{repo_id}/archival_contexts/{agent_type}/{id}.xml
Expand Down
Loading