This reference covers the main public classes and methods in DocEX. For more details and advanced usage, see the Developer Guide.
The main entry point for all document and transport operations.
from docex import DocEX
# Setup DocEX configuration (one-time)
DocEX.setup(
database={'type': 'postgresql', 'postgres': {...}},
storage={'type': 's3', 's3': {...}},
multi_tenancy={'enabled': True, ...}
)
# Check if DocEX is initialized
if DocEX.is_initialized():
print("Configuration loaded")
# Check if DocEX is properly set up (read-only, no side effects)
if DocEX.is_properly_setup():
print("✅ DocEX is ready")
else:
errors = DocEX.get_setup_errors()
print(f"Issues: {errors}")
# Get default configuration
defaults = DocEX.get_defaults()from docex import DocEX
from docex.context import UserContext
# Create an instance (with UserContext for multi-tenant)
user_context = UserContext(user_id='user123', tenant_id='acme_corp')
docEX = DocEX(user_context=user_context)
# List baskets
baskets = docEX.list_baskets()
# Create a basket
basket = docEX.create_basket('mybasket', description='My basket')
# Get a basket by name
basket = docEX.basket('mybasket')
# Get a basket by ID
basket = docEX.get_basket('bas_123...')
# List routes
routes = docEX.list_routes()
# Get a route
route = docEX.get_route('local_backup')A container for related documents with its own storage configuration.
from docex.docbasket import DocBasket
# Create a basket
basket = DocBasket.create(
name='invoices',
description='Invoice documents',
storage_config={'type': 's3', 's3': {...}}
)
# Get basket by ID
basket = DocBasket.get('bas_123...')
# Find basket by name
basket = DocBasket.find_by_name('invoices')
# List all baskets
baskets = DocBasket.list()# Add a document
doc = basket.add('path/to/file.txt', document_type='file', metadata={'source': 'example'})
# List documents (returns full Document objects)
docs = basket.list_documents(limit=100, offset=0, order_by='created_at', order_desc=True)
# Efficient document listing with metadata (NEW - lightweight, avoids N+1 queries)
documents = basket.list_documents_with_metadata(
columns=['id', 'name', 'document_type', 'status', 'created_at'],
filters={'document_type': 'invoice', 'status': 'RECEIVED'},
limit=100,
offset=0,
order_by='created_at',
order_desc=True
)
# Returns: [{'id': 'doc_123', 'name': 'invoice_001.pdf', ...}, ...]
# Count documents
count = basket.count_documents(status='RECEIVED')
# Find documents by metadata
docs = basket.find_documents_by_metadata(
metadata={'category': 'invoice'},
limit=50
)
# Get document by ID
doc = basket.get_document('doc_123...')
# Update document
updated_doc = basket.update_document('doc_123...', 'path/to/new_file.txt')
# Delete document
basket.delete_document('doc_123...')
# Get basket statistics
stats = basket.get_stats()
# Returns: {'document_count': 10, 'total_size': 1024000, ...}
# Delete basket
basket.delete()Represents an individual document and its metadata.
# Get document details
print(doc.get_details())
# Get content
text = doc.get_content(mode='text')
# Get and update metadata
meta = doc.get_metadata()
doc.update_metadata({'my_key': 'my_value'})
# Get document operations
ops = doc.get_operations()
# Get metadata as a plain dictionary
meta_dict = doc.get_metadata_dict()Represents a transport route for uploading, downloading, and managing files.
# Upload a document
result = route.upload_document(doc)
print(result.message)
# Download a file
result = route.download('remote_file.txt', 'local_file.txt')
print(result.message)
# List files
result = route.list_files()
print(result.details)Processors transform or extract data from documents. You can create and register your own.
from docex.processors.base import BaseProcessor, ProcessingResult
from docex.document import Document
class MyTextProcessor(BaseProcessor):
def can_process(self, document: Document) -> bool:
return document.name.lower().endswith('.txt')
def process(self, document: Document) -> ProcessingResult:
text = self.get_document_text(document)
return ProcessingResult(success=True, content=text.upper())
# Register via CLI
# docex processor register --name MyTextProcessor --type content_processor --description "Uppercases text files" --config '{}'Service for managing document metadata.
from docex.services.metadata_service import MetadataService
# Get metadata
meta = MetadataService().get_metadata(doc.id)
# Update metadata
MetadataService().update_metadata(doc.id, {'key': 'value'})Context object for carrying user-specific information through DocEX operations. Required when multi-tenancy is enabled.
from docex.context import UserContext
from docex import DocEX
# Create user context (user_id is required)
user_context = UserContext(
user_id="user123", # Required
user_email="user@example.com",
tenant_id="acme_corp", # Required for multi-tenant setups
roles=["admin"],
attributes={"department": "finance"}
)
# Check if user has role
if user_context.has_role("admin"):
print("User is admin")
# Get user attribute
dept = user_context.get_attribute("department", default="unknown")
# Initialize DocEX with user context
docEX = DocEX(user_context=user_context)Attributes:
user_id(str, required): Unique identifier for the useruser_email(str, optional): Email address of the usertenant_id(str, optional): Tenant identifier (required for multi-tenant)roles(List[str], optional): List of user roles for RBACattributes(Dict, optional): Custom user attributes and settings
Track file movements and history for a document.
from docex.services.document_service import DocumentService
from docex.db.connection import Database
# Get file history for a document
db = Database()
service = DocumentService(db, basket_id)
history = service.get_file_history(doc.id)
for entry in history:
print(entry.original_path, entry.internal_path)Check for and mark duplicate documents.
from docex.services.document_service import DocumentService
from docex.db.connection import Database
# Check for duplicates
db = Database()
service = DocumentService(db, basket_id)
info = service.check_for_duplicates(source, checksum)
print(info['is_duplicate'])
# Mark a document as duplicate
service.mark_as_duplicate(dup_id, orig_id)Track document operations and their dependencies.
# Get document operations
ops = doc.get_operations()
for op in ops:
print(op['operation_type'], op['status'])
# Get operation dependencies
from docex.db.repository import BaseRepository
deps = BaseRepository.get_dependencies(operation_id)Retrieve extended metadata for a document.
# Get all metadata as a plain dictionary
meta_dict = doc.get_metadata_dict()DocEX supports multiple storage backends including filesystem and S3.
Default storage backend using local filesystem:
# Configured in config.yaml
storage:
type: filesystem
filesystem:
path: /path/to/storageAmazon S3 storage backend with support for multiple credential sources:
# Configured in config.yaml
storage:
type: s3
s3:
bucket: docex-bucket
region: us-east-1
# Optional: credentials (can use env vars or IAM roles)
access_key: your-access-key
secret_key: your-secret-key
# Optional: prefix for organizing files
prefix: docex/S3 Storage Features:
- Automatic retry on transient errors
- Support for environment variables and IAM roles
- Configurable timeouts and retry settings
- Presigned URL generation
- Prefix support for file organization
Example: Using S3 Storage
from docex import DocEX
# Configure S3 storage
docEX = DocEX()
# Create basket with S3 storage
basket = docEX.create_basket('mybasket', storage_config={
'type': 's3',
's3': {
'bucket': 'my-bucket',
'region': 'us-east-1',
'prefix': 'mybasket/'
}
})
# Add document (stored in S3)
doc = basket.add('path/to/file.pdf')Handles explicit tenant provisioning for DocEX 3.0 multi-tenancy.
from docex.provisioning.tenant_provisioner import TenantProvisioner, TenantExistsError, InvalidTenantIdError
provisioner = TenantProvisioner()
# Check if tenant exists (always fresh query, no stale cache)
if provisioner.tenant_exists('acme_corp'):
print("Tenant exists")
# Validate tenant ID
provisioner.validate_tenant_id('acme_corp') # Raises InvalidTenantIdError if invalid
# Check if tenant is system tenant
if TenantProvisioner.is_system_tenant('_docex_system_'):
print("System tenant")
# Provision a new tenant
try:
tenant = provisioner.create(
tenant_id='acme_corp',
display_name='Acme Corporation',
created_by='admin',
isolation_strategy='schema' # 'schema' for PostgreSQL, 'database' for SQLite (auto-detected if None)
)
print(f"✅ Tenant provisioned: {tenant.tenant_id}")
except TenantExistsError:
print("Tenant already exists")
except InvalidTenantIdError as e:
print(f"Invalid tenant ID: {e}")Methods:
tenant_exists(tenant_id: str, use_cache: bool = False) -> bool: Check if tenant existsvalidate_tenant_id(tenant_id: str) -> None: Validate tenant ID formatcreate(tenant_id: str, display_name: str, created_by: str, isolation_strategy: Optional[str] = None) -> TenantRegistry: Provision a new tenantis_system_tenant(tenant_id: str) -> bool: Check if tenant ID is a system tenant
Manages the bootstrap/system tenant for DocEX 3.0.
from docex.provisioning.bootstrap import BootstrapTenantManager
bootstrap_manager = BootstrapTenantManager()
# Check if bootstrap tenant is initialized
if bootstrap_manager.is_initialized():
print("✅ Bootstrap tenant ready")
else:
# Initialize bootstrap tenant (one-time setup)
tenant = bootstrap_manager.initialize(created_by='admin')
print(f"✅ Bootstrap tenant initialized: {tenant.tenant_id}")
# Get bootstrap tenant
bootstrap_tenant = bootstrap_manager.get_bootstrap_tenant()Methods:
is_initialized() -> bool: Check if bootstrap tenant is initializedinitialize(created_by: str = "system") -> TenantRegistry: Initialize bootstrap tenantget_bootstrap_tenant() -> Optional[TenantRegistry]: Get bootstrap tenant record
DocEX uses a three-part S3 path structure for organization:
- Part A (Config Prefix):
{tenant_id}/{path_namespace}/{prefix}/- From configuration - Part B (Basket Path):
{basket_name}_{last_4_of_basket_id}/- Set when basket is created - Part C (Document Path):
{document_name}_{last_6_of_document_id}.{ext}- Set when document is added
Example:
s3://my-bucket/acme_corp/finance_dept/production/invoice_raw_2c03/invoice_001_585d29.pdf
Storage in Database:
- Part A + Part B: Stored in
basket.storage_config['s3']['config_prefix']andbasket.storage_config['s3']['basket_path'] - Full path (A+B+C): Stored in
document.path
See S3_PATH_STRUCTURE.md for detailed documentation.
- See the Developer Guide for advanced topics, configuration, and extensibility.
- See the Platform Integration Guide for platform integration patterns.
- See the Design Document for architecture and design principles.
- See the S3 Storage Troubleshooting Guide for S3-specific issues.
- See the S3 Path Structure Guide for S3 path organization.
