External data provides a flexible key-value store for attaching arbitrary JSON data to a document. This is separate from the hierarchical content node structure and is ideal for storing processing results, cached computations, or any structured data that doesn’t fit into the node tree.
Overview
External data features:
- Named Keys: Organize data under different keys (default key is “default”)
- JSON Values: Store any JSON-serializable data structure
- Document-Level: Data is attached to the document, not individual nodes
- Persistence: Saved with the document in KDDB format
Setting External Data
Basic Usage
from kodexa_document import Document
with Document(inmemory=True) as doc:
# Store data with the default key
doc.set_external_data({
"processed": True,
"confidence": 0.95,
"extracted_values": ["invoice_number", "total", "date"]
})
# Store data with a custom key
doc.set_external_data({
"model": "invoice-extractor-v2",
"version": "1.0.0",
"timestamp": "2024-01-15T10:30:00Z"
}, key="processing_info")
Complex Data Structures
External data supports any JSON-serializable structure:
with Document(inmemory=True) as doc:
# Store complex nested data
doc.set_external_data({
"extraction_results": {
"invoice_number": {
"value": "INV-2024-001",
"confidence": 0.98,
"source_node_uuid": "abc123"
},
"line_items": [
{"description": "Widget A", "amount": 100.00},
{"description": "Widget B", "amount": 250.00}
],
"total": {
"value": 350.00,
"currency": "USD"
}
},
"validation_errors": [],
"processing_time_ms": 1234
}, key="extraction")
Retrieving External Data
Get by Key
with Document.from_kddb("processed.kddb", inmemory=True) as doc:
# Get data from default key
default_data = doc.get_external_data()
print(f"Processed: {default_data.get('processed')}")
# Get data from a specific key
processing_info = doc.get_external_data("processing_info")
print(f"Model: {processing_info.get('model')}")
# Non-existent keys return empty dict (and create the key)
missing = doc.get_external_data("nonexistent")
print(f"Missing data: {missing}") # {}
List All Keys
with Document.from_kddb("processed.kddb", inmemory=True) as doc:
# Get all external data keys
keys = doc.get_external_data_keys()
print(f"Available keys: {keys}")
# e.g., ['default', 'processing_info', 'extraction']
# Iterate through all external data
for key in keys:
data = doc.get_external_data(key)
print(f"{key}: {data}")
Updating External Data
Setting external data with the same key replaces the previous value:
with Document(inmemory=True) as doc:
# Initial data
doc.set_external_data({"status": "pending"}, key="workflow")
# Later, update the status
doc.set_external_data({"status": "complete", "completed_at": "2024-01-15"}, key="workflow")
# The old data is replaced entirely
workflow = doc.get_external_data("workflow")
print(workflow) # {'status': 'complete', 'completed_at': '2024-01-15'}
External data replacement is atomic at the key level. To merge data, read the existing value first, merge in your changes, then write back.
Merging Data
with Document(inmemory=True) as doc:
# Set initial data
doc.set_external_data({"field1": "value1"}, key="config")
# Read, merge, and write back
existing = doc.get_external_data("config")
existing["field2"] = "value2"
existing["field3"] = "value3"
doc.set_external_data(existing, key="config")
# Now contains all three fields
config = doc.get_external_data("config")
print(config) # {'field1': 'value1', 'field2': 'value2', 'field3': 'value3'}
Use Cases
Caching Computed Results
Store expensive computation results to avoid recalculating:
import hashlib
with Document.from_kddb("document.kddb", inmemory=True) as doc:
# Check if we already computed embeddings
cache = doc.get_external_data("embedding_cache")
content_hash = hashlib.sha256(doc.content_node.get_all_content().encode()).hexdigest()
if cache.get("content_hash") != content_hash:
# Content changed, recompute embeddings
embeddings = compute_embeddings(doc) # Your function
doc.set_external_data({
"content_hash": content_hash,
"embeddings": embeddings,
"model": "text-embedding-3-small"
}, key="embedding_cache")
else:
# Use cached embeddings
embeddings = cache["embeddings"]
Storing Processing State
Track workflow state across multiple processing steps:
with Document(inmemory=True) as doc:
# Initialize workflow state
doc.set_external_data({
"steps_completed": [],
"current_step": "ocr",
"started_at": "2024-01-15T10:00:00Z"
}, key="workflow")
# After OCR completes
state = doc.get_external_data("workflow")
state["steps_completed"].append("ocr")
state["current_step"] = "extraction"
doc.set_external_data(state, key="workflow")
# After extraction completes
state = doc.get_external_data("workflow")
state["steps_completed"].append("extraction")
state["current_step"] = "validation"
doc.set_external_data(state, key="workflow")
Multi-Model Results
Store results from different ML models:
with Document(inmemory=True) as doc:
# Store results from different models in separate keys
doc.set_external_data({
"classifications": [
{"label": "invoice", "confidence": 0.95},
{"label": "receipt", "confidence": 0.03}
]
}, key="classifier_v1")
doc.set_external_data({
"entities": [
{"type": "vendor", "value": "Acme Corp", "confidence": 0.92},
{"type": "amount", "value": "$1,234.56", "confidence": 0.88}
]
}, key="ner_model")
doc.set_external_data({
"summary": "Invoice from Acme Corp for $1,234.56",
"key_points": ["Payment due in 30 days", "Net terms"]
}, key="summarizer")
| Feature | External Data | Metadata |
|---|
| Purpose | Processing results, cached data | Document properties |
| Structure | Named key-value pairs | Flat key-value |
| Typical Content | Complex nested JSON | Simple values |
| Access Pattern | By key name | By property name |
| Use Cases | Model outputs, caches, workflow state | Title, author, dates |