Skip to main content
External data provides a flexible key-value store for attaching arbitrary JSON data to a document. This is separate from the hierarchical content node structure and is ideal for storing processing results, cached computations, or any structured data that doesn’t fit into the node tree.

Overview

External data features:
  • Named Keys: Organize data under different keys (default key is “default”)
  • JSON Values: Store any JSON-serializable data structure
  • Document-Level: Data is attached to the document, not individual nodes
  • Persistence: Saved with the document in KDDB format

Setting External Data

Basic Usage

from kodexa_document import Document

with Document(inmemory=True) as doc:
    # Store data with the default key
    doc.set_external_data({
        "processed": True,
        "confidence": 0.95,
        "extracted_values": ["invoice_number", "total", "date"]
    })

    # Store data with a custom key
    doc.set_external_data({
        "model": "invoice-extractor-v2",
        "version": "1.0.0",
        "timestamp": "2024-01-15T10:30:00Z"
    }, key="processing_info")

Complex Data Structures

External data supports any JSON-serializable structure:
with Document(inmemory=True) as doc:
    # Store complex nested data
    doc.set_external_data({
        "extraction_results": {
            "invoice_number": {
                "value": "INV-2024-001",
                "confidence": 0.98,
                "source_node_uuid": "abc123"
            },
            "line_items": [
                {"description": "Widget A", "amount": 100.00},
                {"description": "Widget B", "amount": 250.00}
            ],
            "total": {
                "value": 350.00,
                "currency": "USD"
            }
        },
        "validation_errors": [],
        "processing_time_ms": 1234
    }, key="extraction")

Retrieving External Data

Get by Key

with Document.from_kddb("processed.kddb", inmemory=True) as doc:
    # Get data from default key
    default_data = doc.get_external_data()
    print(f"Processed: {default_data.get('processed')}")

    # Get data from a specific key
    processing_info = doc.get_external_data("processing_info")
    print(f"Model: {processing_info.get('model')}")

    # Non-existent keys return empty dict (and create the key)
    missing = doc.get_external_data("nonexistent")
    print(f"Missing data: {missing}")  # {}

List All Keys

with Document.from_kddb("processed.kddb", inmemory=True) as doc:
    # Get all external data keys
    keys = doc.get_external_data_keys()
    print(f"Available keys: {keys}")
    # e.g., ['default', 'processing_info', 'extraction']

    # Iterate through all external data
    for key in keys:
        data = doc.get_external_data(key)
        print(f"{key}: {data}")

Updating External Data

Setting external data with the same key replaces the previous value:
with Document(inmemory=True) as doc:
    # Initial data
    doc.set_external_data({"status": "pending"}, key="workflow")

    # Later, update the status
    doc.set_external_data({"status": "complete", "completed_at": "2024-01-15"}, key="workflow")

    # The old data is replaced entirely
    workflow = doc.get_external_data("workflow")
    print(workflow)  # {'status': 'complete', 'completed_at': '2024-01-15'}
External data replacement is atomic at the key level. To merge data, read the existing value first, merge in your changes, then write back.

Merging Data

with Document(inmemory=True) as doc:
    # Set initial data
    doc.set_external_data({"field1": "value1"}, key="config")

    # Read, merge, and write back
    existing = doc.get_external_data("config")
    existing["field2"] = "value2"
    existing["field3"] = "value3"
    doc.set_external_data(existing, key="config")

    # Now contains all three fields
    config = doc.get_external_data("config")
    print(config)  # {'field1': 'value1', 'field2': 'value2', 'field3': 'value3'}

Use Cases

Caching Computed Results

Store expensive computation results to avoid recalculating:
import hashlib

with Document.from_kddb("document.kddb", inmemory=True) as doc:
    # Check if we already computed embeddings
    cache = doc.get_external_data("embedding_cache")
    content_hash = hashlib.sha256(doc.content_node.get_all_content().encode()).hexdigest()

    if cache.get("content_hash") != content_hash:
        # Content changed, recompute embeddings
        embeddings = compute_embeddings(doc)  # Your function

        doc.set_external_data({
            "content_hash": content_hash,
            "embeddings": embeddings,
            "model": "text-embedding-3-small"
        }, key="embedding_cache")
    else:
        # Use cached embeddings
        embeddings = cache["embeddings"]

Storing Processing State

Track workflow state across multiple processing steps:
with Document(inmemory=True) as doc:
    # Initialize workflow state
    doc.set_external_data({
        "steps_completed": [],
        "current_step": "ocr",
        "started_at": "2024-01-15T10:00:00Z"
    }, key="workflow")

    # After OCR completes
    state = doc.get_external_data("workflow")
    state["steps_completed"].append("ocr")
    state["current_step"] = "extraction"
    doc.set_external_data(state, key="workflow")

    # After extraction completes
    state = doc.get_external_data("workflow")
    state["steps_completed"].append("extraction")
    state["current_step"] = "validation"
    doc.set_external_data(state, key="workflow")

Multi-Model Results

Store results from different ML models:
with Document(inmemory=True) as doc:
    # Store results from different models in separate keys
    doc.set_external_data({
        "classifications": [
            {"label": "invoice", "confidence": 0.95},
            {"label": "receipt", "confidence": 0.03}
        ]
    }, key="classifier_v1")

    doc.set_external_data({
        "entities": [
            {"type": "vendor", "value": "Acme Corp", "confidence": 0.92},
            {"type": "amount", "value": "$1,234.56", "confidence": 0.88}
        ]
    }, key="ner_model")

    doc.set_external_data({
        "summary": "Invoice from Acme Corp for $1,234.56",
        "key_points": ["Payment due in 30 days", "Net terms"]
    }, key="summarizer")

External Data vs Metadata

FeatureExternal DataMetadata
PurposeProcessing results, cached dataDocument properties
StructureNamed key-value pairsFlat key-value
Typical ContentComplex nested JSONSimple values
Access PatternBy key nameBy property name
Use CasesModel outputs, caches, workflow stateTitle, author, dates