Skip to main content
Document metadata stores key-value properties that describe the document itself, such as title, author, creation date, or processing status. This is distinct from content node features which are attached to specific nodes in the document tree.

Setting Metadata

Individual Values

from kodexa_document import Document

with Document(inmemory=True) as doc:
    # Set individual metadata values
    doc.set_metadata("title", "Invoice #12345")
    doc.set_metadata("author", "Accounting System")
    doc.set_metadata("created_date", "2024-01-15")
    doc.set_metadata("document_type", "invoice")

    # Values can be strings, numbers, booleans, lists, or dicts
    doc.set_metadata("page_count", 3)
    doc.set_metadata("processed", True)
    doc.set_metadata("tags", ["financial", "q1-2024", "priority"])
    doc.set_metadata("source", {
        "system": "ERP",
        "id": "DOC-2024-001"
    })

Bulk Metadata

with Document(inmemory=True, metadata={
    "title": "Invoice #12345",
    "author": "Accounting System",
    "created_date": "2024-01-15"
}) as doc:
    # Metadata is set at creation time
    print(f"Title: {doc.get_metadata('title')}")

# Or replace all metadata at once
with Document(inmemory=True) as doc:
    doc.metadata = {
        "title": "Updated Title",
        "version": "2.0"
    }

Reading Metadata

Individual Values

with Document.from_kddb("document.kddb", inmemory=True) as doc:
    # Get specific metadata values
    title = doc.get_metadata("title")
    author = doc.get_metadata("author")

    print(f"Title: {title}")
    print(f"Author: {author}")

    # Non-existent keys return None
    missing = doc.get_metadata("nonexistent")
    print(f"Missing: {missing}")  # None

All Metadata

with Document.from_kddb("document.kddb", inmemory=True) as doc:
    # Access all metadata as a dict-like object
    metadata = doc.metadata

    # Iterate through all metadata
    for key in metadata:
        print(f"{key}: {metadata[key]}")

    # Check if key exists
    if "title" in metadata:
        print(f"Has title: {metadata['title']}")

Document Labels

Labels provide a simple way to categorize documents with string tags:
with Document(inmemory=True) as doc:
    # Add labels
    doc.add_label("invoice")
    doc.add_label("financial")
    doc.add_label("q1-2024")
    doc.add_label("processed")

    # Get all labels
    labels = doc.labels
    print(f"Labels: {labels}")  # ['invoice', 'financial', 'q1-2024', 'processed']

    # Check for specific label
    if "invoice" in doc.labels:
        print("This is an invoice")

Document Identity

Every document has a unique identifier and version:
with Document(inmemory=True) as doc:
    # UUID is auto-generated
    print(f"Document UUID: {doc.uuid}")

    # Version tracks document changes
    print(f"Version: {doc.version}")

Source Metadata

Track the original source of the document:
with Document(inmemory=True, source={
    "connector": "file-upload",
    "original_filename": "invoice_scan.pdf",
    "mime_type": "application/pdf",
    "size": 125000
}) as doc:
    # Access source information
    source = doc.source
    print(f"Original file: {source.original_filename}")
    print(f"MIME type: {source.mime_type}")

Common Metadata Patterns

Document Classification

with Document(inmemory=True) as doc:
    doc.set_metadata("document_type", "invoice")
    doc.set_metadata("confidence", 0.95)
    doc.set_metadata("classification_model", "doc-classifier-v2")
    doc.set_metadata("classification_timestamp", "2024-01-15T10:30:00Z")

    # Alternative classifications
    doc.set_metadata("classifications", [
        {"type": "invoice", "confidence": 0.95},
        {"type": "receipt", "confidence": 0.03},
        {"type": "contract", "confidence": 0.02}
    ])

Processing Status

with Document(inmemory=True) as doc:
    # Track processing status
    doc.set_metadata("status", "pending")
    doc.set_metadata("created_at", "2024-01-15T10:00:00Z")

    # After processing
    doc.set_metadata("status", "complete")
    doc.set_metadata("processed_at", "2024-01-15T10:30:00Z")
    doc.set_metadata("processing_time_ms", 1500)
    doc.set_metadata("processor_version", "1.2.3")

Audit Trail

with Document(inmemory=True) as doc:
    doc.set_metadata("created_by", "[email protected]")
    doc.set_metadata("created_at", "2024-01-15T10:00:00Z")
    doc.set_metadata("modified_by", "[email protected]")
    doc.set_metadata("modified_at", "2024-01-15T14:30:00Z")
    doc.set_metadata("modification_history", [
        {
            "user": "[email protected]",
            "action": "created",
            "timestamp": "2024-01-15T10:00:00Z"
        },
        {
            "user": "[email protected]",
            "action": "approved",
            "timestamp": "2024-01-15T14:30:00Z"
        }
    ])

Metadata vs Node Features

AspectMetadataNode Features
ScopeDocument-levelPer-node
PurposeDocument propertiesNode-specific data
ExamplesTitle, author, statusFont, position, OCR confidence
Accessdoc.metadata / doc.get_metadata()node.get_feature()
QueryingNot queryable via selectorsQueryable via selectors