Documentation Index
Fetch the complete documentation index at: https://developer.kodexa.ai/llms.txt
Use this file to discover all available pages before exploring further.
Document metadata stores key-value properties that describe the document itself, such as title, author, creation date, or processing status. This is distinct from content node features which are attached to specific nodes in the document tree.
Individual Values
from kodexa_document import Document
with Document() as doc:
# Set individual metadata values
doc.set_metadata("title", "Invoice #12345")
doc.set_metadata("author", "Accounting System")
doc.set_metadata("created_date", "2024-01-15")
doc.set_metadata("document_type", "invoice")
# Values can be strings, numbers, booleans, lists, or dicts
doc.set_metadata("page_count", 3)
doc.set_metadata("processed", True)
doc.set_metadata("tags", ["financial", "q1-2024", "priority"])
doc.set_metadata("source", {
"system": "ERP",
"id": "DOC-2024-001"
})
with Document(metadata={
"title": "Invoice #12345",
"author": "Accounting System",
"created_date": "2024-01-15"
}) as doc:
# Metadata is set at creation time
print(f"Title: {doc.get_metadata('title')}")
# Or replace all metadata at once
with Document() as doc:
doc.metadata = {
"title": "Updated Title",
"version": "2.0"
}
Individual Values
with Document.from_kddb("document.kddb") as doc:
# Get specific metadata values
title = doc.get_metadata("title")
author = doc.get_metadata("author")
print(f"Title: {title}")
print(f"Author: {author}")
# Non-existent keys return None
missing = doc.get_metadata("nonexistent")
print(f"Missing: {missing}") # None
with Document.from_kddb("document.kddb") as doc:
# Access all metadata as a dict-like object
metadata = doc.metadata
# Iterate through all metadata
for key in metadata:
print(f"{key}: {metadata[key]}")
# Check if key exists
if "title" in metadata:
print(f"Has title: {metadata['title']}")
Document Labels
Labels provide a simple way to categorize documents with string tags:
with Document() as doc:
# Add labels
doc.add_label("invoice")
doc.add_label("financial")
doc.add_label("q1-2024")
doc.add_label("processed")
# Get all labels
labels = doc.labels
print(f"Labels: {labels}") # ['invoice', 'financial', 'q1-2024', 'processed']
# Check for specific label
if "invoice" in doc.labels:
print("This is an invoice")
Document Identity
Every document has a unique identifier and version:
with Document() as doc:
# UUID is auto-generated
print(f"Document UUID: {doc.uuid}")
# Version tracks document changes
print(f"Version: {doc.version}")
Track the original source of the document:
with Document(source={
"connector": "file-upload",
"original_filename": "invoice_scan.pdf",
"mime_type": "application/pdf",
"size": 125000
}) as doc:
# Access source information
source = doc.source
print(f"Original file: {source.original_filename}")
print(f"MIME type: {source.mime_type}")
Document Classification
with Document() as doc:
doc.set_metadata("document_type", "invoice")
doc.set_metadata("confidence", 0.95)
doc.set_metadata("classification_model", "doc-classifier-v2")
doc.set_metadata("classification_timestamp", "2024-01-15T10:30:00Z")
# Alternative classifications
doc.set_metadata("classifications", [
{"type": "invoice", "confidence": 0.95},
{"type": "receipt", "confidence": 0.03},
{"type": "contract", "confidence": 0.02}
])
Processing Status
with Document() as doc:
# Track processing status
doc.set_metadata("status", "pending")
doc.set_metadata("created_at", "2024-01-15T10:00:00Z")
# After processing
doc.set_metadata("status", "complete")
doc.set_metadata("processed_at", "2024-01-15T10:30:00Z")
doc.set_metadata("processing_time_ms", 1500)
doc.set_metadata("processor_version", "1.2.3")
Audit Trail
with Document() as doc:
doc.set_metadata("created_by", "user@example.com")
doc.set_metadata("created_at", "2024-01-15T10:00:00Z")
doc.set_metadata("modified_by", "admin@example.com")
doc.set_metadata("modified_at", "2024-01-15T14:30:00Z")
doc.set_metadata("modification_history", [
{
"user": "user@example.com",
"action": "created",
"timestamp": "2024-01-15T10:00:00Z"
},
{
"user": "admin@example.com",
"action": "approved",
"timestamp": "2024-01-15T14:30:00Z"
}
])
| Aspect | Metadata | Node Features |
|---|
| Scope | Document-level | Per-node |
| Purpose | Document properties | Node-specific data |
| Examples | Title, author, status | Font, position, OCR confidence |
| Access | doc.metadata / doc.get_metadata() | node.get_feature() |
| Querying | Not queryable via selectors | Queryable via selectors |