Skip to main content
This guide covers the essential operations for working with Kodexa Documents: creating, loading, manipulating, and saving documents.

Creating Documents

Empty Document

Create a new document and build its structure:
from kodexa_document import Document

# Always use context managers for automatic cleanup
with Document(inmemory=True) as doc:
    # Create the root node
    root = doc.create_node("document", "My Document")
    doc.content_node = root

    # Add child nodes
    section = doc.create_node("section", "Introduction", parent=root)
    para1 = doc.create_node("paragraph", "This is the first paragraph.", parent=section)
    para2 = doc.create_node("paragraph", "This is the second paragraph.", parent=section)

    print(f"Created document with {len(root.get_children())} sections")

From Text

Automatically parse text into paragraphs:
text = """First paragraph of content.
Second paragraph with more details.
Third paragraph to conclude."""

with Document.from_text(text, separator="\n", inmemory=True) as doc:
    paragraphs = doc.select("//paragraph")
    print(f"Created {len(paragraphs)} paragraphs from text")

With Metadata

Initialize documents with metadata:
with Document(inmemory=True, metadata={
    "title": "Invoice Analysis",
    "author": "Processing System",
    "created": "2024-01-15"
}) as doc:
    # Access metadata
    title = doc.get_metadata("title")
    print(f"Document title: {title}")

Loading Documents

From KDDB File

Load an existing document:
# Load into memory for fast processing (creates a copy)
with Document.from_kddb("document.kddb", inmemory=True, detached=True) as doc:
    print(f"Loaded document: {doc.uuid}")
    nodes = doc.select("//*")
    print(f"Total nodes: {len(nodes)}")

# Load for in-place editing (modifies original file)
with Document.from_kddb("document.kddb", detached=False) as doc:
    # Changes are saved to the original file
    doc.set_metadata("last_accessed", "2024-01-15")

From Bytes

Load from API responses or downloads:
import requests

# Example: Load from an API response
response = requests.get("https://api.example.com/documents/123")
kddb_bytes = response.content

with Document.from_kddb(kddb_bytes, inmemory=True) as doc:
    print(f"Loaded document from API: {doc.uuid}")

From JSON

Load from JSON representation:
json_data = '{"uuid": "...", "metadata": {"title": "Test"}}'
with Document.from_json(json_data, inmemory=True) as doc:
    print(f"Loaded from JSON: {doc.uuid}")

Working with Content Nodes

Traverse the document tree:
with Document.from_kddb("document.kddb", inmemory=True) as doc:
    root = doc.content_node

    # Get all children
    children = root.get_children()

    # Navigate relationships
    for child in children:
        parent = child.get_parent()       # Back to root
        siblings = child.get_siblings()   # Other children
        next_node = child.next_node()     # Next sibling
        depth = child.get_depth()         # Depth in tree

        print(f"Node type: {child.type}, depth: {depth}")

Content Access

Read and modify node content:
with Document(inmemory=True) as doc:
    root = doc.create_node("document")
    doc.content_node = root

    para = doc.create_node("paragraph", "Initial content", parent=root)

    # Read content
    print(f"Content: {para.content}")

    # Update content
    para.content = "Updated content"

    # Multi-part content
    para.set_content_parts(["Part 1", "Part 2", "Part 3"])
    parts = para.get_content_parts()

    # Get all content from node and descendants
    all_text = root.get_all_content(separator=" ")

Querying with Selectors

Use XPath-like selectors to find nodes:
with Document.from_text("Para 1\nPara 2\nPara 3", separator="\n", inmemory=True) as doc:
    # Select all nodes of a type
    all_paragraphs = doc.select("//paragraph")

    # Select first match only
    first_para = doc.select_first("//paragraph")

    # Filter by content
    matching = doc.select("//paragraph[contains(@content, 'Para 2')]")

    # Select tagged nodes
    tagged = doc.select("//*[@tag='important']")

    # Select with variables
    variables = {"search_term": "Para 1"}
    results = doc.select("//paragraph[contains(@content, $search_term)]", variables)

    print(f"Found {len(all_paragraphs)} paragraphs")

Common Selector Patterns

SelectorDescription
//*All nodes
//paragraphAll paragraphs
//section/paragraphDirect child paragraphs of sections
//paragraph[1]First paragraph
//*[@tag='important']Nodes with ‘important’ tag
//paragraph[contains(@content, 'text')]Paragraphs containing ‘text’

Adding Features

Attach metadata to nodes:
with Document(inmemory=True) as doc:
    root = doc.create_node("document")
    doc.content_node = root
    para = doc.create_node("paragraph", "Styled text", parent=root)

    # Add features (type, name, value)
    para.add_feature("style", "font-family", "Arial")
    para.add_feature("style", "font-size", "12pt")
    para.add_feature("analysis", "word-count", 2)
    para.add_feature("position", "bbox", {"x": 100, "y": 200, "w": 300, "h": 50})

    # Retrieve features
    font = para.get_feature("style", "font-family")
    if font:
        print(f"Font: {font.get_value()}")

    # Get all features of a type
    style_features = para.get_features_of_type("style")

    # Get all features
    all_features = para.get_features()

Adding Tags

Annotate nodes with tags:
with Document(inmemory=True) as doc:
    root = doc.create_node("document")
    doc.content_node = root
    para = doc.create_node("paragraph", "Important invoice total: $1,234.56", parent=root)

    # Simple tag
    para.tag("important")

    # Tag with confidence and value
    para.tag("invoice-total", confidence=0.95, value="$1,234.56")

    # Check for tags
    if para.has_tag("important"):
        print("This paragraph is marked as important")

    # Get tag details
    tag = para.get_tag("invoice-total")
    if tag:
        confidence = tag.get("Confidence")
        value = tag.get("Value")
        print(f"Invoice total: {value} (confidence: {confidence})")

    # List all tags
    all_tags = para.get_tags()
    print(f"Tags: {all_tags}")

Saving Documents

To KDDB File

Save to the native format:
with Document(inmemory=True) as doc:
    root = doc.create_node("document", "Content to save")
    doc.content_node = root

    # Save to file
    doc.save("output.kddb")

To Bytes

Export for API responses:
with Document(inmemory=True) as doc:
    root = doc.create_node("document", "API response")
    doc.content_node = root

    # Get as bytes
    kddb_bytes = doc.to_kddb()

    # Send in API response
    # return Response(content=kddb_bytes, media_type="application/octet-stream")

To JSON

Export for debugging or interoperability:
with Document(inmemory=True) as doc:
    root = doc.create_node("document", "Debug output")
    doc.content_node = root

    # Pretty-printed JSON
    json_str = doc.to_json(indent=2)
    print(json_str)

    # As dictionary
    doc_dict = doc.to_dict()

Document Metadata

Setting Metadata

with Document(inmemory=True) as doc:
    # Set individual values
    doc.set_metadata("title", "My Document")
    doc.set_metadata("author", "John Doe")
    doc.set_metadata("tags", ["invoice", "2024", "processed"])
    doc.set_metadata("config", {"threshold": 0.8, "model": "v2"})

    # Access all metadata
    metadata = doc.metadata

Labels

Categorize documents:
with Document(inmemory=True) as doc:
    # Add labels
    doc.add_label("invoice")
    doc.add_label("financial")
    doc.add_label("q1-2024")

    # Get all labels
    labels = doc.labels
    print(f"Document labels: {labels}")

Error Handling

Handle common errors gracefully:
from kodexa_document import Document
from kodexa_document.errors import DocumentError, DocumentNotFoundError

try:
    with Document.from_kddb("missing.kddb", inmemory=True) as doc:
        nodes = doc.select("//paragraph")

except DocumentNotFoundError:
    print("Document file not found")

except DocumentError as e:
    print(f"Document error: {e}")

except RuntimeError as e:
    print(f"Runtime error (possibly closed document): {e}")

Complete Example

Here’s a full workflow combining the concepts:
from kodexa_document import Document

def process_document():
    # Create a new document
    with Document(inmemory=True) as doc:
        # Set document metadata
        doc.set_metadata("title", "Invoice Processing Result")
        doc.set_metadata("processor", "kodexa-document-example")
        doc.add_label("invoice")

        # Build document structure
        root = doc.create_node("document", "Invoice #12345")
        doc.content_node = root

        # Add header section
        header = doc.create_node("section", "Header", parent=root)
        doc.create_node("paragraph", "Vendor: Acme Corp", parent=header)
        doc.create_node("paragraph", "Date: 2024-01-15", parent=header)

        # Add line items
        items = doc.create_node("section", "Line Items", parent=root)

        for i, (desc, amount) in enumerate([
            ("Widget A", 100.00),
            ("Widget B", 250.00),
            ("Service Fee", 50.00)
        ]):
            item = doc.create_node("paragraph", f"{desc}: ${amount:.2f}", parent=items)
            item.add_feature("line-item", "amount", amount)
            item.add_feature("line-item", "index", i)
            item.tag("line-item", value=str(amount))

        # Add total
        total = doc.create_node("paragraph", "Total: $400.00", parent=root)
        total.tag("invoice-total", confidence=1.0, value="400.00")
        total.add_feature("summary", "calculated", True)

        # Query the document
        line_items = doc.select("//*[@tag='line-item']")
        print(f"Found {len(line_items)} line items")

        total_node = doc.select_first("//*[@tag='invoice-total']")
        if total_node:
            print(f"Invoice total: {total_node.content}")

        # Save the result
        doc.save("processed_invoice.kddb")
        print("Document saved successfully")

if __name__ == "__main__":
    process_document()

Next Steps

  • Explore advanced selectors for complex queries
  • Learn about processing steps for workflow tracking
  • Integrate with Kodexa Platform for cloud processing