Skip to main content
This guide covers the essential operations for working with Kodexa Documents: creating, loading, manipulating, and saving documents.

Installation

pip install kodexa-document

Initialization

from kodexa_document import Document

# Python SDK is ready to use immediately after import
# Use context managers for automatic cleanup
with Document(inmemory=True) as doc:
    # Your code here
    pass

Creating Documents

Empty Document

Create a new document and build its structure:
from kodexa_document import Document

with Document(inmemory=True) as doc:
    # Create the root node
    root = doc.create_node("document", "My Document")
    doc.content_node = root

    # Add child nodes
    section = doc.create_node("section", "Introduction", parent=root)
    para1 = doc.create_node("paragraph", "First paragraph.", parent=section)
    para2 = doc.create_node("paragraph", "Second paragraph.", parent=section)

    print(f"Created document with {len(root.get_children())} sections")

From Text

Automatically parse text into paragraphs:
text = """First paragraph of content.
Second paragraph with more details.
Third paragraph to conclude."""

with Document.from_text(text, separator="\n", inmemory=True) as doc:
    paragraphs = doc.select("//paragraph")
    print(f"Created {len(paragraphs)} paragraphs from text")

With Metadata

Initialize documents with metadata:
with Document(inmemory=True, metadata={
    "title": "Invoice Analysis",
    "author": "Processing System",
    "created": "2024-01-15"
}) as doc:
    title = doc.get_metadata("title")
    print(f"Document title: {title}")

Loading Documents

From KDDB File / Blob

# Load into memory for fast processing (creates a copy)
with Document.from_kddb("document.kddb", inmemory=True, detached=True) as doc:
    print(f"Loaded document: {doc.uuid}")
    nodes = doc.select("//*")
    print(f"Total nodes: {len(nodes)}")

# Load from bytes (e.g., API response)
import requests
response = requests.get("https://api.example.com/documents/123")
with Document.from_kddb(response.content, inmemory=True) as doc:
    print(f"Loaded from API: {doc.uuid}")

From JSON

json_data = '{"uuid": "...", "metadata": {"title": "Test"}}'
with Document.from_json(json_data, inmemory=True) as doc:
    print(f"Loaded from JSON: {doc.uuid}")

Working with Content Nodes

Traverse the document tree:
with Document.from_kddb("document.kddb", inmemory=True) as doc:
    root = doc.content_node

    for child in root.get_children():
        parent = child.get_parent()       # Back to root
        siblings = child.get_siblings()   # Other children
        next_node = child.next_node()     # Next sibling
        depth = child.get_depth()         # Depth in tree

        print(f"Node type: {child.type}, depth: {depth}")

Content Access

Read and modify node content:
with Document(inmemory=True) as doc:
    root = doc.create_node("document")
    doc.content_node = root

    para = doc.create_node("paragraph", "Initial content", parent=root)

    # Read content
    print(f"Content: {para.content}")

    # Update content
    para.content = "Updated content"

    # Multi-part content
    para.set_content_parts(["Part 1", "Part 2", "Part 3"])
    parts = para.get_content_parts()

    # Get all content from node and descendants
    all_text = root.get_all_content(separator=" ")

Querying with Selectors

Use XPath-like selectors to find nodes:
with Document.from_text("Para 1\nPara 2\nPara 3", separator="\n", inmemory=True) as doc:
    # Select all nodes of a type
    all_paragraphs = doc.select("//paragraph")

    # Select first match only
    first_para = doc.select_first("//paragraph")

    # Filter by content
    matching = doc.select("//paragraph[contains(@content, 'Para 2')]")

    # Select tagged nodes
    tagged = doc.select("//*[@tag='important']")

    # Select with variables
    variables = {"search_term": "Para 1"}
    results = doc.select("//paragraph[contains(@content, $search_term)]", variables)

    print(f"Found {len(all_paragraphs)} paragraphs")

Common Selector Patterns

SelectorDescription
//*All nodes
//paragraphAll paragraphs
//section/paragraphDirect child paragraphs of sections
//paragraph[1]First paragraph
//*[@tag='important']Nodes with ‘important’ tag
//paragraph[contains(@content, 'text')]Paragraphs containing ‘text’

Adding Features

Attach metadata to nodes:
with Document(inmemory=True) as doc:
    root = doc.create_node("document")
    doc.content_node = root
    para = doc.create_node("paragraph", "Styled text", parent=root)

    # Add features (type, name, value)
    para.add_feature("style", "font-family", "Arial")
    para.add_feature("style", "font-size", "12pt")
    para.add_feature("analysis", "word-count", 2)
    para.add_feature("position", "bbox", {"x": 100, "y": 200, "w": 300, "h": 50})

    # Retrieve features
    font = para.get_feature("style", "font-family")
    if font:
        print(f"Font: {font.get_value()}")

    # Get all features of a type
    style_features = para.get_features_of_type("style")

Adding Tags

Annotate nodes with tags:
with Document(inmemory=True) as doc:
    root = doc.create_node("document")
    doc.content_node = root
    para = doc.create_node("paragraph", "Invoice total: $1,234.56", parent=root)

    # Simple tag
    para.tag("important")

    # Tag with confidence and value
    para.tag("invoice-total", confidence=0.95, value="$1,234.56")

    # Check for tags
    if para.has_tag("important"):
        print("This paragraph is marked as important")

    # Get tag details
    tag = para.get_tag("invoice-total")
    if tag:
        confidence = tag.get("Confidence")
        value = tag.get("Value")
        print(f"Invoice total: {value} (confidence: {confidence})")

    # List all tags
    all_tags = para.get_tags()

Saving Documents

To KDDB File / Blob

with Document(inmemory=True) as doc:
    root = doc.create_node("document", "Content to save")
    doc.content_node = root

    # Save to file
    doc.save("output.kddb")

    # Get as bytes (for API responses)
    kddb_bytes = doc.to_kddb()

To JSON

with Document(inmemory=True) as doc:
    root = doc.create_node("document", "Debug output")
    doc.content_node = root

    # Pretty-printed JSON
    json_str = doc.to_json(indent=2)
    print(json_str)

    # As dictionary
    doc_dict = doc.to_dict()

Memory Management

Both SDKs use native code (Go via CFFI for Python, WebAssembly for TypeScript). Proper cleanup prevents memory leaks.
# Python: Use context managers (recommended)
with Document(inmemory=True) as doc:
    # Document automatically closed when exiting the block
    root = doc.create_node("document", "Safe content")

# Manual cleanup if needed
doc = Document(inmemory=True)
try:
    # Work with document
    pass
finally:
    doc.close()

Complete Example

A full invoice processing workflow:
from kodexa_document import Document

def process_document():
    with Document(inmemory=True) as doc:
        # Set document metadata
        doc.set_metadata("title", "Invoice Processing Result")
        doc.add_label("invoice")

        # Build document structure
        root = doc.create_node("document", "Invoice #12345")
        doc.content_node = root

        # Add header section
        header = doc.create_node("section", "Header", parent=root)
        doc.create_node("paragraph", "Vendor: Acme Corp", parent=header)
        doc.create_node("paragraph", "Date: 2024-01-15", parent=header)

        # Add line items
        items = doc.create_node("section", "Line Items", parent=root)

        for i, (desc, amount) in enumerate([
            ("Widget A", 100.00),
            ("Widget B", 250.00),
            ("Service Fee", 50.00)
        ]):
            item = doc.create_node("paragraph", f"{desc}: ${amount:.2f}", parent=items)
            item.add_feature("line-item", "amount", amount)
            item.tag("line-item", value=str(amount))

        # Add total
        total = doc.create_node("paragraph", "Total: $400.00", parent=root)
        total.tag("invoice-total", confidence=1.0, value="400.00")

        # Query the document
        line_items = doc.select("//*[@tag='line-item']")
        print(f"Found {len(line_items)} line items")

        total_node = doc.select_first("//*[@tag='invoice-total']")
        if total_node:
            print(f"Invoice total: {total_node.content}")

        # Save the result
        doc.save("processed_invoice.kddb")

if __name__ == "__main__":
    process_document()

Next Steps

As you become more familiar with the SDK, explore:
  • Advanced XPath selectors for complex queries
  • Processing steps for workflow tracking
  • Integration with Kodexa Platform for cloud processing