Getting Started

This guide covers the essential operations for working with Kodexa Documents: creating, loading, manipulating, and saving documents.

Installation

pip install kodexa-document

Initialization

from kodexa_document import Document

# Python SDK is ready to use immediately after import
# Use context managers for automatic cleanup
with Document() as doc:
    # Your code here
    pass

Creating Documents

Empty Document

Create a new document and build its structure:

from kodexa_document import Document

with Document() as doc:
    # Create the root node
    root = doc.create_node("document", "My Document")
    doc.content_node = root

    # Add child nodes
    section = doc.create_node("section", "Introduction", parent=root)
    para1 = doc.create_node("paragraph", "First paragraph.", parent=section)
    para2 = doc.create_node("paragraph", "Second paragraph.", parent=section)

    print(f"Created document with {len(root.get_children())} sections")

From Text

Automatically parse text into paragraphs:

text = """First paragraph of content.
Second paragraph with more details.
Third paragraph to conclude."""

with Document.from_text(text, separator="\n") as doc:
    paragraphs = doc.select("//paragraph")
    print(f"Created {len(paragraphs)} paragraphs from text")

With Metadata

Initialize documents with metadata:

with Document(metadata={
    "title": "Invoice Analysis",
    "author": "Processing System",
    "created": "2024-01-15"
}) as doc:
    title = doc.get_metadata("title")
    print(f"Document title: {title}")

Loading Documents

From KDDB File / Blob

# Load into memory for fast processing (creates a copy)
with Document.from_kddb("document.kddb", detached=True) as doc:
    print(f"Loaded document: {doc.uuid}")
    nodes = doc.select("//*")
    print(f"Total nodes: {len(nodes)}")

# Load from bytes (e.g., API response)
import requests
response = requests.get("https://api.example.com/documents/123")
with Document.from_kddb(response.content) as doc:
    print(f"Loaded from API: {doc.uuid}")

From JSON

json_data = '{"uuid": "...", "metadata": {"title": "Test"}}'
with Document.from_json(json_data) as doc:
    print(f"Loaded from JSON: {doc.uuid}")

Working with Content Nodes

Traverse the document tree:

with Document.from_kddb("document.kddb") as doc:
    root = doc.content_node

    for child in root.get_children():
        parent = child.get_parent()       # Back to root
        siblings = child.get_siblings()   # Other children
        next_node = child.next_node()     # Next sibling
        depth = child.get_depth()         # Depth in tree

        print(f"Node type: {child.type}, depth: {depth}")

Content Access

Read and modify node content:

with Document() as doc:
    root = doc.create_node("document")
    doc.content_node = root

    para = doc.create_node("paragraph", "Initial content", parent=root)

    # Read content
    print(f"Content: {para.content}")

    # Update content
    para.content = "Updated content"

    # Multi-part content
    para.set_content_parts(["Part 1", "Part 2", "Part 3"])
    parts = para.get_content_parts()

    # Get all content from node and descendants
    all_text = root.get_all_content(separator=" ")

Querying with Selectors

Use XPath-like selectors to find nodes:

with Document.from_text("Para 1\nPara 2\nPara 3", separator="\n") as doc:
    # Select all nodes of a type
    all_paragraphs = doc.select("//paragraph")

    # Select first match only
    first_para = doc.select_first("//paragraph")

    # Filter by content
    matching = doc.select("//paragraph[contains(@content, 'Para 2')]")

    # Select tagged nodes
    tagged = doc.select("//*[@tag='important']")

    # Select with variables
    variables = {"search_term": "Para 1"}
    results = doc.select("//paragraph[contains(@content, $search_term)]", variables)

    print(f"Found {len(all_paragraphs)} paragraphs")

Common Selector Patterns

Selector	Description
`//*`	All nodes
`//paragraph`	All paragraphs
`//section/paragraph`	Direct child paragraphs of sections
`//paragraph[1]`	First paragraph
`//*[@tag='important']`	Nodes with ‘important’ tag
`//paragraph[contains(@content, 'text')]`	Paragraphs containing ‘text’

Adding Features

Attach metadata to nodes:

with Document() as doc:
    root = doc.create_node("document")
    doc.content_node = root
    para = doc.create_node("paragraph", "Styled text", parent=root)

    # Add features (type, name, value)
    para.add_feature("style", "font-family", "Arial")
    para.add_feature("style", "font-size", "12pt")
    para.add_feature("analysis", "word-count", 2)
    para.add_feature("position", "bbox", {"x": 100, "y": 200, "w": 300, "h": 50})

    # Retrieve features
    font = para.get_feature("style", "font-family")
    if font:
        print(f"Font: {font.get_value()}")

    # Get all features of a type
    style_features = para.get_features_of_type("style")

Adding Tags

Annotate nodes with tags:

with Document() as doc:
    root = doc.create_node("document")
    doc.content_node = root
    para = doc.create_node("paragraph", "Invoice total: $1,234.56", parent=root)

    # Simple tag
    para.tag("important")

    # Tag with confidence and value
    para.tag("invoice-total", confidence=0.95, value="$1,234.56")

    # Check for tags
    if para.has_tag("important"):
        print("This paragraph is marked as important")

    # Get tag details
    tag = para.get_tag("invoice-total")
    if tag:
        confidence = tag.get("Confidence")
        value = tag.get("Value")
        print(f"Invoice total: {value} (confidence: {confidence})")

    # List all tags
    all_tags = para.get_tags()

Saving Documents

To KDDB File / Blob

with Document() as doc:
    root = doc.create_node("document", "Content to save")
    doc.content_node = root

    # Save to file
    doc.save("output.kddb")

    # Get as bytes (for API responses)
    kddb_bytes = doc.to_kddb()

To JSON

with Document() as doc:
    root = doc.create_node("document", "Debug output")
    doc.content_node = root

    # Pretty-printed JSON
    json_str = doc.to_json(indent=2)
    print(json_str)

    # As dictionary
    doc_dict = doc.to_dict()

Memory Management

Both SDKs use native code (Go via CFFI for Python, WebAssembly for TypeScript). Proper cleanup prevents memory leaks.

# Python: Use context managers (recommended)
with Document() as doc:
    # Document automatically closed when exiting the block
    root = doc.create_node("document", "Safe content")

# Manual cleanup if needed
doc = Document()
try:
    # Work with document
    pass
finally:
    doc.close()

Complete Example

A full invoice processing workflow:

from kodexa_document import Document

def process_document():
    with Document() as doc:
        # Set document metadata
        doc.set_metadata("title", "Invoice Processing Result")
        doc.add_label("invoice")

        # Build document structure
        root = doc.create_node("document", "Invoice #12345")
        doc.content_node = root

        # Add header section
        header = doc.create_node("section", "Header", parent=root)
        doc.create_node("paragraph", "Vendor: Acme Corp", parent=header)
        doc.create_node("paragraph", "Date: 2024-01-15", parent=header)

        # Add line items
        items = doc.create_node("section", "Line Items", parent=root)

        for i, (desc, amount) in enumerate([
            ("Widget A", 100.00),
            ("Widget B", 250.00),
            ("Service Fee", 50.00)
        ]):
            item = doc.create_node("paragraph", f"{desc}: ${amount:.2f}", parent=items)
            item.add_feature("line-item", "amount", amount)
            item.tag("line-item", value=str(amount))

        # Add total
        total = doc.create_node("paragraph", "Total: $400.00", parent=root)
        total.tag("invoice-total", confidence=1.0, value="400.00")

        # Query the document
        line_items = doc.select("//*[@tag='line-item']")
        print(f"Found {len(line_items)} line items")

        total_node = doc.select_first("//*[@tag='invoice-total']")
        if total_node:
            print(f"Invoice total: {total_node.content}")

        # Save the result
        doc.save("processed_invoice.kddb")

if __name__ == "__main__":
    process_document()

Next Steps

As you become more familiar with the SDK, explore:

Advanced XPath selectors for complex queries
Processing steps for workflow tracking
Integration with Kodexa Platform for cloud processing

Overview

Document Data

Document Structure

Structured Data

Change Management

Python SDK

Installation

Initialization

Creating Documents

Empty Document

From Text

With Metadata

Loading Documents

From KDDB File / Blob

From JSON

Working with Content Nodes

Navigation

Content Access

Querying with Selectors

Common Selector Patterns

Adding Features

Adding Tags

Saving Documents

To KDDB File / Blob

To JSON

Memory Management

Complete Example

Next Steps

Overview

Document Data

Document Structure

Structured Data

Change Management

Python SDK

​Installation

​Initialization

​Creating Documents

​Empty Document

​From Text

​With Metadata

​Loading Documents

​From KDDB File / Blob

​From JSON

​Working with Content Nodes

​Navigation

​Content Access

​Querying with Selectors

​Common Selector Patterns

​Adding Features

​Adding Tags

​Saving Documents

​To KDDB File / Blob

​To JSON

​Memory Management

​Complete Example

​Next Steps

Installation

Initialization

Creating Documents

Empty Document

From Text

With Metadata

Loading Documents

From KDDB File / Blob

From JSON

Working with Content Nodes

Navigation

Content Access

Querying with Selectors

Common Selector Patterns

Adding Features

Adding Tags

Saving Documents

To KDDB File / Blob

To JSON

Memory Management

Complete Example

Next Steps