Skip to main content
Content nodes form the hierarchical structure of a Kodexa document. Each node represents a piece of content (text, a section, a page, etc.) and can have children, features, and tags.

Node Basics

Creating Nodes

from kodexa_document import Document

with Document(inmemory=True) as doc:
    # Create the root node
    root = doc.create_node("document", "My Document")
    doc.content_node = root

    # Create child nodes with content
    section = doc.create_node("section", "Introduction", parent=root)
    para1 = doc.create_node("paragraph", "First paragraph.", parent=section)
    para2 = doc.create_node("paragraph", "Second paragraph.", parent=section)

    # Create nodes without initial content
    table = doc.create_node("table", parent=root)
    row = doc.create_node("row", parent=table)
    cell = doc.create_node("cell", "Cell content", parent=row)

Node Types

Common node types used in Kodexa documents:
TypeDescription
documentRoot node of the document
pageA page in the document
sectionA logical section
paragraphA paragraph of text
lineA line of text
wordAn individual word
tableA table structure
rowA table row
cellA table cell
imageAn image element

Node Content

Reading and Writing Content

with Document(inmemory=True) as doc:
    root = doc.create_node("document")
    doc.content_node = root
    para = doc.create_node("paragraph", "Initial content", parent=root)

    # Read content
    print(f"Content: {para.content}")

    # Update content
    para.content = "Updated content"

    # Content can be None
    empty_node = doc.create_node("section", parent=root)
    print(f"Empty content: {empty_node.content}")  # None

Content Parts

Nodes can have multiple content parts for complex text:
with Document(inmemory=True) as doc:
    root = doc.create_node("document")
    doc.content_node = root
    para = doc.create_node("paragraph", parent=root)

    # Set multiple content parts
    para.set_content_parts(["Hello ", "world", "!"])

    # Get content parts
    parts = para.get_content_parts()
    print(f"Parts: {parts}")  # ['Hello ', 'world', '!']

    # Full content joins parts
    print(f"Full: {para.content}")  # 'Hello world!'

Aggregated Content

Get all content from a node and its descendants:
with Document(inmemory=True) as doc:
    root = doc.create_node("document")
    doc.content_node = root

    section = doc.create_node("section", parent=root)
    doc.create_node("paragraph", "First paragraph.", parent=section)
    doc.create_node("paragraph", "Second paragraph.", parent=section)
    doc.create_node("paragraph", "Third paragraph.", parent=section)

    # Get all content from section and children
    all_text = section.get_all_content(separator=" ")
    print(f"All content: {all_text}")
    # "First paragraph. Second paragraph. Third paragraph."

Tree Navigation

Parent and Children

with Document.from_kddb("document.kddb", inmemory=True) as doc:
    root = doc.content_node

    # Get all children
    children = root.get_children()
    print(f"Root has {len(children)} children")

    for child in children:
        # Get parent (back to root)
        parent = child.get_parent()
        print(f"Node type: {child.type}, parent type: {parent.type}")

        # Get this node's children
        grandchildren = child.get_children()
        print(f"  Has {len(grandchildren)} children")

Siblings

with Document.from_kddb("document.kddb", inmemory=True) as doc:
    paragraphs = doc.select("//paragraph")

    for para in paragraphs:
        # Get all siblings (nodes with same parent)
        siblings = para.get_siblings()
        print(f"Has {len(siblings)} siblings")

        # Navigate to next/previous sibling
        next_node = para.next_node()
        prev_node = para.previous_node()

        if next_node:
            print(f"Next: {next_node.type}")
        if prev_node:
            print(f"Previous: {prev_node.type}")

Depth and Position

with Document.from_kddb("document.kddb", inmemory=True) as doc:
    nodes = doc.select("//*")

    for node in nodes:
        # Depth in tree (root = 0)
        depth = node.get_depth()

        # Index among siblings
        index = node.index

        print(f"Type: {node.type}, Depth: {depth}, Index: {index}")

Node Features

Features are typed key-value metadata attached to individual nodes:
with Document(inmemory=True) as doc:
    root = doc.create_node("document")
    doc.content_node = root
    para = doc.create_node("paragraph", "Styled text", parent=root)

    # Add features (type, name, value)
    para.add_feature("style", "font-family", "Arial")
    para.add_feature("style", "font-size", "12pt")
    para.add_feature("style", "font-weight", "bold")
    para.add_feature("spatial", "bbox", {"x": 100, "y": 200, "w": 300, "h": 50})
    para.add_feature("ocr", "confidence", 0.95)

    # Retrieve a specific feature
    font = para.get_feature("style", "font-family")
    if font:
        print(f"Font: {font.get_value()}")

    # Get all features of a type
    style_features = para.get_features_of_type("style")
    for f in style_features:
        print(f"  {f.name}: {f.get_value()}")

    # Get all features
    all_features = para.get_features()
    print(f"Total features: {len(all_features)}")

    # Check if feature exists
    has_bbox = para.has_feature("spatial", "bbox")

Node Tags

Tags annotate nodes with labels, optional confidence scores, and values:
with Document(inmemory=True) as doc:
    root = doc.create_node("document")
    doc.content_node = root
    para = doc.create_node("paragraph", "Invoice total: $1,234.56", parent=root)

    # Simple tag
    para.tag("important")

    # Tag with confidence and value
    para.tag("invoice-total", confidence=0.95, value="$1,234.56")

    # Tag with additional data
    para.tag("extracted-field",
        confidence=0.92,
        value="1234.56",
        tag_uuid="field-uuid-123"
    )

    # Check for tag
    if para.has_tag("important"):
        print("Node is marked important")

    # Get tag details
    tag = para.get_tag("invoice-total")
    if tag:
        print(f"Value: {tag.get('Value')}")
        print(f"Confidence: {tag.get('Confidence')}")

    # Get all tags
    all_tags = para.get_tags()
    print(f"Tags: {all_tags}")

    # Remove a tag
    para.remove_tag("important")

Spatial Data (Bounding Boxes)

Nodes can have spatial information for document layout:
with Document(inmemory=True) as doc:
    root = doc.create_node("document")
    doc.content_node = root
    para = doc.create_node("paragraph", "Text on page", parent=root)

    # Set bounding box via feature
    para.add_feature("spatial", "bbox", {
        "x": 100,      # Left position
        "y": 200,      # Top position
        "width": 300,  # Width
        "height": 50   # Height
    })

    # Set page reference
    para.add_feature("spatial", "page", 0)

    # Get bounding box
    bbox = para.get_feature("spatial", "bbox")
    if bbox:
        box = bbox.get_value()
        print(f"Position: ({box['x']}, {box['y']})")
        print(f"Size: {box['width']} x {box['height']}")

Building Document Structures

Invoice Example

from kodexa_document import Document

def build_invoice_structure():
    with Document(inmemory=True) as doc:
        # Document root
        root = doc.create_node("document", "Invoice #12345")
        doc.content_node = root

        # Header section
        header = doc.create_node("section", "Header", parent=root)

        vendor = doc.create_node("paragraph", "Vendor: Acme Corp", parent=header)
        vendor.tag("vendor-name", value="Acme Corp", confidence=0.98)

        date = doc.create_node("paragraph", "Date: 2024-01-15", parent=header)
        date.tag("invoice-date", value="2024-01-15", confidence=0.95)

        # Line items table
        table = doc.create_node("table", parent=root)

        items = [
            ("Widget A", "100.00"),
            ("Widget B", "250.00"),
            ("Service Fee", "50.00")
        ]

        for desc, amount in items:
            row = doc.create_node("row", parent=table)

            desc_cell = doc.create_node("cell", desc, parent=row)
            desc_cell.tag("line-item-description")

            amt_cell = doc.create_node("cell", f"${amount}", parent=row)
            amt_cell.tag("line-item-amount", value=amount, confidence=0.92)

        # Total
        total = doc.create_node("paragraph", "Total: $400.00", parent=root)
        total.tag("invoice-total", value="400.00", confidence=0.99)

        return doc