Selectors

Selectors provide a powerful XPath-like syntax for querying nodes within a Kodexa document. Use selectors to find specific content, filter by attributes, and navigate the document structure.

Basic Selectors

Select All Nodes

from kodexa_document import Document

with Document.from_kddb("document.kddb") as doc:
    # Select all nodes in the document
    all_nodes = doc.select("//*")
    print(f"Total nodes: {len(all_nodes)}")

    # Select the first match only
    first_node = doc.select_first("//*")
    if first_node:
        print(f"First node type: {first_node.type}")

Select by Node Type

with Document.from_kddb("document.kddb") as doc:
    # All paragraphs
    paragraphs = doc.select("//paragraph")

    # All tables
    tables = doc.select("//table")

    # All pages
    pages = doc.select("//page")

    # First paragraph only
    first_para = doc.select_first("//paragraph")

    print(f"Found {len(paragraphs)} paragraphs")
    print(f"Found {len(tables)} tables")
    print(f"Found {len(pages)} pages")

Path Expressions

Absolute vs Descendant Paths

Selector	Description
`//paragraph`	All paragraphs anywhere in document
`/document/paragraph`	Direct child paragraphs of root
`//section/paragraph`	Paragraphs that are direct children of any section
`//section//paragraph`	Paragraphs anywhere under any section

with Document.from_kddb("document.kddb") as doc:
    # Direct children of root
    root_children = doc.select("/document/*")

    # Paragraphs directly under sections
    section_paragraphs = doc.select("//section/paragraph")

    # Paragraphs anywhere under sections (including nested)
    all_section_paras = doc.select("//section//paragraph")

    # Cells in tables
    cells = doc.select("//table/row/cell")

Content Filtering

Contains Function

with Document.from_kddb("document.kddb") as doc:
    # Paragraphs containing specific text
    invoices = doc.select("//paragraph[contains(@content, 'invoice')]")

    # Case-sensitive search
    total_nodes = doc.select("//paragraph[contains(@content, 'Total:')]")

    # Search in any node type
    matches = doc.select("//*[contains(@content, 'important')]")

    print(f"Found {len(invoices)} paragraphs mentioning invoice")

Starts-With and Ends-With

with Document.from_kddb("document.kddb") as doc:
    # Content starting with specific text
    headers = doc.select("//paragraph[starts-with(@content, 'Section')]")

    # Content ending with specific text (if supported)
    # Note: Check SDK version for ends-with support

Tag Filtering

Select Tagged Nodes

with Document.from_kddb("document.kddb") as doc:
    # Nodes with any tag
    tagged = doc.select("//*[hasTag()]")

    # Nodes with a specific tag
    important = doc.select("//*[@tag='important']")

    # Nodes with tag matching a prefix
    invoice_fields = doc.select("//*[hasTag('invoice-')]")

    # Combine tag and type
    invoice_totals = doc.select("//paragraph[@tag='invoice-total']")

    print(f"Found {len(tagged)} tagged nodes")
    print(f"Found {len(important)} important nodes")

Get Tagged Node Details

with Document.from_kddb("document.kddb") as doc:
    # Find all nodes with invoice-total tag
    totals = doc.select("//*[@tag='invoice-total']")

    for node in totals:
        # Get the tag details
        tag = node.get_tag("invoice-total")
        if tag:
            value = tag.get("Value")
            confidence = tag.get("Confidence")
            print(f"Total: {value} (confidence: {confidence})")

Position Filtering

Index-Based Selection

with Document.from_kddb("document.kddb") as doc:
    # First paragraph (1-indexed)
    first = doc.select("//paragraph[1]")

    # Second paragraph
    second = doc.select("//paragraph[2]")

    # Last paragraph
    last = doc.select("//paragraph[last()]")

    # First three paragraphs
    first_three = doc.select("//paragraph[position() <= 3]")

Variables in Selectors

Parameterized Queries

with Document.from_kddb("document.kddb") as doc:
    # Use variables in selectors
    variables = {"search_term": "invoice"}
    matches = doc.select(
        "//paragraph[contains(@content, $search_term)]",
        variables=variables
    )

    # Multiple variables
    variables = {
        "tag_name": "invoice-total",
        "min_confidence": 0.9
    }
    high_confidence = doc.select(
        "//*[@tag=$tag_name]",
        variables=variables
    )

    print(f"Found {len(matches)} matches")

Common Selector Patterns

Quick Reference

Pattern	Selector	Description
All nodes	`//*`	Every node in document
By type	`//paragraph`	All paragraphs
Direct children	`//section/paragraph`	Paragraphs under sections
Any depth	`//section//paragraph`	Paragraphs anywhere in sections
First of type	`//paragraph[1]`	First paragraph
By tag	`//*[@tag='important']`	Tagged nodes
By content	`//paragraph[contains(@content, 'text')]`	Content search
Multiple conditions	`//paragraph[@tag='x'][contains(@content, 'y')]`	Combined filters

Real-World Examples

with Document.from_kddb("document.kddb") as doc:
    # Extract all invoice line items
    line_items = doc.select("//row[@tag='line-item']")

    # Find vendor information
    vendor = doc.select_first("//paragraph[@tag='vendor-name']")

    # Get all amounts in a table
    amounts = doc.select("//table//cell[contains(@content, '$')]")

    # Find headers (paragraphs at start of sections)
    headers = doc.select("//section/paragraph[1]")

    # Get all nodes on page 1 (if page structure exists)
    page1_nodes = doc.select("//page[1]//*")

    # Find paragraphs with high-confidence tags
    # (filter in Python after selecting)
    tagged_paras = doc.select("//paragraph[hasTag()]")
    high_conf = [
        p for p in tagged_paras
        for tag in [p.get_tag(t) for t in p.get_tags()]
        if tag and tag.get("Confidence", 0) > 0.9
    ]

Working with Results

Iterating Results

with Document.from_kddb("document.kddb") as doc:
    paragraphs = doc.select("//paragraph")

    for para in paragraphs:
        # Access node properties
        print(f"Type: {para.type}")
        print(f"Content: {para.content}")
        print(f"UUID: {para.uuid}")

        # Check tags
        if para.has_tag("important"):
            print("  -> Important!")

        # Get features
        font = para.get_feature("style", "font-family")
        if font:
            print(f"  Font: {font.get_value()}")

Chaining Selections

with Document.from_kddb("document.kddb") as doc:
    # Find all tables, then get cells from each
    tables = doc.select("//table")

    for table in tables:
        # Select within the context of this table node
        cells = table.select(".//cell")
        print(f"Table has {len(cells)} cells")

        # Get just the first row's cells
        first_row_cells = table.select("./row[1]/cell")

Performance Tips

For large documents, optimize your selectors:

Be specific with paths (//table/row/cell vs //*)
Use select_first() when you only need one result
Combine conditions in one selector vs multiple queries
Cache selector results when iterating multiple times

with Document.from_kddb("large_document.kddb") as doc:
    # Good: Specific path
    cells = doc.select("//table/row/cell")

    # Less efficient: Broad search
    # all_nodes = doc.select("//*")

    # Good: Single result
    first = doc.select_first("//paragraph[@tag='title']")

    # Good: Combined conditions
    matches = doc.select("//paragraph[@tag='important'][contains(@content, 'urgent')]")

    # Less efficient: Multiple queries
    # tagged = doc.select("//paragraph[@tag='important']")
    # matches = [p for p in tagged if 'urgent' in p.content]

Overview

Document Data

Document Structure

Structured Data

Change Management

Python SDK

Basic Selectors

Select All Nodes

Select by Node Type

Path Expressions

Absolute vs Descendant Paths

Content Filtering

Contains Function

Starts-With and Ends-With

Tag Filtering

Select Tagged Nodes

Get Tagged Node Details

Position Filtering

Index-Based Selection

Variables in Selectors

Parameterized Queries

Common Selector Patterns

Quick Reference

Real-World Examples

Working with Results

Iterating Results

Chaining Selections

Performance Tips

Overview

Document Data

Document Structure

Structured Data

Change Management

Python SDK

​Basic Selectors

​Select All Nodes

​Select by Node Type

​Path Expressions

​Absolute vs Descendant Paths

​Content Filtering

​Contains Function

​Starts-With and Ends-With

​Tag Filtering

​Select Tagged Nodes

​Get Tagged Node Details

​Position Filtering

​Index-Based Selection

​Variables in Selectors

​Parameterized Queries

​Common Selector Patterns

​Quick Reference

​Real-World Examples

​Working with Results

​Iterating Results

​Chaining Selections

​Performance Tips

Basic Selectors

Select All Nodes

Select by Node Type

Path Expressions

Absolute vs Descendant Paths

Content Filtering

Contains Function

Starts-With and Ends-With

Tag Filtering

Select Tagged Nodes

Get Tagged Node Details

Position Filtering

Index-Based Selection

Variables in Selectors

Parameterized Queries

Common Selector Patterns

Quick Reference

Real-World Examples

Working with Results

Iterating Results

Chaining Selections

Performance Tips