Skip to main content
Selectors provide a powerful XPath-like syntax for querying nodes within a Kodexa document. Use selectors to find specific content, filter by attributes, and navigate the document structure.

Basic Selectors

Select All Nodes

from kodexa_document import Document

with Document.from_kddb("document.kddb", inmemory=True) as doc:
    # Select all nodes in the document
    all_nodes = doc.select("//*")
    print(f"Total nodes: {len(all_nodes)}")

    # Select the first match only
    first_node = doc.select_first("//*")
    if first_node:
        print(f"First node type: {first_node.type}")

Select by Node Type

with Document.from_kddb("document.kddb", inmemory=True) as doc:
    # All paragraphs
    paragraphs = doc.select("//paragraph")

    # All tables
    tables = doc.select("//table")

    # All pages
    pages = doc.select("//page")

    # First paragraph only
    first_para = doc.select_first("//paragraph")

    print(f"Found {len(paragraphs)} paragraphs")
    print(f"Found {len(tables)} tables")
    print(f"Found {len(pages)} pages")

Path Expressions

Absolute vs Descendant Paths

SelectorDescription
//paragraphAll paragraphs anywhere in document
/document/paragraphDirect child paragraphs of root
//section/paragraphParagraphs that are direct children of any section
//section//paragraphParagraphs anywhere under any section
with Document.from_kddb("document.kddb", inmemory=True) as doc:
    # Direct children of root
    root_children = doc.select("/document/*")

    # Paragraphs directly under sections
    section_paragraphs = doc.select("//section/paragraph")

    # Paragraphs anywhere under sections (including nested)
    all_section_paras = doc.select("//section//paragraph")

    # Cells in tables
    cells = doc.select("//table/row/cell")

Content Filtering

Contains Function

with Document.from_kddb("document.kddb", inmemory=True) as doc:
    # Paragraphs containing specific text
    invoices = doc.select("//paragraph[contains(@content, 'invoice')]")

    # Case-sensitive search
    total_nodes = doc.select("//paragraph[contains(@content, 'Total:')]")

    # Search in any node type
    matches = doc.select("//*[contains(@content, 'important')]")

    print(f"Found {len(invoices)} paragraphs mentioning invoice")

Starts-With and Ends-With

with Document.from_kddb("document.kddb", inmemory=True) as doc:
    # Content starting with specific text
    headers = doc.select("//paragraph[starts-with(@content, 'Section')]")

    # Content ending with specific text (if supported)
    # Note: Check SDK version for ends-with support

Tag Filtering

Select Tagged Nodes

with Document.from_kddb("document.kddb", inmemory=True) as doc:
    # Nodes with any tag
    tagged = doc.select("//*[hasTag()]")

    # Nodes with a specific tag
    important = doc.select("//*[@tag='important']")

    # Nodes with tag matching a prefix
    invoice_fields = doc.select("//*[hasTag('invoice-')]")

    # Combine tag and type
    invoice_totals = doc.select("//paragraph[@tag='invoice-total']")

    print(f"Found {len(tagged)} tagged nodes")
    print(f"Found {len(important)} important nodes")

Get Tagged Node Details

with Document.from_kddb("document.kddb", inmemory=True) as doc:
    # Find all nodes with invoice-total tag
    totals = doc.select("//*[@tag='invoice-total']")

    for node in totals:
        # Get the tag details
        tag = node.get_tag("invoice-total")
        if tag:
            value = tag.get("Value")
            confidence = tag.get("Confidence")
            print(f"Total: {value} (confidence: {confidence})")

Position Filtering

Index-Based Selection

with Document.from_kddb("document.kddb", inmemory=True) as doc:
    # First paragraph (1-indexed)
    first = doc.select("//paragraph[1]")

    # Second paragraph
    second = doc.select("//paragraph[2]")

    # Last paragraph
    last = doc.select("//paragraph[last()]")

    # First three paragraphs
    first_three = doc.select("//paragraph[position() <= 3]")

Variables in Selectors

Parameterized Queries

with Document.from_kddb("document.kddb", inmemory=True) as doc:
    # Use variables in selectors
    variables = {"search_term": "invoice"}
    matches = doc.select(
        "//paragraph[contains(@content, $search_term)]",
        variables=variables
    )

    # Multiple variables
    variables = {
        "tag_name": "invoice-total",
        "min_confidence": 0.9
    }
    high_confidence = doc.select(
        "//*[@tag=$tag_name]",
        variables=variables
    )

    print(f"Found {len(matches)} matches")

Common Selector Patterns

Quick Reference

PatternSelectorDescription
All nodes//*Every node in document
By type//paragraphAll paragraphs
Direct children//section/paragraphParagraphs under sections
Any depth//section//paragraphParagraphs anywhere in sections
First of type//paragraph[1]First paragraph
By tag//*[@tag='important']Tagged nodes
By content//paragraph[contains(@content, 'text')]Content search
Multiple conditions//paragraph[@tag='x'][contains(@content, 'y')]Combined filters

Real-World Examples

with Document.from_kddb("document.kddb", inmemory=True) as doc:
    # Extract all invoice line items
    line_items = doc.select("//row[@tag='line-item']")

    # Find vendor information
    vendor = doc.select_first("//paragraph[@tag='vendor-name']")

    # Get all amounts in a table
    amounts = doc.select("//table//cell[contains(@content, '$')]")

    # Find headers (paragraphs at start of sections)
    headers = doc.select("//section/paragraph[1]")

    # Get all nodes on page 1 (if page structure exists)
    page1_nodes = doc.select("//page[1]//*")

    # Find paragraphs with high-confidence tags
    # (filter in Python after selecting)
    tagged_paras = doc.select("//paragraph[hasTag()]")
    high_conf = [
        p for p in tagged_paras
        for tag in [p.get_tag(t) for t in p.get_tags()]
        if tag and tag.get("Confidence", 0) > 0.9
    ]

Working with Results

Iterating Results

with Document.from_kddb("document.kddb", inmemory=True) as doc:
    paragraphs = doc.select("//paragraph")

    for para in paragraphs:
        # Access node properties
        print(f"Type: {para.type}")
        print(f"Content: {para.content}")
        print(f"UUID: {para.uuid}")

        # Check tags
        if para.has_tag("important"):
            print("  -> Important!")

        # Get features
        font = para.get_feature("style", "font-family")
        if font:
            print(f"  Font: {font.get_value()}")

Chaining Selections

with Document.from_kddb("document.kddb", inmemory=True) as doc:
    # Find all tables, then get cells from each
    tables = doc.select("//table")

    for table in tables:
        # Select within the context of this table node
        cells = table.select(".//cell")
        print(f"Table has {len(cells)} cells")

        # Get just the first row's cells
        first_row_cells = table.select("./row[1]/cell")

Performance Tips

For large documents, optimize your selectors:
  1. Be specific with paths (//table/row/cell vs //*)
  2. Use select_first() when you only need one result
  3. Combine conditions in one selector vs multiple queries
  4. Cache selector results when iterating multiple times
with Document.from_kddb("large_document.kddb", inmemory=True) as doc:
    # Good: Specific path
    cells = doc.select("//table/row/cell")

    # Less efficient: Broad search
    # all_nodes = doc.select("//*")

    # Good: Single result
    first = doc.select_first("//paragraph[@tag='title']")

    # Good: Combined conditions
    matches = doc.select("//paragraph[@tag='important'][contains(@content, 'urgent')]")

    # Less efficient: Multiple queries
    # tagged = doc.select("//paragraph[@tag='important']")
    # matches = [p for p in tagged if 'urgent' in p.content]