Documentation Index
Fetch the complete documentation index at: https://developer.kodexa.ai/llms.txt
Use this file to discover all available pages before exploring further.
Selectors provide a powerful XPath-like syntax for querying nodes within a Kodexa document. Use selectors to find specific content, filter by attributes, and navigate the document structure.
Basic Selectors
Select All Nodes
from kodexa_document import Document
with Document.from_kddb("document.kddb") as doc:
# Select all nodes in the document
all_nodes = doc.select("//*")
print(f"Total nodes: {len(all_nodes)}")
# Select the first match only
first_node = doc.select_first("//*")
if first_node:
print(f"First node type: {first_node.type}")
Select by Node Type
with Document.from_kddb("document.kddb") as doc:
# All paragraphs
paragraphs = doc.select("//paragraph")
# All tables
tables = doc.select("//table")
# All pages
pages = doc.select("//page")
# First paragraph only
first_para = doc.select_first("//paragraph")
print(f"Found {len(paragraphs)} paragraphs")
print(f"Found {len(tables)} tables")
print(f"Found {len(pages)} pages")
Path Expressions
Absolute vs Descendant Paths
| Selector | Description |
|---|
//paragraph | All paragraphs anywhere in document |
/document/paragraph | Direct child paragraphs of root |
//section/paragraph | Paragraphs that are direct children of any section |
//section//paragraph | Paragraphs anywhere under any section |
with Document.from_kddb("document.kddb") as doc:
# Direct children of root
root_children = doc.select("/document/*")
# Paragraphs directly under sections
section_paragraphs = doc.select("//section/paragraph")
# Paragraphs anywhere under sections (including nested)
all_section_paras = doc.select("//section//paragraph")
# Cells in tables
cells = doc.select("//table/row/cell")
Content Filtering
Contains Function
with Document.from_kddb("document.kddb") as doc:
# Paragraphs containing specific text
invoices = doc.select("//paragraph[contains(@content, 'invoice')]")
# Case-sensitive search
total_nodes = doc.select("//paragraph[contains(@content, 'Total:')]")
# Search in any node type
matches = doc.select("//*[contains(@content, 'important')]")
print(f"Found {len(invoices)} paragraphs mentioning invoice")
Starts-With and Ends-With
with Document.from_kddb("document.kddb") as doc:
# Content starting with specific text
headers = doc.select("//paragraph[starts-with(@content, 'Section')]")
# Content ending with specific text (if supported)
# Note: Check SDK version for ends-with support
Tag Filtering
Select Tagged Nodes
with Document.from_kddb("document.kddb") as doc:
# Nodes with any tag
tagged = doc.select("//*[hasTag()]")
# Nodes with a specific tag
important = doc.select("//*[@tag='important']")
# Nodes with tag matching a prefix
invoice_fields = doc.select("//*[hasTag('invoice-')]")
# Combine tag and type
invoice_totals = doc.select("//paragraph[@tag='invoice-total']")
print(f"Found {len(tagged)} tagged nodes")
print(f"Found {len(important)} important nodes")
Get Tagged Node Details
with Document.from_kddb("document.kddb") as doc:
# Find all nodes with invoice-total tag
totals = doc.select("//*[@tag='invoice-total']")
for node in totals:
# Get the tag details
tag = node.get_tag("invoice-total")
if tag:
value = tag.get("Value")
confidence = tag.get("Confidence")
print(f"Total: {value} (confidence: {confidence})")
Position Filtering
Index-Based Selection
with Document.from_kddb("document.kddb") as doc:
# First paragraph (1-indexed)
first = doc.select("//paragraph[1]")
# Second paragraph
second = doc.select("//paragraph[2]")
# Last paragraph
last = doc.select("//paragraph[last()]")
# First three paragraphs
first_three = doc.select("//paragraph[position() <= 3]")
Variables in Selectors
Parameterized Queries
with Document.from_kddb("document.kddb") as doc:
# Use variables in selectors
variables = {"search_term": "invoice"}
matches = doc.select(
"//paragraph[contains(@content, $search_term)]",
variables=variables
)
# Multiple variables
variables = {
"tag_name": "invoice-total",
"min_confidence": 0.9
}
high_confidence = doc.select(
"//*[@tag=$tag_name]",
variables=variables
)
print(f"Found {len(matches)} matches")
Common Selector Patterns
Quick Reference
| Pattern | Selector | Description |
|---|
| All nodes | //* | Every node in document |
| By type | //paragraph | All paragraphs |
| Direct children | //section/paragraph | Paragraphs under sections |
| Any depth | //section//paragraph | Paragraphs anywhere in sections |
| First of type | //paragraph[1] | First paragraph |
| By tag | //*[@tag='important'] | Tagged nodes |
| By content | //paragraph[contains(@content, 'text')] | Content search |
| Multiple conditions | //paragraph[@tag='x'][contains(@content, 'y')] | Combined filters |
Real-World Examples
with Document.from_kddb("document.kddb") as doc:
# Extract all invoice line items
line_items = doc.select("//row[@tag='line-item']")
# Find vendor information
vendor = doc.select_first("//paragraph[@tag='vendor-name']")
# Get all amounts in a table
amounts = doc.select("//table//cell[contains(@content, '$')]")
# Find headers (paragraphs at start of sections)
headers = doc.select("//section/paragraph[1]")
# Get all nodes on page 1 (if page structure exists)
page1_nodes = doc.select("//page[1]//*")
# Find paragraphs with high-confidence tags
# (filter in Python after selecting)
tagged_paras = doc.select("//paragraph[hasTag()]")
high_conf = [
p for p in tagged_paras
for tag in [p.get_tag(t) for t in p.get_tags()]
if tag and tag.get("Confidence", 0) > 0.9
]
Working with Results
Iterating Results
with Document.from_kddb("document.kddb") as doc:
paragraphs = doc.select("//paragraph")
for para in paragraphs:
# Access node properties
print(f"Type: {para.type}")
print(f"Content: {para.content}")
print(f"UUID: {para.uuid}")
# Check tags
if para.has_tag("important"):
print(" -> Important!")
# Get features
font = para.get_feature("style", "font-family")
if font:
print(f" Font: {font.get_value()}")
Chaining Selections
with Document.from_kddb("document.kddb") as doc:
# Find all tables, then get cells from each
tables = doc.select("//table")
for table in tables:
# Select within the context of this table node
cells = table.select(".//cell")
print(f"Table has {len(cells)} cells")
# Get just the first row's cells
first_row_cells = table.select("./row[1]/cell")
For large documents, optimize your selectors:
- Be specific with paths (
//table/row/cell vs //*)
- Use
select_first() when you only need one result
- Combine conditions in one selector vs multiple queries
- Cache selector results when iterating multiple times
with Document.from_kddb("large_document.kddb") as doc:
# Good: Specific path
cells = doc.select("//table/row/cell")
# Less efficient: Broad search
# all_nodes = doc.select("//*")
# Good: Single result
first = doc.select_first("//paragraph[@tag='title']")
# Good: Combined conditions
matches = doc.select("//paragraph[@tag='important'][contains(@content, 'urgent')]")
# Less efficient: Multiple queries
# tagged = doc.select("//paragraph[@tag='important']")
# matches = [p for p in tagged if 'urgent' in p.content]