Node Basics
Creating Nodes
Copy
Ask AI
from kodexa_document import Document
with Document(inmemory=True) as doc:
# Create the root node
root = doc.create_node("document", "My Document")
doc.content_node = root
# Create child nodes with content
section = doc.create_node("section", "Introduction", parent=root)
para1 = doc.create_node("paragraph", "First paragraph.", parent=section)
para2 = doc.create_node("paragraph", "Second paragraph.", parent=section)
# Create nodes without initial content
table = doc.create_node("table", parent=root)
row = doc.create_node("row", parent=table)
cell = doc.create_node("cell", "Cell content", parent=row)
Node Types
Common node types used in Kodexa documents:| Type | Description |
|---|---|
document | Root node of the document |
page | A page in the document |
section | A logical section |
paragraph | A paragraph of text |
line | A line of text |
word | An individual word |
table | A table structure |
row | A table row |
cell | A table cell |
image | An image element |
Node Content
Reading and Writing Content
Copy
Ask AI
with Document(inmemory=True) as doc:
root = doc.create_node("document")
doc.content_node = root
para = doc.create_node("paragraph", "Initial content", parent=root)
# Read content
print(f"Content: {para.content}")
# Update content
para.content = "Updated content"
# Content can be None
empty_node = doc.create_node("section", parent=root)
print(f"Empty content: {empty_node.content}") # None
Content Parts
Nodes can have multiple content parts for complex text:Copy
Ask AI
with Document(inmemory=True) as doc:
root = doc.create_node("document")
doc.content_node = root
para = doc.create_node("paragraph", parent=root)
# Set multiple content parts
para.set_content_parts(["Hello ", "world", "!"])
# Get content parts
parts = para.get_content_parts()
print(f"Parts: {parts}") # ['Hello ', 'world', '!']
# Full content joins parts
print(f"Full: {para.content}") # 'Hello world!'
Aggregated Content
Get all content from a node and its descendants:Copy
Ask AI
with Document(inmemory=True) as doc:
root = doc.create_node("document")
doc.content_node = root
section = doc.create_node("section", parent=root)
doc.create_node("paragraph", "First paragraph.", parent=section)
doc.create_node("paragraph", "Second paragraph.", parent=section)
doc.create_node("paragraph", "Third paragraph.", parent=section)
# Get all content from section and children
all_text = section.get_all_content(separator=" ")
print(f"All content: {all_text}")
# "First paragraph. Second paragraph. Third paragraph."
Tree Navigation
Parent and Children
Copy
Ask AI
with Document.from_kddb("document.kddb", inmemory=True) as doc:
root = doc.content_node
# Get all children
children = root.get_children()
print(f"Root has {len(children)} children")
for child in children:
# Get parent (back to root)
parent = child.get_parent()
print(f"Node type: {child.type}, parent type: {parent.type}")
# Get this node's children
grandchildren = child.get_children()
print(f" Has {len(grandchildren)} children")
Siblings
Copy
Ask AI
with Document.from_kddb("document.kddb", inmemory=True) as doc:
paragraphs = doc.select("//paragraph")
for para in paragraphs:
# Get all siblings (nodes with same parent)
siblings = para.get_siblings()
print(f"Has {len(siblings)} siblings")
# Navigate to next/previous sibling
next_node = para.next_node()
prev_node = para.previous_node()
if next_node:
print(f"Next: {next_node.type}")
if prev_node:
print(f"Previous: {prev_node.type}")
Depth and Position
Copy
Ask AI
with Document.from_kddb("document.kddb", inmemory=True) as doc:
nodes = doc.select("//*")
for node in nodes:
# Depth in tree (root = 0)
depth = node.get_depth()
# Index among siblings
index = node.index
print(f"Type: {node.type}, Depth: {depth}, Index: {index}")
Node Features
Features are typed key-value metadata attached to individual nodes:Copy
Ask AI
with Document(inmemory=True) as doc:
root = doc.create_node("document")
doc.content_node = root
para = doc.create_node("paragraph", "Styled text", parent=root)
# Add features (type, name, value)
para.add_feature("style", "font-family", "Arial")
para.add_feature("style", "font-size", "12pt")
para.add_feature("style", "font-weight", "bold")
para.add_feature("spatial", "bbox", {"x": 100, "y": 200, "w": 300, "h": 50})
para.add_feature("ocr", "confidence", 0.95)
# Retrieve a specific feature
font = para.get_feature("style", "font-family")
if font:
print(f"Font: {font.get_value()}")
# Get all features of a type
style_features = para.get_features_of_type("style")
for f in style_features:
print(f" {f.name}: {f.get_value()}")
# Get all features
all_features = para.get_features()
print(f"Total features: {len(all_features)}")
# Check if feature exists
has_bbox = para.has_feature("spatial", "bbox")
Node Tags
Tags annotate nodes with labels, optional confidence scores, and values:Copy
Ask AI
with Document(inmemory=True) as doc:
root = doc.create_node("document")
doc.content_node = root
para = doc.create_node("paragraph", "Invoice total: $1,234.56", parent=root)
# Simple tag
para.tag("important")
# Tag with confidence and value
para.tag("invoice-total", confidence=0.95, value="$1,234.56")
# Tag with additional data
para.tag("extracted-field",
confidence=0.92,
value="1234.56",
tag_uuid="field-uuid-123"
)
# Check for tag
if para.has_tag("important"):
print("Node is marked important")
# Get tag details
tag = para.get_tag("invoice-total")
if tag:
print(f"Value: {tag.get('Value')}")
print(f"Confidence: {tag.get('Confidence')}")
# Get all tags
all_tags = para.get_tags()
print(f"Tags: {all_tags}")
# Remove a tag
para.remove_tag("important")
Spatial Data (Bounding Boxes)
Nodes can have spatial information for document layout:Copy
Ask AI
with Document(inmemory=True) as doc:
root = doc.create_node("document")
doc.content_node = root
para = doc.create_node("paragraph", "Text on page", parent=root)
# Set bounding box via feature
para.add_feature("spatial", "bbox", {
"x": 100, # Left position
"y": 200, # Top position
"width": 300, # Width
"height": 50 # Height
})
# Set page reference
para.add_feature("spatial", "page", 0)
# Get bounding box
bbox = para.get_feature("spatial", "bbox")
if bbox:
box = bbox.get_value()
print(f"Position: ({box['x']}, {box['y']})")
print(f"Size: {box['width']} x {box['height']}")
Building Document Structures
Invoice Example
Copy
Ask AI
from kodexa_document import Document
def build_invoice_structure():
with Document(inmemory=True) as doc:
# Document root
root = doc.create_node("document", "Invoice #12345")
doc.content_node = root
# Header section
header = doc.create_node("section", "Header", parent=root)
vendor = doc.create_node("paragraph", "Vendor: Acme Corp", parent=header)
vendor.tag("vendor-name", value="Acme Corp", confidence=0.98)
date = doc.create_node("paragraph", "Date: 2024-01-15", parent=header)
date.tag("invoice-date", value="2024-01-15", confidence=0.95)
# Line items table
table = doc.create_node("table", parent=root)
items = [
("Widget A", "100.00"),
("Widget B", "250.00"),
("Service Fee", "50.00")
]
for desc, amount in items:
row = doc.create_node("row", parent=table)
desc_cell = doc.create_node("cell", desc, parent=row)
desc_cell.tag("line-item-description")
amt_cell = doc.create_node("cell", f"${amount}", parent=row)
amt_cell.tag("line-item-amount", value=amount, confidence=0.92)
# Total
total = doc.create_node("paragraph", "Total: $400.00", parent=root)
total.tag("invoice-total", value="400.00", confidence=0.99)
return doc
