from kodexa_document import Document
from kodexa_document.accessors import DataObjectInput, DataAttributeInput
def extract_invoice_data():
with Document() as doc:
root = doc.create_node("document", "Invoice #12345")
doc.content_node = root
# Create the invoice data object
invoice = doc.data_objects.create(DataObjectInput(
path="/invoice",
taxonomy_ref="acme/invoice:1.0.0"
))
# Add header attributes
doc.data_attributes.create(invoice['id'], DataAttributeInput(
tag="invoice-number",
string_value="INV-2024-001",
confidence=0.99
))
doc.data_attributes.create(invoice['id'], DataAttributeInput(
tag="vendor-name",
string_value="Acme Corp",
confidence=0.95
))
doc.data_attributes.create(invoice['id'], DataAttributeInput(
tag="invoice-date",
date_value="2024-01-15",
confidence=0.98
))
# Create line items
items = [
("Widget A", 100.00),
("Widget B", 250.00),
("Service Fee", 50.00)
]
for desc, amount in items:
line_item = doc.data_objects.create(DataObjectInput(
parent_id=invoice['id'],
path="/invoice/line-item"
))
doc.data_attributes.create(line_item['id'], DataAttributeInput(
tag="description",
string_value=desc,
confidence=0.92
))
doc.data_attributes.create(line_item['id'], DataAttributeInput(
tag="amount",
decimal_value=amount,
confidence=0.90
))
# Add total
doc.data_attributes.create(invoice['id'], DataAttributeInput(
tag="total-amount",
decimal_value=400.00,
confidence=0.95
))
# Query the results
roots = doc.data_objects.get_roots()
for obj in roots:
attrs = doc.data_attributes.get_for_data_object(obj['id'])
print(f"Data Object: {obj['path']}")
for attr in attrs:
print(f" {attr['tag']}: {attr.get('stringValue') or attr.get('decimalValue')}")
doc.save("extracted_invoice.kddb")