Skip to main content
Data objects and data attributes provide a structured data layer within KDDB documents. Data objects represent extracted entities (e.g., an invoice line item, a person, an address), while data attributes store the individual fields and values associated with those entities.

Overview

The data layer is separate from the content node tree and designed for storing extraction results:
  • Data Objects: Entities with parent-child relationships and taxonomy references
  • Data Attributes: Typed fields attached to data objects (string, decimal, date, boolean)
  • Confidence Scores: Each attribute can have an extraction confidence score
  • Taxonomy Integration: Objects and attributes can reference taxonomy definitions

Data Objects

Creating Data Objects

from kodexa_document import Document
from kodexa_document.accessors import DataObjectInput

with Document() as doc:
    root = doc.create_node("document", "Invoice #12345")
    doc.content_node = root

    # Create a root data object
    invoice = doc.data_objects.create(DataObjectInput(
        path="/invoice"
    ))
    print(f"Created data object: {invoice}")

    # Create child data objects
    line_item_1 = doc.data_objects.create(DataObjectInput(
        parent_id=invoice['id'],
        path="/invoice/line-item"
    ))

    line_item_2 = doc.data_objects.create(DataObjectInput(
        parent_id=invoice['id'],
        path="/invoice/line-item"
    ))

Retrieving Data Objects

with Document.from_kddb("processed.kddb") as doc:
    # Get all data objects
    all_objects = doc.data_objects.get_all()
    print(f"Total data objects: {len(all_objects)}")

    # Get root-level objects only
    roots = doc.data_objects.get_roots()

    # Get children of a specific object
    for root_obj in roots:
        children = doc.data_objects.get_children(root_obj['groupUuid'])
        print(f"Object {root_obj['path']} has {len(children)} children")

    # Get by UUID
    obj = doc.data_objects.get_by_uuid("some-uuid")

    # Get by group UUID
    group = doc.data_objects.get_by_group_uuid("group-uuid")

Updating and Deleting Data Objects

with Document.from_kddb("processed.kddb") as doc:
    # Update a data object
    doc.data_objects.update("some-uuid", DataObjectInput(
        path="/invoice/updated-path"
    ))

    # Delete a data object
    doc.data_objects.delete("some-uuid")

Data Attributes

Creating Data Attributes

from kodexa_document.accessors import DataAttributeInput

with Document() as doc:
    root = doc.create_node("document", "Invoice")
    doc.content_node = root

    # Create a data object first
    invoice = doc.data_objects.create(DataObjectInput(path="/invoice"))

    # Add string attribute
    doc.data_attributes.create(invoice['id'], DataAttributeInput(
        tag="vendor-name",
        string_value="Acme Corp",
        confidence=0.95,
        owner_uri="model://kodexa/invoice-extractor:1.0.0"
    ))

    # Add decimal attribute
    doc.data_attributes.create(invoice['id'], DataAttributeInput(
        tag="total-amount",
        decimal_value=1234.56,
        confidence=0.92
    ))

    # Add date attribute
    doc.data_attributes.create(invoice['id'], DataAttributeInput(
        tag="invoice-date",
        date_value="2024-01-15",
        confidence=0.98
    ))

    # Add boolean attribute
    doc.data_attributes.create(invoice['id'], DataAttributeInput(
        tag="is-paid",
        boolean_value=False,
        confidence=1.0
    ))

Retrieving Data Attributes

with Document.from_kddb("processed.kddb") as doc:
    # Get all data objects
    objects = doc.data_objects.get_all()

    for obj in objects:
        # Get attributes for a data object
        attrs = doc.data_attributes.get_for_data_object(obj['id'])

        for attr in attrs:
            print(f"  {attr['tag']}: {attr.get('stringValue') or attr.get('decimalValue')}")
            print(f"    Confidence: {attr.get('confidence')}")

    # Get a specific attribute by ID
    attr = doc.data_attributes.get_by_id(1)

Updating Data Attributes

with Document.from_kddb("processed.kddb") as doc:
    # Update an attribute
    doc.data_attributes.update(1, DataAttributeInput(
        string_value="Updated Vendor Name",
        confidence=0.99
    ))

    # Delete an attribute
    doc.data_attributes.delete(1)

DataObjectInput Reference

FieldPythonTypeScriptDescription
Parentparent_idparentIdID of the parent data object
Taxonomytaxonomy_reftaxonomyRefReference to a taxonomy definition
PathpathpathHierarchical path identifier

DataAttributeInput Reference

FieldPythonTypeScriptDescription
TagtagtagTag name linking to the content node tag
Tag IDtag_idtagIdDirect tag ID reference
ValuevaluevalueGeneric value
String Valuestring_valuestringValueString-typed value
Decimal Valuedecimal_valuedecimalValueNumeric value
Date Valuedate_valuedateValueDate string value
Boolean Valueboolean_valuebooleanValueBoolean value
ConfidenceconfidenceconfidenceExtraction confidence (0-1)
Typetype_at_creationtypeAtCreationType classification
PathpathpathHierarchical path identifier
Owner URIowner_uriownerUriSource identifier (e.g., model URI)
Data Featuresdata_featuresdataFeaturesAdditional metadata dictionary

Complete Example

from kodexa_document import Document
from kodexa_document.accessors import DataObjectInput, DataAttributeInput

def extract_invoice_data():
    with Document() as doc:
        root = doc.create_node("document", "Invoice #12345")
        doc.content_node = root

        # Create the invoice data object
        invoice = doc.data_objects.create(DataObjectInput(
            path="/invoice",
            taxonomy_ref="acme/invoice:1.0.0"
        ))

        # Add header attributes
        doc.data_attributes.create(invoice['id'], DataAttributeInput(
            tag="invoice-number",
            string_value="INV-2024-001",
            confidence=0.99
        ))
        doc.data_attributes.create(invoice['id'], DataAttributeInput(
            tag="vendor-name",
            string_value="Acme Corp",
            confidence=0.95
        ))
        doc.data_attributes.create(invoice['id'], DataAttributeInput(
            tag="invoice-date",
            date_value="2024-01-15",
            confidence=0.98
        ))

        # Create line items
        items = [
            ("Widget A", 100.00),
            ("Widget B", 250.00),
            ("Service Fee", 50.00)
        ]

        for desc, amount in items:
            line_item = doc.data_objects.create(DataObjectInput(
                parent_id=invoice['id'],
                path="/invoice/line-item"
            ))
            doc.data_attributes.create(line_item['id'], DataAttributeInput(
                tag="description",
                string_value=desc,
                confidence=0.92
            ))
            doc.data_attributes.create(line_item['id'], DataAttributeInput(
                tag="amount",
                decimal_value=amount,
                confidence=0.90
            ))

        # Add total
        doc.data_attributes.create(invoice['id'], DataAttributeInput(
            tag="total-amount",
            decimal_value=400.00,
            confidence=0.95
        ))

        # Query the results
        roots = doc.data_objects.get_roots()
        for obj in roots:
            attrs = doc.data_attributes.get_for_data_object(obj['id'])
            print(f"Data Object: {obj['path']}")
            for attr in attrs:
                print(f"  {attr['tag']}: {attr.get('stringValue') or attr.get('decimalValue')}")

        doc.save("extracted_invoice.kddb")