Skip to main content
Native documents allow you to embed binary files (PDFs, images, Word documents, spreadsheets, etc.) directly within a KDDB document. This is useful for preserving the original source files alongside the extracted content.

Overview

Each native document stores:
  • filename: The original filename
  • mimeType: The MIME type (e.g., application/pdf)
  • data: The raw binary content
  • checksum: Optional integrity hash
  • size: File size in bytes

Creating Native Documents

Store a binary file within your document:
from kodexa_document import Document

with Document(inmemory=True) as doc:
    # Read a PDF file
    with open("invoice.pdf", "rb") as f:
        pdf_data = f.read()

    # Store it in the document
    doc_id = doc.create_native_document(
        filename="invoice.pdf",
        mime_type="application/pdf",
        data=pdf_data,
        checksum="sha256:abc123def..."  # Optional
    )

    print(f"Created native document with ID: {doc_id}")

Retrieving Native Documents

Get All Native Documents

with Document.from_kddb("processed.kddb", inmemory=True) as doc:
    # Get all native documents (metadata only)
    native_docs = doc.get_native_documents()

    for native_doc in native_docs:
        print(f"ID: {native_doc['id']}")
        print(f"  Filename: {native_doc['filename']}")
        print(f"  MIME Type: {native_doc['mime_type']}")
        print(f"  Size: {native_doc['size']} bytes")

Get by ID or Filename

with Document.from_kddb("processed.kddb", inmemory=True) as doc:
    # Get by ID
    native_doc = doc.get_native_document_by_id(1)
    if native_doc:
        print(f"Found: {native_doc['filename']}")

    # Get by filename
    native_doc = doc.get_native_document_by_filename("invoice.pdf")
    if native_doc:
        print(f"Found: {native_doc['filename']}")

    # Get the first native document
    first = doc.get_first_native_document()
    if first:
        print(f"First document: {first['filename']}")

Retrieve Binary Data

The binary content is retrieved separately from metadata for efficiency:
with Document.from_kddb("processed.kddb", inmemory=True) as doc:
    # Get the native document metadata
    native_doc = doc.get_native_document_by_filename("invoice.pdf")

    if native_doc:
        # Get the actual binary data
        data = doc.get_native_document_data(native_doc['id'])

        # Save to file
        with open("extracted_invoice.pdf", "wb") as f:
            f.write(data)

        print(f"Extracted {len(data)} bytes")

Deleting Native Documents

with Document.from_kddb("processed.kddb", inmemory=False) as doc:
    # Delete a specific native document
    success = doc.delete_native_document(doc_id=1)
    print(f"Deleted: {success}")

    # Delete all native documents
    success = doc.delete_all_native_documents()
    print(f"Deleted all: {success}")

Common MIME Types

File TypeMIME Type
PDFapplication/pdf
Word (docx)application/vnd.openxmlformats-officedocument.wordprocessingml.document
Excel (xlsx)application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
PNGimage/png
JPEGimage/jpeg
JSONapplication/json
HTMLtext/html
Plain Texttext/plain
CSVtext/csv
ZIPapplication/zip

Use Cases

Preserving Original Files

Store the original document alongside extracted content:
from kodexa_document import Document

def process_invoice(pdf_path: str) -> Document:
    """Process an invoice and preserve the original PDF."""
    with Document(inmemory=True) as doc:
        # Store the original PDF
        with open(pdf_path, "rb") as f:
            doc.create_native_document(
                filename=pdf_path.split("/")[-1],
                mime_type="application/pdf",
                data=f.read()
            )

        # Build extracted content structure
        root = doc.create_node("document", "Invoice")
        doc.content_node = root

        # Add extracted data...
        vendor = doc.create_node("paragraph", "Vendor: Acme Corp", parent=root)
        vendor.tag("vendor-name")

        return doc

Multiple Source Files

Store multiple related files:
with Document(inmemory=True) as doc:
    # Store multiple related documents
    files = [
        ("contract.pdf", "application/pdf"),
        ("schedule_a.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
        ("signature.png", "image/png")
    ]

    for filename, mime_type in files:
        with open(filename, "rb") as f:
            doc.create_native_document(
                filename=filename,
                mime_type=mime_type,
                data=f.read()
            )

    print(f"Stored {len(doc.get_native_documents())} files")