Documentation Index
Fetch the complete documentation index at: https://developer.kodexa.ai/llms.txt
Use this file to discover all available pages before exploring further.
Native documents allow you to embed binary files (PDFs, images, Word documents, spreadsheets, etc.) directly within a KDDB document. This is useful for preserving the original source files alongside the extracted content.
Overview
Each native document stores:
- filename: The original filename
- mimeType: The MIME type (e.g.,
application/pdf)
- data: The raw binary content
- checksum: Optional integrity hash
- size: File size in bytes
Creating Native Documents
Store a binary file within your document:
from kodexa_document import Document
with Document() as doc:
# Read a PDF file
with open("invoice.pdf", "rb") as f:
pdf_data = f.read()
# Store it in the document
doc_id = doc.create_native_document(
filename="invoice.pdf",
mime_type="application/pdf",
data=pdf_data,
checksum="sha256:abc123def..." # Optional
)
print(f"Created native document with ID: {doc_id}")
Retrieving Native Documents
Get All Native Documents
with Document.from_kddb("processed.kddb") as doc:
# Get all native documents (metadata only)
native_docs = doc.get_native_documents()
for native_doc in native_docs:
print(f"ID: {native_doc['id']}")
print(f" Filename: {native_doc['filename']}")
print(f" MIME Type: {native_doc['mime_type']}")
print(f" Size: {native_doc['size']} bytes")
Get by ID or Filename
with Document.from_kddb("processed.kddb") as doc:
# Get by ID
native_doc = doc.get_native_document_by_id(1)
if native_doc:
print(f"Found: {native_doc['filename']}")
# Get by filename
native_doc = doc.get_native_document_by_filename("invoice.pdf")
if native_doc:
print(f"Found: {native_doc['filename']}")
# Get the first native document
first = doc.get_first_native_document()
if first:
print(f"First document: {first['filename']}")
Retrieve Binary Data
The binary content is retrieved separately from metadata for efficiency:
with Document.from_kddb("processed.kddb") as doc:
# Get the native document metadata
native_doc = doc.get_native_document_by_filename("invoice.pdf")
if native_doc:
# Get the actual binary data
data = doc.get_native_document_data(native_doc['id'])
# Save to file
with open("extracted_invoice.pdf", "wb") as f:
f.write(data)
print(f"Extracted {len(data)} bytes")
Deleting Native Documents
with Document.from_kddb("processed.kddb") as doc:
# Delete a specific native document
success = doc.delete_native_document(doc_id=1)
print(f"Deleted: {success}")
# Delete all native documents
success = doc.delete_all_native_documents()
print(f"Deleted all: {success}")
Common MIME Types
| File Type | MIME Type |
|---|
| PDF | application/pdf |
| Word (docx) | application/vnd.openxmlformats-officedocument.wordprocessingml.document |
| Excel (xlsx) | application/vnd.openxmlformats-officedocument.spreadsheetml.sheet |
| PNG | image/png |
| JPEG | image/jpeg |
| JSON | application/json |
| HTML | text/html |
| Plain Text | text/plain |
| CSV | text/csv |
| ZIP | application/zip |
Use Cases
Preserving Original Files
Store the original document alongside extracted content:
from kodexa_document import Document
def process_invoice(pdf_path: str) -> Document:
"""Process an invoice and preserve the original PDF."""
with Document() as doc:
# Store the original PDF
with open(pdf_path, "rb") as f:
doc.create_native_document(
filename=pdf_path.split("/")[-1],
mime_type="application/pdf",
data=f.read()
)
# Build extracted content structure
root = doc.create_node("document", "Invoice")
doc.content_node = root
# Add extracted data...
vendor = doc.create_node("paragraph", "Vendor: Acme Corp", parent=root)
vendor.tag("vendor-name")
return doc
Multiple Source Files
Store multiple related files:
with Document() as doc:
# Store multiple related documents
files = [
("contract.pdf", "application/pdf"),
("schedule_a.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
("signature.png", "image/png")
]
for filename, mime_type in files:
with open(filename, "rb") as f:
doc.create_native_document(
filename=filename,
mime_type=mime_type,
data=f.read()
)
print(f"Stored {len(doc.get_native_documents())} files")