> ## Documentation Index
> Fetch the complete documentation index at: https://docs.mixpeek.com/docs/llms.txt
> Use this file to discover all available pages before exploring further.

# Create a document.

> Create a document by ID.



## OpenAPI

````yaml post /v1/collections/{collection_identifier}/documents
openapi: 3.1.0
info:
  title: Mixpeek API
  description: >-
    This is the Mixpeek API, providing access to various endpoints for data
    processing and retrieval.
  termsOfService: https://mixpeek.com/terms
  contact:
    name: Mixpeek Support
    url: https://mixpeek.com/contact
    email: info@mixpeek.com
  version: '0.82'
servers:
  - url: https://api.mixpeek.com
    description: Production
security: []
paths:
  /v1/collections/{collection_identifier}/documents:
    post:
      tags:
        - Collection Documents
      summary: Create a document.
      description: Create a document by ID.
      operationId: create_document_v1_collections__collection_identifier__documents_post
      parameters:
        - name: collection_identifier
          in: path
          required: true
          schema:
            type: string
            description: The ID of the collection.
            title: Collection Identifier
          description: The ID of the collection.
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/DocumentCreateRequest'
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/DocumentResponse'
        '400':
          description: Bad Request
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '401':
          description: Unauthorized
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '403':
          description: Forbidden
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '404':
          description: Not Found
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '422':
          description: Validation Error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/HTTPValidationError'
        '500':
          description: Internal Server Error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
components:
  schemas:
    DocumentCreateRequest:
      properties:
        collection_id:
          type: string
          title: Collection Id
          description: ID of the collection the document belongs to.
          example: collection_123
        root_object_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Root Object Id
          description: >-
            Optional denormalized root object identifier provided during
            creation.
        root_bucket_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Root Bucket Id
          description: Optional denormalized bucket identifier provided during creation.
        source_type:
          anyOf:
            - type: string
              enum:
                - bucket
                - collection
                - direct_upsert
            - type: 'null'
          title: Source Type
          description: Optional immediate parent type for the document.
        source_collection_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Source Collection Id
          description: >-
            Optional parent collection identifier when sourced from a
            collection.
        source_document_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Source Document Id
          description: Optional parent document identifier when sourced from a collection.
        source_object_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Source Object Id
          description: >-
            Optional parent object identifier when sourced directly from a
            bucket.
        lineage_path:
          anyOf:
            - type: string
            - type: 'null'
          title: Lineage Path
          description: Optional materialized lineage path to set during creation.
        lineage_chain:
          items:
            $ref: '#/components/schemas/LineageStep'
          type: array
          title: Lineage Chain
          description: >-
            Processing steps from root object to this document. Recommended for
            decomposition trees.
        document_schema_version:
          anyOf:
            - type: string
            - type: 'null'
          title: Document Schema Version
          description: >-
            Optional document schema version (v1 or v2). If not provided, uses
            system default.
        metadata:
          additionalProperties: true
          type: object
          title: Metadata
          description: >-
            Optional metadata dictionary for user-defined fields and custom
            attributes.
        features:
          items:
            $ref: '#/components/schemas/FeatureModel'
          type: array
          title: Features
          description: Features to associate with the document
        vectors:
          anyOf:
            - additionalProperties:
                items:
                  type: number
                type: array
              type: object
            - type: 'null'
          title: Vectors
          description: >-
            Optional pre-computed vectors to store with the document. Keys are
            vector index names (e.g. 'text_extractor_v1_embedding'), values are
            float arrays matching the index dimensions.
      additionalProperties: true
      type: object
      required:
        - collection_id
      title: DocumentCreateRequest
      description: Request model for creating a document.
    DocumentResponse:
      properties:
        document_id:
          type: string
          title: Document Id
          description: >-
            REQUIRED. Unique identifier for the document. Format: 'doc_' prefix
            + alphanumeric characters. Use for: API queries, references,
            filtering.
          examples:
            - doc_f8966ff29c18e20c6b45e053
            - doc_abc123
        collection_id:
          type: string
          title: Collection Id
          description: >-
            REQUIRED. ID of the collection this document belongs to. Format:
            'col_' prefix + alphanumeric characters. Use for: Collection-scoped
            queries, filtering.
          examples:
            - col_articles
            - col_video_frames
        document_blobs:
          items:
            $ref: '#/components/schemas/BlobURLRef'
          type: array
          title: Document Blobs
          description: Document blobs with presigned URLs when requested
        _internal:
          anyOf:
            - $ref: '#/components/schemas/InternalPayloadModel'
            - type: 'null'
          description: >-
            System-managed internal fields. Contains all Mixpeek-managed
            metadata including lineage, processing info, timestamps, and blob
            references. User-defined fields appear at root level alongside
            document_id and collection_id.
      additionalProperties: true
      type: object
      required:
        - document_id
        - collection_id
      title: DocumentResponse
      description: >-
        Response model for a single document.


        This is the standard response format when fetching documents via API
        endpoints.

        Contains all document data plus optional presigned URLs for S3 blobs.


        The document payload structure follows native Qdrant format:
            - System fields are stored in `_internal` (lineage, metadata, blobs, etc.)
            - User fields are at root level (brand_name, thumbnail_url, etc.)
            - Only document_id and collection_id are Mixpeek IDs at root level
            - No duplication between root and _internal

        Query Parameters Affecting Response:
            - return_url=true: Adds presigned_url to each document_blobs entry
            - return_vectors=true: Includes embedding arrays in response

        Use Cases:
            - Display document details in UI
            - Download source files or generated artifacts
            - Understand document provenance and processing
            - Access enrichment fields (flat) for filtering/display
      examples:
        - _internal:
            collection_id: col_articles
            created_at: '2025-10-31T10:00:00Z'
            document_id: doc_f8966ff29c18e20c6b45e053
            internal_id: org_abc123
            lineage:
              chain:
                - collection_id: col_articles
                  feature_extractor_id: text_extractor_v1
                  timestamp: '2025-10-31T10:00:00Z'
              path: bkt_content/col_articles
              root_bucket_id: bkt_content
              root_object_id: obj_article_001
              source_object_id: obj_article_001
              source_type: bucket
            metadata:
              ingestion_status: COMPLETED
            modality: text
            namespace_id: ns_xyz789
            source_blobs:
              - blob_id: blob_text_001
                blob_property: content
                blob_type: text
            updated_at: '2025-10-31T10:00:00Z'
          author: Dr. Smith
          collection_id: col_articles
          description: Text document with _internal structure
          document_id: doc_f8966ff29c18e20c6b45e053
          title: AI in Healthcare
        - _internal:
            collection_id: col_video_frames
            created_at: '2025-10-31T10:00:00Z'
            document_blobs:
              - field: thumbnail
                role: thumbnail
                type: image
                url: s3://bucket/thumbnails/frame_050.jpg
            document_id: doc_frame_050
            internal_id: org_abc123
            lineage:
              path: bkt_marketing/col_video_frames
              root_bucket_id: bkt_marketing
              root_object_id: obj_video_123
              source_object_id: obj_video_123
              source_type: bucket
            mime_type: video/mp4
            modality: video
            namespace_id: ns_xyz789
            source_blobs:
              - blob_id: blob_video_001
                blob_property: video
                blob_type: video
            updated_at: '2025-10-31T10:00:00Z'
          campaign_id: Q4_2025
          cluster_distance: 0.15
          cluster_id: cl_marketing
          collection_id: col_video_frames
          description: Video document with user fields and enrichments
          document_id: doc_frame_050
          duration: 120
          taxonomy_products_label: Electronics
          taxonomy_products_score: 0.87
        - _internal:
            collection_id: col_scenes
            created_at: '2025-10-31T10:05:00Z'
            document_id: doc_scene_042
            internal_id: org_abc123
            lineage:
              chain:
                - collection_id: col_frames
                  feature_extractor_id: multimodal_extractor_v1
                  timestamp: '2025-10-31T10:00:00Z'
                - collection_id: col_scenes
                  feature_extractor_id: scene_detector_v1
                  timestamp: '2025-10-31T10:05:00Z'
              path: bkt_marketing/col_video_frames/col_scenes
              root_bucket_id: bkt_marketing
              root_object_id: obj_video_123
              source_collection_id: col_video_frames
              source_document_id: doc_frame_050
              source_type: collection
            modality: video
            namespace_id: ns_xyz789
            updated_at: '2025-10-31T10:05:00Z'
          collection_id: col_scenes
          description: Multi-tier document (collection→collection)
          document_id: doc_scene_042
          end_time: 62.3
          scene_type: action
          start_time: 45.5
    ErrorResponse:
      properties:
        success:
          type: boolean
          title: Success
          description: Always false for error responses
          default: false
        status:
          type: integer
          title: Status
          description: HTTP status code for this error
        error:
          $ref: '#/components/schemas/ErrorDetail'
          description: Error details payload
      type: object
      required:
        - status
        - error
      title: ErrorResponse
      description: Error response model.
      examples:
        - error:
            details:
              id: ns_123
              resource: namespace
            message: Namespace not found
            type: NotFoundError
          status: 404
          success: false
    HTTPValidationError:
      properties:
        detail:
          items:
            $ref: '#/components/schemas/ValidationError'
          type: array
          title: Detail
      type: object
      title: HTTPValidationError
    LineageStep:
      properties:
        collection_id:
          type: string
          title: Collection Id
          description: Collection ID where this processing step occurred
          examples:
            - col_video_frames
            - col_scenes
            - col_embeddings
        feature_extractor_id:
          type: string
          title: Feature Extractor Id
          description: Feature extractor that processed the data in this step
          examples:
            - multimodal_extractor_v1
            - scene_detector_v1
            - text_extractor_v1
        document_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Document Id
          description: >-
            Document ID from this step (for intermediate steps). Allows tracing
            back through the decomposition tree.
          examples:
            - doc_frame001
            - doc_scene005
        timestamp:
          type: string
          format: date-time
          title: Timestamp
          description: When this processing step occurred
          examples:
            - '2025-10-18T10:30:00Z'
      type: object
      required:
        - collection_id
        - feature_extractor_id
      title: LineageStep
      description: |-
        Single processing step in a document's lineage chain.

        Each step represents one transformation in the decomposition tree,
        tracking which collection and feature extractor produced the document.

        Example:
            ```python
            step = LineageStep(
                collection_id="col_video_frames",
                feature_extractor_id="multimodal_extractor_v1",
                document_id="doc_frame123",
                timestamp=datetime.now()
            )
            ```
      examples:
        - collection_id: col_video_frames
          document_id: doc_frame123
          feature_extractor_id: multimodal_extractor_v1
          timestamp: '2025-10-18T10:30:00Z'
    FeatureModel:
      properties:
        feature_extractor_id:
          type: string
          title: Feature Extractor Id
          description: ID of the feature extractor that produced this response
        payload:
          additionalProperties: true
          type: object
          title: Payload
          description: Metadata of the feature
        vectors:
          anyOf:
            - $ref: '#/components/schemas/DenseVector'
            - $ref: '#/components/schemas/SparseVector'
            - $ref: '#/components/schemas/MultiDenseVector'
            - $ref: '#/components/schemas/NamedDenseVectors'
            - type: 'null'
          title: Vectors
          description: >-
            Vector representation of the feature. Can be any supported vector
            type.
      additionalProperties: true
      type: object
      required:
        - feature_extractor_id
      title: FeatureModel
      description: Response from a feature extractor.
    BlobURLRef:
      properties:
        field:
          type: string
          minLength: 1
          title: Field
          description: >-
            REQUIRED. Stable semantic label for this blob. Use descriptive names
            like 'video_segment', 'thumbnail', 'source_video', etc. Avoid
            internal implementation details like array indices. This field is
            used for programmatic access and should be consistent across
            documents.
          examples:
            - video_segment
            - thumbnail
            - source_video
            - processed_frame
        role:
          type: string
          enum:
            - source
            - processed
            - thumbnail
            - artifact
            - aux
          title: Role
          description: >-
            REQUIRED. Semantic role determining how this blob should be treated.
            'source': Original input media from buckets or uploads. 'processed':
            Derived content created by feature extractors (segments, frames).
            'thumbnail': Preview images for UI display. 'artifact': Generated
            outputs (reports, analysis results). 'aux': Supporting or auxiliary
            files. Used by UI for grouping and displaying blobs appropriately.
          default: source
        type:
          type: string
          enum:
            - video
            - image
            - audio
            - text
            - pdf
            - other
          title: Type
          description: >-
            REQUIRED. Media type of the blob content. Determines how the blob
            should be rendered or processed. Use 'other' for custom or unknown
            types.
          default: other
        url:
          type: string
          minLength: 1
          title: Url
          description: >-
            REQUIRED. Permanent URL to the blob content. S3 URLs
            (s3://bucket/key) are automatically converted to presigned HTTPS
            URLs by the API. HTTP/HTTPS URLs are returned as-is. This is the
            canonical reference to the blob that persists across API calls.
          examples:
            - s3://mixpeek-storage/namespace_123/objects/obj_456/segment_0.mp4
            - https://example.com/media/video.mp4
        filename:
          anyOf:
            - type: string
            - type: 'null'
          title: Filename
          description: >-
            OPTIONAL. Original filename or leaf name of the blob. Useful for
            downloads and display purposes. Example: 'segment_0.mp4',
            'thumbnail.jpg'
          examples:
            - segment_0.mp4
            - thumbnail.jpg
            - analysis_report.pdf
        size_bytes:
          anyOf:
            - type: integer
              minimum: 0
            - type: 'null'
          title: Size Bytes
          description: >-
            OPTIONAL. Size of the blob in bytes. Useful for progress indicators
            and storage tracking.
          examples:
            - 1048576
            - 524288
        content_type:
          anyOf:
            - type: string
            - type: 'null'
          title: Content Type
          description: >-
            OPTIONAL. MIME type of the blob content. Used for proper
            content-type headers when serving files. Example: 'video/mp4',
            'image/jpeg', 'application/pdf'
          examples:
            - video/mp4
            - image/jpeg
            - image/png
            - application/pdf
        checksum:
          anyOf:
            - type: string
            - type: 'null'
          title: Checksum
          description: >-
            OPTIONAL. Checksum or hash of the blob content for integrity
            verification. Format: 'algorithm:hash' (e.g., 'sha256:abc123...')
          examples:
            - sha256:a1b2c3d4e5f6...
            - md5:9e107d9d372bb6826bd81d3542a419d6
        created_at:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Created At
          description: >-
            OPTIONAL. Timestamp when the blob was created or uploaded. ISO 8601
            format. Useful for tracking blob lifecycle and cleanup.
        source_blob_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Source Blob Id
          description: >-
            OPTIONAL. Cross-reference to the source blob ID from the bucket
            object. Used for lineage tracking: connects processed blobs back to
            their original sources. Example: A video segment references the
            original video blob that it was extracted from.
          examples:
            - blob_abc123
            - blob_xyz789
        presigned_url:
          anyOf:
            - type: string
              minLength: 1
              format: uri
            - type: 'null'
          title: Presigned Url
          description: >-
            RESPONSE ONLY. Time-limited HTTPS URL for direct access to the blob.
            Automatically generated by the API when documents are retrieved. NOT
            REQUIRED when creating blobs - leave empty, API will populate.
            Typically expires after 1 hour. Use for immediate media playback or
            download.
      type: object
      required:
        - field
        - url
      title: BlobURLRef
      description: >-
        Reference to a document-related blob with automatic presigned URL
        generation.


        Represents any file or asset associated with a document, including
        source media,

        processed outputs, thumbnails, and artifacts. All blobs with S3 URLs
        automatically

        receive presigned URLs when documents are retrieved from the API.


        Use Cases:
            - Track source media: Original videos, images, or files from buckets
            - Reference processed outputs: Video segments, extracted frames, thumbnails
            - Link artifacts: Generated reports, analysis results, derived media
            - Maintain lineage: Connect processed blobs back to their source blobs

        Role Definitions:
            - source: Original input media (e.g., uploaded video, bucket object)
            - processed: Derived/transformed content (e.g., video segments, extracted frames)
            - thumbnail: Preview images for visual content
            - artifact: Generated files (reports, analysis outputs)
            - aux: Supporting/auxiliary files

        Presigned URL Flow:
            1. During processing: Store S3 URLs in document_blobs (no presigned URL yet)
            2. During retrieval: API automatically adds presigned_url field to each blob
            3. Client receives: Both permanent S3 URL and time-limited HTTPS presigned URL

        Requirements:
            - field: REQUIRED. Semantic label for the blob (e.g., 'video_segment', 'thumbnail')
            - url: REQUIRED. S3 or HTTP URL to the blob content
            - role: REQUIRED. Semantic role for UX grouping and behavior
            - All other fields: OPTIONAL but recommended for better tracking
      examples:
        - content_type: video/mp4
          description: Original source video from bucket
          field: source_video
          role: source
          size_bytes: 10485760
          source_blob_id: blob_original123
          type: video
          url: s3://mixpeek-storage/ns_123/obj_456/original.mp4
        - content_type: video/mp4
          description: Processed video segment
          field: video_segment
          filename: segment_0.mp4
          object_key: ns_123/obj_456/segments/segment_0.mp4
          role: processed
          size_bytes: 524288
          source_blob_id: blob_original123
          type: video
          url: s3://mixpeek-storage/ns_123/obj_456/segments/segment_0.mp4
        - content_type: image/jpeg
          description: Thumbnail image
          field: thumbnail
          filename: thumb_0.jpg
          role: thumbnail
          size_bytes: 51200
          type: image
          url: s3://mixpeek-storage/ns_123/obj_456/thumbs/thumb_0.jpg
    InternalPayloadModel:
      properties:
        internal_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Internal Id
          description: Organization/tenant identifier for multi-tenancy isolation.
          examples:
            - org_abc123
        namespace_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Namespace Id
          description: Namespace identifier within the organization.
          examples:
            - ns_xyz789
        document_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Document Id
          description: Document identifier (also at root level for convenience).
          examples:
            - doc_f8966ff29c18e20c6b45e053
        collection_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Collection Id
          description: Collection identifier (also at root level for convenience).
          examples:
            - col_articles
            - col_video_frames
        created_at:
          anyOf:
            - type: string
            - type: 'null'
          title: Created At
          description: ISO 8601 timestamp when document was created.
          examples:
            - '2025-10-31T10:00:00Z'
        updated_at:
          anyOf:
            - type: string
            - type: 'null'
          title: Updated At
          description: ISO 8601 timestamp when document was last updated.
          examples:
            - '2025-10-31T10:05:00Z'
        lineage:
          anyOf:
            - $ref: '#/components/schemas/InternalLineageModel'
            - type: 'null'
          description: Document lineage and provenance tracking.
        processing:
          anyOf:
            - $ref: '#/components/schemas/InternalProcessingModel'
            - type: 'null'
          description: Processing history and provenance metadata.
        source_blobs:
          anyOf:
            - items:
                additionalProperties: true
                type: object
              type: array
            - type: 'null'
          title: Source Blobs
          description: Blobs that constituted the original source object.
          examples:
            - - blob_id: blob_video_001
                blob_property: video
                blob_type: video
        document_blobs:
          anyOf:
            - items:
                additionalProperties: true
                type: object
              type: array
            - type: 'null'
          title: Document Blobs
          description: Blobs generated during document processing (thumbnails, etc.).
          examples:
            - - field: thumbnail
                role: thumbnail
                type: image
                url: s3://...
        source_details:
          anyOf:
            - items:
                additionalProperties: true
                type: object
              type: array
            - type: 'null'
          title: Source Details
          description: Enrichment tracking and source detail entries.
        modality:
          anyOf:
            - type: string
            - type: 'null'
          title: Modality
          description: Content modality (text, image, video, audio, etc.).
          examples:
            - text
            - image
            - video
            - audio
        metadata:
          anyOf:
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Metadata
          description: >-
            System metadata including ingestion_status,
            feature_extractor_config_hash, and other processing-related
            information.
          examples:
            - feature_extractor_config_hash: abc123
              ingestion_status: COMPLETED
        mime_type:
          anyOf:
            - type: string
            - type: 'null'
          title: Mime Type
          description: MIME type of the source content.
          examples:
            - video/mp4
            - image/jpeg
            - text/plain
        size_bytes:
          anyOf:
            - type: integer
            - type: 'null'
          title: Size Bytes
          description: Size of the source content in bytes.
          examples:
            - 1024000
            - 5242880
        content_hash:
          anyOf:
            - type: string
            - type: 'null'
          title: Content Hash
          description: SHA256 hash of the source content for deduplication.
          examples:
            - e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
        _acl:
          anyOf:
            - $ref: '#/components/schemas/InternalACLModel'
            - type: 'null'
          description: >-
            Access control list for document-level security. Controls which
            user-scoped API keys can read/write this document. Org-scoped keys
            bypass ACL entirely.
      additionalProperties: true
      type: object
      title: InternalPayloadModel
      description: >-
        Complete _internal field structure for Qdrant document payloads.


        All Mixpeek-managed system fields are namespaced under this structure
        to:

        - Prevent collision with user-defined fields

        - Provide clear separation of system vs user data

        - Enable filtering on internal fields via _internal.field_name paths


        This structure is stored in Qdrant and returned in API responses.
      examples:
        - collection_id: col_articles
          created_at: '2025-10-31T10:00:00Z'
          document_id: doc_f8966ff29c
          internal_id: org_abc123
          lineage:
            path: bkt_content/col_articles
            root_bucket_id: bkt_content
            root_object_id: obj_article_001
            source_object_id: obj_article_001
            source_type: bucket
          metadata:
            ingestion_status: COMPLETED
          modality: text
          namespace_id: ns_xyz789
          updated_at: '2025-10-31T10:00:00Z'
    ErrorDetail:
      properties:
        message:
          type: string
          title: Message
          description: Human-readable error message
        type:
          type: string
          title: Type
          description: Stable error type identifier (machine-readable)
        code:
          anyOf:
            - type: string
            - type: 'null'
          title: Code
          description: >-
            Fine-grained error code for programmatic handling (e.g.,
            namespace_name_taken, feature_extractor_not_found). Present only
            when consumers may need to branch on a specific error condition.
        details:
          anyOf:
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Details
          description: >-
            Optional structured details to help debugging (validation errors,
            IDs, etc.)
      type: object
      required:
        - message
        - type
      title: ErrorDetail
      description: Error detail model.
    ValidationError:
      properties:
        loc:
          items:
            anyOf:
              - type: string
              - type: integer
          type: array
          title: Location
        msg:
          type: string
          title: Message
        type:
          type: string
          title: Error Type
      type: object
      required:
        - loc
        - msg
        - type
      title: ValidationError
    DenseVector:
      properties: {}
      additionalProperties: true
      type: object
      title: DenseVector
      description: |-
        Basic dense vector representation with flexible key naming.

        Accepts a single key-value pair where the key can be any string
        and the value must be a list of floats.

        Example:
        ```json
        {
            "embedding": [0.1, 0.2, 0.3, 0.4, 0.5]
        }
        ```
        or
        ```json
        {
            "my_custom_vector": [0.1, 0.2, 0.3, 0.4, 0.5]
        }
        ```
    SparseVector:
      properties:
        indices:
          items:
            anyOf:
              - type: integer
              - type: number
          type: array
          title: Indices
          description: Indices of non-zero elements
        values:
          items:
            type: number
          type: array
          title: Values
          description: Values of non-zero elements
      type: object
      required:
        - indices
        - values
      title: SparseVector
      description: |-
        Sparse vector representation with indices and values.

        Only non-zero elements are stored for efficiency.

        Example:
        ```json
        {
            "indices": [0, 2, 4],
            "values": [0.1, 0.3, 0.5]
        }
        ```
    MultiDenseVector:
      properties: {}
      additionalProperties: true
      type: object
      title: MultiDenseVector
      description: >-
        Multi-dimensional dense vector representation with flexible key naming.


        Accepts a single key-value pair where the key can be any string

        and the value must be a list of lists of floats.

        Useful for late interaction models and other multi-dimensional
        embeddings.


        Example:

        ```json

        {
            "embeddings": [
                [0.1, 0.2, 0.3],
                [0.4, 0.5, 0.6],
                [0.7, 0.8, 0.9]
            ]
        }

        ```

        or

        ```json

        {
            "multi_vectors": [
                [0.1, 0.2, 0.3],
                [0.4, 0.5, 0.6],
                [0.7, 0.8, 0.9]
            ]
        }

        ```
    NamedDenseVectors:
      additionalProperties:
        items:
          type: number
        type: array
      type: object
      title: NamedDenseVectors
      description: |-
        Root model mapping vector names → dense float lists.

        Accepts JSON like:
        ```json
        {
            "vector_a": [0.1, 0.2, 0.3],
            "vector_b": [0.4, 0.5, 0.6]
        }
        ```
    InternalLineageModel:
      properties:
        root_object_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Root Object Id
          description: >-
            Original object ID from bucket (root of decomposition tree). All
            documents derived from the same object share this ID.
          examples:
            - obj_video123
            - obj_document456
        root_bucket_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Root Bucket Id
          description: Bucket ID containing the root object.
          examples:
            - bkt_marketing
            - bkt_documents
        source_type:
          anyOf:
            - type: string
              enum:
                - bucket
                - collection
                - direct_upsert
            - type: 'null'
          title: Source Type
          description: >-
            Type of immediate parent source. 'bucket': Document created directly
            from bucket object (tier 1). 'collection': Document created from
            another collection (tier 2+).
          examples:
            - bucket
            - collection
        source_object_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Source Object Id
          description: Object ID of immediate parent when source_type='bucket'.
          examples:
            - obj_video123
        source_document_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Source Document Id
          description: Document ID of immediate parent when source_type='collection'.
          examples:
            - doc_frame050
        source_collection_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Source Collection Id
          description: Collection ID of immediate parent when source_type='collection'.
          examples:
            - col_video_frames
        path:
          anyOf:
            - type: string
            - type: 'null'
          title: Path
          description: Materialized lineage path string (e.g., 'bkt_123/col_456/col_789').
          examples:
            - bkt_marketing/col_video_frames
            - bkt_docs/col_chapters/col_paragraphs
        chain:
          anyOf:
            - items:
                additionalProperties: true
                type: object
              type: array
            - type: 'null'
          title: Chain
          description: >-
            Ordered list of processing steps from root object to this document.
            Each step contains: collection_id, feature_extractor_id,
            document_id, timestamp.
          examples:
            - - collection_id: col_frames
                feature_extractor_id: multimodal_extractor_v1
                timestamp: '2025-10-31T10:00:00Z'
      additionalProperties: true
      type: object
      title: InternalLineageModel
      description: |-
        Lineage tracking information for document provenance.

        Tracks the complete processing history from the original bucket object
        through all transformation stages in the decomposition tree.
    InternalProcessingModel:
      properties:
        source_url:
          anyOf:
            - type: string
            - type: 'null'
          title: Source Url
          description: Original URL before S3 mirroring (for URL-based ingestion).
          examples:
            - https://example.com/video.mp4
        object_key_source:
          anyOf:
            - type: string
            - type: 'null'
          title: Object Key Source
          description: S3 key source identifier.
        detected_mime_type:
          anyOf:
            - type: string
            - type: 'null'
          title: Detected Mime Type
          description: MIME type detected during canonicalization.
          examples:
            - video/mp4
            - image/jpeg
            - application/pdf
        history:
          anyOf:
            - items:
                additionalProperties: true
                type: object
              type: array
            - type: 'null'
          title: History
          description: Processing steps history with timestamps and operations.
          examples:
            - - operation: taxonomy_join
                taxonomy_ids_applied:
                  - tax_industries
                timestamp: '2025-10-22T11:03:22Z'
        taxonomy_lineage:
          anyOf:
            - items:
                additionalProperties: true
                type: object
              type: array
            - type: 'null'
          title: Taxonomy Lineage
          description: Taxonomy enrichment entries applied to this document.
        last_health_check:
          anyOf:
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Last Health Check
          description: Last health check result (for batch processing).
      additionalProperties: true
      type: object
      title: InternalProcessingModel
      description: |-
        Processing and provenance tracking information.

        Consolidates all processing-related metadata including source URLs,
        processing history, and taxonomy enrichment lineage.
    InternalACLModel:
      properties:
        owner:
          anyOf:
            - type: string
            - type: 'null'
          title: Owner
          description: Principal ID of the document owner (the creator).
          examples:
            - user_alice
            - usr_12345
        read:
          anyOf:
            - items:
                type: string
              type: array
            - type: 'null'
          title: Read
          description: List of principal IDs that can read this document.
          examples:
            - - user_alice
              - user_bob
        write:
          anyOf:
            - items:
                type: string
              type: array
            - type: 'null'
          title: Write
          description: List of principal IDs that can write/update this document.
          examples:
            - - user_alice
        public:
          anyOf:
            - type: boolean
            - type: 'null'
          title: Public
          description: If true, any user-scoped key can read this document.
          default: false
      additionalProperties: true
      type: object
      title: InternalACLModel
      description: |-
        Access control list for document-level security (row-level security).

        Controls which end-user principals can read/write a document.
        Used with user-scoped API keys (keys with principal_id set).
        Org-scoped keys bypass ACL entirely.

````