> ## Documentation Index
> Fetch the complete documentation index at: https://docs.mixpeek.com/docs/llms.txt
> Use this file to discover all available pages before exploring further.

# List Collections

> This endpoint allows you to list collections.



## OpenAPI

````yaml post /v1/collections/list
openapi: 3.1.0
info:
  title: Mixpeek API
  description: >-
    This is the Mixpeek API, providing access to various endpoints for data
    processing and retrieval.
  termsOfService: https://mixpeek.com/terms
  contact:
    name: Mixpeek Support
    url: https://mixpeek.com/contact
    email: info@mixpeek.com
  version: '0.82'
servers:
  - url: https://api.mixpeek.com
    description: Production
security: []
paths:
  /v1/collections/list:
    post:
      tags:
        - Collections
      summary: List Collections
      description: This endpoint allows you to list collections.
      operationId: list_collections_v1_collections_list_post
      parameters:
        - name: limit
          in: query
          required: false
          schema:
            anyOf:
              - type: integer
                maximum: 1000
                minimum: 1
              - type: 'null'
            title: Limit
        - name: page_size
          in: query
          required: false
          schema:
            anyOf:
              - type: integer
                maximum: 1000
                minimum: 1
              - type: 'null'
            title: Page Size
        - name: offset
          in: query
          required: false
          schema:
            anyOf:
              - type: integer
                maximum: 10000
                minimum: 0
              - type: 'null'
            title: Offset
        - name: page
          in: query
          required: false
          schema:
            anyOf:
              - type: integer
                minimum: 1
              - type: 'null'
            title: Page
        - name: cursor
          in: query
          required: false
          schema:
            anyOf:
              - type: string
              - type: 'null'
            title: Cursor
        - name: include_total
          in: query
          required: false
          schema:
            type: boolean
            default: false
            title: Include Total
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ListCollectionsRequest'
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListCollectionsResponse'
        '400':
          description: Bad Request
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '401':
          description: Unauthorized
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '403':
          description: Forbidden
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '404':
          description: Not Found
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '422':
          description: Validation Error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/HTTPValidationError'
        '500':
          description: Internal Server Error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
components:
  schemas:
    ListCollectionsRequest:
      properties:
        filters:
          anyOf:
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Filters
          description: >-
            Filters to apply when listing collections. Supports nested field
            filtering like 'taxonomy_applications.taxonomy_id'. Format: {"AND":
            [{"field": "field_name", "operator": "eq", "value": "value"}]}
        sort:
          anyOf:
            - $ref: '#/components/schemas/SortOption'
            - type: 'null'
          description: Sort options for the results
        search:
          anyOf:
            - type: string
            - type: 'null'
          title: Search
          description: >-
            Search term for wildcard search across collection_id,
            collection_name, description, and other text fields
        case_sensitive:
          type: boolean
          title: Case Sensitive
          description: If True, filters and search will be case-sensitive
          default: false
      type: object
      title: ListCollectionsRequest
      description: >-
        Request model for listing collections.


        To filter by taxonomy, use dot notation in filters:

        filters.AND = [{"field": "taxonomy_applications.taxonomy_id",
        "operator": "eq", "value": "tax_123"}]
    ListCollectionsResponse:
      properties:
        results:
          items:
            $ref: '#/components/schemas/CollectionResponse'
          type: array
          title: Results
          description: List of collections
        pagination:
          $ref: '#/components/schemas/PaginationResponse'
          description: Pagination information
        total_count:
          type: integer
          title: Total Count
          description: Total number of collections matching the query
        stats:
          anyOf:
            - $ref: '#/components/schemas/CollectionListStats'
            - type: 'null'
          description: Aggregate statistics across all collections in the result
      type: object
      required:
        - results
        - pagination
        - total_count
      title: ListCollectionsResponse
      description: Response model for listing collections.
    ErrorResponse:
      properties:
        success:
          type: boolean
          title: Success
          description: Always false for error responses
          default: false
        status:
          type: integer
          title: Status
          description: HTTP status code for this error
        error:
          $ref: '#/components/schemas/ErrorDetail'
          description: Error details payload
      type: object
      required:
        - status
        - error
      title: ErrorResponse
      description: Error response model.
      examples:
        - error:
            details:
              id: ns_123
              resource: namespace
            message: Namespace not found
            type: NotFoundError
          status: 404
          success: false
    HTTPValidationError:
      properties:
        detail:
          items:
            $ref: '#/components/schemas/ValidationError'
          type: array
          title: Detail
      type: object
      title: HTTPValidationError
    SortOption:
      properties:
        field:
          type: string
          title: Field
          description: Field to sort by, supports dot notation for nested fields
          example: created_at
        direction:
          $ref: '#/components/schemas/SortDirection'
          description: Sort direction
          default: asc
          example: desc
      type: object
      required:
        - field
      title: SortOption
      description: |-
        Specifies how to sort query results.

        Attributes:
            field: Field to sort by
            direction: Sort direction (ascending or descending)
    CollectionResponse:
      properties:
        collection_id:
          type: string
          title: Collection Id
          description: >-
            NOT REQUIRED (auto-generated). Unique identifier for this
            collection. Used for: API paths, document queries, pipeline
            references. Format: 'col_' prefix + 10 random alphanumeric
            characters. Stable after creation - use for all collection
            references.
          examples:
            - col_a1b2c3d4e5
            - col_xyz789abc
            - col_video_frames
        collection_name:
          type: string
          maxLength: 100
          minLength: 3
          title: Collection Name
          description: >-
            REQUIRED. Human-readable name for the collection. Must be unique
            within the namespace. Used for: Display, lookups (can query by name
            or ID), organization. Format: Alphanumeric with underscores/hyphens,
            3-100 characters. Examples: 'product_embeddings', 'video_frames',
            'customer_documents'.
          examples:
            - video_frames
            - product_embeddings
            - customer_documents
        description:
          anyOf:
            - type: string
            - type: 'null'
          title: Description
          description: >-
            NOT REQUIRED. Human-readable description of the collection's
            purpose. Use for: Documentation, team communication, UI display.
            Common pattern: Describe what the collection contains and what
            processing is applied.
          examples:
            - Video frames extracted at 1 FPS with CLIP embeddings
            - Product catalog with image embeddings and metadata
        input_schema:
          anyOf:
            - $ref: '#/components/schemas/BucketSchema-Output'
            - type: 'null'
          description: >-
            Auto-computed from source. Input schema defining fields available to
            the feature extractor. Source: bucket.bucket_schema (if
            source.type='bucket') OR upstream_collection.output_schema (if
            source.type='collection'). Determines: Which fields can be used in
            input_mappings and field_passthrough. This is the 'left side' of the
            transformation - what data goes IN. Format: BucketSchema with
            properties dict. Use for: Validating input_mappings, configuring
            field_passthrough.
          examples:
            - properties:
                title:
                  type: string
                content:
                  type: text
            - properties:
                video:
                  type: video
                campaign_id:
                  type: string
        output_schema:
          anyOf:
            - $ref: '#/components/schemas/BucketSchema-Output'
            - type: 'null'
          description: >-
            Auto-computed at creation. Output schema defining fields in final
            documents. Computed as: field_passthrough fields + extractor output
            fields (deterministic). Known IMMEDIATELY when collection is created
            - no waiting for documents! This is the 'right side' of the
            transformation - what data comes OUT. Use for: Understanding
            document structure, building queries, schema validation. Example:
            {title (passthrough), embedding (extractor output)} = output_schema.
          examples:
            - properties:
                title:
                  type: string
                text_extractor_v1_embedding:
                  type: array
            - properties:
                campaign_id:
                  type: string
                multimodal_extractor_v1_embedding:
                  type: array
        feature_extractor:
          $ref: >-
            #/components/schemas/shared__collection__features__extractors__models__FeatureExtractorConfig-Output
          description: >-
            REQUIRED. Single feature extractor configuration for this
            collection. Defines: extractor name/version, input_mappings,
            field_passthrough, parameters. Task 9 change: ONE extractor per
            collection (previously supported multiple). For multiple extractors:
            Create multiple collections and use collection-to-collection
            pipelines. Use field_passthrough to include additional source fields
            beyond extractor outputs.
          examples:
            - feature_extractor_name: text_extractor
              field_passthrough:
                - source_path: title
              input_mappings:
                text: content
              version: v1
        source:
          $ref: '#/components/schemas/SourceConfig-Output'
          description: >-
            REQUIRED. Source configuration defining where data comes from. Type
            'bucket': Process objects from one or more buckets (tier 1). Type
            'collection': Process documents from upstream collection(s) (tier
            2+). For multi-bucket sources, all buckets must have compatible
            schemas. For multi-collection sources, all collections must have
            compatible schemas. Determines input_schema and enables
            decomposition trees.
          examples:
            - bucket_ids:
                - bkt_articles
              type: bucket
            - bucket_ids:
                - bkt_us_products
                - bkt_eu_products
              type: bucket
            - collection_id: col_video_frames
              type: collection
            - collection_ids:
                - col_products_2023
                - col_products_2024
              type: collection
        source_bucket_schemas:
          anyOf:
            - additionalProperties:
                $ref: '#/components/schemas/BucketSchema-Output'
              type: object
            - type: 'null'
          title: Source Bucket Schemas
          description: >-
            NOT REQUIRED (auto-computed). Snapshot of bucket schemas at
            collection creation. Only populated for multi-bucket collections
            (source.type='bucket' with multiple bucket_ids). Key: bucket_id,
            Value: BucketSchema at time of collection creation. Used for: Schema
            compatibility validation, document lineage, debugging. Schema
            snapshot is immutable - bucket schema changes after collection
            creation do not affect this. Single-bucket collections may omit this
            field (schema in input_schema is sufficient).
          examples:
            - null
            - bkt_eu_products:
                properties:
                  image:
                    type: image
                  title:
                    type: string
                  price:
                    type: number
                required:
                  - image
                  - title
              bkt_us_products:
                properties:
                  image:
                    type: image
                  title:
                    type: string
                  price:
                    type: number
                required:
                  - image
                  - title
        source_lineage:
          items:
            $ref: '#/components/schemas/SingleLineageEntry'
          type: array
          title: Source Lineage
          description: >-
            NOT REQUIRED (auto-computed). Lineage chain showing complete
            processing history. Each entry contains: source_config,
            feature_extractor, output_schema for one tier. Length indicates
            processing depth (1 = tier 1, 2 = tier 2, etc.). Use for:
            Understanding multi-tier pipelines, visualizing decomposition trees.
          examples:
            - []
            - - output_schema: {}
                source_config:
                  bucket_id: bkt_videos
                  type: bucket
        vector_indexes:
          items: {}
          type: array
          title: Vector Indexes
          description: >-
            NOT REQUIRED (auto-computed from extractor). Vector indexes for
            semantic search. Populated from
            feature_extractor.required_vector_indexes. Defines: Which embeddings
            are indexed, dimensions, distance metrics. Use for: Understanding
            search capabilities, debugging vector queries.
          examples:
            - []
            - - distance: cosine
                name: text_extractor_v1_embedding
                size: 1024
        payload_indexes:
          items: {}
          type: array
          title: Payload Indexes
          description: >-
            NOT REQUIRED (auto-computed from extractor + namespace). Payload
            indexes for filtering. Enables efficient filtering on metadata
            fields, timestamps, IDs. Populated from: extractor requirements +
            namespace defaults. Use for: Understanding which fields support fast
            filtering.
          examples:
            - []
            - - field_name: collection_id
                type: keyword
        embedding_task:
          anyOf:
            - type: string
            - type: 'null'
          title: Embedding Task
          description: >-
            Override the embedding task hint for instruction-aware models (E5,
            Gemini). Defaults to 'retrieval_document' for indexing pipelines.
            Values: retrieval_document, retrieval_query, semantic_similarity,
            classification, clustering. Applied to all task-aware embedding
            models in this collection's extractor pipeline.
        enabled:
          type: boolean
          title: Enabled
          description: >-
            NOT REQUIRED (defaults to True). Whether the collection accepts new
            documents. False: Collection exists but won't process new objects.
            True: Active and processing. Use for: Temporarily disabling
            collections without deletion.
          default: true
          examples:
            - true
            - false
        metadata:
          anyOf:
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Metadata
          description: >-
            NOT REQUIRED. Additional user-defined metadata for the collection.
            Arbitrary key-value pairs for custom organization, tracking,
            configuration. Not used by the platform - purely for user purposes.
            Common uses: team ownership, project tags, deployment environment.
          examples:
            - environment: production
              project: Q4_campaign
              team: data-science
            - null
        schedule:
          anyOf:
            - $ref: '#/components/schemas/CollectionScheduleConfig'
            - type: 'null'
          description: >-
            NOT REQUIRED. Schedule configuration for automatic re-processing.
            When set, a COLLECTION_TRIGGER trigger is created and linked to this
            collection.
        trigger_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Trigger Id
          description: >-
            NOT REQUIRED. ID of the linked trigger for scheduled re-processing.
            Automatically set when a schedule is configured.
        created_at:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Created At
          description: >-
            Timestamp when the collection was created. Automatically set by the
            system when the collection is first saved to the database.
        updated_at:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Updated At
          description: >-
            Timestamp when the collection was last updated. Automatically
            updated by the system whenever the collection is modified.
        lifecycle_state:
          type: string
          title: Lifecycle State
          description: >-
            Storage lifecycle state: 'active' (Qdrant + S3), 'cold' (S3 only),
            'archived' (metadata only). Managed via lifecycle API.
          default: active
        s3_vector_index:
          anyOf:
            - type: string
            - type: 'null'
          title: S3 Vector Index
          description: >-
            S3 Vectors index name for this collection (e.g.
            'col_{collection_id}').
        last_lifecycle_transition:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Last Lifecycle Transition
          description: Timestamp of the most recent lifecycle state change.
        tiering_rules:
          anyOf:
            - items:
                $ref: '#/components/schemas/TieringRule'
              type: array
            - type: 'null'
          title: Tiering Rules
          description: Automatic storage tiering rules (not enforced in V1).
        document_count:
          anyOf:
            - type: integer
            - type: 'null'
          title: Document Count
          description: Number of documents in the collection
        schema_version:
          type: integer
          title: Schema Version
          description: >-
            Version number for the output_schema. Increments automatically when
            schema is updated via document sampling. Used to track schema
            evolution and trigger downstream collection schema updates.
          default: 1
        last_schema_sync:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Last Schema Sync
          description: >-
            Timestamp of last automatic schema sync from document sampling. Used
            to debounce schema updates (prevents thrashing).
        schema_sync_enabled:
          type: boolean
          title: Schema Sync Enabled
          description: >-
            Whether automatic schema discovery and sync is enabled for this
            collection. When True, schema is periodically updated by sampling
            documents. When False, schema remains fixed at creation time.
          default: true
        document_schema:
          anyOf:
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Document Schema
          description: >-
            NOT REQUIRED. JSON Schema for validating documents on create/update.
            When set with schema_validation='strict', non-conforming documents
            are rejected (422). When set with schema_validation='warn',
            violations are recorded but document is accepted. Schema follows
            JSON Schema draft-07 format. Only validates user-defined fields
            (system fields like _internal, collection_id, document_id are
            excluded from validation).
          examples:
            - null
            - additionalProperties: true
              properties:
                title:
                  maxLength: 200
                  type: string
                price:
                  minimum: 0
                  type: number
                status:
                  enum:
                    - active
                    - draft
                    - archived
                  type: string
              required:
                - title
                - price
              type: object
        schema_validation:
          type: string
          enum:
            - strict
            - warn
            - 'off'
          title: Schema Validation
          description: >-
            Document schema validation mode. 'strict': reject non-conforming
            documents with 422. 'warn': accept but attach _schema_violations
            field to the document. 'off': no validation (default, preserves
            current behavior).
          default: 'off'
          examples:
            - 'off'
            - strict
            - warn
        taxonomy_applications:
          anyOf:
            - items:
                $ref: '#/components/schemas/TaxonomyApplicationConfig-Output'
              type: array
            - type: 'null'
          title: Taxonomy Applications
          description: >-
            NOT REQUIRED. List of taxonomies to apply to documents in this
            collection. Each entry specifies: taxonomy_id, optional
            target_collection_id, optional filters. Enrichments are materialized
            (persisted to documents) during ingestion. Empty/null if no
            taxonomies attached. Use for: Categorization, hierarchical
            classification.
          examples:
            - null
            - - execution_mode: materialize
                taxonomy_id: tax_categories
              - execution_mode: materialize
                taxonomy_id: tax_brands
        cluster_applications:
          anyOf:
            - items:
                $ref: '#/components/schemas/ClusterApplicationConfig'
              type: array
            - type: 'null'
          title: Cluster Applications
          description: >-
            NOT REQUIRED. List of clusters to automatically execute when batch
            processing completes. Each entry specifies: cluster_id,
            auto_execute_on_batch, min_document_threshold, cooldown_seconds.
            Clusters enrich source documents with cluster assignments
            (cluster_id, cluster_label, etc.). Empty/null if no clusters
            attached. Use for: Segmentation, grouping, pattern discovery.
          examples:
            - null
            - - auto_execute_on_batch: true
                cluster_id: clust_product_categories
                cooldown_seconds: 3600
                min_document_threshold: 100
        alert_applications:
          anyOf:
            - items:
                $ref: '#/components/schemas/AlertApplicationConfig-Output'
              type: array
            - type: 'null'
          title: Alert Applications
          description: >-
            NOT REQUIRED. List of alerts to automatically execute when documents
            are ingested. Each entry specifies: alert_id, execution_mode,
            input_mappings, execution_phase, priority. Alerts run retrievers on
            ingested documents and send notifications when matches are found.
            Empty/null if no alerts attached. Use for: Content monitoring,
            safety detection, compliance alerts.
          examples:
            - null
            - - alert_id: alt_safety_001
                execution_mode: on_ingest
                execution_phase: 3
                input_mappings:
                  - input_key: query_embedding
                    source:
                      path: features.video_embedding
                      source_type: document_field
                priority: 100
        retriever_enrichments:
          anyOf:
            - items:
                $ref: '#/components/schemas/RetrieverEnrichmentConfig-Output'
              type: array
            - type: 'null'
          title: Retriever Enrichments
          description: >-
            NOT REQUIRED. List of retriever enrichments to run on documents
            during post-processing. Each entry specifies: retriever_id,
            input_mappings, write_back_fields, execution_phase, priority.
            Retriever enrichments execute a retriever pipeline and write
            selected result fields back to documents. Empty/null if no
            enrichments attached. Use for: LLM classification, cross-collection
            joins, enrichment.
          examples:
            - null
            - - execution_phase: 4
                input_mappings:
                  - input_key: query
                    source:
                      path: title
                      source_type: document_field
                priority: 0
                retriever_id: ret_classifier_001
                write_back_fields:
                  - mode: first
                    source_field: category
                    target_field: _enrichment_category
        vector_count:
          anyOf:
            - type: integer
            - type: 'null'
          title: Vector Count
          description: >-
            Total number of vector entries across all documents in this
            collection. Computed as document_count * number_of_vector_indexes.
            Each document stores one vector per configured vector index (e.g. a
            collection with 1 embedding produces 1 vector per document).
        taxonomy_count:
          anyOf:
            - type: integer
            - type: 'null'
          title: Taxonomy Count
          description: Number of taxonomies connected to this collection
        retriever_count:
          anyOf:
            - type: integer
            - type: 'null'
          title: Retriever Count
          description: Number of retrievers connected to this collection
      type: object
      required:
        - collection_name
        - feature_extractor
        - source
      title: CollectionResponse
      description: Response model for collection endpoints.
      examples:
        - collection_id: col_a1b2c3d4e5
          collection_name: article_embeddings
          description: >-
            Simple text collection: News articles with text embeddings from
            bucket source
          enabled: true
          feature_extractor:
            feature_extractor_name: text_extractor
            field_passthrough:
              - source_path: title
            input_mappings:
              text: content
            version: v1
          input_schema:
            properties:
              title:
                type: string
              content:
                type: text
          output_schema:
            properties:
              title:
                type: string
              text_extractor_v1_embedding:
                type: array
          source:
            bucket_id: bkt_articles
            type: bucket
        - collection_id: col_xyz789abc
          collection_name: video_frames
          description: >-
            Video frames: Extracted at 1 FPS with CLIP embeddings and
            campaign_id passthrough
          enabled: true
          feature_extractor:
            feature_extractor_name: multimodal_extractor
            field_passthrough:
              - source_path: campaign_id
            input_mappings:
              video: video
            parameters:
              fps: 1
            version: v1
          input_schema:
            properties:
              video:
                type: video
              campaign_id:
                type: string
          output_schema:
            properties:
              campaign_id:
                type: string
              multimodal_extractor_v1_embedding:
                type: array
          source:
            bucket_id: bkt_marketing_videos
            type: bucket
        - collection_id: col_scenes_def
          collection_name: detected_scenes
          description: >-
            Tier 2 decomposition: Scenes detected from frame embeddings
            (collection source)
          enabled: true
          feature_extractor:
            feature_extractor_name: scene_extractor
            field_passthrough:
              - source_path: campaign_id
            input_mappings:
              embedding: multimodal_extractor_v1_embedding
            version: v1
          input_schema:
            properties:
              campaign_id:
                type: string
              multimodal_extractor_v1_embedding:
                type: array
          output_schema:
            properties:
              campaign_id:
                type: string
              scene_extractor_v1_embedding:
                type: array
          source:
            collection_id: col_video_frames
            type: collection
    PaginationResponse:
      properties:
        total:
          anyOf:
            - type: integer
            - type: 'null'
          title: Total
        page:
          anyOf:
            - type: integer
            - type: 'null'
          title: Page
        page_size:
          anyOf:
            - type: integer
            - type: 'null'
          title: Page Size
        total_pages:
          anyOf:
            - type: integer
            - type: 'null'
          title: Total Pages
        next_page:
          anyOf:
            - type: string
            - type: 'null'
          title: Next Page
        previous_page:
          anyOf:
            - type: string
            - type: 'null'
          title: Previous Page
        next_cursor:
          anyOf:
            - type: string
            - type: 'null'
          title: Next Cursor
      type: object
      title: PaginationResponse
      description: |-
        PaginationResponse.

        Cursor-based pagination response:
        - Use next_cursor for navigation
        - Total count fields only populated when include_total=true
    CollectionListStats:
      properties:
        total_documents:
          type: integer
          title: Total Documents
          description: Total number of documents across all collections
          default: 0
        avg_documents_per_collection:
          type: number
          title: Avg Documents Per Collection
          description: Average number of documents per collection
          default: 0
        collections_with_taxonomies:
          type: integer
          title: Collections With Taxonomies
          description: Number of collections with taxonomy applications
          default: 0
        total_feature_extractors:
          type: integer
          title: Total Feature Extractors
          description: Total number of feature extractors across all collections
          default: 0
        total_taxonomies:
          type: integer
          title: Total Taxonomies
          description: Total number of taxonomy connections across all collections
          default: 0
        total_retrievers:
          type: integer
          title: Total Retrievers
          description: Total number of retriever connections across all collections
          default: 0
      type: object
      title: CollectionListStats
      description: Aggregate statistics for a list of collections.
    ErrorDetail:
      properties:
        message:
          type: string
          title: Message
          description: Human-readable error message
        type:
          type: string
          title: Type
          description: Stable error type identifier (machine-readable)
        code:
          anyOf:
            - type: string
            - type: 'null'
          title: Code
          description: >-
            Fine-grained error code for programmatic handling (e.g.,
            namespace_name_taken, feature_extractor_not_found). Present only
            when consumers may need to branch on a specific error condition.
        details:
          anyOf:
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Details
          description: >-
            Optional structured details to help debugging (validation errors,
            IDs, etc.)
      type: object
      required:
        - message
        - type
      title: ErrorDetail
      description: Error detail model.
    ValidationError:
      properties:
        loc:
          items:
            anyOf:
              - type: string
              - type: integer
          type: array
          title: Location
        msg:
          type: string
          title: Message
        type:
          type: string
          title: Error Type
      type: object
      required:
        - loc
        - msg
        - type
      title: ValidationError
    SortDirection:
      type: string
      enum:
        - asc
        - desc
      title: SortDirection
      description: Sort direction options.
    BucketSchema-Output:
      properties:
        properties:
          additionalProperties:
            $ref: '#/components/schemas/BucketSchemaField-Output'
          type: object
          title: Properties
          description: >-
            REQUIRED. Map of field names to their type definitions. Each field
            must have a 'type' from the supported types: metadata types (string,
            number, integer, float, boolean, object, array, date, datetime) or
            file/blob types (text, image, audio, video, pdf, excel). NOTE: Use
            Mixpeek types, NOT JSON Schema types — e.g. use 'string' not
            'keyword', 'text' for text blobs, 'image' for image blobs. Example:
            {"title": {"type": "string"}, "photo": {"type": "image"}}
      additionalProperties: true
      type: object
      required:
        - properties
      title: BucketSchema
      description: >-
        Schema definition for bucket objects.


        IMPORTANT: The bucket schema defines what fields your bucket objects
        will have.

        This schema is REQUIRED if you want to:

        1. Create collections that use input_mappings to process your bucket
        data

        2. Validate object structure before ingestion

        3. Enable type-safe data pipelines


        The schema defines the custom fields that will be used in:

        - Blob properties (e.g., "content", "thumbnail", "transcript")

        - Object metadata structure

        - Blob data structures


        Example workflow:

        1. Create bucket WITH schema defining your data structure

        2. Upload objects that conform to that schema

        3. Create collections that map schema fields to feature extractors


        Without a bucket_schema, collections cannot use input_mappings.
    shared__collection__features__extractors__models__FeatureExtractorConfig-Output:
      properties:
        feature_extractor_name:
          type: string
          title: Feature Extractor Name
          description: Name of the feature extractor
        version:
          type: string
          title: Version
          description: Version of the feature extractor (e.g., 'v1', 'v2')
        params:
          anyOf:
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Params
          description: >-
            Optional extractor parameters that affect vector index
            configuration. Parameters set here are locked at namespace creation
            and determine vector dimensions in Qdrant. Collections using this
            extractor must use compatible params. Example: {'model':
            'siglip_base'}
        parameters:
          anyOf:
            - $ref: '#/components/schemas/AudioFingerprintExtractorParams'
            - $ref: '#/components/schemas/DocumentGraphExtractorParams'
            - $ref: '#/components/schemas/FaceIdentityExtractorParams'
            - $ref: '#/components/schemas/GeminiMultifileExtractorParams'
            - $ref: '#/components/schemas/ImageExtractorParams'
            - $ref: '#/components/schemas/MultimodalExtractorParams'
            - $ref: '#/components/schemas/PassthroughExtractorParams'
            - $ref: '#/components/schemas/ScrollingTextExtractorParams'
            - $ref: '#/components/schemas/TextExtractorParams'
            - $ref: '#/components/schemas/UniversalExtractorParams'
            - $ref: '#/components/schemas/WebScraperExtractorParams'
            - $ref: '#/components/schemas/CustomPluginParams'
            - type: 'null'
          title: Parameters
          description: >-
            Parameters for the feature extractor. Each extractor type has
            specific parameters. See the schema for your chosen extractor (e.g.,
            MultimodalExtractorParams for multimodal_extractor).
          examples:
            - split_method: time
              time_split_interval: 10
            - chunk_size: 512
              split_by: sentences
        input_mappings:
          anyOf:
            - additionalProperties:
                anyOf:
                  - type: string
                  - items:
                      type: string
                    type: array
              type: object
            - items:
                $ref: '#/components/schemas/InputMapping'
              type: array
          title: Input Mappings
          description: >-
            Mapping from extractor input names to source field paths. Tells the
            extractor which source fields to process. Single value: {'image':
            'thumbnail_url'} maps one blob field to one extractor input. Array
            value: {'files': ['image', 'spec_pdf', 'description']} maps multiple
            blob fields to a single extractor input as a list — used by
            multi-file extractors like gemini_multifile_extractor that embed all
            blobs of an object into one embedding.
          examples:
            - image: product_image
              text: title
            - video: video_url
            - text: content
            - files:
                - hero_image
                - spec_sheet
                - description
        field_passthrough:
          items:
            $ref: '#/components/schemas/FieldPassthrough'
          type: array
          title: Field Passthrough
          description: >-
            NOT REQUIRED. List of specific fields to pass through from source to
            output documents. These fields are included alongside
            extractor-computed features (embeddings, detections, etc.). Empty
            list = only extractor outputs in documents (default behavior). With
            entries = specified fields + extractor outputs in documents. 


            How It Works:

            1. During processing, fields are extracted from source
            object/document

            2. They appear in output documents at the root level

            3. Field filtering happens automatically (only listed fields
            included)

            4. Use target_path to rename fields for cleaner schemas


            Common Use Cases:

            - Preserve identifiers: campaign_id, product_sku, order_id

            - Keep metadata: category, tags, author, created_at

            - Enable filtering: department, status, priority, region

            - Maintain context: title, description, source_url


            Behavior:

            - Works with include_all_source_fields=False (default): ONLY these
            fields included

            - Works with include_all_source_fields=True: These configs used for
            renaming/defaults

            - Fields must exist in source bucket_schema or upstream collection
            output_schema

            - Missing optional fields are omitted (unless default provided)

            - Missing required fields cause processing errors


            Output Schema:

            output_schema = field_passthrough fields + extractor output fields

            Example: ['title', 'category', 'text_extractor_v1_embedding']
          examples:
            - - required: true
                source_path: title
              - required: true
                source_path: campaign_id
            - - default: uncategorized
                source_path: category
              - source_path: metadata.author
                target_path: author
        include_all_source_fields:
          type: boolean
          title: Include All Source Fields
          description: >-
            NOT REQUIRED. Whether to include ALL fields from source
            object/document in output. Default: False (only field_passthrough
            fields included). 


            When False (RECOMMENDED):

            - Only fields listed in field_passthrough are included in output

            - Creates clean, predictable output schemas

            - Prevents data leakage of unwanted fields

            - Output = field_passthrough fields + extractor outputs


            When True (USE WITH CAUTION):

            - ALL source fields are included in output documents

            - field_passthrough still used for renaming/defaults/requirements

            - Can result in large documents if source has many fields

            - Can leak sensitive or unnecessary data

            - Output = all source fields + extractor outputs


            Use True When:

            - You want to preserve complete source data

            - Source has limited, well-defined fields

            - Downstream processing needs all context


            Use False When (MOST CASES):

            - You want clean, controlled output schemas

            - Source has many fields you don't need

            - You want explicit field selection

            - You're concerned about document size


            Examples:

            False: source={a,b,c,d} + passthrough=[a,b] → output={a,b,embedding}

            True:  source={a,b,c,d} + passthrough=[a→x] →
            output={x,b,c,d,embedding}
          default: false
        feature_extractor_id:
          type: string
          title: Feature Extractor Id
          description: >-
            Construct unique identifier for the feature extractor instance (name
            + version).
          readOnly: true
      additionalProperties: false
      type: object
      required:
        - feature_extractor_name
        - version
        - feature_extractor_id
      title: FeatureExtractorConfig
      description: >-
        Configuration for a feature extractor with field passthrough support.


        A feature extractor processes source data (from buckets or collections)
        and

        produces features (embeddings, extracted text, detected objects, etc.).


        With field passthrough, you can also include selected source fields in
        the

        output documents alongside the computed features.


        Core Concepts:
            1. **Feature Extraction**: Extractors compute features from input data
               (e.g., text → embeddings, image → detections, video → scenes)
            2. **Field Passthrough**: Selectively preserve source fields in output
               (e.g., title, category, campaign_id from source → output documents)
            3. **Output Schema**: Combination of passed-through fields + extractor outputs
               (e.g., {title, category, text_embedding} all in one document)

        How Field Passthrough Works:
            1. Define which source fields to include via field_passthrough list
            2. During processing, these fields are extracted from source
            3. They appear in output documents at root level
            4. Combine with extractor outputs for complete documents
            5. Use target_path to rename fields for cleaner schemas

        Field Selection Modes:
            - **Explicit** (field_passthrough + include_all=False):
              Only listed fields pass through. Clean, controlled output.
              Example: passthrough=[title, category] → output has ONLY title, category, embedding

            - **Inclusive** (include_all=True):
              All source fields pass through, field_passthrough for renaming.
              Example: source has 10 fields → output has all 10 + embedding

            - **None** (no field_passthrough):
              Only extractor outputs in documents.
              Example: → output has ONLY embedding (no source fields)

        Use Cases:
            - **Preserve Identifiers**: Keep campaign_id, product_sku, order_id for tracking
            - **Enable Filtering**: Pass category, status, department for query filters
            - **Maintain Context**: Include title, description for display
            - **Track Metadata**: Preserve author, created_at, source for lineage
            - **Business Logic**: Keep priority, region, type for application logic

        Common Patterns:
            1. **Minimal Passthrough** (recommended):
               field_passthrough=[{"source_path": "id"}], include_all=False
               → Clean output, only ID + extractor features

            2. **Metadata Preservation**:
               field_passthrough=[
                   {"source_path": "title"},
                   {"source_path": "category"},
                   {"source_path": "created_at"}
               ]
               → Document has context for display and filtering

            3. **Field Renaming**:
               field_passthrough=[
                   {"source_path": "doc_title", "target_path": "title"},
                   {"source_path": "metadata.author", "target_path": "author"}
               ]
               → Cleaner output schema with flattened fields

            4. **Required Fields**:
               field_passthrough=[
                   {"source_path": "campaign_id", "required": True},
                   {"source_path": "priority", "default": 0}
               ]
               → Ensures critical fields always present

        Requirements:
            - feature_extractor_name: REQUIRED - name of the extractor
            - version: REQUIRED - extractor version (e.g., "v1")
            - parameters: NOT REQUIRED - extractor-specific config (model, thresholds, etc.)
            - input_mappings: NOT REQUIRED - maps extractor inputs to source fields
            - field_passthrough: NOT REQUIRED - which source fields to preserve (default: none)
            - include_all_source_fields: NOT REQUIRED - preserve all fields (default: false)
      examples:
        - description: Text extractor with field passthrough
          feature_extractor_name: text_extractor
          field_passthrough:
            - required: true
              source_path: title
            - source_path: author
          input_mappings:
            text: content
          parameters:
            model: text-embedding-3-small
          version: v1
        - description: Video extractor with campaign metadata
          feature_extractor_name: multimodal_extractor
          field_passthrough:
            - source_path: campaign_id
            - source_path: duration_seconds
          input_mappings:
            video: video_url
          parameters:
            fps: 1
          version: v1
        - description: Image extractor with all source fields
          feature_extractor_name: image_extractor
          include_all_source_fields: true
          input_mappings:
            image: product_image
          parameters:
            model: clip-vit-base-patch32
          version: v1
        - description: Text extractor with field renaming
          feature_extractor_name: text_extractor
          field_passthrough:
            - source_path: metadata.author
              target_path: author
            - source_path: metadata.created_at
              target_path: created_at
          input_mappings:
            text: content
          version: v1
    SourceConfig-Output:
      properties:
        type:
          $ref: '#/components/schemas/SourceType'
          description: >-
            REQUIRED. Type of source for this collection. 'bucket': Process
            objects from one or more buckets (first-stage processing).
            'collection': Process documents from another collection (downstream
            processing). Use 'bucket' for initial data ingestion, 'collection'
            for decomposition trees.
          examples:
            - bucket
            - collection
        bucket_ids:
          anyOf:
            - items:
                type: string
              type: array
              minItems: 1
            - type: 'null'
          title: Bucket Ids
          description: >-
            List of bucket IDs when type='bucket'. REQUIRED when type='bucket'.
            NOT ALLOWED when type='collection'. Can specify one or more buckets
            to process. Single bucket: Use array with one element ['bkt_id'].
            Multiple buckets: All buckets MUST have compatible schemas. Schema
            compatibility validated at collection creation. Compatible schemas
            have: 1) Same field names, 2) Same field types, 3) Same required
            status. Documents will include root_bucket_id to track which bucket
            they came from. Use cases: multi-region data, multi-team
            consolidation, environment aggregation.
          examples:
            - - bkt_marketing_videos
            - - bkt_us_products
              - bkt_eu_products
              - bkt_asia_products
            - - bkt_staging_data
              - bkt_production_data
        source_namespace_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Source Namespace Id
          description: >-
            Namespace ID where the source buckets reside. Use this to process
            buckets from a different namespace within the same organization.
            When omitted, buckets are looked up in the current (collection's)
            namespace. Only valid when type='bucket'.
        collection_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Collection Id
          description: >-
            Collection ID when type='collection' (single collection). Use this
            OR collection_ids (not both). REQUIRED when type='collection' and
            processing single collection. NOT ALLOWED when type='bucket'. The
            collection will process documents from this upstream collection. The
            upstream collection's output_schema becomes this collection's
            input_schema. This enables decomposition trees (multi-stage
            pipelines). Example: Process frames collection → create scenes
            collection.
          examples:
            - col_video_frames
            - col_book_chapters
            - col_product_images
        collection_ids:
          anyOf:
            - items:
                type: string
              type: array
              minItems: 1
            - type: 'null'
          title: Collection Ids
          description: >-
            List of collection IDs when type='collection' (multiple
            collections). Use this OR collection_id (not both). REQUIRED when
            type='collection' and processing multiple collections. NOT ALLOWED
            when type='bucket'. Used for operations that consolidate multiple
            upstream collections. Example: Clustering across multiple
            collections → cluster output collection. All collections must have
            compatible schemas for consolidation operations.
          examples:
            - - col_us_products
              - col_eu_products
            - - col_images_2023
              - col_images_2024
        inherited_bucket_ids:
          anyOf:
            - items:
                type: string
              type: array
            - type: 'null'
          title: Inherited Bucket Ids
          description: >-
            List of original bucket IDs that source collections originated from.
            OPTIONAL. Only used when type='collection'. Tracks the complete
            lineage chain: buckets → collections → derived collections.
            Extracted from upstream collection metadata at collection creation
            time. Enables tracing derived collections (like cluster outputs)
            back to original data sources. Example: Cluster output collection
            inherits bucket IDs from its source collections. Format: List of
            bucket IDs with 'bkt_' prefix.
          examples:
            - - bkt_marketing_videos
            - - bkt_us_products
              - bkt_eu_products
            - null
        source_filters:
          anyOf:
            - $ref: '#/components/schemas/SourceFilters-Output'
            - type: 'null'
          description: >-
            Optional filters to apply to source data. When specified, only
            objects/documents matching these filters will be processed by this
            collection. Filters are evaluated at batch creation time. Uses same
            LogicalOperator model as list APIs for consistency.
          examples:
            - filters:
                AND:
                  - field: blobs.type
                    operator: eq
                    value: video
            - filters:
                AND:
                  - field: metadata.status
                    operator: eq
                    value: active
            - {}
      type: object
      required:
        - type
      title: SourceConfig
      description: >-
        Configuration for collection source (bucket(s) or collection).


        Collections can process data from two types of sources:


        1. **Bucket Source**: Process raw objects from one or more buckets
        (first-stage processing)
           - Use this to create your initial collections from uploaded data
           - Can specify multiple buckets to consolidate data from different sources
           - All buckets must have compatible schemas (validated at creation)
           - Example: Videos from multiple regions → Frame extraction collection

        2. **Collection Source**: Process documents from another collection
        (decomposition trees)
           - Use this to create multi-stage processing pipelines
           - Example: Frames collection → Scene detection collection

        Multi-Bucket Requirements:

        - All buckets must have compatible schemas (same fields, types, and
        required status)

        - Schema compatibility is validated when the collection is created

        - Documents track which specific bucket they came from via
        root_bucket_id

        - Useful for consolidating data from multiple regions, teams, or
        environments


        The source determines:

        - What data the feature extractor receives as input

        - The input_schema available for input_mappings and field_passthrough

        - The lineage tracking in output documents


        Examples:
            Single bucket: {"type": "bucket", "bucket_ids": ["bkt_products"]}
            Multi-bucket: {"type": "bucket", "bucket_ids": ["bkt_us", "bkt_eu", "bkt_asia"]}
            Collection: {"type": "collection", "collection_id": "col_frames"}
      examples:
        - bucket_ids:
            - bkt_marketing_videos
          description: Single bucket source
          type: bucket
        - bucket_ids:
            - bkt_marketing_ads
          description: Bucket source with video-only filter
          source_filters:
            filters:
              AND:
                - field: blobs.type
                  operator: eq
                  value: video
          type: bucket
        - bucket_ids:
            - bkt_us_products
            - bkt_eu_products
            - bkt_asia_products
          description: Multi-bucket source (region consolidation)
          type: bucket
        - bucket_ids:
            - bkt_us_ads
            - bkt_eu_ads
            - bkt_asia_ads
          description: Multi-bucket source with brand filter
          source_filters:
            filters:
              OR:
                - field: brand_name
                  operator: in
                  value:
                    - Acme
                    - TechCo
                - field: category
                  operator: eq
                  value: premium
          type: bucket
        - collection_id: col_video_frames
          description: Collection source (decomposition tree)
          type: collection
        - collection_id: col_raw_frames
          description: Collection source with active documents only
          source_filters:
            filters:
              AND:
                - field: __fully_enriched
                  operator: eq
                  value: true
                - field: quality_score
                  operator: gte
                  value: 0.8
          type: collection
        - collection_ids:
            - col_us_products
            - col_eu_products
            - col_asia_products
          description: Multi-collection source (cluster output)
          type: collection
    SingleLineageEntry:
      properties:
        source_config:
          $ref: '#/components/schemas/SourceConfig-Output'
          description: Configuration of the source for this lineage entry
        feature_extractor:
          $ref: >-
            #/components/schemas/shared__collection__features__extractors__models__FeatureExtractorConfig-Output
          description: Single feature extractor applied at this stage
        output_schema:
          $ref: '#/components/schemas/BucketSchema-Output'
          description: Output schema produced by this processing stage
      type: object
      required:
        - source_config
        - feature_extractor
        - output_schema
      title: SingleLineageEntry
      description: >-
        Single entry in the lineage chain of a collection.


        Each lineage entry represents one processing stage with one feature
        extractor.
    CollectionScheduleConfig:
      properties:
        trigger_type:
          type: string
          title: Trigger Type
          description: 'Schedule type: ''cron'' or ''interval'''
          examples:
            - cron
            - interval
        schedule_config:
          additionalProperties: true
          type: object
          title: Schedule Config
          description: >-
            Schedule configuration. For cron: {cron_expression, timezone}. For
            interval: {interval_seconds, start_immediately}.
          examples:
            - cron_expression: 0 2 * * *
              timezone: UTC
            - interval_seconds: 21600
              start_immediately: false
        description:
          anyOf:
            - type: string
            - type: 'null'
          title: Description
          description: Human-readable description of the schedule.
      type: object
      required:
        - trigger_type
        - schedule_config
      title: CollectionScheduleConfig
      description: |-
        Schedule configuration for automatic collection re-processing.

        Attaches a cron or interval schedule to a collection, which creates
        a COLLECTION_TRIGGER trigger behind the scenes. This is a DX convenience
        so users don't need to create triggers manually.

        Examples:
            Daily re-crawl at 2am UTC:
                {"trigger_type": "cron", "schedule_config": {"cron_expression": "0 2 * * *"}}

            Every 6 hours:
                {"trigger_type": "interval", "schedule_config": {"interval_seconds": 21600}}
    TieringRule:
      properties:
        rule_type:
          type: string
          enum:
            - auto_evict
            - auto_archive
            - auto_rehydrate
          title: Rule Type
        enabled:
          type: boolean
          title: Enabled
          default: false
        threshold_days:
          anyOf:
            - type: integer
            - type: 'null'
          title: Threshold Days
      type: object
      required:
        - rule_type
      title: TieringRule
      description: 'A single automatic storage tiering rule (V1: stored but not enforced).'
    TaxonomyApplicationConfig-Output:
      properties:
        taxonomy_id:
          type: string
          title: Taxonomy Id
          description: ID of the `TaxonomyModel` to execute.
        execution_mode:
          $ref: '#/components/schemas/TaxonomyExecutionMode'
          description: >-
            Execution mode for taxonomy enrichment. Materializes results during
            ingestion.
          default: materialize
        target_collection_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Target Collection Id
          description: >-
            Optional collection to persist results into when `execution_mode` is
            'materialize'. If omitted, the source collection is updated
            in-place.
        scroll_filters:
          anyOf:
            - $ref: '#/components/schemas/LogicalOperator-Output'
            - type: 'null'
          description: >-
            Additional filters applied when scrolling the source collection
            before enrichment.
        execution_phase:
          type: integer
          maximum: 3
          minimum: 1
          title: Execution Phase
          description: >-
            Which phase this taxonomy runs in. Default: 1 (TAXONOMY phase, runs
            first). Valid values: 1=TAXONOMY, 2=CLUSTER, 3=ALERT. Lower phases
            run earlier.
          default: 1
        priority:
          type: integer
          maximum: 1000
          minimum: 0
          title: Priority
          description: Priority within the execution phase (higher = runs first)
          default: 0
        hierarchical_enrichment_style:
          $ref: '#/components/schemas/HierarchicalEnrichmentStyle'
          description: >-
            For hierarchical taxonomies, controls how enrichment fields are
            structured. 'full_chain': Each tier writes to tier-prefixed fields
            (tier1_id, tier2_id, etc.). 'best_match': Only deepest match stored
            (category_id, category_name, path[]). 'combined' (default): Both
            full chain AND best match summary fields.
          default: combined
      additionalProperties: false
      type: object
      required:
        - taxonomy_id
      title: TaxonomyApplicationConfig
      description: |-
        Configuration block that attaches a taxonomy to a collection.

        Supports execution phase ordering for unified post-processing with
        taxonomies, clusters, and alerts.
      examples:
        - execution_mode: materialize
          taxonomy_id: tax_abc123
        - execution_mode: materialize
          execution_phase: 1
          priority: 10
          target_collection_id: col_enriched_v1
          taxonomy_id: tax_abc123
        - execution_mode: materialize
          hierarchical_enrichment_style: full_chain
          taxonomy_id: tax_iab_content
    ClusterApplicationConfig:
      properties:
        cluster_id:
          type: string
          title: Cluster Id
          description: >-
            ID of the cluster to execute (must exist and use this collection as
            input)
        auto_execute_on_batch:
          type: boolean
          title: Auto Execute On Batch
          description: >-
            Automatically execute cluster when batch processing completes for
            this collection. If False, cluster must be executed manually via
            API.
          default: true
        min_document_threshold:
          anyOf:
            - type: integer
            - type: 'null'
          title: Min Document Threshold
          description: >-
            Minimum number of documents required before executing cluster. If
            document_count < threshold, clustering is skipped. Useful to avoid
            clustering on small datasets.
        cooldown_seconds:
          type: integer
          title: Cooldown Seconds
          description: >-
            Minimum time (in seconds) between automatic cluster executions.
            Prevents excessive re-clustering on frequent batch completions.
            Default: 3600 seconds (1 hour).
          default: 3600
        execution_phase:
          type: integer
          maximum: 3
          minimum: 1
          title: Execution Phase
          description: >-
            Which phase this cluster runs in. Default: 2 (CLUSTER phase, after
            taxonomies). Valid values: 1=TAXONOMY, 2=CLUSTER, 3=ALERT. Lower
            phases run earlier.
          default: 2
        priority:
          type: integer
          maximum: 1000
          minimum: 0
          title: Priority
          description: Priority within the execution phase (higher = runs first)
          default: 0
      type: object
      required:
        - cluster_id
      title: ClusterApplicationConfig
      description: >-
        Configuration for automatic cluster execution on collection.


        Similar to TaxonomyApplicationConfig, this attaches a cluster to a
        collection

        and defines when/how it should be automatically executed.


        Used in CollectionModel.cluster_applications field.


        Supports execution phase ordering for unified post-processing with

        taxonomies, clusters, and alerts.
      examples:
        - auto_execute_on_batch: true
          cluster_id: clust_product_categories
          cooldown_seconds: 3600
          min_document_threshold: 100
        - auto_execute_on_batch: true
          cluster_id: clust_user_segments
          cooldown_seconds: 7200
          execution_phase: 2
          min_document_threshold: 500
          priority: 10
    AlertApplicationConfig-Output:
      properties:
        alert_id:
          type: string
          title: Alert Id
          description: ID of the alert to execute
          examples:
            - alt_safety_001
            - alt_content_moderation
        execution_mode:
          $ref: '#/components/schemas/AlertExecutionMode'
          description: When this alert should execute
          default: on_ingest
        input_mappings:
          items:
            $ref: '#/components/schemas/AlertInputMapping'
          type: array
          minItems: 1
          title: Input Mappings
          description: Map document fields or constants to retriever input parameters
        execution_phase:
          $ref: '#/components/schemas/PostProcessingPhase'
          description: >-
            Which phase this alert runs in. Default: ALERT (phase 3, after
            taxonomies and clusters)
          default: 3
        priority:
          type: integer
          maximum: 1000
          minimum: 0
          title: Priority
          description: Priority within the execution phase (higher = runs first)
          default: 0
      additionalProperties: false
      type: object
      required:
        - alert_id
        - input_mappings
      title: AlertApplicationConfig
      description: |-
        Configuration for attaching an alert to a collection.

        The key responsibility here is INPUT MAPPING: connecting document
        fields (or constants) to the retriever's expected inputs.

        Note: Filtering logic (scroll_filters, min_score, etc.) belongs in
        the retriever, not here. The retriever owns all query semantics.

        Use Cases:
            - Attach safety alert to video upload collection
            - Configure different field mappings for different collections
            - Set execution priority for multiple alerts

        Attributes:
            alert_id: ID of the alert to execute
            execution_mode: When this alert should execute
            input_mappings: Map document fields or constants to retriever inputs
            execution_phase: Which phase this alert runs in (default: ALERT)
            priority: Priority within the execution phase (higher = runs first)
      examples:
        - alert_id: alt_safety_001
          execution_mode: on_ingest
          execution_phase: 3
          input_mappings:
            - input_key: query_embedding
              source:
                path: features.video_embedding
                source_type: document_field
            - input_key: target_collection
              source:
                source_type: constant
                value: col_known_incidents
          priority: 100
    RetrieverEnrichmentConfig-Output:
      properties:
        retriever_id:
          type: string
          title: Retriever Id
          description: ID of the retriever to execute
          examples:
            - ret_classifier_001
            - ret_cross_reference
        input_mappings:
          items:
            $ref: '#/components/schemas/EnrichmentInputMapping'
          type: array
          minItems: 1
          title: Input Mappings
          description: Map document fields or constants to retriever input parameters
        write_back_fields:
          items:
            $ref: '#/components/schemas/WriteBackFieldMapping'
          type: array
          minItems: 1
          title: Write Back Fields
          description: Which retriever result fields to write back to the document
        execution_phase:
          $ref: '#/components/schemas/PostProcessingPhase'
          description: >-
            Which phase this enrichment runs in. Default: RETRIEVER_ENRICHMENT
            (phase 4, after taxonomies, clusters, and alerts)
          default: 4
        priority:
          type: integer
          maximum: 1000
          minimum: 0
          title: Priority
          description: Priority within the execution phase (higher = runs first)
          default: 0
        scroll_filters:
          anyOf:
            - $ref: '#/components/schemas/LogicalOperator-Output'
            - type: 'null'
          description: Optional filters to select which documents to enrich
        enabled:
          type: boolean
          title: Enabled
          description: Whether this enrichment is active
          default: true
      type: object
      required:
        - retriever_id
        - input_mappings
        - write_back_fields
      title: RetrieverEnrichmentConfig
      description: |-
        Configuration for attaching a retriever enrichment to a collection.

        Retriever enrichments run a retriever pipeline on each document during
        post-processing and write selected result fields back to the document.

        Attributes:
            retriever_id: ID of the retriever to execute
            input_mappings: How to map document fields to retriever inputs
            write_back_fields: Which result fields to write back to the document
            execution_phase: Which post-processing phase to run in (default: RETRIEVER_ENRICHMENT)
            priority: Priority within the execution phase (higher = runs first)
            scroll_filters: Optional filters to select which documents to enrich
            enabled: Whether this enrichment is active
      examples:
        - enabled: true
          input_mappings:
            - input_key: query
              source:
                path: title
                source_type: document_field
          retriever_id: ret_classifier_001
          write_back_fields:
            - mode: first
              source_field: category
              target_field: _enrichment_category
    BucketSchemaField-Output:
      properties:
        type:
          $ref: '#/components/schemas/BucketSchemaFieldType'
        default:
          anyOf:
            - {}
            - type: 'null'
          title: Default
        items:
          anyOf:
            - $ref: '#/components/schemas/BucketSchemaField-Output'
            - type: 'null'
        properties:
          anyOf:
            - additionalProperties:
                $ref: '#/components/schemas/BucketSchemaField-Output'
              type: object
            - type: 'null'
          title: Properties
        examples:
          anyOf:
            - items: {}
              type: array
            - type: 'null'
          title: Examples
          description: >-
            OPTIONAL. List of example values for this field. Used by Apps to
            show example inputs in the UI. Provide multiple diverse examples
            when possible.
        description:
          anyOf:
            - type: string
            - type: 'null'
          title: Description
        enum:
          anyOf:
            - items: {}
              type: array
            - type: 'null'
          title: Enum
        required:
          anyOf:
            - type: boolean
            - type: 'null'
          title: Required
          default: false
      additionalProperties: true
      type: object
      required:
        - type
      title: BucketSchemaField
      description: Schema field definition for bucket objects.
    AudioFingerprintExtractorParams:
      properties:
        extractor_type:
          type: string
          const: audio_fingerprint_extractor
          title: Extractor Type
          description: Discriminator field. Must be 'audio_fingerprint_extractor'.
          default: audio_fingerprint_extractor
        segment_duration_sec:
          type: number
          maximum: 30
          minimum: 1
          title: Segment Duration Sec
          description: >-
            Duration of each audio segment in seconds. 5.0: Recommended for
            sound mark matching. Shorter segments increase recall but reduce
            per-segment context.
          default: 5
        segment_hop_sec:
          type: number
          maximum: 15
          minimum: 0.5
          title: Segment Hop Sec
          description: >-
            Hop size between segments in seconds. 2.5: 50% overlap
            (recommended). Set equal to segment_duration_sec for no overlap.
          default: 2.5
        sample_rate:
          type: integer
          title: Sample Rate
          description: >-
            Target sample rate for audio. 48000: CLAP default (recommended).
            Audio is resampled to this rate before embedding.
          default: 48000
        normalize_embeddings:
          type: boolean
          title: Normalize Embeddings
          description: >-
            L2-normalize embeddings to unit vectors (recommended for cosine
            similarity).
          default: true
        max_audio_length_sec:
          type: number
          maximum: 600
          minimum: 1
          title: Max Audio Length Sec
          description: >-
            Maximum audio length to process in seconds. 120: Default (2
            minutes). Audio beyond this is truncated.
          default: 120
      type: object
      title: AudioFingerprintExtractorParams
      description: |-
        Parameters for the Audio Fingerprint Extractor.

        Processes audio files (or audio extracted from video) through CLAP
        (Contrastive Language-Audio Pretraining) to produce 512-d embeddings
        suitable for audio fingerprint matching.

        Core Pipeline:
        1. Audio extraction (if video input, via FFmpeg)
        2. Segmentation into fixed-length windows
        3. CLAP embedding (laion/clap-htsat-tiny, 512-d)
        4. L2 normalization

        Use Cases:
            - Sound mark detection (IP safety)
            - Audio similarity search
            - Music/jingle identification
            - Audio deduplication
      examples:
        - description: Sound mark detection (IP safety)
          extractor_type: audio_fingerprint_extractor
          normalize_embeddings: true
          sample_rate: 48000
          segment_duration_sec: 5
          segment_hop_sec: 2.5
        - description: Music identification (longer segments)
          extractor_type: audio_fingerprint_extractor
          sample_rate: 48000
          segment_duration_sec: 10
          segment_hop_sec: 5
    DocumentGraphExtractorParams:
      properties:
        extractor_type:
          type: string
          const: document_graph_extractor
          title: Extractor Type
          description: >-
            Discriminator field for parameter type identification. Must be
            'document_graph_extractor'.
          default: document_graph_extractor
        use_layout_detection:
          type: boolean
          title: Use Layout Detection
          description: >-
            Enable ML-based layout detection to find ALL document elements
            (text, images, tables, figures). When enabled, uses the configured
            layout_detector to detect and extract both text regions AND non-text
            elements (scanned images, figures, charts) as separate documents.
            **Recommended for**: Scanned documents, image-heavy PDFs, mixed
            content documents. **When disabled**: Falls back to text-only
            extraction (faster but misses images). Default: True (detects all
            elements including images).
          default: true
        layout_detector:
          type: string
          enum:
            - pymupdf
            - docling
          title: Layout Detector
          description: >-
            Layout detection engine to use when use_layout_detection=True.
            'pymupdf': Fast, rule-based detection using PyMuPDF heuristics (~15
            pages/sec). 'docling': SOTA ML-based detection using IBM Docling
            with DiT model (~3-8 sec/doc). **Docling advantages**: Better
            semantic type detection (section_header vs paragraph), true table
            structure extraction (rows/cols), more accurate figure detection.
            **PyMuPDF advantages**: Much faster, lower memory usage, simpler
            dependencies. Default: 'pymupdf' for speed. Use 'docling' for
            accuracy-critical applications.
          default: pymupdf
        vertical_threshold:
          type: number
          maximum: 100
          minimum: 1
          title: Vertical Threshold
          description: >-
            Maximum vertical gap (in points) between lines to be grouped in same
            block. Increase for looser grouping, decrease for tighter blocks.
            Default 15pt works well for standard documents.
          default: 15
        horizontal_threshold:
          type: number
          maximum: 200
          minimum: 1
          title: Horizontal Threshold
          description: >-
            Maximum horizontal distance (in points) for overlap detection.
            Affects column detection and block merging. Increase for wider
            columns, decrease for narrow layouts.
          default: 50
        min_text_length:
          type: integer
          maximum: 500
          minimum: 1
          title: Min Text Length
          description: >-
            Minimum text length (characters) to keep a block. Blocks with less
            text are filtered out. Helps remove noise and tiny fragments.
          default: 20
        base_confidence:
          type: number
          maximum: 1
          minimum: 0
          title: Base Confidence
          description: >-
            Base confidence score for embedded (native) text. Penalties are
            subtracted for OCR artifacts, encoding issues, etc.
          default: 0.85
        min_confidence_for_vlm:
          type: number
          maximum: 1
          minimum: 0
          title: Min Confidence For Vlm
          description: >-
            Confidence threshold below which VLM correction is triggered. Blocks
            with confidence < this value get sent to VLM for correction. Only
            applies when use_vlm_correction=True.
          default: 0.6
        use_vlm_correction:
          type: boolean
          title: Use Vlm Correction
          description: >-
            Enable VLM (Vision Language Model) correction for low-confidence
            blocks. Uses Gemini/GPT-4V to correct OCR errors by analyzing the
            page image. Significantly slower (~1 page/sec) but improves accuracy
            for degraded docs.
          default: true
        fast_mode:
          type: boolean
          title: Fast Mode
          description: >-
            Skip VLM correction entirely for maximum throughput (~15 pages/sec).
            Overrides use_vlm_correction. Use when speed is more important than
            accuracy.
          default: false
        vlm_provider:
          type: string
          title: Vlm Provider
          description: >-
            LLM provider for VLM correction. Options: 'google' (Gemini),
            'openai' (GPT-4V), 'anthropic' (Claude). Google recommended for best
            vision quality.
          default: google
        vlm_model:
          type: string
          title: Vlm Model
          description: >-
            Specific model for VLM correction. Examples: 'gemini-2.5-flash',
            'gpt-4o', 'claude-3-5-sonnet'.
          default: gemini-2.5-flash
        llm_api_key:
          anyOf:
            - type: string
            - type: 'null'
          title: Llm Api Key
          description: >-
            API key for VLM correction (BYOK - Bring Your Own Key). Supports:

            - Direct key: 'sk-proj-abc123...'

            - Secret reference: '{{SECRET.openai_api_key}}'


            When using secret reference, the key is loaded from your
            organization's secrets vault at runtime. Store secrets via POST
            /v1/organizations/secrets.


            If not provided, uses Mixpeek's default API keys.
        run_text_embedding:
          type: boolean
          title: Run Text Embedding
          description: >-
            Generate text embeddings for semantic search over block content.
            Uses E5-Large (1024-dim) for multilingual support.
          default: true
        render_dpi:
          type: integer
          maximum: 300
          minimum: 72
          title: Render Dpi
          description: >-
            DPI for page rendering (used for VLM correction). 72: Fast, lower
            quality. 150: Balanced (recommended). 300: High quality, slower.
          default: 150
        generate_thumbnails:
          type: boolean
          title: Generate Thumbnails
          description: >-
            Generate thumbnail images for blocks. Useful for visual previews and
            UI display.
          default: true
        thumbnail_mode:
          type: string
          title: Thumbnail Mode
          description: >-
            Thumbnail generation mode. 'full_page': Low-res thumbnail of entire
            page. 'segment': Cropped thumbnail of just the block's bounding box.
            'both': Generate both types (recommended for flexibility).
          default: both
        thumbnail_dpi:
          type: integer
          maximum: 150
          minimum: 36
          title: Thumbnail Dpi
          description: >-
            DPI for thumbnail generation. Lower DPI = smaller files. 72:
            Standard web quality. 36: Very small thumbnails.
          default: 72
      type: object
      title: DocumentGraphExtractorParams
      description: >-
        Parameters for the document graph extractor.


        This extractor decomposes PDFs into spatial blocks with layout
        classification,

        confidence scoring, and optional VLM correction for degraded documents.


        **When to Use**:
            - Historical/archival document processing (FBI files, old records)
            - Scanned documents with mixed quality
            - Documents requiring spatial understanding (forms, tables, multi-column)
            - When you need block-level granularity with bounding boxes
            - When confidence scoring is needed for downstream filtering

        **When NOT to Use**:
            - Simple text-only documents -> Use text_extractor instead
            - When page-level granularity is sufficient -> Use pdf_extractor instead
            - Real-time processing requirements -> VLM correction adds latency
      examples:
        - description: Fast processing mode (no VLM, maximum throughput)
          extractor_type: document_graph_extractor
          fast_mode: true
          generate_thumbnails: true
          layout_detector: pymupdf
          run_text_embedding: true
          use_case: >-
            High-volume document ingestion where speed matters more than perfect
            accuracy
          use_layout_detection: true
        - description: Archival documents with VLM correction (recommended for old scans)
          extractor_type: document_graph_extractor
          layout_detector: pymupdf
          min_confidence_for_vlm: 0.6
          render_dpi: 150
          run_text_embedding: true
          use_case: >-
            Historical archives, FBI files, old scanned documents with degraded
            quality
          use_layout_detection: true
          use_vlm_correction: true
          vlm_model: gemini-2.5-flash
          vlm_provider: google
        - description: SOTA accuracy mode with Docling (best for tables/figures)
          extractor_type: document_graph_extractor
          fast_mode: true
          generate_thumbnails: true
          layout_detector: docling
          run_text_embedding: true
          use_case: >-
            Documents with complex tables, figures, or requiring accurate
            semantic typing
          use_layout_detection: true
    FaceIdentityExtractorParams:
      properties:
        extractor_type:
          type: string
          const: face_identity_extractor
          title: Extractor Type
          description: >-
            Discriminator field for parameter type identification. Must be
            'face_identity_extractor'.
          default: face_identity_extractor
        detection_model:
          type: string
          enum:
            - scrfd_500m
            - scrfd_2.5g
            - scrfd_10g
          title: Detection Model
          description: >-
            SCRFD model for face detection. 'scrfd_500m': Fastest (2-3ms).
            'scrfd_2.5g': Balanced (5-7ms), recommended. 'scrfd_10g': Highest
            accuracy (10-15ms).
          default: scrfd_2.5g
        min_face_size:
          type: integer
          maximum: 200
          minimum: 10
          title: Min Face Size
          description: >-
            Minimum face size in pixels to detect. 20px: Balanced. 40px: Higher
            quality. 10px: Maximum recall.
          default: 20
        detection_threshold:
          type: number
          maximum: 1
          minimum: 0
          title: Detection Threshold
          description: Confidence threshold for face detection (0.0-1.0).
          default: 0.5
        max_faces_per_image:
          anyOf:
            - type: integer
              minimum: 1
            - type: 'null'
          title: Max Faces Per Image
          description: 'Maximum number of faces to process per image. None: Process all.'
        normalize_embeddings:
          type: boolean
          title: Normalize Embeddings
          description: L2-normalize embeddings to unit vectors (recommended).
          default: true
        enable_quality_scoring:
          type: boolean
          title: Enable Quality Scoring
          description: Compute quality scores (blur, size, landmarks). Adds ~5ms per face.
          default: true
        quality_threshold:
          anyOf:
            - type: number
              maximum: 1
              minimum: 0
            - type: 'null'
          title: Quality Threshold
          description: >-
            Minimum quality score to index faces. None: Index all faces. 0.5:
            Moderate filtering. 0.7: High quality only.
        max_video_length:
          type: integer
          maximum: 300
          minimum: 1
          title: Max Video Length
          description: >-
            Maximum video length in seconds. 60: Default. 10: Recommended for
            retrieval. 300: Maximum (extraction only).
          default: 60
        video_sampling_fps:
          anyOf:
            - type: number
              maximum: 60
              minimum: 0.1
            - type: 'null'
          title: Video Sampling Fps
          description: >-
            Frames per second to sample from video. 1.0: One frame per second
            (recommended).
          default: 1
        video_deduplication:
          type: boolean
          title: Video Deduplication
          description: >-
            Remove duplicate faces across video frames (extraction only).
            Reduces 90-95% redundancy. NOT used in retrieval.
          default: true
        video_deduplication_threshold:
          type: number
          maximum: 1
          minimum: 0
          title: Video Deduplication Threshold
          description: >-
            Cosine similarity threshold for deduplication. 0.8: Conservative
            (default).
          default: 0.8
        output_mode:
          type: string
          enum:
            - per_face
            - per_image
          title: Output Mode
          description: >-
            'per_face': One document per face (recommended). 'per_image': One
            doc per image with faces array.
          default: per_face
        include_face_crops:
          type: boolean
          title: Include Face Crops
          description: >-
            Include aligned 112×112 face crops as base64. Adds ~5KB per face.
            Required for LLM cluster labeling to see actual faces instead of
            hallucinating.
          default: true
        include_source_frame_thumbnail:
          type: boolean
          title: Include Source Frame Thumbnail
          description: >-
            Include resized source frame/image as base64 thumbnail (~15-30KB per
            face). Used for display with bounding box overlay.
          default: false
        store_detection_metadata:
          type: boolean
          title: Store Detection Metadata
          description: Store bbox, landmarks, detection scores. Recommended for debugging.
          default: true
      type: object
      title: FaceIdentityExtractorParams
      description: >-
        Parameters for the Face Identity Extractor.


        The Face Identity Extractor processes images or video frames to detect,
        align,

        and embed faces using production-grade SOTA models (SCRFD + ArcFace).


        Core Pipeline:

        1. SCRFD Detection → Bounding boxes + 5 landmarks

        2. 5-Point Affine Alignment → 112×112 canonical face

        3. ArcFace Embedding → 512-d L2-normalized vector

        4. Optional Quality Scoring → Filter low-quality faces


        Use Cases:
            - Face verification (1:1 matching)
            - Face identification (1:N search)
            - Face clustering (group photos by person)
            - Duplicate face detection
      examples:
        - description: Employee verification (high quality, 1:1 matching)
          detection_model: scrfd_2.5g
          detection_threshold: 0.7
          enable_quality_scoring: true
          extractor_type: face_identity_extractor
          max_faces_per_image: 1
          min_face_size: 40
          normalize_embeddings: true
          output_mode: per_face
          quality_threshold: 0.5
          use_case: Corporate access control, employee ID photos for badge matching
        - description: Photo library organization (multiple faces)
          detection_model: scrfd_2.5g
          detection_threshold: 0.6
          enable_quality_scoring: true
          extractor_type: face_identity_extractor
          min_face_size: 30
          output_mode: per_face
          store_detection_metadata: true
          use_case: 'Personal photo management: group photos by person'
    GeminiMultifileExtractorParams:
      properties:
        extractor_type:
          type: string
          const: gemini_multifile_extractor
          title: Extractor Type
          description: Discriminator field for parameter type identification.
          default: gemini_multifile_extractor
        output_dimensionality:
          type: integer
          maximum: 3072
          minimum: 256
          title: Output Dimensionality
          description: >-
            Output embedding dimensions. Gemini Embedding 2 supports 3072
            (default), 768, or 256 via truncation. Lower dimensions reduce
            storage cost at slight quality loss.
          default: 3072
        task_type:
          type: string
          title: Task Type
          description: >-
            Embedding intent used as a text instruction for Gemini Embedding 2.
            Common values: RETRIEVAL_DOCUMENT, RETRIEVAL_QUERY,
            SEMANTIC_SIMILARITY, CLASSIFICATION.
          default: RETRIEVAL_DOCUMENT
        input_key:
          type: string
          title: Input Key
          description: >-
            The input_mappings key whose value is the list of blob fields to
            embed together. Must match the key used in input_mappings (e.g.,
            'files'). Default: 'files'.
          default: files
      type: object
      title: GeminiMultifileExtractorParams
      description: >-
        Parameters for the Gemini Multifile Extractor.


        Uses Gemini Embedding 2 (gemini-embedding-2) to embed all files

        of an object into a single 3072-d vector in one API call. Supports
        images,

        video, audio, PDF, and text blobs.
      examples:
        - extractor_type: gemini_multifile_extractor
          input_key: files
          output_dimensionality: 3072
          task_type: RETRIEVAL_DOCUMENT
    ImageExtractorParams:
      properties:
        extractor_type:
          type: string
          const: image_extractor
          title: Extractor Type
          description: Discriminator field for parameter type identification.
          default: image_extractor
        enable_thumbnails:
          type: boolean
          title: Enable Thumbnails
          description: Whether to generate thumbnail images.
          default: true
        use_cdn:
          type: boolean
          title: Use Cdn
          description: Whether to use CloudFront CDN for thumbnail delivery.
          default: false
      type: object
      title: ImageExtractorParams
      description: Parameters for the Image Extractor.
      examples:
        - enable_thumbnails: true
          extractor_type: image_extractor
          use_cdn: false
    MultimodalExtractorParams:
      properties:
        extractor_type:
          type: string
          const: multimodal_extractor
          title: Extractor Type
          description: Discriminator field. Must be 'multimodal_extractor'.
          default: multimodal_extractor
        split_method:
          $ref: '#/components/schemas/SplitMethod'
          description: Video splitting strategy.
          default: time
        description_prompt:
          type: string
          title: Description Prompt
          description: Prompt for description generation.
          default: >-
            Watch this video segment carefully and describe exactly what you
            see. Do not make up or infer details that are not visible in the
            footage. Include: who is shown (gender, appearance, actions), what
            they are doing, the setting/location, and any products, text, or
            branding visible on screen.
        time_split_interval:
          anyOf:
            - type: integer
            - type: 'null'
          title: Time Split Interval
          description: Interval in seconds for 'time' splitting.
          default: 10
        silence_db_threshold:
          anyOf:
            - type: integer
            - type: 'null'
          title: Silence Db Threshold
          description: 'Decibel threshold for silence detection. Recommended: -40.'
        scene_detection_threshold:
          anyOf:
            - type: number
            - type: 'null'
          title: Scene Detection Threshold
          description: 'Scene detection threshold (0.0-1.0). Recommended: 0.5.'
        run_transcription:
          type: boolean
          title: Run Transcription
          description: Run Whisper transcription on segments.
          default: false
        transcription_language:
          type: string
          title: Transcription Language
          description: Transcription language code.
          default: en
        run_video_description:
          type: boolean
          title: Run Video Description
          description: Generate Gemini descriptions for segments.
          default: false
        run_transcription_embedding:
          type: boolean
          title: Run Transcription Embedding
          description: Generate E5 embeddings for transcriptions (1024D).
          default: false
        run_ocr_embedding:
          type: boolean
          title: Run Ocr Embedding
          description: Generate E5 embeddings for OCR text (1024D). Requires run_ocr.
          default: false
        run_description_embedding:
          type: boolean
          title: Run Description Embedding
          description: >-
            Generate E5 embeddings for descriptions (1024D). Requires
            run_video_description.
          default: false
        run_multimodal_embedding:
          type: boolean
          title: Run Multimodal Embedding
          description: >-
            Generate Gemini Embedding 2 multimodal embeddings (3072D). Creates
            unified embeddings across video, image, text, audio, and GIF
            content.
          default: true
        run_ocr:
          type: boolean
          title: Run Ocr
          description: Extract text from video frames via Gemini OCR.
          default: false
        max_segment_duration:
          anyOf:
            - type: number
            - type: 'null'
          title: Max Segment Duration
          description: >-
            Maximum duration in seconds for any single segment. Scene/silence
            segments longer than this are subdivided. Set to None to disable.
            Default: 30s.
          default: 30
        sensitivity:
          type: string
          title: Sensitivity
          description: Scene detection sensitivity.
          default: low
        enable_thumbnails:
          type: boolean
          title: Enable Thumbnails
          description: Generate thumbnail images for segments.
          default: true
        use_cdn:
          type: boolean
          title: Use Cdn
          description: Use CloudFront CDN for thumbnail delivery.
          default: false
        generation_config:
          $ref: '#/components/schemas/GenerationConfig'
        output_dimensionality:
          type: integer
          title: Output Dimensionality
          description: >-
            Output embedding dimensions. Gemini Embedding 2 supports Matryoshka
            dimension reduction: 3072 (full), 1536, or 768.
          default: 3072
        task_type:
          type: string
          title: Task Type
          description: >-
            Embedding task type hint. Options: RETRIEVAL_DOCUMENT,
            RETRIEVAL_QUERY, SEMANTIC_SIMILARITY, CLASSIFICATION.
          default: RETRIEVAL_DOCUMENT
        response_shape:
          anyOf:
            - type: string
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Response Shape
          description: >-
            Custom structured output schema for Gemini extraction. String for
            natural language prompt, dict for explicit JSON schema.
        embedding_task:
          anyOf:
            - type: string
            - type: 'null'
          title: Embedding Task
          description: >-
            Embedding task hint for instruction-aware models (E5). Prefer
            setting this at collection level (embedding_task on the collection)
            rather than here. Collection-level overrides this value. Defaults to
            'retrieval_document'. Values: retrieval_document, retrieval_query,
            semantic_similarity, classification, clustering. Note: Vertex AI
            multimodal embeddings ignore this — only E5 transcription embeddings
            use it.
      type: object
      title: MultimodalExtractorParams
      description: |-
        Parameters for multimodal extractor v2.

        Same pipeline as v1 but uses Gemini Embedding 2 (3072D) for the
        multimodal embedding step. Supports configurable output dimensions
        via Matryoshka representation learning (3072/1536/768).
    PassthroughExtractorParams:
      properties:
        extractor_type:
          type: string
          const: passthrough_extractor
          title: Extractor Type
          description: Discriminator field for parameter type identification.
          default: passthrough_extractor
        preserve_metadata:
          type: boolean
          title: Preserve Metadata
          description: Preserve source object metadata in output document.
          default: true
      type: object
      title: PassthroughExtractorParams
      description: |-
        Parameters for passthrough extractor.

        Minimal configuration - just passes data through with canonicalization.
    ScrollingTextExtractorParams:
      properties:
        extractor_type:
          type: string
          const: scrolling_text_extractor
          title: Extractor Type
          description: Discriminator field. Must be 'scrolling_text_extractor'.
          default: scrolling_text_extractor
        fps:
          type: number
          maximum: 30
          minimum: 1
          title: Fps
          description: >-
            Frame sampling rate for analysis. Higher values improve detection
            accuracy for fast-scrolling text but increase processing time. 5 FPS
            works well for most video ads and tickers.
          default: 5
        strip_height:
          type: integer
          maximum: 200
          minimum: 10
          title: Strip Height
          description: >-
            Height (in pixels) of each scanning strip used for phase
            correlation. Should roughly match the height of the scrolling text
            band. Smaller values detect narrower text bands; larger values are
            more robust but may miss thin tickers. 40px works for most standard
            video ads.
          default: 40
        min_shift_px:
          type: number
          maximum: 20
          minimum: 0.5
          title: Min Shift Px
          description: >-
            Minimum pixel shift per frame to consider a strip as 'scrolling'.
            Lower values detect slower-moving text; higher values filter out
            noise. 2.0px is a good default for 5 FPS sampling.
          default: 2
        consistency_ratio:
          type: number
          maximum: 1
          minimum: 0.3
          title: Consistency Ratio
          description: >-
            Fraction of frame pairs that must show consistent shift for a band
            to be classified as scrolling. 0.6 means 60% of frames must agree.
            Lower values detect intermittent scrolling; higher values reduce
            false positives.
          default: 0.6
        pad:
          type: integer
          maximum: 50
          minimum: 0
          title: Pad
          description: Pixel padding above/below detected band when cropping for stitching.
          default: 8
      type: object
      title: ScrollingTextExtractorParams
      description: >-
        Parameters for the scrolling text extractor.


        Detects and extracts scrolling/marquee text from video using computer
        vision

        (phase-correlation band detection + panoramic stitching) and VLM-based
        OCR.


        **When to Use**:
            - Video ads with scrolling promotional banners or tickers
            - News broadcasts with scrolling chyrons or tickers
            - Videos with scrolling terms & conditions or disclaimers
            - Social media content with horizontally scrolling text overlays
            - Credits sequences in film/TV content
            - Live event streams with scrolling info bars
            - Any video where important text scrolls across the screen

        **When NOT to Use**:
            - Static text overlays → use multimodal_extractor with run_ocr=True
            - Spoken content → use multimodal_extractor with run_transcription=True
            - Text documents/PDFs → use text_extractor
            - Animated text that fades/scales (not linear scroll) → use multimodal_extractor

        **How It Works**:
            1. Sample frames from video at configurable FPS
            2. Split each frame into horizontal and vertical strips
            3. Phase-correlate consecutive frames to measure per-strip pixel shift
            4. Strips with consistent shift in one direction = scrolling band
            5. Stitch the band across frames into a single wide/tall panorama image
            6. OCR the panorama using a vision language model (Gemini)
            7. Deduplicate repeated marquee loops (e.g. "SALE • SALE • SALE •" → "SALE")

        **Performance**:
            - Processing speed: ~2-5x realtime (depends on video resolution and FPS)
            - Accuracy: Best with consistent scroll speed; handles variable speed with degradation
            - Minimum video length: ~2 seconds (needs 3+ frames for correlation)

        **Supported Scroll Directions**:
            - Horizontal: Right-to-left (most common), Left-to-right
            - Vertical: Bottom-to-top (credits), Top-to-bottom
      examples:
        - description: Standard video ad with scrolling banner
          extractor_type: scrolling_text_extractor
          fps: 5
          strip_height: 40
          use_case: Extract promotional text from video ad tickers
        - description: Fast-scrolling news ticker
          extractor_type: scrolling_text_extractor
          fps: 10
          min_shift_px: 3
          strip_height: 30
          use_case: Capture rapidly scrolling chyron text from news broadcasts
        - consistency_ratio: 0.5
          description: Credits / disclaimer scroll
          extractor_type: scrolling_text_extractor
          fps: 5
          min_shift_px: 1.5
          strip_height: 60
          use_case: Extract vertically scrolling credits or legal disclaimers
    TextExtractorParams:
      properties:
        extractor_type:
          type: string
          const: text_extractor
          title: Extractor Type
          description: Discriminator field for parameter type identification.
          default: text_extractor
        source_type:
          type: string
          enum:
            - text
            - youtube
          title: Source Type
          description: >-
            Source content type. Use 'youtube' to resolve YouTube URLs to
            caption text before embedding. Default: 'text' (plain text input).
          default: text
        split_by:
          $ref: '#/components/schemas/TextSplitStrategy'
          description: Strategy for splitting text into multiple documents.
          default: none
        chunk_size:
          type: integer
          maximum: 10000
          minimum: 1
          title: Chunk Size
          description: Target size for each chunk.
          default: 1000
        chunk_overlap:
          type: integer
          maximum: 5000
          minimum: 0
          title: Chunk Overlap
          description: Number of units to overlap between consecutive chunks.
          default: 0
        segment_length_seconds:
          type: integer
          maximum: 600
          minimum: 30
          title: Segment Length Seconds
          description: >-
            Length of each transcript segment in seconds (for time_segments
            split strategy). Shorter segments give more precise search results
            but more documents.
          default: 120
        language:
          type: string
          title: Language
          description: >-
            Preferred language code for YouTube captions (when
            source_type='youtube').
          default: en
        extract_captions:
          type: boolean
          title: Extract Captions
          description: >-
            Extract auto-captions or manual subtitles from YouTube videos (when
            source_type='youtube'). Falls back to video description if False.
          default: true
        response_shape:
          anyOf:
            - type: string
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Response Shape
          description: Define custom structured output using LLM extraction.
        llm_provider:
          anyOf:
            - type: string
            - type: 'null'
          title: Llm Provider
          description: LLM provider for structured extraction (openai, google, anthropic).
        llm_model:
          anyOf:
            - type: string
            - type: 'null'
          title: Llm Model
          description: Specific LLM model for structured extraction.
        llm_api_key:
          anyOf:
            - type: string
            - type: 'null'
          title: Llm Api Key
          description: >-
            API key for LLM operations (BYOK - Bring Your Own Key). Supports:

            - Direct key: 'sk-proj-abc123...'

            - Secret reference: '{{SECRET.openai_api_key}}'


            When using secret reference, the key is loaded from your
            organization's secrets vault at runtime. Store secrets via POST
            /v1/organizations/secrets.


            If not provided, uses Mixpeek's default API keys.
        embedding_model:
          anyOf:
            - $ref: '#/components/schemas/EmbeddingModel'
            - type: 'null'
          description: >-
            Embedding model to use. Defaults to the current TEXT modality
            default in the central embedding registry. Changing this on an
            existing namespace requires a migration — dimensions are fixed at
            namespace creation.
        embedding_task:
          anyOf:
            - type: string
            - type: 'null'
          title: Embedding Task
          description: >-
            Embedding task hint for instruction-aware models (E5, Gemini).
            Prefer setting this at collection level (embedding_task on the
            collection) rather than here. Collection-level overrides this value.
            Defaults to 'retrieval_document'. Values: retrieval_document,
            retrieval_query, semantic_similarity, classification, clustering.
      type: object
      title: TextExtractorParams
      description: >-
        Parameters for the text extractor.


        The text extractor generates dense vector embeddings optimized for
        semantic similarity search.

        It uses the E5-Large multilingual model to convert text into
        1024-dimensional vectors.


        When ``source_type`` is ``"youtube"``, the extractor first resolves
        YouTube URLs

        to caption text via yt-dlp before chunking and embedding. Use
        ``split_by="time_segments"``

        with ``segment_length_seconds`` to segment captions by time window.
      examples:
        - chunk_overlap: 0
          chunk_size: 1000
          extractor_type: text_extractor
          split_by: none
        - chunk_overlap: 1
          chunk_size: 5
          extractor_type: text_extractor
          split_by: sentences
        - extractor_type: text_extractor
          language: en
          segment_length_seconds: 120
          source_type: youtube
          split_by: time_segments
    UniversalExtractorParams:
      properties:
        extractor_type:
          type: string
          const: universal_extractor
          title: Extractor Type
          description: Discriminator field for parameter type identification.
          default: universal_extractor
        output_dimensionality:
          type: integer
          maximum: 3072
          minimum: 256
          title: Output Dimensionality
          description: Output embedding dimensions (Gemini Embedding 2 supports 256-3072).
          default: 3072
        task_type:
          type: string
          title: Task Type
          description: >-
            Embedding intent used as a text instruction for Gemini Embedding 2.
            Common values: RETRIEVAL_DOCUMENT, RETRIEVAL_QUERY,
            SEMANTIC_SIMILARITY.
          default: RETRIEVAL_DOCUMENT
        generate_description:
          type: boolean
          title: Generate Description
          description: >-
            Generate a text description of the content via Gemini
            vision/understanding.
          default: true
        extract_text:
          type: boolean
          title: Extract Text
          description: >-
            Extract text content (OCR for images/docs, transcription for
            audio/video).
          default: true
        max_video_segments:
          type: integer
          maximum: 50
          minimum: 1
          title: Max Video Segments
          description: Maximum number of 30s segments to process for video files.
          default: 10
        max_document_pages:
          type: integer
          maximum: 200
          minimum: 1
          title: Max Document Pages
          description: Maximum number of pages to process for document files.
          default: 50
        max_file_download_mb:
          type: integer
          maximum: 1024
          minimum: 1
          title: Max File Download Mb
          description: Maximum file download size in MB for Celery fast-path processing.
          default: 500
        max_concurrency:
          type: integer
          maximum: 32
          minimum: 1
          title: Max Concurrency
          description: Maximum per-task object concurrency for Celery fast-path processing.
          default: 4
      type: object
      title: UniversalExtractorParams
      description: Parameters for the Universal Extractor.
      examples:
        - extract_text: true
          extractor_type: universal_extractor
          generate_description: true
          max_concurrency: 4
          max_file_download_mb: 500
          output_dimensionality: 3072
          task_type: RETRIEVAL_DOCUMENT
    WebScraperExtractorParams:
      properties:
        extractor_type:
          type: string
          const: web_scraper
          title: Extractor Type
          description: Discriminator field for parameter type identification.
          default: web_scraper
        max_depth:
          type: integer
          maximum: 10
          minimum: 0
          title: Max Depth
          description: >-
            Maximum link depth to crawl. 0=seed page only, 1=seed+direct links,
            etc. Default: 2. Max: 10.
          default: 2
        max_pages:
          type: integer
          maximum: 500
          minimum: 1
          title: Max Pages
          description: 'Maximum pages to crawl. Default: 50. Max: 500.'
          default: 50
        crawl_timeout:
          type: integer
          maximum: 3600
          minimum: 10
          title: Crawl Timeout
          description: >-
            Maximum total time for crawling in seconds. Default: 300 (5
            minutes). Increase for large sites with many pages. Max: 3600 (1
            hour).
          default: 300
        crawl_mode:
          $ref: '#/components/schemas/CrawlMode'
          description: >-
            Crawl strategy. DETERMINISTIC: BFS all links (predictable).
            SEMANTIC: LLM-guided, prioritizes relevant pages (requires
            crawl_goal).
          default: deterministic
        crawl_goal:
          anyOf:
            - type: string
            - type: 'null'
          title: Crawl Goal
          description: >-
            Goal for semantic crawling. Only used when crawl_mode=SEMANTIC.
            Example: 'Find all S3 API documentation and examples'
        render_strategy:
          $ref: '#/components/schemas/RenderStrategy'
          description: >-
            How to render pages. AUTO (default): tries static, falls back to JS.
            STATIC: fast HTTP fetch. JAVASCRIPT: Playwright browser for SPAs.
          default: auto
        include_patterns:
          anyOf:
            - items:
                type: string
              type: array
            - type: 'null'
          title: Include Patterns
          description: 'Regex patterns for URLs to include. Example: [''/docs/'', ''/api/'']'
        exclude_patterns:
          anyOf:
            - items:
                type: string
              type: array
            - type: 'null'
          title: Exclude Patterns
          description: 'Regex patterns for URLs to exclude. Example: [''/blog/'', ''\.pdf$'']'
        chunk_strategy:
          $ref: '#/components/schemas/ChunkStrategy'
          description: >-
            How to split page content. NONE: one chunk per page.
            SENTENCES/PARAGRAPHS: semantic boundaries. WORDS/CHARACTERS: fixed
            size chunks.
          default: none
        chunk_size:
          type: integer
          maximum: 10000
          minimum: 1
          title: Chunk Size
          description: Target size for each chunk (in units of chunk_strategy).
          default: 500
        chunk_overlap:
          type: integer
          maximum: 5000
          minimum: 0
          title: Chunk Overlap
          description: Overlap between chunks to preserve context.
          default: 50
        document_id_strategy:
          $ref: '#/components/schemas/DocumentIdStrategy'
          description: >-
            How to generate document IDs. URL (default): stable across
            re-crawls. POSITION: order-based. CONTENT: deduplicates identical
            content.
          default: url
        generate_text_embeddings:
          type: boolean
          title: Generate Text Embeddings
          description: Generate E5 embeddings for text content.
          default: true
        generate_code_embeddings:
          type: boolean
          title: Generate Code Embeddings
          description: Generate Jina code embeddings for code blocks.
          default: true
        generate_image_embeddings:
          type: boolean
          title: Generate Image Embeddings
          description: Generate SigLIP embeddings for images/figures.
          default: true
        generate_structure_embeddings:
          type: boolean
          title: Generate Structure Embeddings
          description: Generate DINOv2 visual structure embeddings for layout comparison.
          default: true
        response_shape:
          anyOf:
            - type: string
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Response Shape
          description: >-
            Optional structured extraction schema. Natural language or JSON
            schema. Example: 'Extract API version, deprecated methods, and
            example code'
        llm_provider:
          anyOf:
            - type: string
            - type: 'null'
          title: Llm Provider
          description: 'LLM provider for structured extraction: openai, google, anthropic'
        llm_model:
          anyOf:
            - type: string
            - type: 'null'
          title: Llm Model
          description: LLM model for structured extraction.
        llm_api_key:
          anyOf:
            - type: string
            - type: 'null'
          title: Llm Api Key
          description: >-
            API key for LLM operations (BYOK - Bring Your Own Key). Supports:

            - Direct key: 'sk-proj-abc123...'

            - Secret reference: '{{SECRET.openai_api_key}}'


            When using secret reference, the key is loaded from your
            organization's secrets vault at runtime. Store secrets via POST
            /v1/organizations/secrets.


            If not provided, uses Mixpeek's default API keys.
        max_retries:
          type: integer
          maximum: 10
          minimum: 0
          title: Max Retries
          description: >-
            Maximum retry attempts for failed HTTP requests. Uses exponential
            backoff with jitter. Default: 3.
          default: 3
        retry_base_delay:
          type: number
          maximum: 30
          minimum: 0.1
          title: Retry Base Delay
          description: >-
            Base delay in seconds for retry backoff. Actual delay = base *
            2^attempt + jitter. Default: 1.0.
          default: 1
        retry_max_delay:
          type: number
          maximum: 300
          minimum: 1
          title: Retry Max Delay
          description: 'Maximum delay in seconds between retries. Default: 30.'
          default: 30
        respect_retry_after:
          type: boolean
          title: Respect Retry After
          description: >-
            Respect Retry-After header from 429/503 responses. If False, uses
            exponential backoff instead. Default: True.
          default: true
        proxies:
          anyOf:
            - items:
                type: string
              type: array
            - type: 'null'
          title: Proxies
          description: >-
            List of proxy URLs for rotation. Supports formats:
            'http://host:port', 'http://user:pass@host:port',
            'socks5://host:port'. Proxies rotate on errors or every N requests.
        rotate_proxy_on_error:
          type: boolean
          title: Rotate Proxy On Error
          description: 'Rotate to next proxy when request fails. Default: True.'
          default: true
        rotate_proxy_every_n_requests:
          type: integer
          maximum: 1000
          minimum: 0
          title: Rotate Proxy Every N Requests
          description: >-
            Rotate proxy every N requests (0 = disabled). Useful for avoiding
            IP-based rate limits. Default: 0 (disabled).
          default: 0
        captcha_service_provider:
          anyOf:
            - type: string
            - type: 'null'
          title: Captcha Service Provider
          description: >-
            Captcha solving service provider: '2captcha', 'anti-captcha',
            'capsolver'. If not set, captcha pages are skipped gracefully.
        captcha_service_api_key:
          anyOf:
            - type: string
            - type: 'null'
          title: Captcha Service Api Key
          description: >-
            API key for captcha solving service. Supports secret reference:
            '{{SECRET.captcha_api_key}}'. Required if captcha_service_provider
            is set.
        detect_captcha:
          type: boolean
          title: Detect Captcha
          description: >-
            Detect captcha challenges (Cloudflare, reCAPTCHA, hCaptcha). If
            detected and no solver configured, page is skipped. Default: True.
          default: true
        persist_cookies:
          type: boolean
          title: Persist Cookies
          description: >-
            Persist cookies across requests within a crawl session. Useful for
            sites requiring authentication. Default: True.
          default: true
        custom_headers:
          anyOf:
            - additionalProperties:
                type: string
              type: object
            - type: 'null'
          title: Custom Headers
          description: >-
            Custom HTTP headers to include in all requests. Example:
            {'Authorization': 'Bearer token', 'X-Custom': 'value'}
        youtube_mode:
          type: string
          enum:
            - auto
            - 'off'
            - force
          title: Youtube Mode
          description: >-
            YouTube channel fast path. 'auto' (default): detect YouTube channel
            URLs and hand off to yt-dlp; other URLs run normal BFS. 'off': never
            hand off. 'force': treat every URL as a YouTube channel (useful for
            explicit channel buckets).
          default: auto
        youtube_max_videos:
          type: integer
          maximum: 5000
          minimum: 1
          title: Youtube Max Videos
          description: 'Max videos to pull per channel enumeration. Default: 50.'
          default: 50
        youtube_backfill_months:
          type: integer
          maximum: 60
          minimum: 1
          title: Youtube Backfill Months
          description: 'Skip videos older than this many months. Default: 6.'
          default: 6
        youtube_show_filter:
          anyOf:
            - type: string
            - type: 'null'
          title: Youtube Show Filter
          description: >-
            Optional case-insensitive regex applied to video titles. Used when a
            channel hosts multiple shows and you only want one.
        youtube_download_videos:
          type: boolean
          title: Youtube Download Videos
          description: >-
            If true (default), download each video file so downstream processors
            can read it from the `video_path` field. If false, only metadata +
            captions are emitted.
          default: true
        youtube_format_ladder:
          type: string
          title: Youtube Format Ladder
          description: >-
            yt-dlp format selector. Defaults to a muxed-first ladder that avoids
            breaking on YouTube's DASH n-challenge when the current EJS
            signature solver is stale. See BUILD_LOG.md in the greenroom folder
            for the rationale.
          default: b[height<=720]/bv*[height<=720]+ba/best
        youtube_cookies_path:
          anyOf:
            - type: string
            - type: 'null'
          title: Youtube Cookies Path
          description: >-
            Optional path to a Netscape cookies file. Required for age-gated or
            members-only content.
        youtube_request_sleep:
          type: number
          maximum: 30
          minimum: 0
          title: Youtube Request Sleep
          description: 'Seconds to sleep between yt-dlp requests. Default: 1.0.'
          default: 1
        delay_between_requests:
          type: number
          maximum: 60
          minimum: 0
          title: Delay Between Requests
          description: >-
            Delay in seconds between consecutive requests. Useful for polite
            crawling and avoiding rate limits. Default: 0 (no delay).
          default: 0
      type: object
      title: WebScraperExtractorParams
      description: >-
        Parameters for the web scraper extractor.


        The web scraper extractor crawls websites and extracts content with
        three types

        of embeddings for comprehensive multimodal search:


        **Embedding Types:**

        - Text (E5-Large): 1024D embeddings for page content

        - Code (Jina Code): 768D embeddings for code blocks

        - Images (SigLIP): 768D semantic embeddings for figures/screenshots

        - Images (DINOv2): 768D structure embeddings for visual layout
        comparison


        **Crawl Modes:**

        - DETERMINISTIC: BFS following all links (default, predictable)

        - SEMANTIC: LLM-guided, prioritizes pages matching crawl_goal


        **Rendering Strategies:**

        - STATIC: Fast HTTP fetch (default, works for most sites)

        - JAVASCRIPT: Playwright browser for SPAs (React/Vue/Angular)

        - AUTO: Tries static, falls back to JS if content too short


        **Use Cases:**

        - Documentation freshness: Crawl docs, compare against course content

        - Job board ingestion: Extract job listings with structured data

        - Knowledge base building: Convert websites to searchable collections

        - Code example indexing: Find API usage patterns across docs
      examples:
        - chunk_size: 3
          chunk_strategy: paragraphs
          description: Documentation site crawl
          extractor_type: web_scraper
          max_depth: 3
          max_pages: 100
        - description: Job board extraction
          extractor_type: web_scraper
          max_depth: 1
          max_pages: 50
          render_strategy: auto
          response_shape: Extract job title, department, location, and requirements
        - crawl_goal: Find all S3 upload examples and API documentation
          crawl_mode: semantic
          description: Semantic crawl for API docs
          extractor_type: web_scraper
          generate_code_embeddings: true
          max_pages: 200
        - delay_between_requests: 0.5
          description: Large-scale catalogue with resilience
          extractor_type: web_scraper
          max_depth: 5
          max_pages: 10000
          max_retries: 5
          respect_retry_after: true
        - description: Protected site with proxy rotation
          extractor_type: web_scraper
          max_pages: 5000
          persist_cookies: true
          proxies:
            - http://proxy1.example.com:8080
            - http://proxy2.example.com:8080
          rotate_proxy_every_n_requests: 50
          rotate_proxy_on_error: true
    CustomPluginParams:
      properties:
        extractor_type:
          type: string
          title: Extractor Type
          description: Custom plugin extractor type (plugin name)
      additionalProperties: true
      type: object
      required:
        - extractor_type
      title: CustomPluginParams
      description: >-
        Parameters for custom plugin extractors.


        This model accepts any extractor_type that doesn't match the builtin
        extractors,

        allowing custom plugins to define their own parameters.
    InputMapping:
      properties:
        input_key:
          type: string
          title: Input Key
          description: Key used in the constructed inputs payload.
        source_type:
          anyOf:
            - $ref: '#/components/schemas/InputSourceType'
            - type: 'null'
          description: Source of the value (payload, literal, vector).
        path:
          anyOf:
            - type: string
            - type: 'null'
          title: Path
          description: >-
            Dot-notation path inside payload/vector when source_type is PAYLOAD
            or VECTOR.
        override:
          anyOf:
            - {}
            - type: 'null'
          title: Override
          description: Static value used when source_type is LITERAL. Overrides any path.
      type: object
      required:
        - input_key
      title: InputMapping
      description: |-
        Declarative mapping for building inputs from various sources.

        - input_key: The key used in the constructed inputs payload
        - source_type: Where to fetch the value (payload, literal, vector)
        - path: Dot-notation path when source_type is PAYLOAD or VECTOR
        - override: Static value when source_type is LITERAL
      examples:
        - input_key: query_text
          path: content.title
          source_type: payload
        - input_key: lang
          override: en
          source_type: literal
        - input_key: image_vector
          path: features.clip_vit_l_14
          source_type: vector
    FieldPassthrough:
      properties:
        source_path:
          type: string
          title: Source Path
          description: >-
            REQUIRED. Path to the source field to copy. Simple fields: Use field
            name directly (e.g., 'title', 'campaign_id'). Nested fields: Use dot
            notation (e.g., 'metadata.author', 'config.model.version'). The
            field must exist in the source bucket schema or upstream collection
            schema. Without target_path, nested fields are flattened:
            'metadata.author' becomes 'author' in output.
          examples:
            - title
            - campaign_id
            - metadata.author
            - config.model_version
            - category
        target_path:
          anyOf:
            - type: string
            - type: 'null'
          title: Target Path
          description: >-
            OPTIONAL. Target field name in output document. If NOT PROVIDED:
            Uses source_path name (or last component for nested paths).   -
            'title' → 'title'   - 'metadata.author' → 'author' If PROVIDED: Uses
            this exact name in output.   - source_path='doc_title',
            target_path='title' → 'title'   - source_path='metadata.author',
            target_path='contributor' → 'contributor' Use cases:   - Rename
            fields for cleaner API schemas   - Avoid name conflicts with
            extractor outputs   - Standardize field names across different
            sources Constraints:   - Must not conflict with system fields
            (document_id, collection_id, etc.)   - Must not conflict with
            extractor output fields   - Must be a valid field name
            (alphanumeric, underscores, hyphens)
          examples:
            - title
            - author
            - campaign_name
            - product_id
            - user_email
        default:
          anyOf:
            - {}
            - type: 'null'
          title: Default
          description: >-
            OPTIONAL. Default value if source field doesn't exist or is None. If
            NOT PROVIDED and field missing: Field is omitted from output
            document. If PROVIDED and field missing: Field is included with this
            default value. Type should match expected field type (string, int,
            list, dict, etc.).
          examples:
            - Unknown
            - N/A
            - 0
            - []
            - {}
            - false
        required:
          type: boolean
          title: Required
          description: >-
            OPTIONAL. Whether this field MUST exist in source. If True and field
            missing: Raises validation error, processing fails. If False and
            field missing: Field omitted (or default used if provided). Use True
            for: Critical identifiers, required business fields. Use False for:
            Optional metadata, nice-to-have fields. Default: False (field is
            optional).
          default: false
      type: object
      required:
        - source_path
      title: FieldPassthrough
      description: >-
        Configuration for passing fields from source to output documents.


        Simple field passthrough: specify which fields to copy from source
        (bucket object

        or upstream collection document) to the output documents alongside
        extractor outputs.


        Use Cases:
            - Preserve identifiers: campaign_id, product_sku, user_id
            - Keep metadata: category, tags, author, timestamp
            - Maintain business context: priority, status, region
            - Extract nested values: metadata.author, config.model_version
            - Rename fields for cleaner schemas: doc_title → title

        Field Selection:
            - WITHOUT field_passthrough: Only extractor outputs appear in documents
            - WITH field_passthrough: Specified fields + extractor outputs
            - WITH include_all_source_fields=True: All source fields + extractor outputs

        Field Naming:
            - WITHOUT target_path: Output uses source name (or last component for nested)
              - "title" → "title"
              - "metadata.author" → "author"
            - WITH target_path: Output uses specified name
              - source_path="doc_title", target_path="title" → "title"
              - source_path="metadata.author", target_path="contributor" → "contributor"

        Requirements:
            - source_path is REQUIRED - specifies which field to copy (supports dot notation)
            - target_path is OPTIONAL - rename field in output (default: auto-derived name)
            - default is OPTIONAL - provides fallback if field missing (default: omit field)
            - required is OPTIONAL - errors if field missing (default: false, omit field)
      examples:
        - description: Simple field passthrough
          required: true
          source_path: title
        - default: uncategorized
          description: Field with default value
          source_path: category
        - description: Field renaming for cleaner schema
          source_path: doc_title
          target_path: title
        - description: Nested field extraction with custom name
          source_path: metadata.author
          target_path: contributor
        - description: Required field with renaming
          required: true
          source_path: campaign_id
          target_path: campaign
        - default: normal
          description: Optional field with default and renaming
          required: false
          source_path: metadata.priority
          target_path: priority
    SourceType:
      type: string
      enum:
        - bucket
        - collection
        - taxonomy
        - cluster
        - direct_upsert
        - none
      title: SourceType
      description: Source types for any document/point.
    SourceFilters-Output:
      properties:
        filters:
          anyOf:
            - $ref: '#/components/schemas/LogicalOperator-Output'
            - type: 'null'
          description: >-
            Optional logical filters to apply to source data. Uses
            LogicalOperator model with AND/OR/NOT support. When specified, only
            objects/documents matching these filters will be processed by this
            collection. When null, all source data is processed (no filtering).
            Filters are consistent across all batch runs for this collection.
          examples:
            - AND:
                - field: blobs.type
                  operator: eq
                  value: video
            - AND:
                - field: metadata.status
                  operator: eq
                  value: active
                - field: created_at
                  operator: gte
                  value: '2025-10-01T00:00:00Z'
            - OR:
                - field: brand_name
                  operator: in
                  value:
                    - Acme
                    - TechCo
                - field: category
                  operator: eq
                  value: premium
      type: object
      title: SourceFilters
      description: >-
        Filters applied to source data when processing collections.


        Source filters determine which objects (from buckets) or documents (from
        collections)

        are processed by this collection. Filters use the same LogicalOperator
        model as

        list APIs throughout the system, supporting complex AND/OR/NOT logic.


        Use Cases:
            - Process only specific content types from mixed-content buckets
            - Filter by metadata fields (status, category, tags, dates)
            - Create specialized collections from broader sources
            - Exclude certain objects or documents from processing

        Examples:
            Process only video content:
                {
                    "AND": [
                        {"field": "blobs.type", "operator": "eq", "value": "video"}
                    ]
                }

            Process only active, published content:
                {
                    "AND": [
                        {"field": "metadata.status", "operator": "eq", "value": "active"},
                        {"field": "metadata.published", "operator": "eq", "value": true}
                    ]
                }

            Process content from last 30 days:
                {
                    "AND": [
                        {"field": "created_at", "operator": "gte", "value": "2025-10-08T00:00:00Z"}
                    ]
                }

            Process specific brands OR categories:
                {
                    "OR": [
                        {"field": "brand_name", "operator": "in", "value": ["Acme", "TechCo"]},
                        {"field": "category", "operator": "eq", "value": "premium"}
                    ]
                }

        Filter Operators:
            - eq (equals)
            - ne (not equals)
            - gt (greater than)
            - gte (greater than or equal)
            - lt (less than)
            - lte (less than or equal)
            - in (value in list)
            - nin (value not in list)
            - contains (string contains)
            - starts_with (string starts with)
            - ends_with (string ends with)

        Performance Considerations:
            - Filters are evaluated at batch creation time
            - Only matching objects/documents are included in processing
            - More selective filters = smaller batches = faster processing
            - Use indexed fields (metadata, timestamps) for better performance

        Relationship to Batch Filters:
            - Source filters: Applied at collection definition (consistent across all batches)
            - Batch filters: Applied at batch creation (ad-hoc, per-batch basis)
            - Both can be used together: source filters + batch filters = intersection
      examples:
        - description: Filter only video content
          filters:
            AND:
              - field: blobs.type
                operator: eq
                value: video
        - description: Filter active content from specific brands
          filters:
            AND:
              - field: metadata.status
                operator: eq
                value: active
              - field: brand_name
                operator: in
                value:
                  - Acme
                  - TechCo
                  - Innovate
        - description: Filter by content type and date range
          filters:
            AND:
              - OR:
                  - field: blobs.type
                    operator: eq
                    value: video
                  - field: blobs.type
                    operator: eq
                    value: image
              - field: created_at
                operator: gte
                value: '2025-10-01T00:00:00Z'
              - field: created_at
                operator: lte
                value: '2025-10-31T23:59:59Z'
        - description: No filters - process all source data
    TaxonomyExecutionMode:
      type: string
      enum:
        - materialize
      title: TaxonomyExecutionMode
      description: How a taxonomy should be executed when attached to a collection.
    LogicalOperator-Output:
      properties:
        AND:
          anyOf:
            - items:
                anyOf:
                  - $ref: '#/components/schemas/LogicalOperator-Output'
                  - $ref: '#/components/schemas/FilterCondition'
              type: array
            - type: 'null'
          title: And
          description: Logical AND operation - all conditions must be true
          example:
            - field: name
              operator: eq
              value: John
            - field: age
              operator: gte
              value: 30
        OR:
          anyOf:
            - items:
                anyOf:
                  - $ref: '#/components/schemas/LogicalOperator-Output'
                  - $ref: '#/components/schemas/FilterCondition'
              type: array
            - type: 'null'
          title: Or
          description: Logical OR operation - at least one condition must be true
          example:
            - field: status
              operator: eq
              value: active
            - field: role
              operator: eq
              value: admin
        NOT:
          anyOf:
            - items:
                anyOf:
                  - $ref: '#/components/schemas/LogicalOperator-Output'
                  - $ref: '#/components/schemas/FilterCondition'
              type: array
            - type: 'null'
          title: Not
          description: Logical NOT operation - all conditions must be false
          example:
            - field: department
              operator: eq
              value: HR
            - field: location
              operator: eq
              value: remote
        case_sensitive:
          anyOf:
            - type: boolean
            - type: 'null'
          title: Case Sensitive
          description: Whether to perform case-sensitive matching
          default: false
          example: true
      additionalProperties: true
      type: object
      title: LogicalOperator
      description: >-
        Represents a logical operation (AND, OR, NOT) on filter conditions.


        Allows nesting with a defined depth limit.


        Also supports shorthand syntax where field names can be passed directly

        as key-value pairs for equality filtering (e.g., {"metadata.title":
        "value"}).
    HierarchicalEnrichmentStyle:
      type: string
      enum:
        - full_chain
        - best_match
        - combined
      title: HierarchicalEnrichmentStyle
      description: >-
        How hierarchical taxonomy results should be structured in enriched
        documents.


        Controls the field naming pattern for multi-tier taxonomy enrichment.
    AlertExecutionMode:
      type: string
      enum:
        - on_ingest
        - scheduled
        - on_demand
      title: AlertExecutionMode
      description: When the alert should execute.
    AlertInputMapping:
      properties:
        input_key:
          type: string
          title: Input Key
          description: The retriever input parameter name
          examples:
            - query_embedding
            - collection_id
            - category_filter
        source:
          $ref: '#/components/schemas/InputMappingSource'
          description: Where to get the value from
      type: object
      required:
        - input_key
        - source
      title: AlertInputMapping
      description: |-
        Maps a retriever input to a document field or constant.

        Input mappings define how to construct retriever inputs from the
        ingested document. This allows the same alert to be used with
        different collections that have different field structures.

        Examples:
            # Map document embedding to retriever query
            {
                "input_key": "query_embedding",
                "source": {"source_type": "document_field", "path": "features.video_embedding"}
            }

            # Use constant collection for search target
            {
                "input_key": "collection_id",
                "source": {"source_type": "constant", "value": "col_known_incidents"}
            }

        Attributes:
            input_key: The retriever input parameter name
            source: Where to get the value from
      examples:
        - input_key: query_embedding
          source:
            path: features.video_embedding
            source_type: document_field
        - input_key: target_collection
          source:
            source_type: constant
            value: col_known_incidents
    PostProcessingPhase:
      type: integer
      enum:
        - 1
        - 2
        - 3
        - 4
      title: PostProcessingPhase
      description: >-
        Execution phases for post-processing applications.


        Applications execute in phase order (lower = earlier).

        Within a phase, applications execute by priority (higher = earlier).


        Phases:
            TAXONOMY (1): Classification and labeling operations
            CLUSTER (2): Grouping and clustering operations
            ALERT (3): Notifications and alerts (default for alerts)
            RETRIEVER_ENRICHMENT (4): Retriever-based enrichment operations

        The default phase for each application type:
            - TaxonomyApplicationConfig: TAXONOMY
            - ClusterApplicationConfig: CLUSTER
            - AlertApplicationConfig: ALERT
            - RetrieverEnrichmentConfig: RETRIEVER_ENRICHMENT

        Users can override the phase via the `execution_phase` field to run

        applications in non-default order. For example, an alert can be
        configured

        to run in Phase 1 alongside taxonomies if early notification is needed.


        Example:
            # Default: Alert runs after taxonomies and clusters
            AlertApplicationConfig(alert_id="alt_123", execution_phase=PostProcessingPhase.ALERT)

            # Override: Run alert early, in taxonomy phase
            AlertApplicationConfig(alert_id="alt_urgent", execution_phase=PostProcessingPhase.TAXONOMY)
    EnrichmentInputMapping:
      properties:
        input_key:
          type: string
          title: Input Key
          description: The retriever input parameter name
          examples:
            - query
            - query_embedding
            - collection_id
        source:
          $ref: '#/components/schemas/InputMappingSource'
          description: Where to get the value from (document field or constant)
      type: object
      required:
        - input_key
        - source
      title: EnrichmentInputMapping
      description: >-
        Maps a document field or constant to a retriever input parameter.


        Defines how to construct retriever inputs from the document being
        enriched.


        Attributes:
            input_key: The retriever input parameter name
            source: Where to get the value from (document field or constant)
      examples:
        - input_key: query
          source:
            path: title
            source_type: document_field
        - input_key: collection_id
          source:
            source_type: constant
            value: col_reference_data
    WriteBackFieldMapping:
      properties:
        source_field:
          type: string
          title: Source Field
          description: Field path in retriever result (dot notation supported)
          examples:
            - ai_safety_insight.text
            - category
            - score
        target_field:
          type: string
          title: Target Field
          description: Field name to write on the document
          examples:
            - _enrichment_category
            - safety_score
            - related_items
        mode:
          type: string
          enum:
            - first
            - all_as_array
            - concat
          title: Mode
          description: >-
            How to aggregate values from multiple results. 'first': top result
            only. 'all_as_array': collect into list. 'concat': join strings with
            ', '.
          default: first
      type: object
      required:
        - source_field
        - target_field
      title: WriteBackFieldMapping
      description: |-
        Maps a field from retriever results back to the document.

        Controls how retriever result fields are written to the source document.

        Attributes:
            source_field: Field path in retriever result (dot notation for nested fields)
            target_field: Field name to write on the document
            mode: How to aggregate values across multiple results:
                - "first": Write value from first result only (default)
                - "all_as_array": Collect values from all results into a list
                - "concat": Concatenate string values with ", " separator
    BucketSchemaFieldType:
      type: string
      enum:
        - string
        - number
        - integer
        - float
        - boolean
        - object
        - array
        - date
        - datetime
        - text
        - image
        - audio
        - video
        - pdf
        - excel
      title: BucketSchemaFieldType
      description: >-
        Supported data types for bucket schema fields.


        Types fall into two categories:


        1. **Metadata Types** (JSON types):
           - Stored as object metadata
           - Standard JSON-compatible types
           - Not processed by extractors (unless explicitly mapped)
           - Examples: string, number, boolean, date

        2. **File Types** (blobs):
           - Stored as files/blobs
           - Processed by extractors
           - Require file content (URL or base64)
           - Examples: text, image, video, pdf

        **GIF Special Handling**:
            GIF files can be declared as either IMAGE or VIDEO type:

            - As IMAGE: GIF is embedded as a single static image (first frame)
            - As VIDEO: GIF is decomposed frame-by-frame with embeddings per frame

            The multimodal extractor detects GIFs via MIME type (image/gif) and routes
            them based on your schema declaration. Use VIDEO for animated GIFs where
            frame-level search is needed, IMAGE for static/thumbnail use cases.

        NOTE: For retriever input schemas that need to accept document
        references

        (e.g., "find similar documents"), use RetrieverInputSchemaFieldType
        instead,

        which includes all bucket types plus document_reference.
    SplitMethod:
      type: string
      enum:
        - time
        - scene
        - silence
      title: SplitMethod
    GenerationConfig:
      properties:
        candidate_count:
          type: integer
          title: Candidate Count
          description: Number of candidate responses to generate for video description.
          default: 1
        max_output_tokens:
          type: integer
          title: Max Output Tokens
          description: Maximum number of tokens for the generated video description.
          default: 1024
        temperature:
          type: number
          title: Temperature
          description: >-
            Controls randomness for video description generation. Higher is more
            random.
          default: 0.2
        top_p:
          type: number
          title: Top P
          description: Nucleus sampling (top-p) for video description generation.
          default: 0.8
        response_mime_type:
          anyOf:
            - type: string
            - type: 'null'
          title: Response Mime Type
          description: MIME type for response (e.g., 'application/json')
        response_schema:
          anyOf:
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Response Schema
          description: JSON schema for structured output
      type: object
      title: GenerationConfig
      description: Configuration for generative models.
    TextSplitStrategy:
      type: string
      enum:
        - characters
        - words
        - sentences
        - paragraphs
        - pages
        - time_segments
        - none
      title: TextSplitStrategy
      description: Strategy for splitting text into chunks.
    EmbeddingModel:
      type: string
      enum:
        - laion_clip_vit_l_14_v1
        - multilingual_e5_large_instruct_v1
        - vertex_multimodal_embedding
        - multimodalembedding@001
        - gemini-embedding-2
        - google_siglip_base_v1
        - google_siglip_so400m_v1
        - text-embedding-3-small
        - text-embedding-3-large
        - face_identity_arcface_r100_v1
        - all_minilm_l6_v2_v1
      title: EmbeddingModel
      description: |-
        Embedding model identifiers.

        Format: {provider}_{model_name}_{version}
    CrawlMode:
      type: string
      enum:
        - deterministic
        - semantic
      title: CrawlMode
      description: |-
        Mode for crawling web pages.

        Values:
            DETERMINISTIC: BFS crawl following all links up to max_depth
            SEMANTIC: LLM-guided crawl prioritizing pages relevant to crawl_goal
    RenderStrategy:
      type: string
      enum:
        - static
        - javascript
        - auto
      title: RenderStrategy
      description: |-
        Strategy for rendering web pages.

        Values:
            STATIC: Fast HTTP fetch, works for most sites
            JAVASCRIPT: Browser rendering via Playwright for SPAs
            AUTO: Try static first, fall back to JS if content too short
    ChunkStrategy:
      type: string
      enum:
        - none
        - sentences
        - paragraphs
        - words
        - characters
      title: ChunkStrategy
      description: Strategy for splitting page content into chunks.
    DocumentIdStrategy:
      type: string
      enum:
        - url
        - position
        - content
      title: DocumentIdStrategy
      description: |-
        Strategy for generating deterministic document IDs.

        Values:
            URL: hash(page_url + chunk_index) - stable across re-crawls
            POSITION: hash(seed_url + page_index + chunk_index) - order-based
            CONTENT: hash(content) - deduplicates identical content
    InputSourceType:
      type: string
      enum:
        - payload
        - literal
        - vector
        - blob
      title: InputSourceType
      description: Where the value for an input should be retrieved from.
    FilterCondition:
      properties:
        field:
          type: string
          title: Field
          description: Field name to filter on
        operator:
          $ref: '#/components/schemas/FilterOperator'
          description: Comparison operator
          default: eq
        value:
          anyOf:
            - $ref: '#/components/schemas/DynamicValue'
            - {}
          title: Value
          description: Value to compare against
      type: object
      required:
        - field
        - value
      title: FilterCondition
      description: |-
        Represents a single filter condition.

        Attributes:
            field: The field to filter on
            operator: The comparison operator
            value: The value to compare against
    InputMappingSource:
      properties:
        source_type:
          type: string
          enum:
            - document_field
            - constant
            - source_blob
          title: Source Type
          description: Where the value comes from
        path:
          anyOf:
            - type: string
            - type: 'null'
          title: Path
          description: >-
            JSONPath to document field (when source_type='document_field'), or
            blob property name (when source_type='source_blob', e.g. 'image')
          examples:
            - features.video_embedding
            - metadata.category
            - _taxonomy_product_label
            - image
        value:
          anyOf:
            - {}
            - type: 'null'
          title: Value
          description: Constant value to use (when source_type='constant')
          examples:
            - col_known_incidents
            - 0.85
            - true
      type: object
      required:
        - source_type
      title: InputMappingSource
      description: >-
        Defines how to get a value for a retriever input.


        Can be a document field reference, a constant value, or a source blob
        URL.


        Attributes:
            source_type: Where the value comes from
            path: JSONPath to document field (when source_type='document_field'),
                  or blob property name (when source_type='source_blob', e.g. 'image')
            value: Constant value to use (when source_type='constant')
    FilterOperator:
      type: string
      enum:
        - eq
        - ne
        - gt
        - lt
        - gte
        - lte
        - in
        - nin
        - contains
        - starts_with
        - ends_with
        - regex
        - exists
        - is_null
        - text
        - phrase
      title: FilterOperator
      description: Supported filter operators across database implementations.
    DynamicValue:
      properties:
        type:
          type: string
          const: dynamic
          title: Type
          default: dynamic
        field:
          type: string
          title: Field
          description: >-
            The dot-notation path to the value in the runtime query request,
            e.g., 'inputs.user_id'
          examples:
            - inputs.query_text
            - filters.AND[0].value
      type: object
      required:
        - field
      title: DynamicValue
      description: A value that should be dynamically resolved from the query request.

````