> ## Documentation Index
> Fetch the complete documentation index at: https://docs.mixpeek.com/docs/llms.txt
> Use this file to discover all available pages before exploring further.

# Add Objects to Batch

> Add objects to an existing batch. The batch must be in 'draft' status.



## OpenAPI

````yaml post /v1/buckets/{bucket_identifier}/batches/{batch_id}/objects
openapi: 3.1.0
info:
  title: Mixpeek API
  description: >-
    This is the Mixpeek API, providing access to various endpoints for data
    processing and retrieval.
  termsOfService: https://mixpeek.com/terms
  contact:
    name: Mixpeek Support
    url: https://mixpeek.com/contact
    email: info@mixpeek.com
  version: '0.82'
servers:
  - url: https://api.mixpeek.com
    description: Production
security: []
paths:
  /v1/buckets/{bucket_identifier}/batches/{batch_id}/objects:
    post:
      tags:
        - Bucket Batches
      summary: Add Objects to Batch
      description: Add objects to an existing batch. The batch must be in 'draft' status.
      operationId: >-
        add_objects_to_batch_v1_buckets__bucket_identifier__batches__batch_id__objects_post
      parameters:
        - name: bucket_identifier
          in: path
          required: true
          schema:
            type: string
            description: The unique identifier of the bucket.
            title: Bucket Identifier
          description: The unique identifier of the bucket.
        - name: batch_id
          in: path
          required: true
          schema:
            type: string
            description: The unique identifier of the batch.
            title: Batch Id
          description: The unique identifier of the batch.
        - name: skip_validation
          in: query
          required: false
          schema:
            type: boolean
            description: >-
              Skip object existence validation. Use this for large batches (>10k
              objects) or when you're certain all object IDs are valid. Improves
              performance significantly.
            default: false
            title: Skip Validation
          description: >-
            Skip object existence validation. Use this for large batches (>10k
            objects) or when you're certain all object IDs are valid. Improves
            performance significantly.
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/AddObjectsToBatchRequest'
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/BatchModel'
        '400':
          description: Bad Request
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '401':
          description: Unauthorized
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '403':
          description: Forbidden
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '404':
          description: Not Found
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '422':
          description: Validation Error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/HTTPValidationError'
        '500':
          description: Internal Server Error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
components:
  schemas:
    AddObjectsToBatchRequest:
      properties:
        object_ids:
          items:
            type: string
          type: array
          minItems: 1
          title: Object Ids
          description: A list of object IDs to add to the batch.
          examples:
            - - object_789
              - object_101
      type: object
      required:
        - object_ids
      title: AddObjectsToBatchRequest
      description: The request model for adding objects to an existing batch.
    BatchModel:
      properties:
        batch_id:
          type: string
          title: Batch Id
          description: >-
            OPTIONAL (auto-generated if not provided). Unique identifier for
            this batch. Format: 'btch_' prefix followed by 12-character secure
            token. Generated using generate_secure_token() from
            shared.utilities.helpers. Used to query batch status and track
            processing across tiers. Immutable after creation.
          examples:
            - btch_abc123xyz789
            - btch_video_batch_01
        bucket_id:
          type: string
          title: Bucket Id
          description: >-
            REQUIRED. Unique identifier of the bucket containing the objects to
            process. Must be a valid bucket ID that exists in the system. All
            object_ids must belong to this bucket. Format: Bucket ID as defined
            when bucket was created.
          examples:
            - bkt_videos
            - bkt_documents_q4
        namespace_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Namespace Id
          description: Namespace this batch belongs to. Stored at creation time.
        status:
          $ref: '#/components/schemas/TaskStatusEnum'
          description: >-
            OPTIONAL (defaults to DRAFT). Current processing status of the
            batch. Lifecycle: DRAFT → PENDING → IN_PROGRESS → COMPLETED/FAILED.
            DRAFT: Batch created but not yet submitted. PENDING: Batch submitted
            and queued for processing. IN_PROGRESS: Batch currently processing
            (one or more tiers active). COMPLETED: All tiers successfully
            completed. FAILED: One or more tiers failed. Aggregated from
            tier_tasks statuses during multi-tier processing.
          default: DRAFT
          examples:
            - DRAFT
            - PENDING
            - IN_PROGRESS
            - COMPLETED
            - FAILED
        object_ids:
          items:
            type: string
          type: array
          minItems: 0
          title: Object Ids
          description: >-
            List of object IDs to include in this batch. All objects must exist
            in the specified bucket_id. These objects are the source data for
            tier 0 processing. Collection-sourced batches may have empty
            object_ids. Objects are processed in parallel within each tier.
          examples:
            - - obj_video_001
              - obj_video_002
            - - obj_doc_123
        dedup_strategy:
          $ref: '#/components/schemas/DedupStrategy'
          description: >-
            Controls how objects already processed in prior batches are handled.
            Scoped to (bucket, collection). 'skip': Don't reprocess objects that
            already have documents. 'replace': Delete existing documents and
            reprocess. 'force': Process regardless, allowing duplicates.
          default: skip
        dedup_audit:
          anyOf:
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Dedup Audit
          description: >-
            Per-collection dedup decisions. Keys are collection_ids; values
            contain dedup_strategy, total_input, skipped, processed, and
            skipped_object_ids (up to 1000). Written at TWO stages, deep-merged
            per collection: (1) the API at manifest build, when smart-skip
            enforcement excludes already-complete objects before any engine
            submission (fields prefixed manifest_*), and (2) the Engine after
            its resume filter runs on whatever residue was submitted.
        collection_ids:
          anyOf:
            - items:
                type: string
              type: array
            - type: 'null'
          title: Collection Ids
          description: >-
            OPTIONAL. List of all collection IDs involved in this batch's
            processing. Automatically populated during DAG resolution from
            dag_tiers. Includes collections from all tiers (flattened view of
            dag_tiers). Used for quick lookups without traversing tier
            structure. Format: List of collection IDs across all tiers.
          examples:
            - - col_chunks
            - - col_chunks
              - col_frames
              - col_scenes
        error:
          anyOf:
            - type: string
            - type: 'null'
          title: Error
          description: >-
            OPTIONAL. Legacy error message field for backward compatibility.
            None if batch succeeded or is still processing. Contains
            human-readable error description from first failed tier. DEPRECATED:
            Use tier_tasks[].errors for detailed error information. For
            multi-tier batches, typically contains the error from the first
            failed tier. Check tier_tasks array for tier-specific error details
            and error_summary for aggregation.
          examples:
            - 'Failed to process batch: Object not found'
            - 'Tier 1 failed: Out of memory during frame extraction'
            - Collection 'col_frames' not found
        failure_reason:
          anyOf:
            - type: string
            - type: 'null'
          title: Failure Reason
          description: >-
            OPTIONAL. Human-readable explanation of why the batch failed. None
            if batch succeeded, is still processing, or is in DRAFT/PENDING
            state. Populated automatically when a batch transitions to FAILED
            status. Provides a concise, actionable summary of the root cause.
            Common reasons include: Ray job failure (spot preemption, OOM, code
            errors), 0 documents written (processing completed but produced no
            output), processing stall (no activity detected for extended
            period), or task exception (submission/validation failures). Use
            this field for user-facing error displays and alerting.
          examples:
            - 'Ray job failed: ImportError: No module named ''google.genai'''
            - >-
              Processing completed but produced 0 documents. Check Ray job logs
              for '[FailureAggregator]' entries.
            - >-
              Processing stalled: no activity for 30 minutes. Job was cancelled
              automatically.
            - Namespace ns_abc123 no longer exists (may have been deleted).
        error_summary:
          anyOf:
            - additionalProperties:
                type: integer
              type: object
            - type: 'null'
          title: Error Summary
          description: >-
            OPTIONAL. Aggregated summary of errors across ALL tiers in the
            batch. None if batch succeeded or is still processing. Maps
            error_type (category) to total count of affected documents across
            all tiers. Provides quick batch-wide overview of error distribution.
            Example: {'dependency': 15, 'authentication': 25, 'validation': 5}
            means across all tiers, 15 documents failed with dependency errors,
            25 with auth errors, 5 with validation errors. Automatically
            aggregated from tier_tasks[].error_summary. Used for batch health
            dashboard and error trend analysis.
          examples:
            - null
            - dependency: 15
            - authentication: 25
              runtime: 10
              validation: 5
        failure_category:
          anyOf:
            - $ref: '#/components/schemas/FailureCategory'
            - type: 'null'
          description: >-
            OPTIONAL. Machine-readable classification of the batch failure. None
            if batch succeeded or is still processing. Auto-derived from
            `failure_reason` if not explicitly set, so existing writers that
            only populate `failure_reason` still get a category on read.
            Categories: timeout (stall/no-progress), infrastructure (OOM,
            workers died, spot preemption), orphaned (RayJob CRD gone), pipeline
            (genuine extractor/pipeline failure), validation (submission/schema
            errors), unknown (uncategorized). Use this instead of parsing
            failure_reason.
          examples:
            - null
            - timeout
            - infrastructure
            - pipeline
        failed_objects:
          items:
            $ref: '#/components/schemas/FailedObjectRecord'
          type: array
          title: Failed Objects
          description: >-
            OPTIONAL. List of per-object failure records from batch processing.
            Populated when individual objects fail while others succeed. Each
            record includes the object_id, error message, error classification
            (transient/permanent/resource), and timestamp. When this list is
            non-empty and some objects succeeded, batch status is
            COMPLETED_WITH_ERRORS. Enables targeted resubmission of only failed
            objects.
          examples:
            - []
            - - error: 'Unsupported codec: VP9'
                error_type: permanent
                object_id: obj_video_001
                timestamp: '2025-11-29T10:15:30Z'
              - error: Connection timeout to embedding service
                error_type: transient
                object_id: obj_doc_123
                timestamp: '2025-11-29T10:15:35Z'
        failed_object_count:
          type: integer
          minimum: 0
          title: Failed Object Count
          description: >-
            OPTIONAL. Count of objects that failed during batch processing.
            Shorthand for len(failed_objects). Stored separately for efficient
            queries and sorting without loading full failed_objects array.
          default: 0
          examples:
            - 0
            - 3
            - 15
        type:
          $ref: '#/components/schemas/BatchType'
          description: >-
            OPTIONAL (defaults to BUCKET). Type of batch. BUCKET: Standard batch
            processing bucket objects through collections. COLLECTION: Reserved
            for future collection-only batch processing. Currently only BUCKET
            type is supported.
          default: BUCKET
          examples:
            - BUCKET
        manifest_key:
          anyOf:
            - type: string
            - type: 'null'
          title: Manifest Key
          description: >-
            OPTIONAL. S3 key where the batch manifest is stored. Contains
            metadata and row data (Parquet) for Engine processing. For tier 0,
            points to bucket object manifest. For tier N+, points to collection
            document manifest. Format: S3 path (e.g.,
            'namespace_id/internal_id/manifests/tier_0.parquet'). Generated
            during batch submission.
          examples:
            - ns_abc/org_123/manifests/tier_0.parquet
            - ns_xyz/org_456/manifests/tier_1.parquet
        task_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Task Id
          description: >-
            OPTIONAL. Primary task ID for the batch (typically tier 0 task).
            Used for backward compatibility with single-tier batch tracking. For
            multi-tier batches, prefer querying tier_tasks array for granular
            tracking. Format: Task ID as generated for tier 0.
          examples:
            - task_tier0_abc123
            - task_batch_001
        loaded_object_ids:
          anyOf:
            - items:
                type: string
              type: array
            - type: 'null'
          title: Loaded Object Ids
          description: >-
            OPTIONAL. List of object IDs that were successfully validated and
            loaded into the batch. Subset of object_ids that passed validation.
            Used to track which objects are ready for processing. None if batch
            hasn't been validated yet.
          examples:
            - - obj_video_001
              - obj_video_002
        internal_metadata:
          anyOf:
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Internal Metadata
          description: >-
            OPTIONAL. Internal engine/job metadata for system use. May contain:
            job_id (provider-specific), engine_version, processing hints,
            last_health_check. last_health_check: Most recent health check
            results with health_status, enriched_documents,
            vector_populated_count, stall_duration_seconds, recommendations,
            missing_features. Populated asynchronously (non-blocking,
            best-effort). Used for troubleshooting batch processing issues via
            API. NOTE: In MongoDB, this is stored under '_internal.processing'
            path.
          examples:
            - include_history: true
              last_health_check:
                enriched_documents: 98
                health_status: HEALTHY
                missing_features:
                  - text_embedding
                processed_documents: 100
                recommendations: []
                stall_duration_seconds: 0
                timestamp: '2025-11-06T10:05:00Z'
                total_documents: 100
                vector_populated_count: 98
        metadata:
          $ref: '#/components/schemas/BatchMetadata'
          description: >-
            OPTIONAL. User-defined metadata for the batch. Has typed fields
            (campaign_id, source, tags, notes) and also accepts arbitrary extra
            keys. Persisted with the batch and returned in API responses. Not
            used by the system for processing logic.
          examples:
            - campaign_id: Q4_2025
              tags:
                - video
                - high-priority
            - notes: TubeScience daily ingest
              source: s3://raw-uploads/2026-05/
        tier_tasks:
          items:
            $ref: '#/components/schemas/TierTaskInfo'
          type: array
          title: Tier Tasks
          description: >-
            OPTIONAL. List of tier task tracking information for multi-tier
            processing. Each element represents one tier in the processing
            pipeline. Empty array for simple single-tier batches. Populated
            during batch submission with tier 0 info, then appended as tiers
            progress. Each TierTaskInfo contains: tier_num, task_id, status,
            collection_ids, timestamps. Used for granular monitoring: 'Show me
            status of tier 2' or 'Retry tier 1'. Array index typically matches
            tier_num (tier_tasks[0] = tier 0, tier_tasks[1] = tier 1, etc.).
          examples:
            - []
            - - collection_ids:
                  - col_chunks
                status: COMPLETED
                task_id: task_tier0_abc
                tier_num: 0
        current_tier:
          anyOf:
            - type: integer
              minimum: 0
            - type: 'null'
          title: Current Tier
          description: >-
            OPTIONAL. Zero-based index of the currently processing tier. None if
            batch hasn't started processing (status=DRAFT or PENDING). Updated
            as batch progresses through tiers. Used to show processing progress:
            'Processing tier 2 of 5'. Set to last tier number when batch
            completes. Example: If processing tier 1 (frames), current_tier=1.
          examples:
            - 0
            - 1
            - 2
        total_tiers:
          type: integer
          minimum: 1
          title: Total Tiers
          description: >-
            OPTIONAL (defaults to 1). Total number of tiers in the collection
            DAG. Minimum 1 (tier 0 only = bucket → collection). Set during DAG
            resolution when batch is submitted. Equals len(dag_tiers) if
            dag_tiers is populated. Used to calculate progress: current_tier /
            total_tiers. Example: 5-tier pipeline (bucket → chunks → frames →
            scenes → summaries) has total_tiers=5.
          default: 1
          examples:
            - 1
            - 3
            - 5
        dag_tiers:
          anyOf:
            - items:
                items:
                  type: string
                type: array
              type: array
            - type: 'null'
          title: Dag Tiers
          description: >-
            OPTIONAL. Complete DAG tier structure for this batch. List of tiers,
            where each tier is a list of collection IDs to process at that
            stage. Tier 0 = bucket-sourced collections. Tier N (N > 0) =
            collection-sourced collections. Collections within same tier have no
            dependencies (can run in parallel). Collections in tier N+1 depend
            on collections in tier N. Populated during DAG resolution at batch
            submission. Used for tier-by-tier processing orchestration. Example:
            [['col_chunks'], ['col_frames', 'col_objects'], ['col_scenes']] = 3
            tiers where frames and objects run in parallel at tier 1.
          examples:
            - - - col_chunks
            - - - col_chunks
              - - col_frames
            - - - col_chunks
              - - col_frames
                - col_objects
              - - col_scenes
        created_at:
          type: string
          format: date-time
          title: Created At
          description: >-
            OPTIONAL (auto-set on creation). ISO 8601 timestamp when batch was
            created. Set using current_time() from shared.utilities.helpers.
            Immutable after creation. Used for batch age tracking and cleanup of
            old batches.
          examples:
            - '2025-11-03T10:00:00Z'
        progress:
          anyOf:
            - $ref: '#/components/schemas/BatchProgress'
            - type: 'null'
          description: >-
            OPTIONAL. Live progress snapshot updated approximately every 10
            seconds while the batch is IN_PROGRESS. Written by the Ray
            ProgressActor inside the engine job. None when status is DRAFT or
            PENDING (job not started), or after COMPLETED/FAILED. Use this to
            show real-time progress bars: processed/total objects, percent
            complete, throughput (items_per_second), and estimated time
            remaining (eta_seconds).
          examples:
            - null
            - batch_count: 703
              errors: 0
              eta_seconds: 7308
              items_per_second: 12.5
              percent: 33
              processed: 45000
              total: 136356
        documents_written:
          anyOf:
            - type: integer
            - type: 'null'
          title: Documents Written
          description: >-
            OPTIONAL. Read-time aggregate of documents_written from tier_tasks
            and extractor_jobs. None means the completion callback has not
            reported write accounting yet.
          examples:
            - null
            - 1000
        status_diagnostics:
          additionalProperties: true
          type: object
          title: Status Diagnostics
          description: >-
            Read-time diagnostics explaining terminal status, error indicators,
            and document-write accounting.
        health:
          anyOf:
            - type: string
            - type: 'null'
          title: Health
          description: >-
            OPTIONAL. Computed health status for actively processing batches.
            Only populated when status is PROCESSING or IN_PROGRESS. Values:
            'healthy' (recent activity detected), 'stalled' (no activity for 5+
            minutes), 'unknown' (no heartbeat data yet). Computed from
            tier_tasks[].last_activity_at and updated_at. Use this to detect
            stuck batches before the internal stall detector kills them.
          examples:
            - healthy
            - stalled
            - unknown
            - null
        cost:
          anyOf:
            - $ref: '#/components/schemas/BatchCost'
            - type: 'null'
          description: >-
            OPTIONAL. Dollar cost of this batch, derived at read time from the
            credits recorded against it in usage_records (credits_consumed *
            $0.001/credit). Lets callers answer 'what did this batch cost?' from
            the batch response without GCP billing labels. None when cost could
            not be looked up (e.g. the usage store was unavailable); 0
            credits/USD when no usage has been recorded for the batch yet.
        last_activity_at:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Last Activity At
          description: >-
            OPTIONAL. Timestamp of the most recent activity across all tier
            tasks. Aggregated from tier_tasks[].last_activity_at — the latest
            heartbeat from any tier. Updated approximately every 10 seconds by
            the BatchJobPoller while processing. A stale value (minutes old)
            while status is PROCESSING indicates the batch may be stalled. None
            for batches that have not started processing or have no heartbeat
            data.
        retry_count:
          type: integer
          minimum: 0
          title: Retry Count
          description: >-
            OPTIONAL (defaults to 0). Number of times this batch has been
            auto-retried due to transient infrastructure failures (spot node
            preemption, OOM, actor death). Incremented each time the batch is
            automatically requeued after a retryable failure. User-facing: lets
            users see that retries happened transparently.
          default: 0
          examples:
            - 0
            - 1
            - 2
            - 3
        max_retries:
          type: integer
          minimum: 0
          title: Max Retries
          description: >-
            OPTIONAL (defaults to 3). Maximum number of automatic retries for
            transient failures. When retry_count reaches max_retries, the batch
            stays in FAILED state. Only transient/infrastructure failures
            trigger retries — validation and data errors do not.
          default: 3
          examples:
            - 3
            - 5
        last_retry_at:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Last Retry At
          description: >-
            OPTIONAL. ISO 8601 timestamp of the most recent auto-retry attempt.
            None if the batch has never been retried. Used to calculate
            exponential backoff for subsequent retries.
          examples:
            - null
            - '2025-11-03T10:35:00Z'
        retry_reason:
          anyOf:
            - type: string
            - type: 'null'
          title: Retry Reason
          description: >-
            OPTIONAL. Human-readable reason for the most recent auto-retry. None
            if the batch has never been retried. Describes the transient failure
            that triggered the retry (e.g., 'Spot node preempted', 'Ray actor
            died', 'OOM killed').
          examples:
            - null
            - Spot node preempted
            - Ray actor died unexpectedly
            - Worker node evicted (OOM)
        webhook_url:
          anyOf:
            - type: string
            - type: 'null'
          title: Webhook Url
          description: >-
            OPTIONAL. URL to receive an HTTP POST notification when the batch
            reaches a terminal state (COMPLETED, FAILED, or CANCELED). Set at
            submit time via SubmitBatchRequest. The webhook is fire-and-forget:
            delivery failures are logged but never affect batch processing.
          examples:
            - https://example.com/webhooks/batch-complete
            - null
        updated_at:
          type: string
          format: date-time
          title: Updated At
          description: >-
            OPTIONAL (auto-updated). ISO 8601 timestamp when batch was last
            modified. Updated using current_time() whenever batch status or
            tier_tasks change. Used to track batch activity and identify stale
            batches.
          examples:
            - '2025-11-03T10:30:00Z'
        status_message:
          anyOf:
            - type: string
            - type: 'null'
          title: Status Message
          description: >-
            COMPUTED. Human-readable description of the current batch state.
            Examples: 'Processing 724/50,000 objects (1.4%)', 'Queued — 2
            batches ahead', 'Completed in 5m 23s', 'Loading model (stage 1/3)'.
            Computed on read, not stored in the database.
        estimated_completion:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Estimated Completion
          description: >-
            COMPUTED. Estimated completion timestamp based on current
            throughput. Derived from progress.eta_seconds + now. None if
            throughput data is unavailable. Computed on read, not stored in the
            database.
      type: object
      required:
        - bucket_id
      title: BatchModel
      description: >-
        Model representing a batch of objects for processing through
        collections.


        A batch groups bucket objects together for processing through one or
        more collections.

        Batches support multi-tier processing where collections are processed in
        dependency order

        (e.g., bucket → chunks → frames → scenes). Each tier has independent
        task tracking.


        Use Cases:
            - Process multiple objects through collections in a single batch
            - Track progress of multi-tier decomposition pipelines
            - Monitor and retry individual processing tiers
            - Query batch status and tier-specific task information

        Lifecycle:
            1. Created in DRAFT status with object_ids
            2. Submitted for processing → status changes to PENDING
            3. Each tier processes sequentially (tier 0 → tier 1 → ... → tier N)
            4. Batch completes when all tiers finish (status=COMPLETED) or any tier fails (status=FAILED)

        Multi-Tier Processing:
            - Tier 0: Bucket objects → Collections (bucket as source)
            - Tier N (N > 0): Collection documents → Collections (upstream collection as source)
            - Each tier gets independent task tracking via tier_tasks array
            - Processing proceeds tier-by-tier with automatic chaining

        Requirements:
            - batch_id: OPTIONAL (auto-generated if not provided)
            - bucket_id: REQUIRED
            - status: OPTIONAL (defaults to DRAFT)
            - object_ids: REQUIRED for processing (must have at least 1 object)
            - collection_ids: OPTIONAL (discovered via DAG resolution)
            - tier_tasks: OPTIONAL (populated during processing)
            - current_tier: OPTIONAL (set during processing)
            - total_tiers: OPTIONAL (defaults to 1, set during DAG resolution)
            - dag_tiers: OPTIONAL (populated during DAG resolution)
      examples:
        - batch_id: btch_simple_001
          bucket_id: bkt_videos
          collection_ids:
            - col_chunks
          dag_tiers:
            - - col_chunks
          description: Simple single-tier batch (DRAFT)
          metadata:
            campaign_id: Q4_2025
          object_ids:
            - obj_video_001
            - obj_video_002
          status: DRAFT
          tier_tasks: []
          total_tiers: 1
          type: BUCKET
        - batch_id: btch_multitier_002
          bucket_id: bkt_videos
          collection_ids:
            - col_chunks
            - col_frames
            - col_scenes
          current_tier: 1
          dag_tiers:
            - - col_chunks
            - - col_frames
            - - col_scenes
          description: 'Multi-tier batch (IN_PROGRESS): Currently processing tier 1'
          manifest_key: ns_abc/org_123/manifests/tier_1.parquet
          metadata:
            user_email: user@example.com
          object_ids:
            - obj_video_001
          status: IN_PROGRESS
          task_id: task_tier0_abc123
          tier_tasks:
            - collection_ids:
                - col_chunks
              completed_at: '2025-11-03T10:05:00Z'
              source_type: bucket
              started_at: '2025-11-03T10:00:00Z'
              status: COMPLETED
              task_id: task_tier0_abc123
              tier_num: 0
            - collection_ids:
                - col_frames
              parent_task_id: task_tier0_abc123
              source_collection_ids:
                - col_chunks
              source_type: collection
              started_at: '2025-11-03T10:05:00Z'
              status: IN_PROGRESS
              task_id: task_tier1_def456
              tier_num: 1
            - collection_ids:
                - col_scenes
              source_collection_ids:
                - col_frames
              source_type: collection
              status: PENDING
              tier_num: 2
          total_tiers: 3
          type: BUCKET
        - batch_id: btch_complete_003
          bucket_id: bkt_videos
          collection_ids:
            - col_chunks
            - col_frames
            - col_scenes
          current_tier: 2
          dag_tiers:
            - - col_chunks
            - - col_frames
            - - col_scenes
          description: 'Multi-tier batch (COMPLETED): All 3 tiers finished'
          object_ids:
            - obj_video_001
          status: COMPLETED
          tier_tasks:
            - collection_ids:
                - col_chunks
              completed_at: '2025-11-03T10:05:00Z'
              status: COMPLETED
              task_id: task_tier0_abc
              tier_num: 0
            - collection_ids:
                - col_frames
              completed_at: '2025-11-03T10:10:00Z'
              status: COMPLETED
              task_id: task_tier1_def
              tier_num: 1
            - collection_ids:
                - col_scenes
              completed_at: '2025-11-03T10:15:00Z'
              status: COMPLETED
              task_id: task_tier2_ghi
              tier_num: 2
          total_tiers: 3
          type: BUCKET
        - batch_id: btch_failed_004
          bucket_id: bkt_videos
          collection_ids:
            - col_chunks
            - col_frames
          current_tier: 1
          dag_tiers:
            - - col_chunks
            - - col_frames
          description: 'Multi-tier batch (FAILED): Tier 1 failed with dependency error'
          error: 'Tier 1 failed: Missing required package: google-genai'
          error_summary:
            authentication: 2
            dependency: 5
          object_ids:
            - obj_video_001
          status: FAILED
          tier_tasks:
            - collection_ids:
                - col_chunks
              completed_at: '2025-11-03T10:05:00Z'
              errors: []
              status: COMPLETED
              task_id: task_tier0_abc
              tier_num: 0
            - collection_ids:
                - col_frames
              completed_at: '2025-11-03T10:06:30Z'
              error_summary:
                authentication: 2
                dependency: 5
              errors:
                - affected_count: 5
                  component: VertexMultimodalService
                  error_type: dependency
                  message: 'Missing required package: google-genai'
                  recovery_suggestion: Install google-genai package
                  stage: gemini_extraction
                - affected_count: 2
                  component: VertexMultimodalService
                  error_type: authentication
                  message: Invalid Vertex AI credentials
              status: FAILED
              task_id: task_tier1_def
              tier_num: 1
          total_tiers: 2
          type: BUCKET
    ErrorResponse:
      properties:
        success:
          type: boolean
          title: Success
          description: Always false for error responses
          default: false
        status:
          type: integer
          title: Status
          description: HTTP status code for this error
        error:
          $ref: '#/components/schemas/ErrorDetail'
          description: Error details payload
      type: object
      required:
        - status
        - error
      title: ErrorResponse
      description: Error response model.
      examples:
        - error:
            details:
              id: ns_123
              resource: namespace
            message: Namespace not found
            type: NotFoundError
          status: 404
          success: false
    HTTPValidationError:
      properties:
        detail:
          items:
            $ref: '#/components/schemas/ValidationError'
          type: array
          title: Detail
      type: object
      title: HTTPValidationError
    TaskStatusEnum:
      type: string
      enum:
        - PENDING
        - QUEUED
        - IN_PROGRESS
        - PROCESSING
        - COMPLETED
        - COMPLETED_WITH_ERRORS
        - FAILED
        - CANCELED
        - INTERRUPTED
        - UNKNOWN
        - SKIPPED
        - DRAFT
        - ACTIVE
        - ARCHIVED
        - SUSPENDED
      title: TaskStatusEnum
      description: |-
        Enumeration of task statuses for tracking asynchronous operations.

        Task statuses indicate the current state of asynchronous operations like
        batch processing, object ingestion, clustering, and taxonomy execution.

        Status Categories:
            Operation Statuses: Track progress of async operations
            Lifecycle Statuses: Track entity state (buckets, collections, namespaces)

        Values:
            PENDING: Task is queued but has not started processing yet
            IN_PROGRESS: Task is currently being executed
            PROCESSING: Task is actively processing data (similar to IN_PROGRESS)
            COMPLETED: Task finished successfully with no errors
            COMPLETED_WITH_ERRORS: Task finished but some items failed (partial success)
            FAILED: Task encountered an error and could not complete
            CANCELED: Task was manually canceled by a user or system
            UNKNOWN: Task status could not be determined
            SKIPPED: Task was intentionally skipped
            DRAFT: Task is in draft state and not yet submitted

            ACTIVE: Entity is active and operational (for buckets, collections, etc.)
            ARCHIVED: Entity has been archived
            SUSPENDED: Entity has been temporarily suspended

        Terminal Statuses:
            COMPLETED, COMPLETED_WITH_ERRORS, FAILED, CANCELED are terminal statuses.
            Once a task reaches these states, it will not transition to another state.

        Partial Success Handling:
            COMPLETED_WITH_ERRORS indicates that the operation completed but some
            documents/items failed. The task result includes:
            - List of successful items
            - List of failed items with error details
            - Success rate percentage
            This allows clients to handle partial success scenarios appropriately.

        Polling Guidance:
            - Poll tasks in PENDING, QUEUED, IN_PROGRESS, or PROCESSING states
            - Stop polling when task reaches COMPLETED, COMPLETED_WITH_ERRORS, FAILED, or CANCELED
            - Use exponential backoff (1s → 30s) when polling
    DedupStrategy:
      type: string
      enum:
        - skip
        - replace
        - force
      title: DedupStrategy
      description: |-
        Controls how duplicate objects are handled during batch processing.

        Dedup is scoped to (bucket_id, collection_id): an object is considered
        a duplicate if the target collection already has documents produced from
        the same source object in any prior batch.
    FailureCategory:
      type: string
      enum:
        - timeout
        - infrastructure
        - orphaned
        - pipeline
        - validation
        - unknown
      title: FailureCategory
      description: |-
        Batch-level failure classification.

        Coarser-grained than ErrorCategory (which classifies individual object
        errors). FailureCategory is set on the batch itself to tell users
        *why the batch as a whole failed* — timeout, infra, orphan, pipeline,
        or unknown. Drives the "Batch failed: <category>" badge in Studio and
        lets callers distinguish retryable infra blips from genuine pipeline
        bugs without parsing human-readable strings.
    FailedObjectRecord:
      properties:
        object_id:
          type: string
          title: Object Id
          description: ID of the object that failed processing.
          examples:
            - obj_video_001
            - obj_doc_123
        error:
          type: string
          title: Error
          description: Human-readable error message describing what went wrong.
          examples:
            - 'Invalid video format: unsupported codec'
            - Connection timeout to embedding service
            - CUDA out of memory
        error_type:
          type: string
          enum:
            - transient
            - permanent
            - resource
          title: Error Type
          description: >-
            Classification of the error for retry decisions. transient: network,
            timeout, temporary service issues (worth retrying). permanent: bad
            data, unsupported format (will never succeed). resource: GPU OOM,
            quota exceeded (may succeed with different resources).
          examples:
            - transient
            - permanent
            - resource
        timestamp:
          type: string
          title: Timestamp
          description: ISO 8601 timestamp when the error occurred.
          examples:
            - '2025-11-29T10:15:30Z'
      type: object
      required:
        - object_id
        - error
        - error_type
        - timestamp
      title: FailedObjectRecord
      description: >-
        Record of a single object that failed during batch processing.


        Stored on the batch document in MongoDB to provide per-object error

        visibility without requiring a separate query to the failed_documents

        collection.


        Used to:

        - Show users exactly which objects failed and why

        - Classify errors for retry decisions (transient vs permanent vs
        resource)

        - Enable selective resubmission of failed objects
    BatchType:
      type: string
      enum:
        - BUCKET
        - COLLECTION
      title: BatchType
      description: The type of batch.
    BatchMetadata:
      properties:
        campaign_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Campaign Id
          description: Identifier linking this batch to a marketing or processing campaign.
          examples:
            - Q4_2025
            - onboarding_wave_3
        source:
          anyOf:
            - type: string
            - type: 'null'
          title: Source
          description: >-
            Origin of the batch data (e.g. an S3 prefix, partner name, or
            pipeline stage).
          examples:
            - s3://raw-uploads/2026-05/
            - partner_acme
        tags:
          anyOf:
            - items:
                type: string
              type: array
            - type: 'null'
          title: Tags
          description: Free-form tags for filtering and grouping batches.
          examples:
            - - video
              - high-priority
            - - backfill
              - Q2
        notes:
          anyOf:
            - type: string
            - type: 'null'
          title: Notes
          description: >-
            Free-form notes about this batch (intent, context, special
            handling).
          examples:
            - Re-run after Whisper quota fix
            - TubeScience daily ingest
      additionalProperties: true
      type: object
      title: BatchMetadata
      description: |-
        Typed user-defined metadata for a batch.

        Known fields are validated and surfaced in API docs. Additional
        arbitrary keys are accepted via ``model_config extra="allow"``.
    TierTaskInfo:
      properties:
        tier_num:
          type: integer
          minimum: 0
          title: Tier Num
          description: >-
            REQUIRED. Zero-based tier number indicating the processing stage.
            Tier 0 = initial bucket-to-collection processing (bucket objects as
            source). Tier N (N > 0) = collection-to-collection processing
            (upstream documents as source). Used to determine processing order
            and identify which stage a task represents. Example: In a 5-tier
            pipeline (bucket → chunks → frames → scenes → summaries),
            chunks=tier 0, frames=tier 1, scenes=tier 2, summaries=tier 3.
          examples:
            - 0
            - 1
            - 2
        task_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Task Id
          description: >-
            OPTIONAL. Unique task identifier for this tier's processing task.
            None if tier has not yet started (status=PENDING). Assigned when
            tier processing begins (status=IN_PROGRESS). Used to query task
            status via GET /v1/tasks/{task_id}. Format: 'task_' prefix followed
            by secure token. Generated using generate_secure_token() from
            shared.utilities.helpers.
          examples:
            - task_tier0_abc123
            - task_tier1_def456
            - task_tier2_ghi789
        status:
          $ref: '#/components/schemas/TaskStatusEnum'
          description: >-
            REQUIRED. Current processing status of this tier's task. Lifecycle:
            PENDING → IN_PROGRESS → COMPLETED/FAILED. PENDING: Tier scheduled
            but not yet started. IN_PROGRESS: Tier currently processing
            documents. COMPLETED: Tier successfully processed all documents.
            FAILED: Tier encountered an error and stopped processing. Used to
            determine overall batch status and whether to proceed to next tier.
          default: PENDING
          examples:
            - PENDING
            - IN_PROGRESS
            - COMPLETED
            - FAILED
        collection_ids:
          items:
            type: string
          type: array
          minItems: 1
          title: Collection Ids
          description: >-
            REQUIRED. List of collection IDs being processed in this tier.
            Flattened from extractor_jobs for convenience. Each tier can process
            one or more collections in parallel. Collections in the same tier
            have no dependencies on each other. Format: Collection IDs as
            defined when collections were created. Minimum 1 collection per
            tier. Example: Tier 1 might process ['col_frames_30fps',
            'col_frames_60fps'] in parallel.
          examples:
            - - col_chunks
            - - col_frames
              - col_scenes
            - - col_summaries
        extractor_jobs:
          items:
            $ref: '#/components/schemas/ExtractorJobInfo'
          type: array
          title: Extractor Jobs
          description: >-
            List of extractor jobs for this tier (one per unique
            feature_extractor_type). NEW as of 2025-12-31: Tiers now support
            multiple Ray jobs. Empty list for backwards compatibility with old
            batches. Tier completes when ALL extractor_jobs reach COMPLETED
            status.
        source_type:
          type: string
          title: Source Type
          description: >-
            REQUIRED. Type of data source for this tier's processing. 'bucket':
            Tier 0 processing where source is bucket objects from the objects
            table. 'collection': Tier N+ processing where source is documents
            from upstream collection(s). Determines how the API prepares the
            input dataset manifest for the Engine. Bucket sources query the
            objects table and include file blobs. Collection sources query the
            documents table and include processed features.
          examples:
            - bucket
            - collection
        source_collection_ids:
          anyOf:
            - items:
                type: string
              type: array
            - type: 'null'
          title: Source Collection Ids
          description: >-
            OPTIONAL for tier 0 (must be None). REQUIRED for tier N+ (N > 0).
            List of upstream collection IDs that provide documents as input to
            this tier. Typically contains collection IDs from the previous tier
            (tier_num - 1). Used by the API to query documents from these
            collections for processing. These upstream documents are converted
            to a Parquet manifest for the Engine. Example: If tier 1 processes
            'col_frames' and sources from tier 0's 'col_chunks', then
            source_collection_ids=['col_chunks'].
          examples:
            - - col_chunks
            - - col_frames
            - - col_scenes
              - col_objects
        parent_task_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Parent Task Id
          description: >-
            OPTIONAL. Task ID of the previous tier (tier_num - 1) that processed
            before this tier. Used to link tiers together for audit trail and
            lineage tracking. None for tier 0 (no parent). Enables queries like
            'show all tiers that processed after tier 0' or 'trace back through
            all parent tiers to find the original batch'. Format: Same as
            task_id (e.g., 'task_tier0_abc123').
          examples:
            - task_tier0_abc123
            - task_tier1_def456
        started_at:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Started At
          description: >-
            OPTIONAL. ISO 8601 timestamp when this tier began processing. None
            if tier has not yet started (status=PENDING). Set using
            current_time() from shared.utilities.helpers when tier starts. Used
            to calculate tier processing duration and identify long-running
            tiers. Example: '2025-11-03T10:00:00Z'.
          examples:
            - '2025-11-03T10:00:00Z'
            - '2025-11-03T14:30:15Z'
        completed_at:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Completed At
          description: >-
            OPTIONAL. ISO 8601 timestamp when this tier finished processing
            (success or failure). None if tier has not yet completed
            (status=PENDING or IN_PROGRESS). Set using current_time() from
            shared.utilities.helpers when tier completes. Used to calculate tier
            processing duration (completed_at - started_at). Set for both
            COMPLETED and FAILED statuses. Example: '2025-11-03T10:05:00Z'.
          examples:
            - '2025-11-03T10:05:00Z'
            - '2025-11-03T14:35:42Z'
        duration_ms:
          anyOf:
            - type: number
            - type: 'null'
          title: Duration Ms
          description: >-
            OPTIONAL. Processing duration in milliseconds for this tier.
            Calculated as (completed_at - started_at) when tier completes. None
            if tier has not yet completed or if started_at was not set. Provides
            a pre-computed duration for easy querying without timestamp math.
            Set for both COMPLETED and FAILED statuses.
          examples:
            - 300000
            - 5000.5
            - 12345.67
        errors:
          items:
            $ref: '#/components/schemas/BatchErrorDetail'
          type: array
          title: Errors
          description: >-
            OPTIONAL. List of detailed errors that occurred during tier
            processing. Empty list if tier succeeded or has not yet completed.
            Each error includes: error_type, message, component, stage,
            traceback, timestamp. Multiple errors may occur if different
            documents fail with different issues. Used for detailed error
            analysis, debugging, and intelligent retry logic. Example: Multiple
            documents failing with different errors (dependency vs auth). For
            backward compatibility, check if list is empty for
            success/in-progress status.
          examples:
            - []
            - - affected_count: 5
                component: VertexMultimodalService
                error_type: dependency
                message: 'Missing required package: google-genai'
                recovery_suggestion: Install google-genai package
                stage: gemini_extraction
            - - affected_count: 10
                component: VertexMultimodalService
                error_type: authentication
                message: Invalid Vertex AI credentials
              - affected_count: 3
                component: GeminiExtractor
                error_type: validation
                message: Schema validation failed
        error_summary:
          anyOf:
            - additionalProperties:
                type: integer
              type: object
            - type: 'null'
          title: Error Summary
          description: >-
            OPTIONAL. Aggregated summary of errors by error type. None if tier
            succeeded or has not yet completed. Maps error_type (category) to
            count of affected documents. Provides quick overview of error
            distribution without parsing full error list. Example:
            {'dependency': 5, 'authentication': 10, 'validation': 3} means 5
            documents failed with dependency errors, 10 with auth errors, 3 with
            validation. Automatically generated from errors list for
            convenience. Used for batch health monitoring and error trend
            analysis.
          examples:
            - null
            - dependency: 5
            - authentication: 10
              runtime: 2
              validation: 3
        performance:
          anyOf:
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Performance
          description: >-
            OPTIONAL. Performance metrics summary for this tier's execution.
            Automatically populated after tier completion by collecting data
            from ClickHouse analytics. Contains: total_time_ms (total execution
            time), avg_latency_ms (average operation latency), bottlenecks (list
            of slowest operations), stage_count (number of profiled stages).
            Used for troubleshooting performance issues and identifying
            bottlenecks. None if tier has not completed or performance data
            collection failed. Populated asynchronously (non-blocking,
            best-effort).
          examples:
            - avg_latency_ms: 234.56
              bottlenecks:
                - avg_time_ms: 113.58
                  execution_count: 50
                  max_time_ms: 234.56
                  stage_name: gcs_batch_upload_all_segments
                  total_time_ms: 5678.9
                - avg_time_ms: 69.14
                  execution_count: 50
                  max_time_ms: 123.45
                  stage_name: pipeline_run
                  total_time_ms: 3456.78
              stage_count: 5
              timestamp: '2025-11-06T10:05:00Z'
              total_time_ms: 12345.67
        ray_job_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Ray Job Id
          description: >-
            OPTIONAL. Ray/Anyscale job ID for tracking the infrastructure-level
            processing job. None if tier has not yet started or if the job ID
            was not returned by the engine. Set when tier processing is
            submitted to Ray/Anyscale via the Engine. Used for cancelling
            running jobs and monitoring infrastructure-level status. Format:
            'raysubmit_' prefix followed by job identifier (e.g.,
            'raysubmit_9pDAyZbd5MN281TB'). This is the job ID that appears in
            the Ray/Anyscale dashboard.
          examples:
            - raysubmit_9pDAyZbd5MN281TB
            - raysubmit_ABC123XYZ456
        requires_gpu:
          anyOf:
            - type: boolean
            - type: 'null'
          title: Requires Gpu
          description: >-
            OPTIONAL. Whether this tier's Ray job was scheduled onto a GPU
            worker group. Populated at submit time from the engine's
            requires_gpu decision (built-in extractors default to GPU; custom
            plugins opt in via compute_profile). Lets users confirm a custom
            plugin actually landed on a GPU without kubectl access.
          examples:
            - null
            - true
            - false
        worker_groups:
          anyOf:
            - items:
                type: string
              type: array
            - type: 'null'
          title: Worker Groups
          description: >-
            OPTIONAL. Names of the Ray worker groups this tier's job is eligible
            to run on. Derived from the compute profile chosen at submit time
            (e.g., ['gpu-workers'] for requires_gpu=True, ['cpu-workers']
            otherwise). Surfaces resource allocation in the batch response so
            users don't need kubectl access to debug scheduling.
          examples:
            - null
            - - gpu-workers
            - - cpu-workers
        celery_task_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Celery Task Id
          description: >-
            OPTIONAL. Background task ID for tracking the worker processing this
            tier. None if tier has not yet started or is not processed via
            background task. Set when the tier processing task is triggered.
            Used for revoking pending/running tasks during batch cancellation or
            deletion. Format: UUID string (e.g.,
            'a1b2c3d4-e5f6-7890-abcd-ef1234567890').
          examples:
            - a1b2c3d4-e5f6-7890-abcd-ef1234567890
        source_documents_fetched:
          anyOf:
            - type: integer
            - type: 'null'
          title: Source Documents Fetched
          description: >-
            OPTIONAL. Number of documents fetched from source collection(s) for
            Tier N processing. For Tier 0 (bucket source), this is the number of
            objects from the bucket. For Tier N+ (collection source), this is
            the count of documents from upstream collection(s). Set at the start
            of tier artifact building in build_tier_n_artifacts(). If 0, the
            source collection is empty - check upstream tier completion.
          examples:
            - 100
            - 0
            - 1500
        documents_after_source_filter:
          anyOf:
            - type: integer
            - type: 'null'
          title: Documents After Source Filter
          description: >-
            OPTIONAL. Number of documents remaining after applying
            source_filters. source_filters are optional conditions that exclude
            documents from processing. If this is 0 but source_documents_fetched
            > 0, your source_filters are too restrictive. Check that filter
            fields exist in source documents and conditions match expected
            values.
          examples:
            - 95
            - 0
            - 1200
        documents_missing_input_fields:
          anyOf:
            - type: integer
            - type: 'null'
          title: Documents Missing Input Fields
          description: >-
            OPTIONAL. Number of documents missing required input_mapping fields.
            input_mappings define which fields from source documents map to
            extractor inputs. If this equals documents_after_source_filter, ALL
            documents are missing required fields. Common cause: upstream
            extractor didn't produce expected output (e.g., video_segment_url).
            Check upstream extractor configuration and verify output field
            names.
          examples:
            - 0
            - 95
            - 50
        documents_submitted_to_engine:
          anyOf:
            - type: integer
            - type: 'null'
          title: Documents Submitted To Engine
          description: >-
            OPTIONAL. Number of documents actually submitted to the Ray/Engine
            for processing. This is documents_after_source_filter minus
            documents_missing_input_fields. If 0, no documents were sent to the
            engine - check source_filters and input_mappings. If > 0 but
            documents_written = 0, the engine failed to process documents.
          examples:
            - 95
            - 0
            - 1150
        documents_written:
          anyOf:
            - type: integer
            - type: 'null'
          title: Documents Written
          description: >-
            OPTIONAL. Total document count in target collection(s) after
            processing. Set after tier completion by querying the collection
            document count. Compare with documents_before_processing to get the
            delta (new docs created). If 0 but documents_submitted_to_engine >
            0, check tier errors for processing failures.
          examples:
            - 95
            - 0
            - 1150
        documents_before_processing:
          anyOf:
            - type: integer
            - type: 'null'
          title: Documents Before Processing
          description: >-
            OPTIONAL. Document count in target collection(s) before this tier
            started processing. Compare (documents_written -
            documents_before_processing) to get the number of NEW documents
            created by this batch tier. A delta of 0 means all objects were
            already processed (deduplication) or all failed silently.
          examples:
            - 0
            - 50
            - 1000
        last_activity_at:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Last Activity At
          description: >-
            OPTIONAL. Timestamp of the last BatchJobPoller heartbeat confirming
            the tier's Ray job(s) were non-terminal (RUNNING or PENDING).
            Updated approximately every 10 seconds while the tier is
            IN_PROGRESS. A stale last_activity_at (minutes old) indicates the
            job may be stalled or lost. Use this to distinguish an actively
            running batch from one that is silently stuck.
        ray_job_status:
          anyOf:
            - type: string
            - type: 'null'
          title: Ray Job Status
          description: >-
            OPTIONAL. Last observed Ray job status as of last_activity_at.
            Values: 'RUNNING' (actively processing), 'PENDING' (queued, not yet
            started), 'SUCCEEDED', 'FAILED'. For multi-extractor tiers, see
            extractor_jobs[].ray_job_status for per-job granularity.
          examples:
            - RUNNING
            - PENDING
        ray_job_logs:
          anyOf:
            - type: string
            - type: 'null'
          title: Ray Job Logs
          description: >-
            OPTIONAL. Persisted Ray job logs captured when the job reached a
            terminal state (SUCCEEDED or FAILED). Contains the last 500 lines of
            the head pod's stdout. Populated automatically by the batch poller
            before pods are cleaned up by K8s, so logs remain available for
            debugging after the job's infrastructure is gone.
        ray_job_logs_captured_at:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Ray Job Logs Captured At
          description: OPTIONAL. Timestamp when ray_job_logs were captured.
        submission_params:
          anyOf:
            - $ref: '#/components/schemas/SubmissionParams'
            - type: 'null'
          description: >-
            OPTIONAL. Parameters used for the first Ray/GKE job submission in
            this tier. For per-extractor params, see
            extractor_jobs[].submission_params.
        infrastructure_events:
          items:
            $ref: '#/components/schemas/InfrastructureDetail'
          type: array
          title: Infrastructure Events
          description: >-
            Infrastructure-level events correlated with this tier's execution
            (OOM, preemption, etc.).
        audit:
          anyOf:
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Audit
          description: >-
            OPTIONAL. Tier completion invariant audit, persisted by the
            complete_tier callback. Shape: {tier_num, submitted, processed,
            failed, skipped, lost, balanced, notes}. Phase 1 of
            INGESTION_RELIABILITY_PLAN.md. ``lost > 0`` means objects were
            submitted but absent from both processed_objects and
            failed_documents — investigate via the /audit endpoint.
        audit_override_reason:
          anyOf:
            - type: string
            - type: 'null'
          title: Audit Override Reason
          description: >-
            OPTIONAL. Set when the audit overrode the orchestrator-supplied tier
            status (e.g. promoted COMPLETED → COMPLETED_WITH_ERRORS because of
            lost objects).
      type: object
      required:
        - tier_num
        - source_type
      title: TierTaskInfo
      description: >-
        Tracking information for a single collection processing tier's task.


        In multi-tier collection processing, each tier represents a processing
        stage in the

        decomposition pipeline (e.g., bucket → chunks → frames → scenes). Each
        tier gets its

        own independent task for granular monitoring, status tracking, and retry
        capabilities.


        Use Cases:
            - Monitor progress of individual tiers in a multi-tier batch
            - Retry failed tiers without reprocessing successful ones
            - Track lineage of processing through parent_task_id linkage
            - Query task status for specific processing stages

        Tier Definitions:
            - Tier 0: Bucket → Collection (bucket objects as source)
            - Tier N (N > 0): Collection → Collection (upstream collection documents as source)

        Lifecycle:
            1. Created with status=PENDING when tier is scheduled
            2. Updated to status=IN_PROGRESS when processing starts (task_id assigned)
            3. Finalized to status=COMPLETED or status=FAILED when tier completes

        Requirements:
            - tier_num: REQUIRED. Zero-based tier number indicating processing stage
            - task_id: OPTIONAL. None until tier processing starts
            - status: REQUIRED. Defaults to PENDING
            - collection_ids: REQUIRED. Collections being processed in this tier
            - source_type: REQUIRED. "bucket" for tier 0, "collection" for tier N+
            - source_collection_ids: OPTIONAL. Required for tier N+ (collection sources)
            - parent_task_id: OPTIONAL. Links to previous tier for audit trail
            - started_at: OPTIONAL. Set when tier processing begins
            - completed_at: OPTIONAL. Set when tier processing finishes
            - error: OPTIONAL. Set if tier fails with error details
      examples:
        - collection_ids:
            - col_chunks
          completed_at: '2025-11-03T10:05:00Z'
          description: 'Tier 0 task (completed): Bucket → Collection'
          errors: []
          ray_job_id: raysubmit_ABC123
          source_type: bucket
          started_at: '2025-11-03T10:00:00Z'
          status: COMPLETED
          task_id: task_tier0_abc123
          tier_num: 0
        - collection_ids:
            - col_frames
          description: 'Tier 1 task (in progress): Collection → Collection'
          errors: []
          parent_task_id: task_tier0_abc123
          ray_job_id: raysubmit_DEF456
          source_collection_ids:
            - col_chunks
          source_type: collection
          started_at: '2025-11-03T10:05:00Z'
          status: IN_PROGRESS
          task_id: task_tier1_def456
          tier_num: 1
        - collection_ids:
            - col_scenes
          description: 'Tier 2 task (pending): Not yet started'
          errors: []
          source_collection_ids:
            - col_frames
          source_type: collection
          status: PENDING
          tier_num: 2
        - collection_ids:
            - col_frames
          completed_at: '2025-11-03T10:06:30Z'
          description: 'Tier 1 task (failed): Processing errors'
          error_summary:
            dependency: 2
          errors:
            - affected_count: 2
              affected_document_ids:
                - doc_123
                - doc_456
              component: VertexMultimodalService
              error_type: dependency
              message: 'Missing required package: google-genai'
              recovery_suggestion: 'Install google-genai package: pip install google-genai'
              stage: gemini_extraction
              timestamp: '2025-11-03T10:06:00Z'
          parent_task_id: task_tier0_abc123
          ray_job_id: raysubmit_XYZ789
          source_collection_ids:
            - col_chunks
          source_type: collection
          started_at: '2025-11-03T10:05:00Z'
          status: FAILED
          task_id: task_tier1_xyz789
          tier_num: 1
    BatchProgress:
      properties:
        processed:
          type: integer
          title: Processed
          description: Number of objects fully processed so far.
          default: 0
        total:
          anyOf:
            - type: integer
            - type: 'null'
          title: Total
          description: >-
            Total objects to process. None until the dataset is loaded by the
            engine.
        percent:
          anyOf:
            - type: number
            - type: 'null'
          title: Percent
          description: Completion percentage (0–100). None until total is known.
        items_per_second:
          anyOf:
            - type: number
            - type: 'null'
          title: Items Per Second
          description: Current throughput. Averaged over elapsed time since job start.
        eta_seconds:
          anyOf:
            - type: number
            - type: 'null'
          title: Eta Seconds
          description: >-
            Estimated seconds remaining. None until both total and throughput
            are known.
        batch_count:
          type: integer
          title: Batch Count
          description: Number of Ray map_batch micro-batches completed.
          default: 0
        errors:
          type: integer
          title: Errors
          description: Number of items that errored during processing.
          default: 0
        first_error:
          anyOf:
            - type: string
            - type: 'null'
          title: First Error
          description: >-
            First item-level error captured during processing (truncated to ~500
            chars), e.g. 'ValueError: cannot embed empty text'. Explains WHAT
            the `errors` counter is counting. None when no item-level error has
            been captured (or the engine image pre-dates this field). See also
            batch-level error_summary (error-type counts) and failed_objects
            (per-object detail).
        extraction_first_error:
          anyOf:
            - type: string
            - type: 'null'
          title: Extraction First Error
          description: >-
            First map/extraction-stage error (rows dropped before the datasink).
            Distinct from datasink write errors. Typically equals first_error
            unless a later write-stage error was also captured.
        documents_skipped:
          type: integer
          title: Documents Skipped
          description: >-
            Number of input rows the processor explicitly marked skipped
            (metadata-only objects, missing required embedding fields,
            content-flag filtered, null-text chunks). Skipped rows are not
            failures and not successes; they count toward the tier-completion
            invariant via the audit. Phase 1.3 of INGESTION_RELIABILITY_PLAN.md.
          default: 0
        current_stage:
          anyOf:
            - $ref: '#/components/schemas/BatchStageInfo'
            - type: 'null'
          description: >-
            Current processing stage as reported by the Ray engine. Populated
            once the engine begins work (may lag by ~10s). Useful to distinguish
            'loading model' from 'no activity' during early processing.
        phases:
          anyOf:
            - $ref: '#/components/schemas/BatchPhases'
            - type: 'null'
          description: >-
            Honest streaming-phase breakdown (BACKE-762): dispatch → extraction
            → write, each with its own processed/total/status. Exactly one phase
            is 'active' and its key equals current_stage.name. The SOLE
            per-stage source of truth — use this rather than current_stage for
            per-phase counters. None on engine images that pre-date this field.
        active_step:
          anyOf:
            - $ref: '#/components/schemas/BatchStepInfo'
            - type: 'null'
          description: >-
            Active pipeline step for multi-step extractors. Shows which
            processor is currently running (e.g., 'GroundingDINOProcessor 1/3').
            None for single-step pipelines.
        overshoot_percent:
          anyOf:
            - type: number
            - type: 'null'
          title: Overshoot Percent
          description: >-
            When processed exceeds total (due to Ray Data retries or pipeline
            data expansion), this field shows the excess percentage above 100%.
            For example, 32.0 means 132% of items have been processed. None when
            processed <= total.
        queue_position:
          anyOf:
            - type: integer
            - type: 'null'
          title: Queue Position
          description: >-
            1-based position in the Ray job submission queue. Non-null only
            while the batch is waiting for a concurrency slot. Once a slot is
            acquired (job submitted to Ray), this becomes null.
        stage_history:
          anyOf:
            - items:
                additionalProperties: true
                type: object
              type: array
            - type: 'null'
          title: Stage History
          description: >-
            Completed stage timing breakdown. Each entry: name, index, total,
            started_at (epoch), ended_at (epoch), duration_seconds. Populated as
            stages complete; the current (in-progress) stage is in
            current_stage.
        documents_written:
          anyOf:
            - type: integer
            - type: 'null'
          title: Documents Written
          description: Derived documents written across completed tier/extractor jobs.
        status_warnings:
          items:
            type: string
          type: array
          title: Status Warnings
          description: >-
            Read-time warnings that explain ambiguous terminal status or
            accounting gaps.
      type: object
      title: BatchProgress
      description: >-
        Live progress snapshot written by ProgressPoller every ~10 seconds while
        a batch is IN_PROGRESS.


        Populated by the Ray ProgressActor running inside the engine job.

        None when the batch has not started processing yet (DRAFT/PENDING) or if
        the engine

        image pre-dates this field.
    BatchCost:
      properties:
        credits_consumed:
          type: integer
          minimum: 0
          title: Credits Consumed
          description: >-
            Total credits recorded against this batch in usage_records
            (resource_type='batch'). 0 if no usage has been recorded yet (e.g.
            DRAFT batches, ENTERPRISE tiers that skip credit consumption, or
            batches submitted before metering existed).
          default: 0
          examples:
            - 0
            - 100
            - 4820
        cost_usd:
          type: number
          minimum: 0
          title: Cost Usd
          description: >-
            Derived dollar cost = credits_consumed * $0.001/credit
            (CREDIT_RATE_USD). Same rate used by monthly invoicing, so this
            matches what the batch contributes to the bill.
          default: 0
          examples:
            - 0
            - 0.1
            - 4.82
        credit_rate_usd:
          type: number
          title: Credit Rate Usd
          description: USD per credit used to derive cost_usd ($0.001/credit).
          default: 0.001
      type: object
      title: BatchCost
      description: |-
        Dollar cost of a batch, derived from recorded credit usage.

        Mixpeek meters work in *credits* (recorded in the usage_records
        collection at credit-consume time, keyed by ``resource_id=batch_id`` /
        ``resource_type="batch"``). There is no separately-stored per-batch USD
        figure, so cost is derived at read time as
        ``credits_consumed * CREDIT_RATE_USD`` using the same canonical
        $0.001/credit rate the billing/invoicing path uses. This lets callers
        answer "what did this batch cost?" from the batch GET response without
        GCP billing labels or a separate billing query.
    ErrorDetail:
      properties:
        message:
          type: string
          title: Message
          description: Human-readable error message
        type:
          type: string
          title: Type
          description: Stable error type identifier (machine-readable)
        code:
          anyOf:
            - type: string
            - type: 'null'
          title: Code
          description: >-
            Fine-grained error code for programmatic handling (e.g.,
            namespace_name_taken, feature_extractor_not_found). Present only
            when consumers may need to branch on a specific error condition.
        details:
          anyOf:
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Details
          description: >-
            Optional structured details to help debugging (validation errors,
            IDs, etc.)
      type: object
      required:
        - message
        - type
      title: ErrorDetail
      description: Error detail model.
    ValidationError:
      properties:
        loc:
          items:
            anyOf:
              - type: string
              - type: integer
          type: array
          title: Location
        msg:
          type: string
          title: Message
        type:
          type: string
          title: Error Type
      type: object
      required:
        - loc
        - msg
        - type
      title: ValidationError
    ExtractorJobInfo:
      properties:
        extractor_type:
          type: string
          title: Extractor Type
          description: >-
            Feature extractor type (e.g., 'image_extractor',
            'face_identity_extractor')
          examples:
            - image_extractor
            - face_identity_extractor
            - video_extractor
        collection_ids:
          items:
            type: string
          type: array
          minItems: 1
          title: Collection Ids
          description: Collections processed by this extractor job
          examples:
            - - col_abc123
            - - col_def456
              - col_ghi789
        extractor_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Extractor Id
          description: >-
            Concrete extractor identifier used for this job, e.g.
            'universal_extractor_v1'.
          examples:
            - universal_extractor_v1
        ray_job_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Ray Job Id
          description: Ray job ID for this extractor job
          examples:
            - raysubmit_abc123
        celery_task_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Celery Task Id
          description: Background task ID that submitted this processing job
          examples:
            - celery_task_abc123
        callback_job_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Callback Job Id
          description: >-
            Job ID expected in the tier completion callback for this extractor
            job.
          examples:
            - celery_fast_path_btch_abc123_0_universal_extractor_v1
        execution_mode:
          anyOf:
            - type: string
            - type: 'null'
          title: Execution Mode
          description: Execution backend used for this extractor job.
          examples:
            - celery_fast_path_universal
        status:
          $ref: '#/components/schemas/TaskStatusEnum'
          description: Current status of this extractor job
          default: PENDING
          examples:
            - PENDING
            - IN_PROGRESS
            - COMPLETED
            - FAILED
        started_at:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Started At
          description: When this extractor job started processing
        completed_at:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Completed At
          description: When this extractor job finished processing
        duration_ms:
          anyOf:
            - type: number
            - type: 'null'
          title: Duration Ms
          description: Processing duration in milliseconds
        documents_written:
          anyOf:
            - type: integer
            - type: 'null'
          title: Documents Written
          description: Number of documents written by this extractor job
        documents_skipped:
          anyOf:
            - type: integer
            - type: 'null'
          title: Documents Skipped
          description: >-
            OPTIONAL. Number of input rows skipped (not failed) — typically
            metadata-only objects with no embeddable content, content-flag
            filtered rows, or null-text chunks. Skipped rows count toward the
            tier-completion invariant (processed + failed + skipped ==
            submitted) so a tier with 100 inputs and 100 skips still ends in a
            valid terminal state.
          examples:
            - 0
            - 12
            - 100
        errors:
          items:
            $ref: '#/components/schemas/BatchErrorDetail'
          type: array
          title: Errors
          description: Detailed errors from this extractor job
        error:
          anyOf:
            - type: string
            - type: 'null'
          title: Error
          description: >-
            OPTIONAL. Simple error message string for quick debugging. Set when
            the Ray job fails with error details from JobStatusMonitor. For
            detailed error information, see errors array.
          examples:
            - Ray job FAILED
            - Timeout waiting for job completion
        last_activity_at:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Last Activity At
          description: >-
            OPTIONAL. Timestamp of the last BatchJobPoller heartbeat confirming
            this extractor job was non-terminal (RUNNING or PENDING). Updated
            approximately every 10 seconds while IN_PROGRESS. A stale value
            (minutes old) may indicate a stuck or lost job.
        ray_job_status:
          anyOf:
            - type: string
            - type: 'null'
          title: Ray Job Status
          description: >-
            OPTIONAL. Last observed Ray job status for this extractor job as of
            last_activity_at. Values: 'RUNNING', 'PENDING', 'SUCCEEDED',
            'FAILED'. Use with last_activity_at to assess whether the job is
            actively running.
          examples:
            - RUNNING
            - PENDING
        submission_params:
          anyOf:
            - $ref: '#/components/schemas/SubmissionParams'
            - type: 'null'
          description: >-
            Parameters used for Ray/GKE job submission. Persisted at submit time
            for debugging.
      type: object
      required:
        - extractor_type
      title: ExtractorJobInfo
      description: >-
        Tracking information for a single feature extractor job within a tier.


        Each tier can have multiple extractor jobs running in parallel, one per
        unique

        feature_extractor_type. This allows different extractors to have
        independent:

        - Resource requirements (GPUs, CPU, memory)

        - Status tracking and retry logic

        - Ray job IDs and Celery task IDs

        - Completion times and performance metrics


        Example:
            Tier 1 with collections using different extractors:
            - col_A: image_extractor
            - col_B: face_identity_extractor
            - col_C: image_extractor

            Creates 2 ExtractorJobInfo instances:
            1. extractor_type="image_extractor", collection_ids=["col_A", "col_C"]
            2. extractor_type="face_identity_extractor", collection_ids=["col_B"]

        Lifecycle:
            1. Created with status=PENDING when tier is scheduled
            2. Updated to IN_PROGRESS when Ray job starts
            3. Finalized to COMPLETED/FAILED when Ray job completes
    BatchErrorDetail:
      properties:
        error_type:
          $ref: '#/components/schemas/ErrorCategory'
          description: >-
            REQUIRED. Category of error that occurred. Used for filtering, retry
            logic, and error analytics. DEPENDENCY: Missing packages/modules
            (e.g., google-genai not installed). AUTHENTICATION: Invalid
            credentials or expired tokens. VALIDATION: Schema mismatches or
            invalid input data. RUNTIME: Code exceptions or processing failures.
            NETWORK: Connectivity issues or timeouts. RESOURCE: Out of memory or
            disk space.
          examples:
            - dependency
            - authentication
            - validation
        message:
          type: string
          title: Message
          description: >-
            REQUIRED. Human-readable error message. Concise description of what
            went wrong. Should be actionable and help users understand the
            issue.
          examples:
            - 'Missing required package: google-genai'
            - Invalid API key for Vertex AI
            - 'Schema validation failed: missing required field ''products'''
        component:
          anyOf:
            - type: string
            - type: 'null'
          title: Component
          description: >-
            OPTIONAL. Component or service where the error occurred. Helps
            identify which part of the system failed. Examples: service class
            names, module names, or feature names.
          examples:
            - VertexMultimodalService
            - WhisperTranscriptionService
            - GeminiExtractor
        stage:
          anyOf:
            - type: string
            - type: 'null'
          title: Stage
          description: >-
            OPTIONAL. Processing stage where the error occurred. Identifies
            which pipeline stage failed. Examples: pipeline stage names from
            collection configuration.
          examples:
            - gemini_extraction
            - whisper_transcription
            - embedding_generation
        traceback:
          anyOf:
            - type: string
            - type: 'null'
          title: Traceback
          description: >-
            OPTIONAL. Full Python traceback for debugging. Includes stack trace
            for code-level troubleshooting. Should be truncated if too long
            (e.g., max 2000 chars).
        timestamp:
          type: string
          format: date-time
          title: Timestamp
          description: >-
            REQUIRED. ISO 8601 timestamp when the error occurred. Used for
            chronological error tracking and debugging.
          examples:
            - '2025-11-29T10:15:30Z'
        affected_document_ids:
          items:
            type: string
          type: array
          title: Affected Document Ids
          description: >-
            OPTIONAL. List of document IDs affected by this error. For
            object-level errors: contains single document ID. For batch-level
            aggregation: contains all affected document IDs. Used to identify
            scope of impact.
          examples:
            - - doc_123
            - - doc_123
              - doc_456
              - doc_789
        affected_count:
          type: integer
          minimum: 1
          title: Affected Count
          description: >-
            REQUIRED. Number of documents affected by this error. For
            object-level: typically 1. For batch-level aggregation: total count
            of affected documents. Used for error impact analysis.
          default: 1
          examples:
            - 1
            - 10
            - 50
        recovery_suggestion:
          anyOf:
            - type: string
            - type: 'null'
          title: Recovery Suggestion
          description: >-
            OPTIONAL. Actionable suggestion for resolving the error. Helps users
            quickly fix common issues. Examples: install missing package, check
            credentials, update schema.
          examples:
            - 'Install google-genai package: pip install google-genai'
            - Verify your Vertex AI credentials are configured correctly
            - Update your schema to include required 'products' field
        metadata:
          additionalProperties: true
          type: object
          title: Metadata
          description: >-
            OPTIONAL. Additional error context and metadata. Free-form
            dictionary for error-specific details. Examples: retry_count,
            last_retry_at, error_code, http_status.
          examples:
            - last_retry_at: '2025-11-29T10:10:00Z'
              retry_count: 2
            - api_endpoint: https://vertex.googleapis.com
              http_status: 401
      type: object
      required:
        - error_type
        - message
      title: BatchErrorDetail
      description: >-
        Detailed error information for batch processing failures.


        Provides structured error tracking at both object and batch levels.

        Enables better debugging, retry logic, and error analytics.


        Use Cases:
            - Track specific errors that occurred during processing
            - Identify error patterns across multiple documents
            - Provide actionable recovery suggestions
            - Enable intelligent retry logic based on error type

        Object-level tracking: Attached to individual document processing
        failures

        Batch-level tracking: Aggregated summaries in batch metadata
      examples:
        - affected_count: 1
          affected_document_ids:
            - doc_123
          component: VertexMultimodalService
          description: 'Dependency error: Missing package'
          error_type: dependency
          message: Cannot import 'genai' from 'google' package
          metadata:
            import_path: google.genai
            package_name: google-genai
          recovery_suggestion: 'Install google-genai package: pip install google-genai'
          stage: gemini_extraction
          timestamp: '2025-11-29T10:15:30Z'
          traceback: 'ImportError: cannot import name ''genai'' from ''google''...'
        - affected_count: 2
          affected_document_ids:
            - doc_456
            - doc_789
          component: VertexMultimodalService
          description: 'Authentication error: Invalid credentials'
          error_type: authentication
          message: 'Invalid grant: Bad Request'
          metadata:
            http_status: 401
            retry_count: 3
          recovery_suggestion: Verify your Vertex AI service account credentials
          stage: gemini_extraction
          timestamp: '2025-11-29T10:20:00Z'
    SubmissionParams:
      properties:
        entrypoint:
          anyOf:
            - type: string
            - type: 'null'
          title: Entrypoint
        deployment_mode:
          anyOf:
            - type: string
            - type: 'null'
          title: Deployment Mode
          description: '''ray'' (local/dev) or ''gke'' (production).'
        requires_gpu:
          anyOf:
            - type: boolean
            - type: 'null'
          title: Requires Gpu
        num_cpus:
          anyOf:
            - type: number
            - type: 'null'
          title: Num Cpus
        num_gpus:
          anyOf:
            - type: number
            - type: 'null'
          title: Num Gpus
        memory_bytes:
          anyOf:
            - type: integer
            - type: 'null'
          title: Memory Bytes
        priority:
          anyOf:
            - type: integer
            - type: 'null'
          title: Priority
        plugin_archives:
          anyOf:
            - items:
                type: string
              type: array
            - type: 'null'
          title: Plugin Archives
        plugin_dependencies:
          anyOf:
            - items:
                type: string
              type: array
            - type: 'null'
          title: Plugin Dependencies
        image_uri:
          anyOf:
            - type: string
            - type: 'null'
          title: Image Uri
        extractor_name:
          anyOf:
            - type: string
            - type: 'null'
          title: Extractor Name
        extractor_version:
          anyOf:
            - type: string
            - type: 'null'
          title: Extractor Version
        env_vars_keys:
          anyOf:
            - items:
                type: string
              type: array
            - type: 'null'
          title: Env Vars Keys
          description: Environment variable keys (values omitted for security).
        manifest_key:
          anyOf:
            - type: string
            - type: 'null'
          title: Manifest Key
          description: >-
            S3 key of the batch manifest (metadata.json) used for this job. The
            manifest survives Ray cluster rebuilds, so the poller can resubmit
            an in-cluster (raysubmit_*) job killed by a redeploy.
        submitted_at:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Submitted At
      type: object
      title: SubmissionParams
      description: >-
        Parameters used when submitting a Ray/GKE job, persisted for post-hoc
        debugging.
    InfrastructureDetail:
      properties:
        event_type:
          $ref: '#/components/schemas/InfraEventType'
        detected_at:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Detected At
        raw_signal:
          anyOf:
            - type: string
            - type: 'null'
          title: Raw Signal
          description: The error text that triggered classification.
        node_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Node Id
        pod_name:
          anyOf:
            - type: string
            - type: 'null'
          title: Pod Name
      type: object
      required:
        - event_type
      title: InfrastructureDetail
      description: A single infrastructure event correlated with a tier execution.
    BatchStageInfo:
      properties:
        name:
          type: string
          title: Name
          description: >-
            Honest active streaming phase: 'dispatch', 'extraction', or 'write'
            (BACKE-762). Equals the one phase in BatchProgress.phases whose
            status is 'active'.
          examples:
            - dispatch
            - extraction
            - write
        index:
          type: integer
          title: Index
          description: 1-based stage index within the current job.
        total:
          type: integer
          title: Total
          description: Total stages in the current job.
        stage_elapsed_seconds:
          type: number
          title: Stage Elapsed Seconds
          description: Seconds spent in this stage so far.
          default: 0
        sub_stage:
          anyOf:
            - type: string
            - type: 'null'
          title: Sub Stage
          description: >-
            Sub-stage within 'processing': 'model_loading' while models are
            initializing, 'inferring' once the first batch completes. Helps
            explain why processed=0.
          examples:
            - model_loading
            - inferring
      type: object
      required:
        - name
        - index
        - total
      title: BatchStageInfo
      description: |-
        Current processing stage reported by the Ray engine.

        BACKE-762: ``name`` is now the honest *active* streaming phase —
        "dispatch" → "extraction" → "write" — derived from the phase machine
        (the true bottleneck), NOT the premature "writing" label that the
        datasink used to emit on the first row. It is a LABEL ONLY; per-phase
        counters live in ``BatchProgress.phases``. ``index``/``total`` stay as
        N-of-3. The stage name resets the stall timer, so users can see *what*
        the job is doing even when processed=0 (e.g., waiting for a model load).
    BatchPhases:
      properties:
        dispatch:
          $ref: '#/components/schemas/BatchPhaseDetail'
        extraction:
          $ref: '#/components/schemas/BatchPhaseDetail'
        write:
          $ref: '#/components/schemas/BatchPhaseDetail'
      type: object
      title: BatchPhases
      description: >-
        Honest streaming-phase breakdown (BACKE-762) — the SOLE per-stage
        source.


        dispatch → extraction → write. Exactly one phase is ``active``, and its
        key

        equals ``BatchStageInfo.name``. Consumed by Studio's
        BatchProgressDetail.
    BatchStepInfo:
      properties:
        name:
          type: string
          title: Name
          description: >-
            Processor class name (e.g., 'GroundingDINOProcessor',
            'SigLIPProcessor').
        index:
          type: integer
          title: Index
          description: 1-based step index within the pipeline.
        total:
          type: integer
          title: Total
          description: Total number of steps in the pipeline.
        step_elapsed_seconds:
          type: number
          title: Step Elapsed Seconds
          description: Seconds spent in this step so far.
      type: object
      required:
        - name
        - index
        - total
        - step_elapsed_seconds
      title: BatchStepInfo
      description: >-
        Active pipeline step for multi-step extractors (e.g., GroundingDINO →
        SigLIP).


        Surfaces which step is currently running and its position in the
        pipeline,

        so users can track progress through complex multi-model pipelines.
    ErrorCategory:
      type: string
      enum:
        - dependency
        - authentication
        - validation
        - runtime
        - network
        - resource
      title: ErrorCategory
      description: >-
        Categories for batch processing errors.


        Used to classify errors for better observability, retry logic, and
        debugging.

        Helps distinguish between transient errors (worth retrying) and
        permanent errors.
    InfraEventType:
      type: string
      enum:
        - oom
        - preemption
        - node_failure
        - ray_bug
        - rolling_update
        - unknown
      title: InfraEventType
    BatchPhaseDetail:
      properties:
        processed:
          type: integer
          title: Processed
          description: Items completed in this phase.
          default: 0
        total:
          anyOf:
            - type: integer
            - type: 'null'
          title: Total
          description: Items expected in this phase; null when indeterminate.
        status:
          type: string
          title: Status
          description: 'Phase status: ''pending'', ''active'', ''done'', or ''error''.'
          default: pending
          examples:
            - pending
            - active
            - done
            - error
      type: object
      title: BatchPhaseDetail
      description: |-
        Per-phase progress for one streaming phase (BACKE-762).

        Ray Data streams, so extraction and write run concurrently. Each phase
        carries its own honest counters. ``status`` is one of pending|active|
        done|error. Exactly one phase is ``active`` at a time, and it matches
        ``BatchStageInfo.name``. ``total`` is null when indeterminate (the write
        phase stays null until extraction drains — expanding pipelines emit N
        output points per input object).

````