Create Batch

curl --request POST \
  --url https://api.mixpeek.com/v1/buckets/{bucket_identifier}/batches \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "object_ids": [
    "object_789",
    "object_101"
  ],
  "filters": {
    "AND": [
      {
        "field": "name",
        "operator": "eq",
        "value": "John"
      },
      {
        "field": "age",
        "operator": "gte",
        "value": 30
      }
    ],
    "OR": [
      {
        "field": "status",
        "operator": "eq",
        "value": "active"
      },
      {
        "field": "role",
        "operator": "eq",
        "value": "admin"
      }
    ],
    "NOT": [
      {
        "field": "department",
        "operator": "eq",
        "value": "HR"
      },
      {
        "field": "location",
        "operator": "eq",
        "value": "remote"
      }
    ],
    "case_sensitive": true
  },
  "collection_ids": [
    "<string>"
  ],
  "limit": 25000,
  "dedup_strategy": "skip",
  "metadata": {
    "campaign_id": "Q4_2025",
    "tags": [
      "backfill"
    ]
  }
}
'

{
  "bucket_id": "<string>",
  "batch_id": "<string>",
  "namespace_id": "<string>",
  "status": "DRAFT",
  "object_ids": [
    "<string>"
  ],
  "dedup_strategy": "skip",
  "dedup_audit": {},
  "collection_ids": [
    "col_chunks"
  ],
  "error": "Failed to process batch: Object not found",
  "failure_reason": "Ray job failed: ImportError: No module named 'google.genai'",
  "error_summary": null,
  "failure_category": null,
  "failed_objects": [
    {
      "object_id": "<string>",
      "error": "<string>",
      "timestamp": "<string>"
    }
  ],
  "failed_object_count": 0,
  "type": "BUCKET",
  "manifest_key": "ns_abc/org_123/manifests/tier_0.parquet",
  "task_id": "task_tier0_abc123",
  "loaded_object_ids": [
    "obj_video_001",
    "obj_video_002"
  ],
  "internal_metadata": {
    "include_history": true,
    "last_health_check": {
      "enriched_documents": 98,
      "health_status": "HEALTHY",
      "missing_features": [
        "text_embedding"
      ],
      "processed_documents": 100,
      "recommendations": [],
      "stall_duration_seconds": 0,
      "timestamp": "2025-11-06T10:05:00Z",
      "total_documents": 100,
      "vector_populated_count": 98
    }
  },
  "metadata": {
    "campaign_id": "Q4_2025",
    "source": "s3://raw-uploads/2026-05/",
    "tags": [
      "video",
      "high-priority"
    ],
    "notes": "Re-run after Whisper quota fix"
  },
  "tier_tasks": [
    {
      "tier_num": 1,
      "source_type": "<string>",
      "task_id": "task_tier0_abc123",
      "status": "PENDING",
      "collection_ids": [
        "<string>"
      ],
      "extractor_jobs": [
        {
          "extractor_type": "<string>",
          "collection_ids": [
            "<string>"
          ],
          "extractor_id": "universal_extractor_v1",
          "ray_job_id": "raysubmit_abc123",
          "celery_task_id": "celery_task_abc123",
          "callback_job_id": "celery_fast_path_btch_abc123_0_universal_extractor_v1",
          "execution_mode": "celery_fast_path_universal",
          "status": "PENDING",
          "started_at": "2023-11-07T05:31:56Z",
          "completed_at": "2023-11-07T05:31:56Z",
          "duration_ms": 123,
          "documents_written": 123,
          "documents_skipped": 0,
          "errors": [
            {
              "message": "<string>",
              "component": "VertexMultimodalService",
              "stage": "gemini_extraction",
              "traceback": "<string>",
              "timestamp": "2023-11-07T05:31:56Z",
              "affected_document_ids": [
                "<string>"
              ],
              "affected_count": 1,
              "recovery_suggestion": "Install google-genai package: pip install google-genai",
              "metadata": {}
            }
          ],
          "error": "Ray job FAILED",
          "last_activity_at": "2023-11-07T05:31:56Z",
          "ray_job_status": "RUNNING",
          "submission_params": {
            "entrypoint": "<string>",
            "deployment_mode": "<string>",
            "requires_gpu": true,
            "num_cpus": 123,
            "num_gpus": 123,
            "memory_bytes": 123,
            "priority": 123,
            "plugin_archives": [
              "<string>"
            ],
            "plugin_dependencies": [
              "<string>"
            ],
            "image_uri": "<string>",
            "extractor_name": "<string>",
            "extractor_version": "<string>",
            "env_vars_keys": [
              "<string>"
            ],
            "manifest_key": "<string>",
            "submitted_at": "2023-11-07T05:31:56Z"
          }
        }
      ],
      "source_collection_ids": [
        "col_chunks"
      ],
      "parent_task_id": "task_tier0_abc123",
      "started_at": "2025-11-03T10:00:00Z",
      "completed_at": "2025-11-03T10:05:00Z",
      "duration_ms": 300000,
      "errors": [
        {
          "message": "<string>",
          "component": "VertexMultimodalService",
          "stage": "gemini_extraction",
          "traceback": "<string>",
          "timestamp": "2023-11-07T05:31:56Z",
          "affected_document_ids": [
            "<string>"
          ],
          "affected_count": 1,
          "recovery_suggestion": "Install google-genai package: pip install google-genai",
          "metadata": {}
        }
      ],
      "error_summary": null,
      "performance": {
        "avg_latency_ms": 234.56,
        "bottlenecks": [
          {
            "avg_time_ms": 113.58,
            "execution_count": 50,
            "max_time_ms": 234.56,
            "stage_name": "gcs_batch_upload_all_segments",
            "total_time_ms": 5678.9
          },
          {
            "avg_time_ms": 69.14,
            "execution_count": 50,
            "max_time_ms": 123.45,
            "stage_name": "pipeline_run",
            "total_time_ms": 3456.78
          }
        ],
        "stage_count": 5,
        "timestamp": "2025-11-06T10:05:00Z",
        "total_time_ms": 12345.67
      },
      "ray_job_id": "raysubmit_9pDAyZbd5MN281TB",
      "requires_gpu": null,
      "worker_groups": null,
      "celery_task_id": "a1b2c3d4-e5f6-7890-abcd-ef1234567890",
      "source_documents_fetched": 100,
      "documents_after_source_filter": 95,
      "documents_missing_input_fields": 0,
      "documents_submitted_to_engine": 95,
      "documents_written": 95,
      "documents_before_processing": 0,
      "last_activity_at": "2023-11-07T05:31:56Z",
      "ray_job_status": "RUNNING",
      "ray_job_logs": "<string>",
      "ray_job_logs_captured_at": "2023-11-07T05:31:56Z",
      "submission_params": {
        "entrypoint": "<string>",
        "deployment_mode": "<string>",
        "requires_gpu": true,
        "num_cpus": 123,
        "num_gpus": 123,
        "memory_bytes": 123,
        "priority": 123,
        "plugin_archives": [
          "<string>"
        ],
        "plugin_dependencies": [
          "<string>"
        ],
        "image_uri": "<string>",
        "extractor_name": "<string>",
        "extractor_version": "<string>",
        "env_vars_keys": [
          "<string>"
        ],
        "manifest_key": "<string>",
        "submitted_at": "2023-11-07T05:31:56Z"
      },
      "infrastructure_events": [
        {
          "detected_at": "2023-11-07T05:31:56Z",
          "raw_signal": "<string>",
          "node_id": "<string>",
          "pod_name": "<string>"
        }
      ],
      "audit": {},
      "audit_override_reason": "<string>"
    }
  ],
  "current_tier": 0,
  "total_tiers": 1,
  "dag_tiers": [
    [
      "col_chunks"
    ]
  ],
  "created_at": "2023-11-07T05:31:56Z",
  "progress": null,
  "documents_written": null,
  "status_diagnostics": {},
  "health": "healthy",
  "cost": {
    "credits_consumed": 0,
    "cost_usd": 0,
    "credit_rate_usd": 0.001
  },
  "last_activity_at": "2023-11-07T05:31:56Z",
  "retry_count": 0,
  "max_retries": 3,
  "last_retry_at": null,
  "retry_reason": null,
  "webhook_url": "https://example.com/webhooks/batch-complete",
  "updated_at": "2023-11-07T05:31:56Z",
  "status_message": "<string>",
  "estimated_completion": "2023-11-07T05:31:56Z"
}

POST

buckets

{bucket_identifier}

batches

Create Batch

curl --request POST \
  --url https://api.mixpeek.com/v1/buckets/{bucket_identifier}/batches \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "object_ids": [
    "object_789",
    "object_101"
  ],
  "filters": {
    "AND": [
      {
        "field": "name",
        "operator": "eq",
        "value": "John"
      },
      {
        "field": "age",
        "operator": "gte",
        "value": 30
      }
    ],
    "OR": [
      {
        "field": "status",
        "operator": "eq",
        "value": "active"
      },
      {
        "field": "role",
        "operator": "eq",
        "value": "admin"
      }
    ],
    "NOT": [
      {
        "field": "department",
        "operator": "eq",
        "value": "HR"
      },
      {
        "field": "location",
        "operator": "eq",
        "value": "remote"
      }
    ],
    "case_sensitive": true
  },
  "collection_ids": [
    "<string>"
  ],
  "limit": 25000,
  "dedup_strategy": "skip",
  "metadata": {
    "campaign_id": "Q4_2025",
    "tags": [
      "backfill"
    ]
  }
}
'

{
  "bucket_id": "<string>",
  "batch_id": "<string>",
  "namespace_id": "<string>",
  "status": "DRAFT",
  "object_ids": [
    "<string>"
  ],
  "dedup_strategy": "skip",
  "dedup_audit": {},
  "collection_ids": [
    "col_chunks"
  ],
  "error": "Failed to process batch: Object not found",
  "failure_reason": "Ray job failed: ImportError: No module named 'google.genai'",
  "error_summary": null,
  "failure_category": null,
  "failed_objects": [
    {
      "object_id": "<string>",
      "error": "<string>",
      "timestamp": "<string>"
    }
  ],
  "failed_object_count": 0,
  "type": "BUCKET",
  "manifest_key": "ns_abc/org_123/manifests/tier_0.parquet",
  "task_id": "task_tier0_abc123",
  "loaded_object_ids": [
    "obj_video_001",
    "obj_video_002"
  ],
  "internal_metadata": {
    "include_history": true,
    "last_health_check": {
      "enriched_documents": 98,
      "health_status": "HEALTHY",
      "missing_features": [
        "text_embedding"
      ],
      "processed_documents": 100,
      "recommendations": [],
      "stall_duration_seconds": 0,
      "timestamp": "2025-11-06T10:05:00Z",
      "total_documents": 100,
      "vector_populated_count": 98
    }
  },
  "metadata": {
    "campaign_id": "Q4_2025",
    "source": "s3://raw-uploads/2026-05/",
    "tags": [
      "video",
      "high-priority"
    ],
    "notes": "Re-run after Whisper quota fix"
  },
  "tier_tasks": [
    {
      "tier_num": 1,
      "source_type": "<string>",
      "task_id": "task_tier0_abc123",
      "status": "PENDING",
      "collection_ids": [
        "<string>"
      ],
      "extractor_jobs": [
        {
          "extractor_type": "<string>",
          "collection_ids": [
            "<string>"
          ],
          "extractor_id": "universal_extractor_v1",
          "ray_job_id": "raysubmit_abc123",
          "celery_task_id": "celery_task_abc123",
          "callback_job_id": "celery_fast_path_btch_abc123_0_universal_extractor_v1",
          "execution_mode": "celery_fast_path_universal",
          "status": "PENDING",
          "started_at": "2023-11-07T05:31:56Z",
          "completed_at": "2023-11-07T05:31:56Z",
          "duration_ms": 123,
          "documents_written": 123,
          "documents_skipped": 0,
          "errors": [
            {
              "message": "<string>",
              "component": "VertexMultimodalService",
              "stage": "gemini_extraction",
              "traceback": "<string>",
              "timestamp": "2023-11-07T05:31:56Z",
              "affected_document_ids": [
                "<string>"
              ],
              "affected_count": 1,
              "recovery_suggestion": "Install google-genai package: pip install google-genai",
              "metadata": {}
            }
          ],
          "error": "Ray job FAILED",
          "last_activity_at": "2023-11-07T05:31:56Z",
          "ray_job_status": "RUNNING",
          "submission_params": {
            "entrypoint": "<string>",
            "deployment_mode": "<string>",
            "requires_gpu": true,
            "num_cpus": 123,
            "num_gpus": 123,
            "memory_bytes": 123,
            "priority": 123,
            "plugin_archives": [
              "<string>"
            ],
            "plugin_dependencies": [
              "<string>"
            ],
            "image_uri": "<string>",
            "extractor_name": "<string>",
            "extractor_version": "<string>",
            "env_vars_keys": [
              "<string>"
            ],
            "manifest_key": "<string>",
            "submitted_at": "2023-11-07T05:31:56Z"
          }
        }
      ],
      "source_collection_ids": [
        "col_chunks"
      ],
      "parent_task_id": "task_tier0_abc123",
      "started_at": "2025-11-03T10:00:00Z",
      "completed_at": "2025-11-03T10:05:00Z",
      "duration_ms": 300000,
      "errors": [
        {
          "message": "<string>",
          "component": "VertexMultimodalService",
          "stage": "gemini_extraction",
          "traceback": "<string>",
          "timestamp": "2023-11-07T05:31:56Z",
          "affected_document_ids": [
            "<string>"
          ],
          "affected_count": 1,
          "recovery_suggestion": "Install google-genai package: pip install google-genai",
          "metadata": {}
        }
      ],
      "error_summary": null,
      "performance": {
        "avg_latency_ms": 234.56,
        "bottlenecks": [
          {
            "avg_time_ms": 113.58,
            "execution_count": 50,
            "max_time_ms": 234.56,
            "stage_name": "gcs_batch_upload_all_segments",
            "total_time_ms": 5678.9
          },
          {
            "avg_time_ms": 69.14,
            "execution_count": 50,
            "max_time_ms": 123.45,
            "stage_name": "pipeline_run",
            "total_time_ms": 3456.78
          }
        ],
        "stage_count": 5,
        "timestamp": "2025-11-06T10:05:00Z",
        "total_time_ms": 12345.67
      },
      "ray_job_id": "raysubmit_9pDAyZbd5MN281TB",
      "requires_gpu": null,
      "worker_groups": null,
      "celery_task_id": "a1b2c3d4-e5f6-7890-abcd-ef1234567890",
      "source_documents_fetched": 100,
      "documents_after_source_filter": 95,
      "documents_missing_input_fields": 0,
      "documents_submitted_to_engine": 95,
      "documents_written": 95,
      "documents_before_processing": 0,
      "last_activity_at": "2023-11-07T05:31:56Z",
      "ray_job_status": "RUNNING",
      "ray_job_logs": "<string>",
      "ray_job_logs_captured_at": "2023-11-07T05:31:56Z",
      "submission_params": {
        "entrypoint": "<string>",
        "deployment_mode": "<string>",
        "requires_gpu": true,
        "num_cpus": 123,
        "num_gpus": 123,
        "memory_bytes": 123,
        "priority": 123,
        "plugin_archives": [
          "<string>"
        ],
        "plugin_dependencies": [
          "<string>"
        ],
        "image_uri": "<string>",
        "extractor_name": "<string>",
        "extractor_version": "<string>",
        "env_vars_keys": [
          "<string>"
        ],
        "manifest_key": "<string>",
        "submitted_at": "2023-11-07T05:31:56Z"
      },
      "infrastructure_events": [
        {
          "detected_at": "2023-11-07T05:31:56Z",
          "raw_signal": "<string>",
          "node_id": "<string>",
          "pod_name": "<string>"
        }
      ],
      "audit": {},
      "audit_override_reason": "<string>"
    }
  ],
  "current_tier": 0,
  "total_tiers": 1,
  "dag_tiers": [
    [
      "col_chunks"
    ]
  ],
  "created_at": "2023-11-07T05:31:56Z",
  "progress": null,
  "documents_written": null,
  "status_diagnostics": {},
  "health": "healthy",
  "cost": {
    "credits_consumed": 0,
    "cost_usd": 0,
    "credit_rate_usd": 0.001
  },
  "last_activity_at": "2023-11-07T05:31:56Z",
  "retry_count": 0,
  "max_retries": 3,
  "last_retry_at": null,
  "retry_reason": null,
  "webhook_url": "https://example.com/webhooks/batch-complete",
  "updated_at": "2023-11-07T05:31:56Z",
  "status_message": "<string>",
  "estimated_completion": "2023-11-07T05:31:56Z"
}

Authorizations

Authorization

string

header

required

Bearer authentication header of the form Bearer <token>, where <token> is your auth token.

Path Parameters

bucket_identifier

string

required

The unique identifier of the bucket.

Query Parameters

skip_validation

boolean

default:false

Skip object existence validation. Use this for large batches (>10k objects) or when you're certain all object IDs are valid. Improves performance significantly.

auto_submit

boolean

default:false

Submit the batch for processing immediately after creation. When false (default) the batch is returned in DRAFT status and the caller must POST /submit separately. Symmetric with the auto_process flag on POST /buckets/{id}/objects, and explicit by design — earlier the only way to auto-submit was via the object endpoint, which made batch lifecycle behaviour non-obvious for callers driving the batch endpoint directly.

Body

application/json

Request model for creating a new batch.

Batches group bucket objects for processing into collections. When you submit a batch, all objects in the batch are processed through the collections associated with the bucket.

Provide either object_ids (explicit list) or filters (server-side query) — not both.

object_ids: Explicit list of object IDs that exist in the bucket
filters: A LogicalOperator filter that is resolved server-side against the bucket's objects

Batch Processing Flow:

Create batch with object_ids or filters → Batch created in DRAFT status, collections auto-discovered
Submit batch → Processing begins for discovered collections
Collections with collection sources (tier 2/3) are processed automatically
Processing happens in topological order based on collection dependencies

Examples: Explicit object IDs: {"object_ids": ["obj_123", "obj_456", "obj_789"]}

Filter-based (all objects with a tag):
{"filters": {"AND": [{"field": "metadata.batch_tag", "operator": "eq", "value": "nfl_2025"}]}}

object_ids

string[] | null

List of object IDs to include in the batch. Objects must exist in the bucket where the batch is created. Minimum 1 object, no maximum limit. Mutually exclusive with 'filters'.

Minimum array length: 1

Example:

["object_789", "object_101"]

filters

LogicalOperator · object | null

Filter to select objects from the bucket. Resolved server-side into object IDs. Supports the same filter syntax as the object list endpoint. Mutually exclusive with 'object_ids'. Maximum 50,000 objects can be resolved from a filter.

Show child attributes

collection_ids

string[] | null

Optional list of collection IDs to process. When omitted, all collections sourced from this bucket (and their downstream dependencies) are auto-discovered. When provided, only these collections (plus their downstream dependencies) are included. Useful for submitting separate batches per collection to isolate failures. Accepts either 'collection_ids' or the shorthand 'collections'.

limit

integer | null

Maximum number of objects to include when using filters. If omitted, all matching objects are included (up to 50,000).

Required range: 1 <= x <= 50000

dedup_strategy

enum<string>

default:skip

Controls how objects that were already processed in prior batches are handled. Scoped to (bucket, collection): checks if the target collection already has documents from the same source object, regardless of which batch created them. 'skip' (default): Don't reprocess objects that already have documents. 'replace': Delete existing documents and reprocess from scratch. 'force': Process regardless, allowing duplicate documents.

Available options:

skip,

replace,

force

Examples:

"skip"

"replace"

"force"

metadata

BatchMetadata · object | null

User-defined metadata for the batch. Has typed fields (campaign_id, source, tags, notes) and also accepts arbitrary extra keys. Persisted with the batch and returned in API responses.

Show child attributes

Example:

{
  "campaign_id": "Q4_2025",
  "tags": ["backfill"]
}

Response

Successful Response

Model representing a batch of objects for processing through collections.

A batch groups bucket objects together for processing through one or more collections. Batches support multi-tier processing where collections are processed in dependency order (e.g., bucket → chunks → frames → scenes). Each tier has independent task tracking.

Use Cases: - Process multiple objects through collections in a single batch - Track progress of multi-tier decomposition pipelines - Monitor and retry individual processing tiers - Query batch status and tier-specific task information

Lifecycle: 1. Created in DRAFT status with object_ids 2. Submitted for processing → status changes to PENDING 3. Each tier processes sequentially (tier 0 → tier 1 → ... → tier N) 4. Batch completes when all tiers finish (status=COMPLETED) or any tier fails (status=FAILED)

Multi-Tier Processing: - Tier 0: Bucket objects → Collections (bucket as source) - Tier N (N > 0): Collection documents → Collections (upstream collection as source) - Each tier gets independent task tracking via tier_tasks array - Processing proceeds tier-by-tier with automatic chaining

Requirements: - batch_id: OPTIONAL (auto-generated if not provided) - bucket_id: REQUIRED - status: OPTIONAL (defaults to DRAFT) - object_ids: REQUIRED for processing (must have at least 1 object) - collection_ids: OPTIONAL (discovered via DAG resolution) - tier_tasks: OPTIONAL (populated during processing) - current_tier: OPTIONAL (set during processing) - total_tiers: OPTIONAL (defaults to 1, set during DAG resolution) - dag_tiers: OPTIONAL (populated during DAG resolution)

bucket_id

string

required

REQUIRED. Unique identifier of the bucket containing the objects to process. Must be a valid bucket ID that exists in the system. All object_ids must belong to this bucket. Format: Bucket ID as defined when bucket was created.

Examples:

"bkt_videos"

"bkt_documents_q4"

batch_id

string

OPTIONAL (auto-generated if not provided). Unique identifier for this batch. Format: 'btch_' prefix followed by 12-character secure token. Generated using generate_secure_token() from shared.utilities.helpers. Used to query batch status and track processing across tiers. Immutable after creation.

Examples:

"btch_abc123xyz789"

"btch_video_batch_01"

namespace_id

string | null

Namespace this batch belongs to. Stored at creation time.

status

enum<string>

default:DRAFT

OPTIONAL (defaults to DRAFT). Current processing status of the batch. Lifecycle: DRAFT → PENDING → IN_PROGRESS → COMPLETED/FAILED. DRAFT: Batch created but not yet submitted. PENDING: Batch submitted and queued for processing. IN_PROGRESS: Batch currently processing (one or more tiers active). COMPLETED: All tiers successfully completed. FAILED: One or more tiers failed. Aggregated from tier_tasks statuses during multi-tier processing.

Available options:

PENDING,

QUEUED,

IN_PROGRESS,

PROCESSING,

COMPLETED,

COMPLETED_WITH_ERRORS,

FAILED,

CANCELED,

INTERRUPTED,

UNKNOWN,

SKIPPED,

DRAFT,

ACTIVE,

ARCHIVED,

SUSPENDED

Examples:

"DRAFT"

"PENDING"

"IN_PROGRESS"

"COMPLETED"

"FAILED"

object_ids

string[]

List of object IDs to include in this batch. All objects must exist in the specified bucket_id. These objects are the source data for tier 0 processing. Collection-sourced batches may have empty object_ids. Objects are processed in parallel within each tier.

Examples:

["obj_video_001", "obj_video_002"]

["obj_doc_123"]

dedup_strategy

enum<string>

default:skip

Controls how objects already processed in prior batches are handled. Scoped to (bucket, collection). 'skip': Don't reprocess objects that already have documents. 'replace': Delete existing documents and reprocess. 'force': Process regardless, allowing duplicates.

Available options:

skip,

replace,

force

dedup_audit

Dedup Audit · object | null

Per-collection dedup decisions. Keys are collection_ids; values contain dedup_strategy, total_input, skipped, processed, and skipped_object_ids (up to 1000). Written at TWO stages, deep-merged per collection: (1) the API at manifest build, when smart-skip enforcement excludes already-complete objects before any engine submission (fields prefixed manifest_*), and (2) the Engine after its resume filter runs on whatever residue was submitted.

collection_ids

string[] | null

OPTIONAL. List of all collection IDs involved in this batch's processing. Automatically populated during DAG resolution from dag_tiers. Includes collections from all tiers (flattened view of dag_tiers). Used for quick lookups without traversing tier structure. Format: List of collection IDs across all tiers.

Example:

["col_chunks"]

error

string | null

OPTIONAL. Legacy error message field for backward compatibility. None if batch succeeded or is still processing. Contains human-readable error description from first failed tier. DEPRECATED: Use tier_tasks[].errors for detailed error information. For multi-tier batches, typically contains the error from the first failed tier. Check tier_tasks array for tier-specific error details and error_summary for aggregation.

Example:

"Failed to process batch: Object not found"

failure_reason

string | null

OPTIONAL. Human-readable explanation of why the batch failed. None if batch succeeded, is still processing, or is in DRAFT/PENDING state. Populated automatically when a batch transitions to FAILED status. Provides a concise, actionable summary of the root cause. Common reasons include: Ray job failure (spot preemption, OOM, code errors), 0 documents written (processing completed but produced no output), processing stall (no activity detected for extended period), or task exception (submission/validation failures). Use this field for user-facing error displays and alerting.

Example:

"Ray job failed: ImportError: No module named 'google.genai'"

error_summary

Error Summary · object | null

OPTIONAL. Aggregated summary of errors across ALL tiers in the batch. None if batch succeeded or is still processing. Maps error_type (category) to total count of affected documents across all tiers. Provides quick batch-wide overview of error distribution. Example: {'dependency': 15, 'authentication': 25, 'validation': 5} means across all tiers, 15 documents failed with dependency errors, 25 with auth errors, 5 with validation errors. Automatically aggregated from tier_tasks[].error_summary. Used for batch health dashboard and error trend analysis.

Show child attributes

Example:

null

failure_category

enum<string> | null

OPTIONAL. Machine-readable classification of the batch failure. None if batch succeeded or is still processing. Auto-derived from failure_reason if not explicitly set, so existing writers that only populate failure_reason still get a category on read. Categories: timeout (stall/no-progress), infrastructure (OOM, workers died, spot preemption), orphaned (RayJob CRD gone), pipeline (genuine extractor/pipeline failure), validation (submission/schema errors), unknown (uncategorized). Use this instead of parsing failure_reason.

Available options:

timeout,

infrastructure,

orphaned,

pipeline,

validation,

unknown

Example:

null

failed_objects

FailedObjectRecord · object[]

OPTIONAL. List of per-object failure records from batch processing. Populated when individual objects fail while others succeed. Each record includes the object_id, error message, error classification (transient/permanent/resource), and timestamp. When this list is non-empty and some objects succeeded, batch status is COMPLETED_WITH_ERRORS. Enables targeted resubmission of only failed objects.

Show child attributes

Examples:

[]

[
  {
    "error": "Unsupported codec: VP9",
    "error_type": "permanent",
    "object_id": "obj_video_001",
    "timestamp": "2025-11-29T10:15:30Z"
  },
  {
    "error": "Connection timeout to embedding service",
    "error_type": "transient",
    "object_id": "obj_doc_123",
    "timestamp": "2025-11-29T10:15:35Z"
  }
]

failed_object_count

integer

default:0

OPTIONAL. Count of objects that failed during batch processing. Shorthand for len(failed_objects). Stored separately for efficient queries and sorting without loading full failed_objects array.

Required range: x >= 0

Examples:

0

3

15

type

enum<string>

default:BUCKET

OPTIONAL (defaults to BUCKET). Type of batch. BUCKET: Standard batch processing bucket objects through collections. COLLECTION: Reserved for future collection-only batch processing. Currently only BUCKET type is supported.

Available options:

BUCKET,

COLLECTION

Example:

"BUCKET"

manifest_key

string | null

OPTIONAL. S3 key where the batch manifest is stored. Contains metadata and row data (Parquet) for Engine processing. For tier 0, points to bucket object manifest. For tier N+, points to collection document manifest. Format: S3 path (e.g., 'namespace_id/internal_id/manifests/tier_0.parquet'). Generated during batch submission.

Example:

"ns_abc/org_123/manifests/tier_0.parquet"

task_id

string | null

OPTIONAL. Primary task ID for the batch (typically tier 0 task). Used for backward compatibility with single-tier batch tracking. For multi-tier batches, prefer querying tier_tasks array for granular tracking. Format: Task ID as generated for tier 0.

Example:

"task_tier0_abc123"

loaded_object_ids

string[] | null

OPTIONAL. List of object IDs that were successfully validated and loaded into the batch. Subset of object_ids that passed validation. Used to track which objects are ready for processing. None if batch hasn't been validated yet.

Example:

["obj_video_001", "obj_video_002"]

internal_metadata

Internal Metadata · object | null

OPTIONAL. Internal engine/job metadata for system use. May contain: job_id (provider-specific), engine_version, processing hints, last_health_check. last_health_check: Most recent health check results with health_status, enriched_documents, vector_populated_count, stall_duration_seconds, recommendations, missing_features. Populated asynchronously (non-blocking, best-effort). Used for troubleshooting batch processing issues via API. NOTE: In MongoDB, this is stored under '_internal.processing' path.

Example:

{
  "include_history": true,
  "last_health_check": {
    "enriched_documents": 98,
    "health_status": "HEALTHY",
    "missing_features": ["text_embedding"],
    "processed_documents": 100,
    "recommendations": [],
    "stall_duration_seconds": 0,
    "timestamp": "2025-11-06T10:05:00Z",
    "total_documents": 100,
    "vector_populated_count": 98
  }
}

metadata

BatchMetadata · object

OPTIONAL. User-defined metadata for the batch. Has typed fields (campaign_id, source, tags, notes) and also accepts arbitrary extra keys. Persisted with the batch and returned in API responses. Not used by the system for processing logic.

Show child attributes

Examples:

{
  "campaign_id": "Q4_2025",
  "tags": ["video", "high-priority"]
}

{
  "notes": "TubeScience daily ingest",
  "source": "s3://raw-uploads/2026-05/"
}

tier_tasks

TierTaskInfo · object[]

OPTIONAL. List of tier task tracking information for multi-tier processing. Each element represents one tier in the processing pipeline. Empty array for simple single-tier batches. Populated during batch submission with tier 0 info, then appended as tiers progress. Each TierTaskInfo contains: tier_num, task_id, status, collection_ids, timestamps. Used for granular monitoring: 'Show me status of tier 2' or 'Retry tier 1'. Array index typically matches tier_num (tier_tasks[0] = tier 0, tier_tasks[1] = tier 1, etc.).

Show child attributes

Examples:

[]

[
  {
    "collection_ids": ["col_chunks"],
    "status": "COMPLETED",
    "task_id": "task_tier0_abc",
    "tier_num": 0
  }
]

current_tier

integer | null

OPTIONAL. Zero-based index of the currently processing tier. None if batch hasn't started processing (status=DRAFT or PENDING). Updated as batch progresses through tiers. Used to show processing progress: 'Processing tier 2 of 5'. Set to last tier number when batch completes. Example: If processing tier 1 (frames), current_tier=1.

Required range: x >= 0

Example:

0

total_tiers

integer

default:1

OPTIONAL (defaults to 1). Total number of tiers in the collection DAG. Minimum 1 (tier 0 only = bucket → collection). Set during DAG resolution when batch is submitted. Equals len(dag_tiers) if dag_tiers is populated. Used to calculate progress: current_tier / total_tiers. Example: 5-tier pipeline (bucket → chunks → frames → scenes → summaries) has total_tiers=5.

Required range: x >= 1

Examples:

1

3

5

dag_tiers

string[][] | null

OPTIONAL. Complete DAG tier structure for this batch. List of tiers, where each tier is a list of collection IDs to process at that stage. Tier 0 = bucket-sourced collections. Tier N (N > 0) = collection-sourced collections. Collections within same tier have no dependencies (can run in parallel). Collections in tier N+1 depend on collections in tier N. Populated during DAG resolution at batch submission. Used for tier-by-tier processing orchestration. Example: [['col_chunks'], ['col_frames', 'col_objects'], ['col_scenes']] = 3 tiers where frames and objects run in parallel at tier 1.

Example:

[["col_chunks"]]

created_at

string<date-time>

OPTIONAL (auto-set on creation). ISO 8601 timestamp when batch was created. Set using current_time() from shared.utilities.helpers. Immutable after creation. Used for batch age tracking and cleanup of old batches.

Example:

"2025-11-03T10:00:00Z"

progress

BatchProgress · object | null

OPTIONAL. Live progress snapshot updated approximately every 10 seconds while the batch is IN_PROGRESS. Written by the Ray ProgressActor inside the engine job. None when status is DRAFT or PENDING (job not started), or after COMPLETED/FAILED. Use this to show real-time progress bars: processed/total objects, percent complete, throughput (items_per_second), and estimated time remaining (eta_seconds).

Show child attributes

Example:

null

documents_written

integer | null

OPTIONAL. Read-time aggregate of documents_written from tier_tasks and extractor_jobs. None means the completion callback has not reported write accounting yet.

Example:

null

status_diagnostics

Status Diagnostics · object

Read-time diagnostics explaining terminal status, error indicators, and document-write accounting.

health

string | null

OPTIONAL. Computed health status for actively processing batches. Only populated when status is PROCESSING or IN_PROGRESS. Values: 'healthy' (recent activity detected), 'stalled' (no activity for 5+ minutes), 'unknown' (no heartbeat data yet). Computed from tier_tasks[].last_activity_at and updated_at. Use this to detect stuck batches before the internal stall detector kills them.

Example:

"healthy"

cost

BatchCost · object | null

OPTIONAL. Dollar cost of this batch, derived at read time from the credits recorded against it in usage_records (credits_consumed * $0.001/credit). Lets callers answer 'what did this batch cost?' from the batch response without GCP billing labels. None when cost could not be looked up (e.g. the usage store was unavailable); 0 credits/USD when no usage has been recorded for the batch yet.

Show child attributes

last_activity_at

string<date-time> | null

OPTIONAL. Timestamp of the most recent activity across all tier tasks. Aggregated from tier_tasks[].last_activity_at — the latest heartbeat from any tier. Updated approximately every 10 seconds by the BatchJobPoller while processing. A stale value (minutes old) while status is PROCESSING indicates the batch may be stalled. None for batches that have not started processing or have no heartbeat data.

retry_count

integer

default:0

OPTIONAL (defaults to 0). Number of times this batch has been auto-retried due to transient infrastructure failures (spot node preemption, OOM, actor death). Incremented each time the batch is automatically requeued after a retryable failure. User-facing: lets users see that retries happened transparently.

Required range: x >= 0

Examples:

0

1

2

3

max_retries

integer

default:3

OPTIONAL (defaults to 3). Maximum number of automatic retries for transient failures. When retry_count reaches max_retries, the batch stays in FAILED state. Only transient/infrastructure failures trigger retries — validation and data errors do not.

Required range: x >= 0

Examples:

3

5

last_retry_at

string<date-time> | null

OPTIONAL. ISO 8601 timestamp of the most recent auto-retry attempt. None if the batch has never been retried. Used to calculate exponential backoff for subsequent retries.

Example:

null

retry_reason

string | null

OPTIONAL. Human-readable reason for the most recent auto-retry. None if the batch has never been retried. Describes the transient failure that triggered the retry (e.g., 'Spot node preempted', 'Ray actor died', 'OOM killed').

Example:

null

webhook_url

string | null

OPTIONAL. URL to receive an HTTP POST notification when the batch reaches a terminal state (COMPLETED, FAILED, or CANCELED). Set at submit time via SubmitBatchRequest. The webhook is fire-and-forget: delivery failures are logged but never affect batch processing.

Example:

"https://example.com/webhooks/batch-complete"

updated_at

string<date-time>

OPTIONAL (auto-updated). ISO 8601 timestamp when batch was last modified. Updated using current_time() whenever batch status or tier_tasks change. Used to track batch activity and identify stale batches.

Example:

"2025-11-03T10:30:00Z"

status_message

string | null

COMPUTED. Human-readable description of the current batch state. Examples: 'Processing 724/50,000 objects (1.4%)', 'Queued — 2 batches ahead', 'Completed in 5m 23s', 'Loading model (stage 1/3)'. Computed on read, not stored in the database.

estimated_completion

string<date-time> | null

COMPUTED. Estimated completion timestamp based on current throughput. Derived from progress.eta_seconds + now. None if throughput data is unavailable. Computed on read, not stored in the database.

Delete Upload Add Objects to Batch