> ## Documentation Index
> Fetch the complete documentation index at: https://docs.mixpeek.com/docs/llms.txt
> Use this file to discover all available pages before exploring further.

# Describe collection features

> List feature addresses and metadata available in this collection


## OpenAPI

````yaml get /v1/collections/{collection_identifier}/features
openapi: 3.1.0
info:
  title: Mixpeek API
  description: >-
    This is the Mixpeek API, providing access to various endpoints for data
    processing and retrieval.
  termsOfService: https://mixpeek.com/terms
  contact:
    name: Mixpeek Support
    url: https://mixpeek.com/contact
    email: info@mixpeek.com
  version: '0.82'
servers:
  - url: https://api.mixpeek.com
    description: Production
security: []
paths:
  /v1/collections/{collection_identifier}/features:
    get:
      tags:
        - Collections
      summary: Describe collection features
      description: List feature addresses and metadata available in this collection
      operationId: >-
        describe_collection_features_route_v1_collections__collection_identifier__features_get
      parameters:
        - name: collection_identifier
          in: path
          required: true
          schema:
            type: string
            description: The ID or name of the collection to describe
            title: Collection Identifier
          description: The ID or name of the collection to describe
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/DescribeCollectionFeaturesResponse'
        '400':
          description: Bad Request
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '401':
          description: Unauthorized
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '403':
          description: Forbidden
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '404':
          description: Not Found
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '422':
          description: Validation Error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/HTTPValidationError'
        '500':
          description: Internal Server Error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
components:
  schemas:
    DescribeCollectionFeaturesResponse:
      properties:
        features:
          items:
            $ref: '#/components/schemas/CollectionFeatureDescriptor'
          type: array
          title: Features
          description: Feature extractors and fields enabled on this collection
      type: object
      required:
        - features
      title: DescribeCollectionFeaturesResponse
      examples:
        - features:
            - feature_extractor_name: text_extractor
              inputs:
                - path: text
                  type: string
              outputs:
                - dim: 1024
                  path: features.text_embedding
                  type: vector
              version: v1
    ErrorResponse:
      properties:
        success:
          type: boolean
          title: Success
          description: Always false for error responses
          default: false
        status:
          type: integer
          title: Status
          description: HTTP status code for this error
        error:
          $ref: '#/components/schemas/ErrorDetail'
          description: Error details payload
      type: object
      required:
        - status
        - error
      title: ErrorResponse
      description: Error response model.
      examples:
        - error:
            details:
              id: ns_123
              resource: namespace
            message: Namespace not found
            type: NotFoundError
          status: 404
          success: false
    HTTPValidationError:
      properties:
        detail:
          items:
            $ref: '#/components/schemas/ValidationError'
          type: array
          title: Detail
      type: object
      title: HTTPValidationError
    CollectionFeatureDescriptor:
      properties:
        feature_address:
          type: string
          title: Feature Address
          description: Fully qualified feature address
        feature_extractor_name:
          type: string
          title: Feature Extractor Name
          description: Extractor name
        version:
          type: string
          title: Version
          description: Extractor version
        vector_index:
          $ref: '#/components/schemas/VectorIndex'
          description: >-
            Vector index configuration (name, dimensions, type, distance,
            inference_name)
        primary:
          type: boolean
          title: Primary
          description: True if this is the primary output (short address allowed)
          default: false
      type: object
      required:
        - feature_address
        - feature_extractor_name
        - version
        - vector_index
      title: CollectionFeatureDescriptor
      description: >-
        Descriptor for a collection's available feature using existing
        models/keys.
    ErrorDetail:
      properties:
        message:
          type: string
          title: Message
          description: Human-readable error message
        type:
          type: string
          title: Type
          description: Stable error type identifier (machine-readable)
        code:
          anyOf:
            - type: string
            - type: 'null'
          title: Code
          description: >-
            Fine-grained error code for programmatic handling (e.g.,
            namespace_name_taken, feature_extractor_not_found). Present only
            when consumers may need to branch on a specific error condition.
        details:
          anyOf:
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Details
          description: >-
            Optional structured details to help debugging (validation errors,
            IDs, etc.)
      type: object
      required:
        - message
        - type
      title: ErrorDetail
      description: Error detail model.
    ValidationError:
      properties:
        loc:
          items:
            anyOf:
              - type: string
              - type: integer
          type: array
          title: Location
        msg:
          type: string
          title: Message
        type:
          type: string
          title: Error Type
      type: object
      required:
        - loc
        - msg
        - type
      title: ValidationError
    VectorIndex:
      properties:
        name:
          anyOf:
            - type: string
            - type: 'null'
          title: Name
          description: >-
            OPTIONAL. Qdrant named vector identifier. If not provided,
            auto-derived from inference_service_id using the same conversion as
            inference_name (org/model -> org__model with hyphens as
            underscores). This enables cross-extractor compatibility: extractors
            using the same model will share the same named vector in Qdrant,
            allowing direct vector search across collections without fusion
            logic.
          examples:
            - intfloat__multilingual_e5_large_instruct
            - google__siglip_base_patch16_224
            - jinaai__jina_embeddings_v2_base_code
        description:
          type: string
          minLength: 10
          title: Description
          description: >-
            REQUIRED. Human-readable description of what this vector index
            represents. Explain the content type, use cases, and search
            characteristics. Shown in API documentation and collection metadata.
            Be specific about what embeddings are stored here.
          examples:
            - Dense vector embedding for text content using E5-Large model
            - Video segment embeddings for semantic visual search
            - Sparse keyword expansion embeddings for explainable search
        dimensions:
          anyOf:
            - type: integer
              minimum: 1
            - type: 'null'
          title: Dimensions
          description: >-
            Number of vector dimensions. REQUIRED for DENSE vectors (e.g., 1024
            for E5-Large, 1408 for multimodal). NOT REQUIRED for SPARSE vectors
            (dimensions determined dynamically). Must match the output
            dimensions of the inference service. Cannot be changed after index
            creation without recreating the collection.
          examples:
            - 1024
            - 1408
            - 768
            - 512
        type:
          $ref: '#/components/schemas/VectorType'
          description: >-
            REQUIRED. Vector storage format type. Determines how vectors are
            stored and searched in Qdrant. Use DENSE for traditional embeddings
            (most common), SPARSE for keyword-based models like SPLADE,
            MULTI_DENSE for late-interaction models like ColBERT. Must match the
            output format of your inference service.
          examples:
            - dense
            - sparse
            - multi_dense
        distance:
          anyOf:
            - type: string
            - type: 'null'
          title: Distance
          description: >-
            Distance metric for similarity search. OPTIONAL - defaults to
            'cosine' (normalized dot product). Options: 'cosine' (most common,
            normalized), 'dot' (raw dot product), 'euclidean' (L2 distance),
            'manhattan' (L1 distance). Cosine recommended for most embeddings as
            it's scale-invariant. Must match the metric your model was trained
            with.
          default: cosine
          examples:
            - cosine
            - dot
            - euclidean
        datatype:
          anyOf:
            - $ref: '#/components/schemas/VectorDataType'
            - type: 'null'
          description: >-
            Data type for storing vector values. OPTIONAL - defaults to FLOAT32
            (standard precision). Use FLOAT32 for general use (4 bytes per
            dimension). Use FLOAT16 to save 50% storage with minimal quality
            loss. Use UINT8 for maximum compression (quantization, ~2% quality
            loss). Lower precision = smaller storage + faster search, slightly
            lower accuracy.
          default: float32
          examples:
            - float32
            - float16
            - uint8
        on_disk:
          anyOf:
            - type: boolean
            - type: 'null'
          title: On Disk
          description: >-
            OPTIONAL. If true, vectors stored on disk instead of RAM. Defaults
            to true for memory efficiency. Set to false for faster search with
            higher memory usage. Trade-off: on_disk=true saves ~95% RAM but ~10x
            slower search. Recommended to keep default (true) unless RAM is
            abundant and low latency critical.
        supported_inputs:
          anyOf:
            - items:
                $ref: '#/components/schemas/BucketSchemaFieldType'
              type: array
            - type: 'null'
          title: Supported Inputs
          description: >-
            OPTIONAL. List of bucket schema field types this vector can process.
            Validates that input fields are compatible with this index.
            Examples: TEXT and STRING for text embeddings, VIDEO and IMAGE for
            multimodal embeddings, DOCUMENT for PDF extractors. Used for
            validation during collection creation.
          examples:
            - - text
              - string
            - - video
              - image
            - - document
        inference_name:
          anyOf:
            - type: string
            - type: 'null'
          title: Inference Name
          description: >-
            DEPRECATED: Use inference_service_id instead. Identifier of the
            inference service to generate embeddings. Must reference a valid
            inference service registered in the system. Examples:
            'multilingual_e5_large_instruct_v1' for text,
            'vertex_multimodal_embedding' for video, 'laion_clip_vit_l_14_v1'
            for images. This determines which model creates the vectors during
            ingestion. Cannot be changed after collection creation.
          examples:
            - multilingual_e5_large_instruct_v1
            - vertex_multimodal_embedding
            - laion_clip_vit_l_14_v1
            - openai_text_embedding_3_small
        inference_service_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Inference Service Id
          description: >-
            RECOMMENDED. Service ID in org/name format (e.g.,
            'intfloat/e5-large'). When set, dimensions and distance are
            automatically derived from the registry. This is the canonical
            identifier for cross-plugin compatibility. Plugins using the same
            service_id can search across each other's vectors. Takes precedence
            over inference_name when both are set.
          examples:
            - intfloat/e5-large
            - google/vertex-multimodal
            - google/siglip
            - jinaai/jina-code-v2
        purpose:
          anyOf:
            - $ref: '#/components/schemas/VectorPurpose'
            - type: 'null'
          description: >-
            RECOMMENDED. Semantic purpose of this vector index. Enables
            pipelines to look up vector configs by purpose (text, code, image)
            without needing to know the specific inference_service_id. This
            provides automatic configuration - the pipeline just says 'give me
            the text vector' and gets the correct column name. If not specified,
            pipeline must use inference_service_id lookup.
          examples:
            - text
            - code
            - image
            - multimodal
        vector_name_override:
          anyOf:
            - type: string
            - type: 'null'
          title: Vector Name Override
          description: >-
            OPTIONAL. Override for Qdrant named vector identifier. When set,
            this value is used as the Qdrant vector name instead of
            auto-deriving from inference_service_id. This enables multiple
            vectors from the same embedding model with different storage names.
            The inference_service_id is still used for cross-extractor
            compatibility checking, but storage uses this custom name. Use case:
            A single extractor producing N vectors (e.g., title_embedding,
            body_embedding) using the same model but needing separate storage.
          examples:
            - title_embedding
            - body_embedding
            - summary_embedding
        supports_multi_query:
          type: boolean
          title: Supports Multi Query
          description: >-
            Whether this vector index supports multi-content queries at
            retrieval time. When True, the feature_search stage accepts
            input_mode='multi_content' — a list of URLs and/or text strings that
            are embedded together in one API call to produce a single query
            vector. Only set for extractors whose underlying model natively
            supports multi-file input (e.g., gemini_multifile_extractor using
            Gemini Embedding 2).
          default: false
      type: object
      required:
        - description
        - type
      title: VectorIndex
      description: >-
        Configuration for a single vector index in Qdrant.


        Defines the fully-qualified vector index including storage name,
        dimensions,

        distance metric, and inference service. This is the actual index that
        gets

        created in Qdrant and used for vector similarity search.


        Key Concepts:
            - The `name` field is the FULL qualified name used as the Qdrant collection name
            - Format: {extractor}_{version}_{output} (e.g., "text_extractor_v1_embedding")
            - This ensures namespace isolation between extractors and versions
            - Different from VectorIndexDefinition.name which is the short user-facing name

        Use Cases:
            - Define vector storage configuration for feature extractors
            - Specify inference service and model parameters
            - Configure distance metrics for similarity search
            - Set storage optimization (on-disk for large vectors)

        Requirements:
            - name: REQUIRED - Must be unique across all extractors in namespace
            - description: REQUIRED - Explain what this vector represents
            - dimensions: REQUIRED for DENSE vectors, OPTIONAL for SPARSE
            - type: REQUIRED - Must match VectorType enum
            - inference_name: REQUIRED - Must reference a valid inference service
      examples:
        - datatype: float32
          description: >-
            Dense vector embedding for text content using E5-Large multilingual
            model. Optimized for semantic search across 100+ languages.
          dimensions: 1024
          distance: cosine
          inference_name: multilingual_e5_large_instruct_v1
          name: text_extractor_v1_embedding
          supported_inputs:
            - text
            - string
          type: dense
        - datatype: float32
          description: >-
            Dense vector embeddings for video segments using Google's multimodal
            model. Supports visual semantic search.
          dimensions: 1408
          distance: cosine
          inference_name: vertex_multimodal_embedding
          name: multimodal_extractor_v1_video_embedding
          supported_inputs:
            - video
            - image
          type: dense
        - datatype: float32
          description: >-
            Dense vector embeddings for images using SigLIP model. Supports
            visual semantic search.
          dimensions: 768
          distance: cosine
          inference_name: siglip_base_v1
          name: image_extractor_v1_embedding
          supported_inputs:
            - image
          type: dense
        - description: >-
            Title embedding using E5-Large - same model as body but stored
            separately.
          inference_service_id: intfloat/multilingual-e5-large-instruct
          type: dense
          vector_name_override: title_embedding
        - description: >-
            Body embedding using E5-Large - same model as title but stored
            separately.
          inference_service_id: intfloat/multilingual-e5-large-instruct
          type: dense
          vector_name_override: body_embedding
    VectorType:
      type: string
      enum:
        - dense
        - sparse
        - multi_dense
      title: VectorType
      description: |-
        Vector types supported by the Mixpeek system.

        Defines the storage format and structure of embeddings in Qdrant.

        Values:
            DENSE: Traditional float array embeddings (e.g., [0.1, 0.2, 0.3]).
                   Most common format. Used by: text_extractor, multimodal_extractor, image_extractor.
                   Storage: ~4KB per 1024-dim vector. Fast cosine/dot similarity search.

            SPARSE: Index-value pairs for sparse embeddings (e.g., SPLADE, BM25).
                    Only stores non-zero dimensions. Format: {indices: [1,5,9], values: [0.8,0.6,0.4]}.
                    Storage: ~20KB. Keyword-based semantic search.

            MULTI_DENSE: List of dense vectors for late interaction models (e.g., ColBERT).
                         Each document has multiple embeddings. Format: [[0.1,0.2], [0.3,0.4], ...].
                         Storage: ~500KB. High-precision retrieval.

        Examples:
            - DENSE for general semantic search (text_extractor, multimodal_extractor)
            - SPARSE for keyword expansion and explainability
            - MULTI_DENSE for maximum precision retrieval
    VectorDataType:
      type: string
      enum:
        - float32
        - uint8
      title: VectorDataType
      description: Vector data type.
    BucketSchemaFieldType:
      type: string
      enum:
        - string
        - number
        - integer
        - float
        - boolean
        - object
        - array
        - date
        - datetime
        - text
        - image
        - audio
        - video
        - pdf
        - excel
      title: BucketSchemaFieldType
      description: >-
        Supported data types for bucket schema fields.


        Types fall into two categories:


        1. **Metadata Types** (JSON types):
           - Stored as object metadata
           - Standard JSON-compatible types
           - Not processed by extractors (unless explicitly mapped)
           - Examples: string, number, boolean, date

        2. **File Types** (blobs):
           - Stored as files/blobs
           - Processed by extractors
           - Require file content (URL or base64)
           - Examples: text, image, video, pdf

        **GIF Special Handling**:
            GIF files can be declared as either IMAGE or VIDEO type:

            - As IMAGE: GIF is embedded as a single static image (first frame)
            - As VIDEO: GIF is decomposed frame-by-frame with embeddings per frame

            The multimodal extractor detects GIFs via MIME type (image/gif) and routes
            them based on your schema declaration. Use VIDEO for animated GIFs where
            frame-level search is needed, IMAGE for static/thumbnail use cases.

        NOTE: For retriever input schemas that need to accept document
        references

        (e.g., "find similar documents"), use RetrieverInputSchemaFieldType
        instead,

        which includes all bucket types plus document_reference.
    VectorPurpose:
      type: string
      enum:
        - text
        - code
        - image
        - multimodal
        - video
        - audio
        - sparse
      title: VectorPurpose
      description: |-
        Semantic purpose of a vector index.

        Used by pipelines to look up vector configs by purpose without
        needing to know the specific inference_service_id.

````