NEWManaged multimodal retrieval.Explore platform →
    SimilarCross-Media

    Multimodal Search with MVS

    Build multimodal search by embedding different content types (text, images, video frames) with your own models and searching across them in a single MVS namespace. Use CLIP or any multimodal embedding model for cross-modal retrieval.

    text
    image
    video
    Multi-Tier

    "dog playing outdoors"

    Why This Matters

    Multimodal search without a managed pipeline. Use your preferred CLIP, SigLIP, or any embedding model to embed text, images, and video into a shared vector space, then search across all of them with MVS.

    import open_clip
    import torch
    from PIL import Image
    from mixpeek import Mixpeek
    mvs = Mixpeek(api_key="your-mvs-key")
    NAMESPACE = "multimodal-search"
    # Load a CLIP model for shared text/image embedding space
    model, _, preprocess = open_clip.create_model_and_transforms(
    "ViT-B-32", pretrained="laion2b_s34b_b79k"
    )
    tokenizer = open_clip.get_tokenizer("ViT-B-32")
    def embed_text(text: str) -> list[float]:
    tokens = tokenizer([text])
    with torch.no_grad():
    features = model.encode_text(tokens)
    features /= features.norm(dim=-1, keepdim=True)
    return features[0].tolist()
    def embed_image(image_path: str) -> list[float]:
    image = preprocess(Image.open(image_path)).unsqueeze(0)
    with torch.no_grad():
    features = model.encode_image(image)
    features /= features.norm(dim=-1, keepdim=True)
    return features[0].tolist()
    # Upsert text documents
    texts = [
    "A golden retriever playing fetch in the park",
    "Technical architecture diagram for microservices",
    "Sunset over the ocean with sailboats",
    ]
    for text in texts:
    mvs.namespaces.documents.upsert(
    namespace=NAMESPACE,
    documents=[{
    "dense_embedding": embed_text(text),
    "metadata": {"modality": "text", "content": text}
    }]
    )
    # Upsert images
    image_files = ["dog_park.jpg", "architecture.png", "sunset.jpg"]
    for img in image_files:
    mvs.namespaces.documents.upsert(
    namespace=NAMESPACE,
    documents=[{
    "dense_embedding": embed_image(img),
    "metadata": {"modality": "image", "file": img}
    }]
    )
    # Cross-modal search: text query finds images AND text
    query = "dog playing outdoors"
    results = mvs.namespaces.documents.search(
    namespace=NAMESPACE,
    query={"dense_embedding": embed_text(query)},
    top_k=5
    )
    for doc in results:
    modality = doc["metadata"]["modality"]
    content = doc["metadata"].get("content") or doc["metadata"].get("file")
    print(f"{doc['score']:.3f} | [{modality}] {content}")
    # Filter to only images
    image_results = mvs.namespaces.documents.search(
    namespace=NAMESPACE,
    query={"dense_embedding": embed_text(query)},
    filters={"modality": "image"},
    top_k=5
    )

    Feature Extractors

    Retriever Stages

    limit

    Truncate results to a maximum count with optional offset for pagination

    reduce