Mixpeek Logo
    Enhanced

    Content Clustering Pipeline

    Automatically group similar content together using embedding-based clustering. Discover themes, identify duplicates, and organize large content libraries.

    text
    image
    video
    Multi-Tier
    2.1K runs
    Deploy Recipe
    from mixpeek import Mixpeek
    client = Mixpeek(api_key="YOUR_API_KEY")
    namespace = client.namespaces.create(name="clusters")
    collection = client.collections.create(
    namespace_id=namespace.id,
    name="articles",
    extractors=["text-embedding-v2"]
    )
    # Upload content
    client.buckets.upload(
    collection_id=collection.id,
    url="s3://your-bucket/articles/"
    )
    # Create clusters
    cluster_job = client.clusters.create(
    namespace_id=namespace.id,
    collection_ids=[collection.id],
    num_clusters=20,
    algorithm="kmeans"
    )
    # Get cluster assignments
    clusters = client.clusters.list(namespace_id=namespace.id)
    for cluster in clusters:
    print(f"Cluster: {cluster.label} ({cluster.document_count} docs)")

    Feature Extractors

    Retriever Stages