Content Clustering Pipeline
Automatically group similar content together using embedding-based clustering. Discover themes, identify duplicates, and organize large content libraries.
from mixpeek import Mixpeekclient = Mixpeek(api_key="YOUR_API_KEY")namespace = client.namespaces.create(name="clusters")collection = client.collections.create(namespace_id=namespace.id,name="articles",extractors=["text-embedding-v2"])# Upload contentclient.buckets.upload(collection_id=collection.id,url="s3://your-bucket/articles/")# Create clusterscluster_job = client.clusters.create(namespace_id=namespace.id,collection_ids=[collection.id],num_clusters=20,algorithm="kmeans")# Get cluster assignmentsclusters = client.clusters.list(namespace_id=namespace.id)for cluster in clusters:print(f"Cluster: {cluster.label} ({cluster.document_count} docs)")
