Execute Raw Inference

curl --request POST \
  --url https://api.mixpeek.com/v1/inference \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "inputs": {},
  "provider": "openai",
  "model": "gpt-4o-mini",
  "inference_name": "my_text_embedder_1_0_0",
  "feature_uri": "mixpeek://text_extractor@v1/multilingual_e5_large_instruct_v1",
  "parameters": {
    "max_tokens": 500,
    "temperature": 0.7
  },
  "enable_semantic_cache": false,
  "cache_delta": 0.5
}
'

import requests

url = "https://api.mixpeek.com/v1/inference"

payload = {
    "inputs": {},
    "provider": "openai",
    "model": "gpt-4o-mini",
    "inference_name": "my_text_embedder_1_0_0",
    "feature_uri": "mixpeek://text_extractor@v1/multilingual_e5_large_instruct_v1",
    "parameters": {
        "max_tokens": 500,
        "temperature": 0.7
    },
    "enable_semantic_cache": False,
    "cache_delta": 0.5
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    inputs: {},
    provider: 'openai',
    model: 'gpt-4o-mini',
    inference_name: 'my_text_embedder_1_0_0',
    feature_uri: 'mixpeek://text_extractor@v1/multilingual_e5_large_instruct_v1',
    parameters: {max_tokens: 500, temperature: 0.7},
    enable_semantic_cache: false,
    cache_delta: 0.5
  })
};

fetch('https://api.mixpeek.com/v1/inference', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.mixpeek.com/v1/inference",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'inputs' => [
        
    ],
    'provider' => 'openai',
    'model' => 'gpt-4o-mini',
    'inference_name' => 'my_text_embedder_1_0_0',
    'feature_uri' => 'mixpeek://text_extractor@v1/multilingual_e5_large_instruct_v1',
    'parameters' => [
        'max_tokens' => 500,
        'temperature' => 0.7
    ],
    'enable_semantic_cache' => false,
    'cache_delta' => 0.5
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.mixpeek.com/v1/inference"

	payload := strings.NewReader("{\n  \"inputs\": {},\n  \"provider\": \"openai\",\n  \"model\": \"gpt-4o-mini\",\n  \"inference_name\": \"my_text_embedder_1_0_0\",\n  \"feature_uri\": \"mixpeek://text_extractor@v1/multilingual_e5_large_instruct_v1\",\n  \"parameters\": {\n    \"max_tokens\": 500,\n    \"temperature\": 0.7\n  },\n  \"enable_semantic_cache\": false,\n  \"cache_delta\": 0.5\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.mixpeek.com/v1/inference")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"inputs\": {},\n  \"provider\": \"openai\",\n  \"model\": \"gpt-4o-mini\",\n  \"inference_name\": \"my_text_embedder_1_0_0\",\n  \"feature_uri\": \"mixpeek://text_extractor@v1/multilingual_e5_large_instruct_v1\",\n  \"parameters\": {\n    \"max_tokens\": 500,\n    \"temperature\": 0.7\n  },\n  \"enable_semantic_cache\": false,\n  \"cache_delta\": 0.5\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.mixpeek.com/v1/inference")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"inputs\": {},\n  \"provider\": \"openai\",\n  \"model\": \"gpt-4o-mini\",\n  \"inference_name\": \"my_text_embedder_1_0_0\",\n  \"feature_uri\": \"mixpeek://text_extractor@v1/multilingual_e5_large_instruct_v1\",\n  \"parameters\": {\n    \"max_tokens\": 500,\n    \"temperature\": 0.7\n  },\n  \"enable_semantic_cache\": false,\n  \"cache_delta\": 0.5\n}"

response = http.request(request)
puts response.read_body

{
  "data": "<unknown>",
  "provider": "<string>",
  "model": "<string>",
  "latency_ms": 123,
  "tokens_used": {
    "completion": 120,
    "prompt": 15,
    "total": 135
  },
  "cached": false
}

{
  "status": 123,
  "error": {
    "message": "<string>",
    "type": "<string>",
    "code": "<string>",
    "details": {}
  },
  "success": false
}

{
  "status": 123,
  "error": {
    "message": "<string>",
    "type": "<string>",
    "code": "<string>",
    "details": {}
  },
  "success": false
}

{
  "status": 123,
  "error": {
    "message": "<string>",
    "type": "<string>",
    "code": "<string>",
    "details": {}
  },
  "success": false
}

{
  "status": 123,
  "error": {
    "message": "<string>",
    "type": "<string>",
    "code": "<string>",
    "details": {}
  },
  "success": false
}

{
  "detail": [
    {
      "loc": [
        "<string>"
      ],
      "msg": "<string>",
      "type": "<string>",
      "input": "<unknown>",
      "ctx": {}
    }
  ]
}

{
  "status": 123,
  "error": {
    "message": "<string>",
    "type": "<string>",
    "code": "<string>",
    "details": {}
  },
  "success": false
}

Inference

Execute Raw Inference

Execute raw inference with provider+model or custom plugin.

This endpoint provides direct access to inference services without the retriever framework overhead. Supports two modes:

Provider + Model: Use standard providers (openai, google, anthropic)
Custom Plugin: Use your custom inference plugins by inference_name

Supported Providers

openai: GPT models, embeddings, Whisper transcription
google: Gemini models, Vertex multimodal embeddings (1408D)
anthropic: Claude models

Examples

Custom Plugin (by inference_name)

{
    "inference_name": "my_text_embedder_1_0_0",
    "inputs": {"text": "hello world"},
    "parameters": {}
}

Custom Plugin (by feature_uri)

{
    "feature_uri": "mixpeek://my_custom_embedder@1.0.0/embedding",
    "inputs": {"text": "hello world"},
    "parameters": {}
}

Builtin Embedder (by feature_uri)

{
    "feature_uri": "mixpeek://text_extractor@v1/multilingual_e5_large_instruct_v1",
    "inputs": {"text": "hello world"},
    "parameters": {}
}

Chat Completion

{
    "provider": "openai",
    "model": "gpt-4o-mini",
    "inputs": {"prompts": ["What is AI?"]},
    "parameters": {"temperature": 0.7, "max_tokens": 500}
}

Text Embedding (OpenAI)

{
    "provider": "openai",
    "model": "text-embedding-3-large",
    "inputs": {"text": "machine learning"},
    "parameters": {}
}

Text Embedding (Google Vertex Multimodal - 1408D)

{
    "provider": "google",
    "model": "multimodalembedding",
    "inputs": {"text": "machine learning"},
    "parameters": {}
}

Image Embedding (Google Vertex Multimodal - 1408D)

{
    "provider": "google",
    "model": "multimodalembedding",
    "inputs": {"image_url": "https://example.com/image.jpg"},
    "parameters": {}
}

Image Embedding from Base64

{
    "provider": "google",
    "model": "multimodalembedding",
    "inputs": {"image_base64": "<base64-encoded-image>"},
    "parameters": {}
}

Video Embedding (Google Vertex Multimodal - 1408D)

{
    "provider": "google",
    "model": "multimodalembedding",
    "inputs": {"video_url": "https://example.com/video.mp4"},
    "parameters": {}
}

Video Embedding from Base64

{
    "provider": "google",
    "model": "multimodalembedding",
    "inputs": {"video_base64": "<base64-encoded-video>"},
    "parameters": {}
}

Audio Transcription

{
    "provider": "openai",
    "model": "whisper-1",
    "inputs": {"audio_url": "https://example.com/audio.mp3"},
    "parameters": {}
}

Vision (Multimodal LLM)

{
    "provider": "openai",
    "model": "gpt-4o",
    "inputs": {
        "prompts": ["Describe this image"],
        "image_url": "https://example.com/image.jpg"
    },
    "parameters": {"temperature": 0.5}
}

Args: request: FastAPI request object (populated by middleware) payload: Raw inference request

Returns: Inference response with results and metadata

Raises: 400 Bad Request: Invalid provider, model, or inputs 401 Unauthorized: Missing or invalid API key 429 Too Many Requests: Rate limit exceeded 500 Internal Server Error: Inference execution failed

POST

inference

Execute Raw Inference

curl --request POST \
  --url https://api.mixpeek.com/v1/inference \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "inputs": {},
  "provider": "openai",
  "model": "gpt-4o-mini",
  "inference_name": "my_text_embedder_1_0_0",
  "feature_uri": "mixpeek://text_extractor@v1/multilingual_e5_large_instruct_v1",
  "parameters": {
    "max_tokens": 500,
    "temperature": 0.7
  },
  "enable_semantic_cache": false,
  "cache_delta": 0.5
}
'

import requests

url = "https://api.mixpeek.com/v1/inference"

payload = {
    "inputs": {},
    "provider": "openai",
    "model": "gpt-4o-mini",
    "inference_name": "my_text_embedder_1_0_0",
    "feature_uri": "mixpeek://text_extractor@v1/multilingual_e5_large_instruct_v1",
    "parameters": {
        "max_tokens": 500,
        "temperature": 0.7
    },
    "enable_semantic_cache": False,
    "cache_delta": 0.5
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    inputs: {},
    provider: 'openai',
    model: 'gpt-4o-mini',
    inference_name: 'my_text_embedder_1_0_0',
    feature_uri: 'mixpeek://text_extractor@v1/multilingual_e5_large_instruct_v1',
    parameters: {max_tokens: 500, temperature: 0.7},
    enable_semantic_cache: false,
    cache_delta: 0.5
  })
};

fetch('https://api.mixpeek.com/v1/inference', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.mixpeek.com/v1/inference",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'inputs' => [
        
    ],
    'provider' => 'openai',
    'model' => 'gpt-4o-mini',
    'inference_name' => 'my_text_embedder_1_0_0',
    'feature_uri' => 'mixpeek://text_extractor@v1/multilingual_e5_large_instruct_v1',
    'parameters' => [
        'max_tokens' => 500,
        'temperature' => 0.7
    ],
    'enable_semantic_cache' => false,
    'cache_delta' => 0.5
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.mixpeek.com/v1/inference"

	payload := strings.NewReader("{\n  \"inputs\": {},\n  \"provider\": \"openai\",\n  \"model\": \"gpt-4o-mini\",\n  \"inference_name\": \"my_text_embedder_1_0_0\",\n  \"feature_uri\": \"mixpeek://text_extractor@v1/multilingual_e5_large_instruct_v1\",\n  \"parameters\": {\n    \"max_tokens\": 500,\n    \"temperature\": 0.7\n  },\n  \"enable_semantic_cache\": false,\n  \"cache_delta\": 0.5\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.mixpeek.com/v1/inference")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"inputs\": {},\n  \"provider\": \"openai\",\n  \"model\": \"gpt-4o-mini\",\n  \"inference_name\": \"my_text_embedder_1_0_0\",\n  \"feature_uri\": \"mixpeek://text_extractor@v1/multilingual_e5_large_instruct_v1\",\n  \"parameters\": {\n    \"max_tokens\": 500,\n    \"temperature\": 0.7\n  },\n  \"enable_semantic_cache\": false,\n  \"cache_delta\": 0.5\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.mixpeek.com/v1/inference")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"inputs\": {},\n  \"provider\": \"openai\",\n  \"model\": \"gpt-4o-mini\",\n  \"inference_name\": \"my_text_embedder_1_0_0\",\n  \"feature_uri\": \"mixpeek://text_extractor@v1/multilingual_e5_large_instruct_v1\",\n  \"parameters\": {\n    \"max_tokens\": 500,\n    \"temperature\": 0.7\n  },\n  \"enable_semantic_cache\": false,\n  \"cache_delta\": 0.5\n}"

response = http.request(request)
puts response.read_body

{
  "data": "<unknown>",
  "provider": "<string>",
  "model": "<string>",
  "latency_ms": 123,
  "tokens_used": {
    "completion": 120,
    "prompt": 15,
    "total": 135
  },
  "cached": false
}

{
  "status": 123,
  "error": {
    "message": "<string>",
    "type": "<string>",
    "code": "<string>",
    "details": {}
  },
  "success": false
}

{
  "status": 123,
  "error": {
    "message": "<string>",
    "type": "<string>",
    "code": "<string>",
    "details": {}
  },
  "success": false
}

{
  "status": 123,
  "error": {
    "message": "<string>",
    "type": "<string>",
    "code": "<string>",
    "details": {}
  },
  "success": false
}

{
  "status": 123,
  "error": {
    "message": "<string>",
    "type": "<string>",
    "code": "<string>",
    "details": {}
  },
  "success": false
}

{
  "detail": [
    {
      "loc": [
        "<string>"
      ],
      "msg": "<string>",
      "type": "<string>",
      "input": "<unknown>",
      "ctx": {}
    }
  ]
}

{
  "status": 123,
  "error": {
    "message": "<string>",
    "type": "<string>",
    "code": "<string>",
    "details": {}
  },
  "success": false
}

Authorizations

Authorization

string

header

required

Bearer authentication header of the form Bearer <token>, where <token> is your auth token.

Body

application/json

Request for raw inference without retriever framework.

This endpoint provides direct access to inference services with minimal configuration. Ideal for simple LLM calls, embeddings, transcription, or vision tasks without requiring collection setup or retriever configuration.

You can either use:

provider + model for standard providers (openai, google, anthropic)
inference_name for custom plugins

Examples: # Chat completion (provider + model) { "provider": "openai", "model": "gpt-4o-mini", "inputs": {"prompts": ["What is AI?"]}, "parameters": {"temperature": 0.7, "max_tokens": 500} }

# Text embedding (provider + model)
{
    "provider": "openai",
    "model": "text-embedding-3-large",
    "inputs": {"text": "machine learning"},
    "parameters": {}
}

# Custom plugin (inference_name)
{
    "inference_name": "my_text_embedder_1_0_0",
    "inputs": {"text": "hello world"},
    "parameters": {}
}

# Audio transcription
{
    "provider": "openai",
    "model": "whisper-1",
    "inputs": {"audio_url": "https://example.com/audio.mp3"},
    "parameters": {}
}

# Vision (multimodal)
{
    "provider": "openai",
    "model": "gpt-4o",
    "inputs": {
        "prompts": ["Describe this image"],
        "image_url": "https://example.com/image.jpg"
    },
    "parameters": {"temperature": 0.5}
}

inputs

Inputs · object

required

Model-specific inputs. Chat: {prompts: [str]}, Embeddings: {text: str} or {texts: [str]}, Transcription: {audio_url: str}, Vision: {prompts: [str], image_url: str}

Examples:

{
  "prompts": ["What is the capital of France?"]
}

{ "text": "machine learning" }

{
  "audio_url": "https://example.com/audio.mp3"
}

provider

string | null

Provider name: openai, google, anthropic (required if inference_name not set)

Example:

"openai"

model

string | null

Model identifier specific to the provider (required if inference_name not set)

Example:

"gpt-4o-mini"

inference_name

string | null

Custom plugin inference name (alternative to provider+model)

Example:

"my_text_embedder_1_0_0"

feature_uri

string | null

Feature URI to resolve to inference_name (alternative to inference_name). Format: mixpeek://{extractor}@{version}/{vector_index_name}

Example:

"mixpeek://text_extractor@v1/multilingual_e5_large_instruct_v1"

parameters

Parameters · object | null

Optional parameters for inference. Common: temperature (float), max_tokens (int), schema (dict for structured output)

Example:

{ "max_tokens": 500, "temperature": 0.7 }

enable_semantic_cache

boolean

default:false

Enable semantic caching (vCache) for LLM chat operations. When enabled, semantically similar prompts may return cached responses, reducing latency and cost. Only applies to chat/completion models.

cache_delta

number | null

Maximum error rate for semantic cache (0.0-1.0). Lower values are more conservative. Default uses system setting (0.02 = 2%).

Required range: 0 <= x <= 1

Response

Successful Response

Response from raw inference.

Returns the inference results along with metadata about the request.

data

any

required

Inference results (structure varies by modality)

provider

string

required

Provider that was used

model

string

required

Model that was used

latency_ms

number

required

Total inference latency in milliseconds

tokens_used

Tokens Used · object | null

Token usage statistics (if available)

Show child attributes

Example:

{
  "completion": 120,
  "prompt": 15,
  "total": 135
}

cached

boolean

default:false

Whether the response was served from semantic cache (vCache)

Kill Task Search Resources