OpenAI-compatible API

The flagship drop-in surface. Anything written against the OpenAI Chat Completions API works by changing only the base URL.

Chat completions

Standard messages, system prompts, and sampling parameters. The model id is treated as intent and echoed back unchanged.

from openai import OpenAI

client = OpenAI(api_key="llm_live_...", base_url="https://app.directinference.com/di/v1")

resp = client.chat.completions.create(
    model="gpt-5.5-mini",
    messages=[
        {"role": "system", "content": "You are a concise assistant."},
        {"role": "user", "content": "Name three uses for a paperclip."},
    ],
    temperature=0.7,
    max_tokens=300,
)

print(resp.choices[0].message.content)

import OpenAI from "openai";

const client = new OpenAI({
  apiKey: "llm_live_...",
  baseURL: "https://app.directinference.com/di/v1",
});

const resp = await client.chat.completions.create({
  model: "gpt-5.5-mini",
  messages: [
    { role: "system", content: "You are a concise assistant." },
    { role: "user", content: "Name three uses for a paperclip." },
  ],
  temperature: 0.7,
  max_tokens: 300,
});

console.log(resp.choices[0].message.content);

curl https://app.directinference.com/di/v1/chat/completions \
  -H "Authorization: Bearer llm_live_..." \
  -H "Content-Type: application/json" \
  -d '{
    "model": "gpt-5.5-mini",
    "messages": [
      { "role": "system", "content": "You are a concise assistant." },
      { "role": "user", "content": "Name three uses for a paperclip." }
    ],
    "temperature": 0.7,
    "max_tokens": 300
  }'

client := openai.NewClient(
  option.WithAPIKey("llm_live_..."),
  option.WithBaseURL("https://app.directinference.com/di/v1"),
)

resp, err := client.Chat.Completions.New(context.TODO(), openai.ChatCompletionNewParams{
  Model: "gpt-5.5-mini",
  Messages: []openai.ChatCompletionMessageParamUnion{
    openai.SystemMessage("You are a concise assistant."),
    openai.UserMessage("Name three uses for a paperclip."),
  },
  Temperature: openai.Float(0.7),
  MaxTokens:   openai.Int(300),
})
if err != nil {
  panic(err)
}

fmt.Println(resp.Choices[0].Message.Content)

Streaming

Set stream: true for token-by-token Server-Sent Events. The stream terminates with the usual [DONE] sentinel.

stream = client.chat.completions.create(
    model="gpt-5.5-mini",
    messages=[{"role": "user", "content": "Stream a haiku about latency."}],
    stream=True,
)

for chunk in stream:
    delta = chunk.choices[0].delta.content
    if delta:
        print(delta, end="", flush=True)

const stream = await client.chat.completions.create({
  model: "gpt-5.5-mini",
  messages: [{ role: "user", content: "Stream a haiku about latency." }],
  stream: true,
});

for await (const chunk of stream) {
  process.stdout.write(chunk.choices[0]?.delta?.content ?? "");
}

curl https://app.directinference.com/di/v1/chat/completions \
  -H "Authorization: Bearer llm_live_..." \
  -H "Content-Type: application/json" \
  -N \
  -d '{
    "model": "gpt-5.5-mini",
    "messages": [{ "role": "user", "content": "Stream a haiku about latency." }],
    "stream": true
  }'

stream := client.Chat.Completions.NewStreaming(context.TODO(), openai.ChatCompletionNewParams{
  Model: "gpt-5.5-mini",
  Messages: []openai.ChatCompletionMessageParamUnion{
    openai.UserMessage("Stream a haiku about latency."),
  },
})

for stream.Next() {
  chunk := stream.Current()
  if len(chunk.Choices) > 0 {
    fmt.Print(chunk.Choices[0].Delta.Content)
  }
}
if err := stream.Err(); err != nil {
  panic(err)
}

Tools & function calling

Pass tools with JSON-Schema parameters; the response carries tool_calls to execute and feed back. Tool-shaped requests map to the code request type.

tools = [{
    "type": "function",
    "function": {
        "name": "get_weather",
        "description": "Get the current weather for a city.",
        "parameters": {
            "type": "object",
            "properties": {"city": {"type": "string"}},
            "required": ["city"],
        },
    },
}]

resp = client.chat.completions.create(
    model="gpt-5.5-mini",
    messages=[{"role": "user", "content": "What is the weather in Paris?"}],
    tools=tools,
)

for call in resp.choices[0].message.tool_calls or []:
    print(call.function.name, call.function.arguments)

const tools = [{
  type: "function",
  function: {
    name: "get_weather",
    description: "Get the current weather for a city.",
    parameters: {
      type: "object",
      properties: { city: { type: "string" } },
      required: ["city"],
    },
  },
}] as const;

const resp = await client.chat.completions.create({
  model: "gpt-5.5-mini",
  messages: [{ role: "user", content: "What is the weather in Paris?" }],
  tools,
});

for (const call of resp.choices[0].message.tool_calls ?? []) {
  console.log(call.function.name, call.function.arguments);
}

curl https://app.directinference.com/di/v1/chat/completions \
  -H "Authorization: Bearer llm_live_..." \
  -H "Content-Type: application/json" \
  -d '{
    "model": "gpt-5.5-mini",
    "messages": [{ "role": "user", "content": "What is the weather in Paris?" }],
    "tools": [{
      "type": "function",
      "function": {
        "name": "get_weather",
        "description": "Get the current weather for a city.",
        "parameters": {
          "type": "object",
          "properties": { "city": { "type": "string" } },
          "required": ["city"]
        }
      }
    }]
  }'

tools := []openai.ChatCompletionToolParam{{
  Function: openai.FunctionDefinitionParam{
    Name:        "get_weather",
    Description: openai.String("Get the current weather for a city."),
    Parameters: openai.FunctionParameters{
      "type": "object",
      "properties": map[string]any{
        "city": map[string]string{"type": "string"},
      },
      "required": []string{"city"},
    },
  },
}}

resp, err := client.Chat.Completions.New(context.TODO(), openai.ChatCompletionNewParams{
  Model:    "gpt-5.5-mini",
  Messages: []openai.ChatCompletionMessageParamUnion{
    openai.UserMessage("What is the weather in Paris?"),
  },
  Tools: tools,
})
if err != nil {
  panic(err)
}

for _, call := range resp.Choices[0].Message.ToolCalls {
  fmt.Println(call.Function.Name, call.Function.Arguments)
}

Vision

Send image content parts alongside text. Image input always uses the vision request type, regardless of the model id you send.

resp = client.chat.completions.create(
    model="gpt-5.5-mini",
    messages=[{
        "role": "user",
        "content": [
            {"type": "text", "text": "What is in this image?"},
            {"type": "image_url", "image_url": {"url": "https://example.com/photo.jpg"}},
        ],
    }],
)

print(resp.choices[0].message.content)

const resp = await client.chat.completions.create({
  model: "gpt-5.5-mini",
  messages: [{
    role: "user",
    content: [
      { type: "text", text: "What is in this image?" },
      { type: "image_url", image_url: { url: "https://example.com/photo.jpg" } },
    ],
  }],
});

console.log(resp.choices[0].message.content);

curl https://app.directinference.com/di/v1/chat/completions \
  -H "Authorization: Bearer llm_live_..." \
  -H "Content-Type: application/json" \
  -d '{
    "model": "gpt-5.5-mini",
    "messages": [{
      "role": "user",
      "content": [
        { "type": "text", "text": "What is in this image?" },
        { "type": "image_url", "image_url": { "url": "https://example.com/photo.jpg" } }
      ]
    }]
  }'

Structured output

Use response_format with a JSON schema to constrain the reply. A response schema maps the call to the json request type.

resp = client.chat.completions.create(
    model="gpt-5.5-mini",
    messages=[{"role": "user", "content": "Extract the name and age from: Ada is 36."}],
    response_format={
        "type": "json_schema",
        "json_schema": {
            "name": "person",
            "schema": {
                "type": "object",
                "properties": {"name": {"type": "string"}, "age": {"type": "integer"}},
                "required": ["name", "age"],
            },
        },
    },
)

print(resp.choices[0].message.content)   # strict JSON

const resp = await client.chat.completions.create({
  model: "gpt-5.5-mini",
  messages: [{ role: "user", content: "Extract the name and age from: Ada is 36." }],
  response_format: {
    type: "json_schema",
    json_schema: {
      name: "person",
      schema: {
        type: "object",
        properties: { name: { type: "string" }, age: { type: "integer" } },
        required: ["name", "age"],
      },
    },
  },
});

console.log(resp.choices[0].message.content); // strict JSON

curl https://app.directinference.com/di/v1/chat/completions \
  -H "Authorization: Bearer llm_live_..." \
  -H "Content-Type: application/json" \
  -d '{
    "model": "gpt-5.5-mini",
    "messages": [{ "role": "user", "content": "Extract the name and age from: Ada is 36." }],
    "response_format": {
      "type": "json_schema",
      "json_schema": {
        "name": "person",
        "schema": {
          "type": "object",
          "properties": { "name": { "type": "string" }, "age": { "type": "integer" } },
          "required": ["name", "age"]
        }
      }
    }
  }'

Caching & response headers

Reuse a stable prompt prefix to cut cost and time-to-first-token: add a cache_control breakpoint to the cacheable content — see Prompt caching. Every response also reports the classified request type in the X-DI-Request-Type header (Response headers).