Quickstarts
These snippets show the public call shapes for query, chat, and embed.
query sends the exact prompt string and never applies a chat template. A
plain prompt is only for completion-style/base models; for decoder-only chat or
instruct GGUFs, render the model’s template yourself. Local query also supports encoder-decoder GGUF text models. chat sends role-tagged
messages. embed returns vectors and needs an embedding-capable local model
loaded with embedding mode enabled.
Local context naming differs only by language casing: browser and Node.js use
contextKey; Python and Rust use context_key.
See Examples And Demos for runnable end-to-end files.
Browser Local
npm install @sipp/sipp
import { SippClient, type ChatMessage } from '@sipp/sipp';
const client = new SippClient();
const messages: readonly ChatMessage[] = [
{ role: 'system', content: 'Answer concisely.' },
{ role: 'user', content: 'Explain local browser inference.' },
];
const queryPrompt = [
'<|system|>',
'Answer concisely.',
'<|user|>',
'Explain local browser inference.',
'<|assistant|>',
].join('\n');
const textEndpoint = await client.add('text', {
kind: 'local',
source: '/models/chat.gguf',
options: { backend: 'webgpu', runtime: { context: { n_ctx: 2048 } } },
});
// query: raw prompt; replace markers with the target model's template.
const query = await client.query(queryPrompt, {
endpoint: textEndpoint,
maxTokens: 64,
contextKey: 'browser-query',
}).response;
// chat: role messages; local runtime uses tokenizer.chat_template.
const chat = await client.chat(messages, {
endpoint: textEndpoint,
maxTokens: 64,
contextKey: 'browser-chat',
}).response;
const embedEndpoint = await client.add('embed', {
kind: 'local',
source: '/models/embed.gguf',
options: {
backend: 'webgpu',
runtime: { context: { n_ctx: 2048, embeddings: true, pooling: 'mean' } },
},
});
// embed: vector output; local endpoint must be embedding-capable.
const embedding = await client.embed('Sipp embedding input.', {
endpoint: embedEndpoint,
contextKey: 'browser-embed',
normalize: true,
}).response;
console.log(query.text, chat.text, embedding.values.length);
await client.close();
Node.js Local
npm install @sipp/sipp-server
import { SippClient } from '@sipp/sipp-server';
const client = new SippClient();
const messages = [
{ role: 'system', content: 'Answer concisely.' },
{ role: 'user', content: 'Explain local Node.js inference.' },
];
const queryPrompt = [
'<|system|>',
'Answer concisely.',
'<|user|>',
'Explain local Node.js inference.',
'<|assistant|>',
].join('\n');
const textOptions = { maxTokens: 64 };
const textModel = process.argv[2] ?? 'chat.gguf';
const embedModel = process.argv[3] ?? 'embed.gguf';
const textEndpoint = await client.add('text', {
kind: 'local',
modelPath: textModel,
config: { context: { n_ctx: 2048 } },
});
// query: raw prompt; replace markers with the target model's template.
const query = await client.query({
endpoint: textEndpoint,
prompt: queryPrompt,
options: textOptions,
local: { contextKey: 'node-query' },
}).response;
// chat: role messages; local runtime uses tokenizer.chat_template.
const chat = await client.chat({
endpoint: textEndpoint,
messages,
options: textOptions,
local: { contextKey: 'node-chat' },
}).response;
const embedEndpoint = await client.add('embed', {
kind: 'local',
modelPath: embedModel,
config: { context: { n_ctx: 2048, embeddings: true, pooling: 'mean' } },
});
// embed: vector output; local endpoint must be embedding-capable.
const embedding = await client.embed({
endpoint: embedEndpoint,
input: 'Sipp embedding input.',
local: { contextKey: 'node-embed', normalize: true },
}).response;
console.log(query.text, chat.text, embedding.values.length);
Local query also supports encoder-decoder GGUF text models, while many
encoder-decoder models cannot use chat because they do not declare
tokenizer.chat_template. Encoder-decoder text models do not produce
embeddings through this runtime.
Python Local
# sippy cuda wheel is currently published via GitHub Releases ;full release matrix is on progress
pip install sipppy
from sipp import (
ChatMessage,
SippClient,
SippTextOptions,
ContextRuntimeConfig,
LocalEmbedOptions,
LocalTextOptions,
LocalModelDescriptor,
NativeRuntimeConfig,
)
client = SippClient()
messages = [
ChatMessage("system", "Answer concisely."),
ChatMessage("user", "Explain local Python inference."),
]
query_prompt = "\n".join(
[
"<|system|>",
"Answer concisely.",
"<|user|>",
"Explain local Python inference.",
"<|assistant|>",
]
)
text_options = SippTextOptions(max_tokens=64)
text_endpoint = client.add("text", LocalModelDescriptor("chat.gguf"))
# query: raw prompt; replace markers with the target model's template.
query = client.query(
query_prompt,
endpoint=text_endpoint,
options=text_options,
local=LocalTextOptions(context_key="python-query"),
).result()
# chat: role messages; local runtime uses tokenizer.chat_template.
chat = client.chat(
messages,
endpoint=text_endpoint,
options=text_options,
local=LocalTextOptions(context_key="python-chat"),
).result()
embed_endpoint = client.add(
"embed",
LocalModelDescriptor(
"embed.gguf",
NativeRuntimeConfig(
context=ContextRuntimeConfig(
n_ctx=2048,
embeddings=True,
pooling="mean",
),
),
),
)
# embed: vector output; local endpoint must be embedding-capable.
embedding = client.embed(
"Sipp embedding input.",
endpoint=embed_endpoint,
local=LocalEmbedOptions(context_key="python-embed", normalize=True),
).result()
print(query["text"], chat["text"], len(embedding["values"]))
Rust Local
cargo add sipp-rs
#![allow(unused)]
fn main() {
use sipp::engine::{
ChatMessage, ChatRole, ContextRuntimeConfig, NativeRuntimeConfig, PoolingType,
};
use sipp::{
SippChatRequest, SippClient, SippEmbedRequest, SippQueryRequest,
SippTextOptions, EndpointDescriptor, LocalEmbedOptions, LocalTextOptions,
};
let mut client = SippClient::new();
let messages = vec![
ChatMessage::new(ChatRole::System, "Answer concisely."),
ChatMessage::new(ChatRole::User, "Explain local Rust inference."),
];
let query_prompt = [
"<|system|>",
"Answer concisely.",
"<|user|>",
"Explain local Rust inference.",
"<|assistant|>",
]
.join("\n");
let text_options = SippTextOptions {
max_tokens: Some(64),
..Default::default()
};
let text_endpoint = client
.add("text", EndpointDescriptor::local("chat.gguf", Default::default()))
.await?;
// query: raw prompt; replace markers with the target model's template.
let query = client
.query(SippQueryRequest {
endpoint: Some(text_endpoint.clone()),
prompt: query_prompt,
options: text_options.clone(),
local: LocalTextOptions {
context_key: Some("rust-query".to_string()),
..Default::default()
},
..Default::default()
})
.await?;
// chat: role messages; local runtime uses tokenizer.chat_template.
let chat = client
.chat(SippChatRequest {
endpoint: Some(text_endpoint),
messages,
options: text_options,
local: LocalTextOptions {
context_key: Some("rust-chat".to_string()),
..Default::default()
},
..Default::default()
})
.await?;
let embed_endpoint = client
.add("embed", EndpointDescriptor::local("embed.gguf", embed_config()))
.await?;
// embed: vector output; local endpoint must be embedding-capable.
let embedding = client
.embed(SippEmbedRequest {
endpoint: Some(embed_endpoint),
input: "Sipp embedding input.".to_string(),
local: LocalEmbedOptions {
context_key: Some("rust-embed".to_string()),
normalize: Some(true),
},
..Default::default()
})
.await?;
println!("{}, {}, {}", query.text, chat.text, embedding.values.len());
fn embed_config() -> NativeRuntimeConfig {
NativeRuntimeConfig {
context: ContextRuntimeConfig {
n_ctx: Some(2048),
embeddings: Some(true),
pooling: Some(PoolingType::Mean),
..Default::default()
},
..Default::default()
}
}
}
Gateway
Gateway clients keep model paths, provider credentials, target policy, and metrics in the gateway process. The example uses the browser package shape; Node.js uses the same request-object shape shown above.
import { SippClient, type ChatMessage } from '@sipp/sipp';
const client = new SippClient();
const endpoint = await client.add('gateway', {
kind: 'gateway',
target: 'local',
baseUrl: 'https://gateway.example.com',
authentication: { kind: 'bearer', value: await getGatewayToken() },
});
const messages: readonly ChatMessage[] = [
{ role: 'system', content: 'Answer concisely.' },
{ role: 'user', content: 'Explain gateway inference.' },
];
const queryPrompt = [
'<|system|>',
'Answer concisely.',
'<|user|>',
'Explain gateway inference.',
'<|assistant|>',
].join('\n');
// query: gateway forwards the raw prompt to the selected target.
const query = await client.query(queryPrompt, {
endpoint,
maxTokens: 64,
}).response;
// chat: gateway maps role messages for the selected provider/local target.
const chat = await client.chat(messages, { endpoint, maxTokens: 64 }).response;
// embed: target must support embeddings.
const embedding = await client.embed('Sipp embedding input.', {
endpoint,
}).response;
console.log(query.text, chat.text, embedding.values.length);
await client.close();
Gateway query preserves the raw prompt, so it is the gateway path for custom
templates or local encoder-decoder targets. Gateway embed requires the target
to support embeddings.
Direct Provider
Use direct provider endpoints only in trusted server code (e.g. self-hosted service). Provider support is
model-specific: query needs a completion-compatible provider or model,
chat needs a chat model, and embed needs an embedding model.
import { SippClient } from '@sipp/sipp-server';
function env(name: string): string {
const value = process.env[name];
if (value == null || value === '') {
throw new Error(`${name} is required`);
}
return value;
}
const client = new SippClient();
const chatMessages = [
{ role: 'system', content: 'Answer concisely.' },
{ role: 'user', content: 'Explain provider inference.' },
];
const completionEndpoint = await client.add('completion', {
kind: 'provider',
provider: 'openai_compatible',
model: env('COMPLETION_MODEL'),
baseUrl: env('COMPLETION_BASE_URL'),
apiKey: env('COMPLETION_API_KEY'),
});
const chatEndpoint = await client.add('chat', {
kind: 'provider',
provider: 'openai',
model: env('OPENAI_CHAT_MODEL'),
apiKey: env('OPENAI_API_KEY'),
});
const embedEndpoint = await client.add('embed', {
kind: 'provider',
provider: 'openai',
model: env('OPENAI_EMBED_MODEL'),
apiKey: env('OPENAI_API_KEY'),
});
// query: raw completion prompt for a completion-compatible provider.
const query = await client.query({
endpoint: completionEndpoint,
prompt: 'Write one provider inference sentence.',
options: { maxTokens: 64 },
}).response;
// chat: provider-native role messages.
const chat = await client.chat({
endpoint: chatEndpoint,
messages: chatMessages,
options: { maxTokens: 64 },
}).response;
// embed: provider-native embedding model.
const embedding = await client.embed({
endpoint: embedEndpoint,
input: 'Sipp embedding input.',
}).response;
console.log(query.text, chat.text, embedding.values.length);
Runtime Tuning
Local endpoint tuning, browser WebGPU options, worker/threading choices, generation options, and provider/gateway option buckets are documented in Runtime Options.
Building and Running from Source Code
Runnable source examples and demos live in the maintainer lane: Source Builds.