37
"github.com/jmorganca/ollama/api"
40
type dynExtServer struct {
41
s C.struct_dynamic_llama_server
48
func newExtServerResp(len C.size_t) C.ext_server_resp_t {
49
var resp C.ext_server_resp_t
51
bytes := make([]byte, len)
52
resp.msg = (*C.char)(C.CBytes(bytes))
56
func freeExtServerResp(resp C.ext_server_resp_t) {
57
if resp.msg_len == 0 {
60
C.free(unsafe.Pointer(resp.msg))
63
func extServerResponseToErr(resp C.ext_server_resp_t) error {
64
return fmt.Errorf(C.GoString(resp.msg))
70
func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
72
slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
75
updatePath(filepath.Dir(library))
76
libPath := C.CString(library)
77
defer C.free(unsafe.Pointer(libPath))
78
resp := newExtServerResp(512)
79
defer freeExtServerResp(resp)
80
var srv C.struct_dynamic_llama_server
81
C.dyn_init(libPath, &srv, &resp)
84
return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
90
slog.Info(fmt.Sprintf("Loading Dynamic llm server: %s", library))
92
var sparams C.ext_server_params_t
93
sparams.model = C.CString(model)
94
defer C.free(unsafe.Pointer(sparams.model))
96
sparams.embedding = true
97
sparams.n_ctx = C.uint(opts.NumCtx)
98
sparams.n_batch = C.uint(opts.NumBatch)
99
sparams.n_gpu_layers = C.int(opts.NumGPU)
100
sparams.main_gpu = C.int(opts.MainGPU)
101
sparams.n_parallel = 1
104
sparams.rope_freq_base = 0.0
105
sparams.rope_freq_scale = 0.0
106
sparams.memory_f16 = C.bool(opts.F16KV)
107
sparams.use_mlock = C.bool(opts.UseMLock)
108
sparams.use_mmap = C.bool(opts.UseMMap)
111
sparams.numa = C.int(1)
113
sparams.numa = C.int(0)
116
sparams.lora_adapters = nil
117
for i := 0; i < len(adapters); i++ {
118
la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
119
defer C.free(unsafe.Pointer(la))
120
la.adapter = C.CString(adapters[i])
121
defer C.free(unsafe.Pointer(la.adapter))
122
la.scale = C.float(1.0)
125
sparams.lora_adapters = la
127
tmp := sparams.lora_adapters
128
for ; tmp.next != nil; tmp = tmp.next {
134
if len(projectors) > 0 {
136
sparams.mmproj = C.CString(projectors[0])
137
defer C.free(unsafe.Pointer(sparams.mmproj))
142
sparams.n_threads = C.uint(opts.NumThread)
144
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
145
sparams.verbose_logging = C.bool(true)
147
sparams.verbose_logging = C.bool(false)
150
slog.Info("Initializing llama server")
151
initResp := newExtServerResp(128)
152
defer freeExtServerResp(initResp)
153
C.dyn_llama_server_init(llm.s, &sparams, &initResp)
156
err := extServerResponseToErr(initResp)
157
slog.Debug(fmt.Sprintf("failure during initialization: %s", err))
161
slog.Info("Starting llama main loop")
162
C.dyn_llama_server_start(llm.s)
166
func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
167
resp := newExtServerResp(128)
168
defer freeExtServerResp(resp)
170
if len(predict.Images) > 0 {
171
slog.Info(fmt.Sprintf("loaded %d images", len(predict.Images)))
174
request := map[string]any{
175
"prompt": predict.Prompt,
177
"n_predict": predict.Options.NumPredict,
178
"n_keep": predict.Options.NumKeep,
179
"temperature": predict.Options.Temperature,
180
"top_k": predict.Options.TopK,
181
"top_p": predict.Options.TopP,
182
"tfs_z": predict.Options.TFSZ,
183
"typical_p": predict.Options.TypicalP,
184
"repeat_last_n": predict.Options.RepeatLastN,
185
"repeat_penalty": predict.Options.RepeatPenalty,
186
"presence_penalty": predict.Options.PresencePenalty,
187
"frequency_penalty": predict.Options.FrequencyPenalty,
188
"mirostat": predict.Options.Mirostat,
189
"mirostat_tau": predict.Options.MirostatTau,
190
"mirostat_eta": predict.Options.MirostatEta,
191
"penalize_nl": predict.Options.PenalizeNewline,
192
"seed": predict.Options.Seed,
193
"stop": predict.Options.Stop,
194
"image_data": predict.Images,
195
"cache_prompt": true,
198
if predict.Format == "json" {
199
request["grammar"] = jsonGrammar
202
retryDelay := 100 * time.Microsecond
203
for retries := 0; retries < maxRetries; retries++ {
205
time.Sleep(retryDelay)
210
buffer := &bytes.Buffer{}
211
enc := json.NewEncoder(buffer)
212
enc.SetEscapeHTML(false)
214
if err := enc.Encode(request); err != nil {
215
return fmt.Errorf("failed to marshal data: %w", err)
218
req := C.CString(buffer.String())
219
defer C.free(unsafe.Pointer(req))
221
C.dyn_llama_server_completion(llm.s, req, &resp)
223
return extServerResponseToErr(resp)
232
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
234
return extServerResponseToErr(resp)
239
var result C.ext_server_task_result_t
240
C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
241
json_resp := C.GoString(result.json_resp)
242
C.dyn_llama_server_release_task_result(llm.s, &result)
245
if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
246
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
248
return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
250
return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
254
if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
266
if p.Stop || bool(result.stop) {
269
PromptEvalCount: p.Timings.PromptN,
270
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
271
EvalCount: p.Timings.PredictedN,
272
EvalDuration: parseDurationMs(p.Timings.PredictedMS),
284
return fmt.Errorf("max retries exceeded")
287
func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
288
data, err := json.Marshal(TokenizeRequest{Content: prompt})
290
return nil, fmt.Errorf("marshaling encode data: %w", err)
292
req := C.CString(string(data))
293
defer C.free(unsafe.Pointer(req))
294
var json_resp *C.char
295
resp := newExtServerResp(128)
296
defer freeExtServerResp(resp)
297
C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
299
return nil, extServerResponseToErr(resp)
301
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
303
var encoded TokenizeResponse
304
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
305
return nil, fmt.Errorf("unmarshal encode response: %w", err2)
308
return encoded.Tokens, err
311
func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
312
if len(tokens) == 0 {
315
data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
317
return "", fmt.Errorf("marshaling decode data: %w", err)
320
req := C.CString(string(data))
321
defer C.free(unsafe.Pointer(req))
322
var json_resp *C.char
323
resp := newExtServerResp(128)
324
defer freeExtServerResp(resp)
325
C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
327
return "", extServerResponseToErr(resp)
329
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
331
var decoded DetokenizeResponse
332
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
333
return "", fmt.Errorf("unmarshal encode response: %w", err2)
336
return decoded.Content, err
339
func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
340
data, err := json.Marshal(TokenizeRequest{Content: input})
342
return nil, fmt.Errorf("error marshaling embed data: %w", err)
345
req := C.CString(string(data))
346
defer C.free(unsafe.Pointer(req))
347
var json_resp *C.char
348
resp := newExtServerResp(128)
349
defer freeExtServerResp(resp)
350
C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
352
return nil, extServerResponseToErr(resp)
354
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
356
var embedding EmbeddingResponse
357
if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
358
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
361
return embedding.Embedding, nil
364
func (llm *dynExtServer) Close() {
365
C.dyn_llama_server_stop(llm.s)
369
func updatePath(dir string) {
370
if runtime.GOOS == "windows" {
371
tmpDir := filepath.Dir(dir)
372
pathComponents := strings.Split(os.Getenv("PATH"), ";")
374
for _, comp := range pathComponents {
375
if strings.EqualFold(comp, dir) {
379
if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
380
pathComponents[i] = comp
384
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
385
slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
386
os.Setenv("PATH", newPath)