llama

baby-llama.cpp
1639 строк · 61.1 Кб
Перенос по словам
1
#include "ggml.h"
2
#include "train.h"
3

4
#include <cassert>
5
#include <cstdlib>
6
#include <cstring>
7
#include <random>
8
#include <vector>
9

10
#if defined(_MSC_VER)
11
#pragma warning(disable: 4244 4267) // possible loss of data
12
#endif
13

14
#ifdef LLAMA_DEFAULT_RMS_EPS
15
constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
16
#else
17
constexpr float rms_norm_eps = 5e-6f;
18
#endif
19

20
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
21
    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
22

23
    if (plan.work_size > 0) {
24
        buf.resize(plan.work_size);
25
        plan.work_data = buf.data();
26
    }
27

28
    ggml_graph_compute(graph, &plan);
29
}
30

31
static struct ggml_tensor * randomize_tensor(
32
    struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
33
) {
34
    switch (ndims) {
35
        case 1:
36
            for (int i0 = 0; i0 < ne[0]; i0++) {
37
                ((float *)tensor->data)[i0] = frand()*(fmax - fmin) + fmin;
38
            }
39
            break;
40
        case 2:
41
            for (int i1 = 0; i1 < ne[1]; i1++) {
42
                for (int i0 = 0; i0 < ne[0]; i0++) {
43
                    ((float *)tensor->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
44
                }
45
            }
46
            break;
47
        case 3:
48
            for (int i2 = 0; i2 < ne[2]; i2++) {
49
                for (int i1 = 0; i1 < ne[1]; i1++) {
50
                    for (int i0 = 0; i0 < ne[0]; i0++) {
51
                        ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
52
                    }
53
                }
54
            }
55
            break;
56
        case 4:
57
            for (int i3 = 0; i3 < ne[3]; i3++) {
58
                for (int i2 = 0; i2 < ne[2]; i2++) {
59
                    for (int i1 = 0; i1 < ne[1]; i1++) {
60
                        for (int i0 = 0; i0 < ne[0]; i0++) {
61
                            ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
62
                        }
63
                    }
64
                }
65
            }
66
            break;
67
        default:
68
            assert(false);
69
    }
70

71
    return tensor;
72
}
73

74
struct llama_hparams {
75
    uint32_t n_vocab = 32000;
76
    uint32_t n_ctx   = 512;   // this is provided as user input?
77
    uint32_t n_embd  = 4096;
78
    uint32_t n_mult  = 4;
79
    uint32_t n_head  = 32;
80
    uint32_t n_layer = 32;
81
    uint32_t n_rot   = 64;
82

83
    bool operator!=(const llama_hparams & other) const {
84
        return memcmp(this, &other, sizeof(llama_hparams));
85
    }
86
};
87

88
static uint32_t get_n_ff(const struct llama_hparams* hparams) {
89
    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
90
    return n_ff;
91
}
92

93
struct llama_hparams_lora {
94
    uint32_t n_vocab = 32000;
95
    uint32_t n_ctx   = 512;   // this is provided as user input?
96
    uint32_t n_embd  = 4096;
97
    uint32_t n_mult  = 4;
98
    uint32_t n_head  = 32;
99
    uint32_t n_layer = 32;
100
    uint32_t n_rot   = 64;
101
    uint32_t n_lora  = 64;
102

103
    bool operator!=(const llama_hparams_lora & other) const {
104
        return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0;
105
    }
106
};
107

108
struct llama_layer {
109
    // normalization
110
    struct ggml_tensor * attention_norm;
111

112
    // attention
113
    struct ggml_tensor * wq;
114
    struct ggml_tensor * wk;
115
    struct ggml_tensor * wv;
116
    struct ggml_tensor * wo;
117

118
    // normalization
119
    struct ggml_tensor * ffn_norm;
120

121
    // ff
122
    struct ggml_tensor * w1;
123
    struct ggml_tensor * w2;
124
    struct ggml_tensor * w3;
125
};
126

127
struct llama_layer_lora {
128
    // normalization
129
    struct ggml_tensor * attention_norm;
130

131
    // attention
132
    struct ggml_tensor * wqa;
133
    struct ggml_tensor * wqb;
134
    struct ggml_tensor * wka;
135
    struct ggml_tensor * wkb;
136
    struct ggml_tensor * wva;
137
    struct ggml_tensor * wvb;
138
    struct ggml_tensor * woa;
139
    struct ggml_tensor * wob;
140

141
    // normalization
142
    struct ggml_tensor * ffn_norm;
143

144
    // ff
145
    struct ggml_tensor * w1;
146
    struct ggml_tensor * w2;
147
    struct ggml_tensor * w3;
148
};
149

150

151
struct llama_kv_cache {
152
    struct ggml_context * ctx = NULL;
153

154
    struct ggml_tensor * k;
155
    struct ggml_tensor * v;
156

157
    // llama_ctx_buffer buf;
158

159
    int n; // number of tokens currently in the cache
160
};
161

162
struct llama_model {
163
    struct ggml_context * ctx = NULL;
164

165
    llama_hparams hparams;
166

167
    struct ggml_tensor * tok_embeddings;
168

169
    struct ggml_tensor * norm;
170
    struct ggml_tensor * output;
171

172
    std::vector<llama_layer> layers;
173
};
174

175
struct llama_model_lora {
176
    struct ggml_context * ctx = NULL;
177

178
    llama_hparams_lora hparams;
179

180
    struct ggml_tensor * tok_embeddings;
181

182
    struct ggml_tensor * norm;
183
    struct ggml_tensor * outputa;
184
    struct ggml_tensor * outputb;
185

186
    std::vector<llama_layer_lora> layers;
187
};
188

189
static void init_model(struct llama_model * model) {
190
    const auto & hparams = model->hparams;
191

192
    const uint32_t n_embd  = hparams.n_embd;
193
    const uint32_t n_layer = hparams.n_layer;
194
    const uint32_t n_vocab = hparams.n_vocab;
195

196
    const uint32_t n_ff = get_n_ff(&hparams);
197

198
    struct ggml_context * ctx = model->ctx;
199

200
    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
201
    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);          // ("norm.weight",           {n_embd});
202
    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("output.weight",         {n_embd, n_vocab});
203

204
    model->layers.resize(n_layer);
205
    for (uint32_t i = 0; i < n_layer; ++i) {
206
        auto & layer = model->layers[i];
207

208
        // std::string layers_i = "layers." + std::to_string(i);
209

210
        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
211

212
        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
213
        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
214
        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
215
        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
216

217
        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);       // (layers_i + ".ffn_norm.weight", {n_embd});
218

219
        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff});
220
        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);     // (layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd});
221
        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff});
222
    }
223
}
224

225

226
static void init_model_lora(struct llama_model_lora * model) {
227
    const auto & hparams = model->hparams;
228

229
    const uint32_t n_embd  = hparams.n_embd;
230
    const uint32_t n_mult  = hparams.n_mult;
231
    const uint32_t n_layer = hparams.n_layer;
232
    const uint32_t n_vocab = hparams.n_vocab;
233
    const uint32_t n_lora  = hparams.n_lora;
234

235
    const uint32_t n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult;
236

237
    struct ggml_context * ctx = model->ctx;
238

239
    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
240
    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);          // ("norm.weight",           {n_embd});
241
    model->outputa        = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_vocab); // ("output.weight",         {n_embd, n_vocab});
242
    model->outputb        = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,  n_lora); // ("output.weight",         {n_embd, n_vocab});
243

244
    model->layers.resize(n_layer);
245
    for (uint32_t i = 0; i < n_layer; ++i) {
246
        auto & layer = model->layers[i];
247

248
        // std::string layers_i = "layers." + std::to_string(i);
249

250
        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
251

252
        layer.wqa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd);    // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
253
        layer.wqb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora);    // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
254
        layer.wka = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd);    // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
255
        layer.wkb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora);    // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
256
        layer.wva = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd);    // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
257
        layer.wvb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora);    // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
258
        layer.woa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd);    // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
259
        layer.wob = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora);    // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
260

261
        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);       // (layers_i + ".ffn_norm.weight", {n_embd});
262

263
        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff});
264
        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);     // (layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd});
265
        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff});
266
    }
267
}
268

269
static void set_param_model(struct llama_model * model) {
270
    const auto& hparams = model->hparams;
271

272
    const uint32_t n_layer = hparams.n_layer;
273

274
    struct ggml_context* ctx = model->ctx;
275

276
    ggml_set_param(ctx, model->tok_embeddings);
277
    ggml_set_param(ctx, model->norm);
278
    ggml_set_param(ctx, model->output);
279

280
    for (uint32_t i = 0; i < n_layer; ++i) {
281
        auto & layer = model->layers[i];
282

283
        ggml_set_param(ctx, layer.attention_norm);
284
        ggml_set_param(ctx, layer.wq);
285
        ggml_set_param(ctx, layer.wk);
286
        ggml_set_param(ctx, layer.wv);
287
        ggml_set_param(ctx, layer.wo);
288
        ggml_set_param(ctx, layer.ffn_norm);
289
        ggml_set_param(ctx, layer.w1);
290
        ggml_set_param(ctx, layer.w2);
291
        ggml_set_param(ctx, layer.w3);
292
    }
293
}
294

295
static void set_param_model_lora(struct llama_model_lora * model) {
296
    const auto& hparams = model->hparams;
297

298
    const uint32_t n_layer = hparams.n_layer;
299

300
    struct ggml_context* ctx = model->ctx;
301

302
    ggml_set_param(ctx, model->tok_embeddings);
303
    ggml_set_param(ctx, model->norm);
304
    ggml_set_param(ctx, model->outputa);
305
    ggml_set_param(ctx, model->outputb);
306

307
    for (uint32_t i = 0; i < n_layer; ++i) {
308
        auto & layer = model->layers[i];
309

310
        ggml_set_param(ctx, layer.attention_norm);
311
        ggml_set_param(ctx, layer.wqa);
312
        ggml_set_param(ctx, layer.wqb);
313
        ggml_set_param(ctx, layer.wka);
314
        ggml_set_param(ctx, layer.wkb);
315
        ggml_set_param(ctx, layer.wva);
316
        ggml_set_param(ctx, layer.wvb);
317
        ggml_set_param(ctx, layer.woa);
318
        ggml_set_param(ctx, layer.wob);
319
        ggml_set_param(ctx, layer.ffn_norm);
320
        ggml_set_param(ctx, layer.w1);
321
        ggml_set_param(ctx, layer.w2);
322
        ggml_set_param(ctx, layer.w3);
323
    }
324
}
325

326
static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
327
    const auto & hparams = model->hparams;
328

329
    const uint32_t n_layer = hparams.n_layer;
330

331
    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
332

333
    randomize_tensor_normal(model->tok_embeddings , rnd);
334
    randomize_tensor_normal(model->norm           , rnd);
335
    randomize_tensor_normal(model->output         , rnd);
336

337
    for (uint32_t i = 0; i < n_layer; ++i) {
338
        auto & layer = model->layers[i];
339
        randomize_tensor_normal(layer.attention_norm, rnd);
340

341
        randomize_tensor_normal(layer.wq, rnd);
342
        randomize_tensor_normal(layer.wk, rnd);
343
        randomize_tensor_normal(layer.wv, rnd);
344
        randomize_tensor_normal(layer.wo, rnd);
345

346
        randomize_tensor_normal(layer.ffn_norm, rnd);
347

348
        randomize_tensor_normal(layer.w1, rnd);
349
        randomize_tensor_normal(layer.w2, rnd);
350
        randomize_tensor_normal(layer.w3, rnd);
351
    }
352

353
    free_random_normal_distribution(rnd);
354
}
355

356

357
static void randomize_model_lora(
358
    struct llama_model_lora * model, int seed, float mean, float std, float min, float max
359
) {
360
    const auto & hparams = model->hparams;
361

362
    const uint32_t n_layer = hparams.n_layer;
363

364
    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
365

366
    randomize_tensor_normal(model->tok_embeddings, rnd);
367
    randomize_tensor_normal(model->norm          , rnd);
368
    randomize_tensor_normal(model->outputa       , rnd);
369
    randomize_tensor_normal(model->outputb       , rnd);
370

371
    for (uint32_t i = 0; i < n_layer; ++i) {
372
        auto & layer = model->layers[i];
373
        randomize_tensor_normal(layer.attention_norm, rnd);
374

375
        randomize_tensor_normal(layer.wqa, rnd);
376
        randomize_tensor_normal(layer.wqb, rnd);
377
        randomize_tensor_normal(layer.wka, rnd);
378
        randomize_tensor_normal(layer.wkb, rnd);
379
        randomize_tensor_normal(layer.wva, rnd);
380
        randomize_tensor_normal(layer.wvb, rnd);
381
        randomize_tensor_normal(layer.woa, rnd);
382
        randomize_tensor_normal(layer.wob, rnd);
383

384
        randomize_tensor_normal(layer.ffn_norm, rnd);
385

386
        randomize_tensor_normal(layer.w1, rnd);
387
        randomize_tensor_normal(layer.w2, rnd);
388
        randomize_tensor_normal(layer.w3, rnd);
389
    }
390

391
    free_random_normal_distribution(rnd);
392
}
393

394
static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
395
    const auto & hparams = model->hparams;
396

397
    const uint32_t n_ctx   = hparams.n_ctx;
398
    const uint32_t n_embd  = hparams.n_embd;
399
    const uint32_t n_layer = hparams.n_layer;
400

401
    const int64_t n_mem      = n_layer*n_ctx*n_batch;
402
    const int64_t n_elements = n_embd*n_mem;
403

404
    // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
405

406
    // struct ggml_init_params params;
407
    // params.mem_size   = cache.buf.size;
408
    // params.mem_buffer = cache.buf.addr;
409
    // params.no_alloc   = false;
410
    if (!cache->ctx) {
411
        struct ggml_init_params params;
412
        params.mem_size   = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
413
        params.mem_buffer = NULL;
414
        params.no_alloc   = false;
415

416
        cache->ctx = ggml_init(params);
417

418
        if (!cache->ctx) {
419
            fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
420
            exit(1);
421
        }
422
    }
423

424
    cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
425
    cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
426
}
427

428
static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
429
    const auto & hparams = model->hparams;
430

431
    const uint32_t n_ctx   = hparams.n_ctx;
432
    const uint32_t n_embd  = hparams.n_embd;
433
    const uint32_t n_layer = hparams.n_layer;
434

435
    const int64_t n_mem      = n_layer*n_ctx*n_batch;
436
    const int64_t n_elements = n_embd*n_mem;
437

438
    // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
439

440
    // struct ggml_init_params params;
441
    // params.mem_size   = cache.buf.size;
442
    // params.mem_buffer = cache.buf.addr;
443
    // params.no_alloc   = false;
444
    if (!cache->ctx) {
445
        struct ggml_init_params params;
446
        params.mem_size   = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
447
        params.mem_buffer = NULL;
448
        params.no_alloc   = false;
449

450
        cache->ctx = ggml_init(params);
451

452
        if (!cache->ctx) {
453
            fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
454
            return false;
455
        }
456
    }
457

458
    cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
459
    cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
460

461
    return true;
462
}
463

464
static struct ggml_tensor * forward(
465
    struct llama_model    * model,
466
    struct llama_kv_cache * cache,
467
    struct ggml_context   * ctx0,
468
    struct ggml_cgraph    * gf,
469
    struct ggml_tensor    * tokens_input,
470
    const  int              n_tokens,
471
    const  int              n_past
472
) {
473
    const int N = n_tokens;
474

475
    struct llama_kv_cache& kv_self = *cache;
476
    const auto & hparams = model->hparams;
477
    const int n_ctx   = hparams.n_ctx;
478
    const int n_embd  = hparams.n_embd;
479
    const int n_layer = hparams.n_layer;
480
    const int n_head  = hparams.n_head;
481
    const int n_rot   = hparams.n_rot;
482

483
    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
484
    memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
485

486
    struct ggml_tensor * kc = kv_self.k;
487
    struct ggml_tensor * vc = kv_self.v;
488

489
    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
490
    {
491
        int * data = (int *) KQ_pos->data;
492
        for (int i = 0; i < N; ++i) {
493
            data[i] = n_past + i;
494
        }
495
    }
496

497
    // inpL shape [n_embd,N,1,1]
498
    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
499
    for (int il = 0; il < n_layer; ++il) {
500
        struct ggml_tensor * inpSA = inpL;
501

502
        struct ggml_tensor * cur;
503

504
        // lctx.use_buf(ctx0, 0);
505

506
        // norm
507
        {
508
            // cur shape [n_embd,N,1,1]
509
            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
510

511
            // cur = attention_norm*cur
512
            cur = ggml_mul(ctx0,
513
                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
514
                        cur);
515
        }
516

517
        // self-attention
518
        {
519
            // compute Q and K and RoPE them
520
            // wq   shape [n_embd, n_embd, 1, 1]
521
            // wk   shape [n_embd, n_embd, 1, 1]
522
            // Qcur shape [n_embd/n_head, n_head, N, 1]
523
            // Kcur shape [n_embd/n_head, n_head, N, 1]
524
            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
525
            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
526

527
            // store key and value to memory
528
            {
529
                // compute the transposed [N, n_embd] V matrix
530
                // wv   shape [n_embd, n_embd, 1, 1]
531
                // Vcur shape [n_embd, N, 1, 1]
532
                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));
533

534
                // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
535
                // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
536
                // k         shape [n_embd * N, 1]   == kv_self.k[:,n_past:n_past+N,il,0]
537
                // v         shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
538

539
                /* {
540
                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
541
                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
542
                            (   n_ctx)*ggml_element_size(kv_self.v),
543
                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
544

545
                    // important: storing RoPE-ed version of K in the KV cache!
546
                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
547
                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
548
                } //*/
549

550
                kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
551
                vc = ggml_set_2d(ctx0, vc, Vcur, (   n_ctx)*ggml_element_size(kv_self.v),
552
                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
553
            }
554

555
            // Qcur shape [n_embd/n_head, n_head, N, 1]
556
            // Q shape    [n_embd/n_head, N, n_head, 1]
557
            struct ggml_tensor * Q =
558
                ggml_permute(ctx0,
559
                        Qcur,
560
                        0, 2, 1, 3);
561

562
            // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
563
            // K shape [n_embd/n_head, n_past + N, n_head, 1]
564
            struct ggml_tensor * K =
565
                ggml_permute(ctx0,
566
                        ggml_reshape_3d(ctx0,
567
                            ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
568
                            n_embd/n_head, n_head, n_past + N),
569
                        0, 2, 1, 3);
570

571
            // K * Q
572
            // KQ shape [n_past + N, N, n_head, 1]
573
            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
574

575
            // KQ_scaled = KQ / sqrt(n_embd/n_head)
576
            // KQ_scaled shape [n_past + N, N, n_head, 1]
577
            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
578

579
            // KQ_masked = mask_past(KQ_scaled)
580
            // KQ_masked shape [n_past + N, N, n_head, 1]
581
            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
582

583
            // KQ = soft_max(KQ_masked)
584
            // KQ_soft_max shape [n_past + N, N, n_head, 1]
585
            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
586

587
            // split cached V into n_head heads
588
            //// V shape [n_past + N, n_embd/n_head, n_head, 1]
589
            // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
590
            struct ggml_tensor * V =
591
                ggml_view_3d(ctx0, vc,
592
                        n_past + N, n_embd/n_head, n_head,
593
                        n_ctx*ggml_element_size(vc),
594
                        n_ctx*ggml_element_size(vc)*n_embd/n_head,
595
                        il*n_ctx*ggml_element_size(vc)*n_embd);
596

597
            // KQV shape [n_embd/n_head, N, n_head, 1]
598
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
599

600
            // KQV_merged = KQV.permute(0, 2, 1, 3)
601
            // KQV_merged shape [n_embd/n_head, n_head, N, 1]
602
            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
603
            // KQV_merged shape
604

605
            // cur = KQV_merged.contiguous().view(n_embd, N)
606
            // cur shape [n_embd,N,1,1]
607
            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
608
            // cur = ggml_cpy(ctx0,
609
            //         KQV_merged,
610
            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
611

612
            // projection (no bias)
613
            // cur shape [n_embd,N,1,1]
614
            cur = ggml_mul_mat(ctx0,
615
                    model->layers[il].wo,
616
                    cur);
617
        }
618

619
        // lctx.use_buf(ctx0, 1);
620

621
        // inpFF shape [n_embd,N,1,1]
622
        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
623

624
        // feed-forward network
625
        {
626
            // norm
627
            {
628
                // cur shape [n_embd,N,1,1]
629
                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
630

631
                // cur = ffn_norm*cur
632
                // cur shape [n_embd,N,1,1]
633
                cur = ggml_mul(ctx0,
634
                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
635
                        cur);
636
            }
637

638
            // tmp shape [n_ff,N,1,1]
639
            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
640
                    model->layers[il].w3,
641
                    cur);
642

643
            // cur shape [n_ff,N,1,1]
644
            cur = ggml_mul_mat(ctx0,
645
                    model->layers[il].w1,
646
                    cur);
647

648
            // SILU activation
649
            // cur shape [n_ff,N,1,1]
650
            cur = ggml_silu(ctx0, cur);
651

652
            // cur shape [n_ff,N,1,1]
653
            cur = ggml_mul(ctx0, cur, tmp);
654

655
            // cur shape [n_embd,N,1,1]
656
            cur = ggml_mul_mat(ctx0,
657
                    model->layers[il].w2,
658
                    cur);
659
        }
660

661
        // cur shape [n_embd,N,1,1]
662
        cur = ggml_add(ctx0, cur, inpFF);
663

664
        // input for next layer
665
        // inpL shape [n_embd,N,1,1]
666
        inpL = cur;
667
    }
668

669
    // norm
670
    {
671

672
        // inpL shape [n_embd,N,1,1]
673
        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
674

675
        // inpL = norm*inpL
676
        // inpL shape [n_embd,N,1,1]
677
        inpL = ggml_mul(ctx0,
678
                    ggml_repeat(ctx0, model->norm, inpL),
679
                    inpL);
680

681
        //embeddings = inpL;
682
    }
683

684
    // lm_head
685
    // inpL shape [n_vocab,N,1,1]
686
    inpL = ggml_mul_mat(ctx0, model->output, inpL);
687

688
    // run the computation
689
    ggml_build_forward_expand(gf, inpL);
690

691
    return inpL;
692
}
693

694
static struct ggml_tensor * forward_batch(
695
    struct llama_model    * model,
696
    struct llama_kv_cache * cache,
697
    struct ggml_context   * ctx0,
698
    struct ggml_cgraph    * gf,
699
    struct ggml_tensor    * tokens_input,
700
    const  int              n_tokens,
701
    const  int              n_past,
702
    const  int              n_batch
703
) {
704
    const int N = n_tokens;
705

706
    struct llama_kv_cache& kv_self = *cache;
707
    const auto & hparams = model->hparams;
708
    const int n_ctx   = hparams.n_ctx;
709
    const int n_vocab = hparams.n_vocab;
710
    const int n_embd  = hparams.n_embd;
711
    const int n_layer = hparams.n_layer;
712
    const int n_head  = hparams.n_head;
713
    const int n_rot   = hparams.n_rot;
714
    const int n_ff    = get_n_ff(&hparams);
715

716
    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
717
    memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
718

719
    struct ggml_tensor * kc = kv_self.k;
720
    struct ggml_tensor * vc = kv_self.v;
721

722
    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
723
    {
724
        int * data = (int *) KQ_pos->data;
725
        for (int i = 0; i < N; ++i) {
726
            data[i] = n_past + i;
727
        }
728
    }
729

730
    // inpL shape [n_embd,N*n_batch,1]
731
    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
732
    assert_shape_2d(inpL, n_embd, N*n_batch);
733

734
    for (int il = 0; il < n_layer; ++il) {
735
        struct ggml_tensor * inpSA = inpL;
736

737
        struct ggml_tensor * cur;
738

739
        // lctx.use_buf(ctx0, 0);
740

741
        // norm
742
        {
743
            // cur shape [n_embd,N*n_batch,1,1]
744
            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
745
            assert_shape_2d(cur, n_embd, N*n_batch);
746

747
            // cur = attention_norm*cur
748
            cur = ggml_mul(ctx0,
749
                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
750
                        cur);
751
            assert_shape_2d(cur, n_embd, N*n_batch);
752
        }
753

754
        // self-attention
755
        {
756
            // compute Q and K and RoPE them
757
            // wq   shape [n_embd, n_embd, 1, 1]
758
            // wk   shape [n_embd, n_embd, 1, 1]
759
            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
760
            // Kcur shape [n_embd/n_head, n_head, N, n_batch]
761
            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
762
            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
763
            assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
764
            assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
765

766
            // store key and value to memory
767
            {
768
                // compute the transposed [N, n_embd] V matrix
769
                // wv   shape [n_embd, n_embd, 1, 1]
770
                // Vcur shape [N, n_embd, n_batch, 1]
771
                struct ggml_tensor * Vcur = ggml_cont(ctx0,
772
                    ggml_permute(ctx0,
773
                        ggml_reshape_3d(ctx0,
774
                            ggml_mul_mat(ctx0,
775
                                model->layers[il].wv,
776
                                cur),
777
                        n_embd, N, n_batch),
778
                        1, 0, 2, 3));
779

780
                assert_shape_3d(Vcur, N, n_embd, n_batch);
781

782
                // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
783
                // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
784
                // k         shape [n_embd * N, n_batch]   == kv_self.k[:,n_past:n_past+N,:,il]
785
                // v         shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]
786

787
                /* {
788
                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
789
                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
790
                            (   n_ctx)*ggml_element_size(kv_self.v),
791
                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
792

793
                    // important: storing RoPE-ed version of K in the KV cache!
794
                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
795
                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
796
                } //*/
797

798
                kc = ggml_set_2d(ctx0, kc,
799
                        ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch),
800
                        ggml_element_size(kc)*n_embd*n_ctx,
801
                        (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past));
802
                vc = ggml_set_2d(ctx0, vc,
803
                        ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch),
804
                        ggml_element_size(vc)*n_ctx*n_embd,
805
                        ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx));
806

807
                assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer);
808
                assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer);
809
            }
810

811
            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
812
            // Q shape    [n_embd/n_head, N, n_head, n_batch]
813
            struct ggml_tensor * Q =
814
                ggml_permute(ctx0,
815
                        Qcur,
816
                        0, 2, 1, 3);
817
            assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
818

819
            // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
820
            // K shape [n_embd/n_head, n_past + N, n_head, n_batch]
821
            struct ggml_tensor * K =
822
                ggml_permute(ctx0,
823
                        ggml_reshape_4d(ctx0,
824
                            ggml_view_3d(ctx0,
825
                                kc,
826
                                n_embd,
827
                                (n_past + N),
828
                                n_batch,
829
                                n_embd*ggml_element_size(kc),
830
                                n_ctx*n_embd*ggml_element_size(kc),
831
                                il*n_batch*n_ctx*n_embd*ggml_element_size(kc)),
832
                            n_embd/n_head, n_head, n_past + N, n_batch),
833
                        0, 2, 1, 3);
834
            assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch);
835

836
            // K * Q
837
            // KQ shape [n_past + N, N, n_head, n_batch]
838
            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
839
            assert_shape_4d(KQ, n_past + N, N, n_head, n_batch);
840

841
            // KQ_scaled = KQ / sqrt(n_embd/n_head)
842
            // KQ_scaled shape [n_past + N, N, n_head, n_batch]
843
            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
844
            assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);
845

846
            // KQ_masked = mask_past(KQ_scaled)
847
            // KQ_masked shape [n_past + N, N, n_head, n_batch]
848
            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
849
            assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch);
850

851
            // KQ = soft_max(KQ_masked)
852
            // KQ_soft_max shape [n_past + N, N, n_head, n_batch]
853
            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
854
            assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);
855

856
            // split cached V into n_head heads
857
            // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
858
            // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
859
            struct ggml_tensor * V =
860
                ggml_view_4d(ctx0, vc,
861
                        n_past + N, n_embd/n_head, n_head, n_batch,
862
                        ggml_element_size(vc)*n_ctx,
863
                        ggml_element_size(vc)*n_ctx*n_embd/n_head,
864
                        ggml_element_size(vc)*n_ctx*n_embd,
865
                        il*n_batch*n_ctx*n_embd*ggml_element_size(vc));
866
            assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch);
867

868
            // KQV shape [n_embd/n_head, N, n_head, n_batch]
869
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
870
            assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
871

872
            // KQV_merged = KQV.permute(0, 2, 1, 3)
873
            // KQV_merged shape [n_embd/n_head, n_head, N, n_batch]
874
            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
875
            assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
876
            // KQV_merged shape
877

878
            // cur = KQV_merged.contiguous().view(n_embd, N)
879
            // cur shape [n_embd,N*n_batch,1,1]
880
            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
881
            assert_shape_2d(cur, n_embd, N*n_batch);
882
            // cur = ggml_cpy(ctx0,
883
            //         KQV_merged,
884
            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
885

886
            // projection (no bias)
887
            // cur shape [n_embd,N*n_batch,1,1]
888
            cur = ggml_mul_mat(ctx0,
889
                    model->layers[il].wo,
890
                    cur);
891
            assert_shape_2d(cur, n_embd, N*n_batch);
892
        }
893

894
        // lctx.use_buf(ctx0, 1);
895

896
        // inpFF shape [n_embd,N*n_batch,1,1]
897
        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
898
        assert_shape_2d(inpFF, n_embd, N*n_batch);
899

900
        // feed-forward network
901
        {
902
            // norm
903
            {
904
                // cur shape [n_embd,N*n_batch,1,1]
905
                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
906
                assert_shape_2d(cur, n_embd, N*n_batch);
907

908
                // cur = ffn_norm*cur
909
                // cur shape [n_embd,N*n_batch,1,1]
910
                cur = ggml_mul(ctx0,
911
                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
912
                        cur);
913
                assert_shape_2d(cur, n_embd, N*n_batch);
914
            }
915

916
            // tmp shape [n_ff,N*n_batch,1,1]
917
            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
918
                    model->layers[il].w3,
919
                    cur);
920
            assert_shape_2d(tmp, n_ff, N*n_batch);
921

922
            // cur shape [n_ff,N*n_batch,1,1]
923
            cur = ggml_mul_mat(ctx0,
924
                    model->layers[il].w1,
925
                    cur);
926
            assert_shape_2d(cur, n_ff, N*n_batch);
927

928
            // SILU activation
929
            // cur shape [n_ff,N*n_batch,1,1]
930
            cur = ggml_silu(ctx0, cur);
931
            assert_shape_2d(cur, n_ff, N*n_batch);
932

933
            // cur shape [n_ff,N*n_batch,1,1]
934
            cur = ggml_mul(ctx0, cur, tmp);
935
            assert_shape_2d(cur, n_ff, N*n_batch);
936

937
            // cur shape [n_embd,N*n_batch,1,1]
938
            cur = ggml_mul_mat(ctx0,
939
                    model->layers[il].w2,
940
                    cur);
941
            assert_shape_2d(cur, n_embd, N*n_batch);
942
        }
943

944
        // cur shape [n_embd,N*n_batch,1,1]
945
        cur = ggml_add(ctx0, cur, inpFF);
946
        assert_shape_2d(cur, n_embd, N*n_batch);
947

948
        // input for next layer
949
        // inpL shape [n_embd,N*n_batch,1,1]
950
        inpL = cur;
951
        assert_shape_2d(inpL, n_embd, N*n_batch);
952
    }
953

954
    // norm
955
    {
956

957
        // inpL shape [n_embd,N*n_batch,1,1]
958
        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
959
        assert_shape_2d(inpL, n_embd, N*n_batch);
960

961
        // inpL = norm*inpL
962
        // inpL shape [n_embd,N*n_batch,1,1]
963
        inpL = ggml_mul(ctx0,
964
                    ggml_repeat(ctx0, model->norm, inpL),
965
                    inpL);
966

967
        assert_shape_2d(inpL, n_embd, N*n_batch);
968

969
        //embeddings = inpL;
970
    }
971

972
    // lm_head
973
    // inpL shape [n_vocab,N*n_batch,1,1]
974
    inpL = ggml_mul_mat(ctx0, model->output, inpL);
975
    assert_shape_2d(inpL, n_vocab, N*n_batch);
976

977
    {
978
        // inpL shape [n_vocab,N,n_batch,1]
979
        inpL = ggml_reshape_3d(ctx0,
980
                        inpL,
981
                        n_vocab, N, n_batch);
982
        assert_shape_3d(inpL, n_vocab, N, n_batch);
983
    }
984

985
    // run the computation
986
    ggml_build_forward_expand(gf, inpL);
987

988
    return inpL;
989
}
990

991
static struct ggml_tensor * forward_lora(
992
    struct llama_model_lora * model,
993
    struct llama_kv_cache   * cache,
994
    struct ggml_context     * ctx0,
995
    struct ggml_cgraph      * gf,
996
    struct ggml_tensor      * tokens_input,
997
    const  int                n_tokens,
998
    const  int                n_past
999
) {
1000
    const int N = n_tokens;
1001

1002
    struct llama_kv_cache& kv_self = *cache;
1003
    const auto & hparams = model->hparams;
1004

1005
    const int n_ctx   = hparams.n_ctx;
1006
    const int n_embd  = hparams.n_embd;
1007
    const int n_layer = hparams.n_layer;
1008
    const int n_head  = hparams.n_head;
1009
    const int n_rot   = hparams.n_rot;
1010

1011
    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1012
    memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
1013

1014
    struct ggml_tensor * kc = kv_self.k;
1015
    struct ggml_tensor * vc = kv_self.v;
1016

1017
    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1018
    {
1019
        int * data = (int *) KQ_pos->data;
1020
        for (int i = 0; i < N; ++i) {
1021
            data[i] = n_past + i;
1022
        }
1023
    }
1024

1025
    // inpL shape [n_embd,N,1,1]
1026
    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
1027
    for (int il = 0; il < n_layer; ++il) {
1028
        struct ggml_tensor * inpSA = inpL;
1029

1030
        struct ggml_tensor * cur;
1031

1032
        // norm
1033
        {
1034
            // cur shape [n_embd,N,1,1]
1035
            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
1036

1037
            // cur = attention_norm*cur
1038
            cur = ggml_mul(ctx0,
1039
                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
1040
                        cur);
1041
        }
1042

1043
        // self-attention
1044
        {
1045
            // compute Q and K and RoPE them
1046
            // wq   shape [n_embd, n_embd, 1, 1]
1047
            // wk   shape [n_embd, n_embd, 1, 1]
1048
            // Qcur shape [n_embd/n_head, n_head, N, 1]
1049
            // Kcur shape [n_embd/n_head, n_head, N, 1]
1050
            struct ggml_tensor * Qcur = ggml_rope(ctx0,
1051
                                            ggml_reshape_3d(ctx0,
1052
                                                ggml_mul_mat(ctx0,
1053
                                                    model->layers[il].wqa,
1054
                                                    ggml_mul_mat(ctx0,
1055
                                                        model->layers[il].wqb,
1056
                                                        cur)),
1057
                                                n_embd/n_head, n_head, N),
1058
                                            KQ_pos, n_rot, 0);
1059
            struct ggml_tensor * Kcur = ggml_rope(ctx0,
1060
                                            ggml_reshape_3d(ctx0,
1061
                                                ggml_mul_mat(ctx0,
1062
                                                    model->layers[il].wka,
1063
                                                    ggml_mul_mat(ctx0,
1064
                                                        model->layers[il].wkb,
1065
                                                        cur)),
1066
                                                n_embd/n_head, n_head, N),
1067
                                            KQ_pos, n_rot, 0);
1068

1069
            // store key and value to memory
1070
            {
1071
                // compute the transposed [N, n_embd] V matrix
1072
                // wv   shape [n_embd, n_embd, 1, 1]
1073
                // Vcur shape [n_embd, N, 1, 1]
1074
                struct ggml_tensor * Vcur = ggml_cont(ctx0,
1075
                                                ggml_transpose(ctx0,
1076
                                                    ggml_reshape_2d(ctx0,
1077
                                                        ggml_mul_mat(ctx0,
1078
                                                            model->layers[il].wva,
1079
                                                            ggml_mul_mat(ctx0,
1080
                                                                model->layers[il].wvb,
1081
                                                                cur)),
1082
                                                        n_embd, N)));
1083

1084
                // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
1085
                // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
1086
                // k         shape [n_embd * N, 1]   == kv_self.k[:,n_past:n_past+N,il,0]
1087
                // v         shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
1088

1089
                /* {
1090
                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1091
                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1092
                            (   n_ctx)*ggml_element_size(kv_self.v),
1093
                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1094

1095
                    // important: storing RoPE-ed version of K in the KV cache!
1096
                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
1097
                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
1098
                } //*/
1099

1100
                kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1101
                vc = ggml_set_2d(ctx0, vc, Vcur, (   n_ctx)*ggml_element_size(kv_self.v),
1102
                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1103
            }
1104

1105
            // Qcur shape [n_embd/n_head, n_head, N, 1]
1106
            // Q shape    [n_embd/n_head, N, n_head, 1]
1107
            struct ggml_tensor * Q =
1108
                ggml_permute(ctx0,
1109
                        Qcur,
1110
                        0, 2, 1, 3);
1111

1112
            // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
1113
            // K shape [n_embd/n_head, n_past + N, n_head, 1]
1114
            struct ggml_tensor * K =
1115
                ggml_permute(ctx0,
1116
                        ggml_reshape_3d(ctx0,
1117
                            ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
1118
                            n_embd/n_head, n_head, n_past + N),
1119
                        0, 2, 1, 3);
1120

1121
            // K * Q
1122
            // KQ shape [n_past + N, N, n_head, 1]
1123
            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1124

1125
            // KQ_scaled = KQ / sqrt(n_embd/n_head)
1126
            // KQ_scaled shape [n_past + N, N, n_head, 1]
1127
            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
1128

1129
            // KQ_masked = mask_past(KQ_scaled)
1130
            // KQ_masked shape [n_past + N, N, n_head, 1]
1131
            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
1132

1133
            // KQ = soft_max(KQ_masked)
1134
            // KQ_soft_max shape [n_past + N, N, n_head, 1]
1135
            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
1136

1137
            // split cached V into n_head heads
1138
            //// V shape [n_past + N, n_embd/n_head, n_head, 1]
1139
            // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
1140
            struct ggml_tensor * V =
1141
                ggml_view_3d(ctx0, vc,
1142
                        n_past + N, n_embd/n_head, n_head,
1143
                        n_ctx*ggml_element_size(vc),
1144
                        n_ctx*ggml_element_size(vc)*n_embd/n_head,
1145
                        il*n_ctx*ggml_element_size(vc)*n_embd);
1146

1147
            // KQV shape [n_embd/n_head, N, n_head, 1]
1148
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1149

1150
            // KQV_merged = KQV.permute(0, 2, 1, 3)
1151
            // KQV_merged shape [n_embd/n_head, n_head, N, 1]
1152
            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1153
            // KQV_merged shape
1154

1155
            // cur = KQV_merged.contiguous().view(n_embd, N)
1156
            // cur shape [n_embd,N,1,1]
1157
            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
1158
            // cur = ggml_cpy(ctx0,
1159
            //         KQV_merged,
1160
            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
1161

1162
            // projection (no bias)
1163
            // cur shape [n_embd,N,1,1]
1164
            cur = ggml_mul_mat(ctx0,
1165
                    model->layers[il].woa,
1166
                    ggml_mul_mat(ctx0,
1167
                        model->layers[il].wob,
1168
                        cur));
1169
        }
1170

1171
        // inpFF shape [n_embd,N,1,1]
1172
        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1173

1174
        // feed-forward network
1175
        {
1176
            // norm
1177
            {
1178
                // cur shape [n_embd,N,1,1]
1179
                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
1180

1181
                // cur = ffn_norm*cur
1182
                // cur shape [n_embd,N,1,1]
1183
                cur = ggml_mul(ctx0,
1184
                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
1185
                        cur);
1186
            }
1187

1188
            // tmp shape [n_ff,N,1,1]
1189
            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
1190
                    model->layers[il].w3,
1191
                    cur);
1192

1193
            // cur shape [n_ff,N,1,1]
1194
            cur = ggml_mul_mat(ctx0,
1195
                    model->layers[il].w1,
1196
                    cur);
1197

1198
            // SILU activation
1199
            // cur shape [n_ff,N,1,1]
1200
            cur = ggml_silu(ctx0, cur);
1201

1202
            // cur shape [n_ff,N,1,1]
1203
            cur = ggml_mul(ctx0, cur, tmp);
1204

1205
            // cur shape [n_embd,N,1,1]
1206
            cur = ggml_mul_mat(ctx0,
1207
                    model->layers[il].w2,
1208
                    cur);
1209
        }
1210

1211
        // cur shape [n_embd,N,1,1]
1212
        cur = ggml_add(ctx0, cur, inpFF);
1213

1214
        // input for next layer
1215
        // inpL shape [n_embd,N,1,1]
1216
        inpL = cur;
1217
    }
1218

1219
    // norm
1220
    {
1221

1222
        // inpL shape [n_embd,N,1,1]
1223
        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
1224

1225
        // inpL = norm*inpL
1226
        // inpL shape [n_embd,N,1,1]
1227
        inpL = ggml_mul(ctx0,
1228
                    ggml_repeat(ctx0, model->norm, inpL),
1229
                    inpL);
1230

1231
        //embeddings = inpL;
1232
    }
1233

1234

1235
    // lm_head
1236
    // inpL shape [n_vocab,N,1,1]
1237
    inpL = ggml_mul_mat(ctx0,
1238
                model->outputa,
1239
                    ggml_mul_mat(ctx0,
1240
                        model->outputb,
1241
                        inpL));
1242

1243
    // ggml_set_scratch(ctx0, { 0, 0, nullptr, });
1244
    // run the computation
1245
    ggml_build_forward_expand(gf, inpL);
1246

1247
    return inpL;
1248
}
1249

1250
static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
1251
    assert(ggml_is_matrix(logits));
1252
    assert(ggml_is_matrix(probs));
1253
    assert(ggml_is_vector(best_samples));
1254
    assert(logits->ne[1] == best_samples->ne[0]);
1255
    assert(logits->ne[0] == probs->ne[0]);
1256
    assert(logits->ne[1] == probs->ne[1]);
1257
    for (int i = 0; i < logits->ne[1]; ++i) {
1258
        float max_logit = ggml_get_f32_1d(logits, i * logits->ne[0]);
1259
        ggml_set_i32_1d(best_samples, i, 0);
1260
        for (int k = 0; k < logits->ne[0]; ++k) {
1261
            float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
1262
            if (logit > max_logit) {
1263
                max_logit = logit;
1264
                ggml_set_i32_1d(best_samples, i, k);
1265
            }
1266
        }
1267
        float psum = 0;
1268
        for (int k = 0; k < logits->ne[0]; ++k) {
1269
            float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
1270
            float p = (logit == -INFINITY) ? 0 : expf(logit - max_logit);
1271
            psum += p;
1272
            ggml_set_f32_1d(probs, i * probs->ne[0] + k, p);
1273
        }
1274
        for (int k = 0; k < logits->ne[0]; ++k) {
1275
            float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
1276
            ggml_set_f32_1d(probs, i * probs->ne[0] + k, p / psum);
1277
        }
1278
    }
1279
}
1280

1281
static void sample_softmax_batch(
1282
    struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
1283
    struct ggml_tensor * best_samples
1284
) {
1285
    GGML_ASSERT(ggml_is_matrix(best_samples));
1286
    GGML_ASSERT(ggml_is_3d(logits));
1287
    GGML_ASSERT(ggml_is_3d(probs));
1288
    int n_tokens = best_samples->ne[0];
1289
    int n_batch  = best_samples->ne[1];
1290
    int n_vocab  = logits->ne[0];
1291
    GGML_ASSERT(n_tokens == logits->ne[1]);
1292
    GGML_ASSERT(n_batch  == logits->ne[2]);
1293
    GGML_ASSERT(n_vocab  == probs->ne[0]);
1294
    GGML_ASSERT(n_tokens == probs->ne[1]);
1295
    GGML_ASSERT(n_batch  == probs->ne[2]);
1296

1297
    for (int k = 0; k < n_batch; ++k) {
1298
        struct ggml_tensor * best_samples_k = ggml_view_1d(ctx,
1299
                                                best_samples,
1300
                                                best_samples->ne[0],
1301
                                                k*best_samples->nb[1]);
1302
        struct ggml_tensor * logits_k       = ggml_view_2d(ctx,
1303
                                                logits,
1304
                                                logits->ne[0],
1305
                                                logits->ne[1],
1306
                                                logits->nb[1],
1307
                                                k*logits->nb[2]);
1308
        struct ggml_tensor * probs_k        = ggml_view_2d(ctx,
1309
                                                probs,
1310
                                                probs->ne[0],
1311
                                                probs->ne[1],
1312
                                                probs->nb[1],
1313
                                                k*probs->nb[2]);
1314
        sample_softmax(logits_k, probs_k, best_samples_k);
1315
    }
1316
}
1317

1318
static void print_row(struct ggml_tensor * probs, int i) {
1319
    for (int k = 0; k < probs->ne[0]; ++k) {
1320
        float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
1321
        printf(" %.2f", p);
1322
    }
1323
    printf("\n");
1324
}
1325

1326
static void print_matrix(struct ggml_tensor * probs) {
1327
    assert(ggml_is_matrix(probs));
1328
    for (int i = 0; i < probs->ne[1]; ++i) {
1329
        for (int k = 0; k < probs->ne[0]; ++k) {
1330
            float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
1331
            printf(" %.2f", p);
1332
        }
1333
        printf("\n");
1334
    }
1335
}
1336

1337
static void print_token(int token, int n_vocab) {
1338
    for (int k = 0; k < token; ++k) {
1339
        printf(" ");
1340
    }
1341
    printf("X");
1342
    for (int k = token+1; k < n_vocab; ++k) {
1343
        printf(" ");
1344
    }
1345
    printf("\n");
1346
}
1347

1348
static void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
1349
    for (int i=0; i<tokens->ne[0]; ++i) {
1350
        int token = ggml_get_i32_1d(tokens, i);
1351
        print_token(token, n_vocab);
1352
    }
1353
}
1354

1355
static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
1356
    int n_tokens = tokens_input->ne[0];
1357
    int n_vocab = targets->ne[0];
1358
    float randomness = 0.0f;
1359
    // ggml_set_zero(targets);
1360
    ggml_set_f32(targets, -1.0f);
1361
    ggml_set_i32_1d(tokens_input, 0, 0);
1362
    for (int i=1; i<n_tokens+1; ++i) {
1363
        float x = example_id + i * 3.14159f * 2.0f * 1.0f * 0.5f / n_tokens;
1364
        float y = sinf(x);//*cosf(x*1.1f+1.0f);
1365
        float z = (y+1.0f)*0.5f; // scale to [0..1]
1366
        z += (frand()-0.5f)*(randomness/n_vocab);
1367
        z = (z < 0.0f) ? 0.0f : (z > 1.0f) ? 1.0f : z; // clamp to [0..1]
1368
        int token = std::max(1,std::min(1+(int)(z*(float)(n_vocab-1)), n_vocab-1));
1369
        ggml_set_f32_1d(targets, (i-1)*n_vocab + token, +1.0f);
1370
        if (i<n_tokens) {
1371
            ggml_set_i32_1d(tokens_input, i, token);
1372
        }
1373
    }
1374
}
1375

1376
static void get_example_targets_batch(
1377
    struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
1378
) {
1379
    GGML_ASSERT(ggml_is_matrix(tokens_input));
1380
    GGML_ASSERT(ggml_is_3d(targets));
1381
    int n_tokens = tokens_input->ne[0];
1382
    int n_batch  = tokens_input->ne[1];
1383
    GGML_ASSERT(n_tokens == targets->ne[1]);
1384
    GGML_ASSERT(n_batch  == targets->ne[2]);
1385

1386
    for (int k=0; k<n_batch; ++k) {
1387
        struct ggml_tensor * tokens_input_k = ggml_view_1d(ctx,
1388
                                                tokens_input,
1389
                                                tokens_input->ne[0],
1390
                                                k*tokens_input->nb[1]);
1391
        struct ggml_tensor * targets_k    = ggml_view_2d(ctx,
1392
                                                targets,
1393
                                                targets->ne[0],
1394
                                                targets->ne[1],
1395
                                                targets->nb[1],
1396
                                                k*targets->nb[2]);
1397
        get_example_targets(example_id*n_batch + k, tokens_input_k, targets_k);
1398
    }
1399
}
1400

1401
static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
1402
    int n_tokens = tokens_input->ne[0];
1403
    int n_vocab = targets->ne[0];
1404
    for (int i=0; i<n_tokens-n_shift; ++i) {
1405
        ggml_set_i32_1d(tokens_input, i, ggml_get_i32_1d(tokens_input, i + n_shift));
1406
        for (int k=0; k<n_vocab; ++k) {
1407
            ggml_set_f32_1d(targets, i*n_vocab + k, ggml_get_f32_1d(targets, (i + n_shift)*n_vocab + k));
1408
        }
1409
    }
1410
}
1411

1412
static struct ggml_tensor * square_error_loss(
1413
    struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
1414
) {
1415
    // todo: instead of a-b: a[1:]-b[:-1]
1416
    return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
1417
}
1418

1419
static struct ggml_tensor * cross_entropy_loss(
1420
    struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
1421
) {
1422
    const float eps = 1e-3f;
1423
    return
1424
        ggml_sum(ctx,
1425
            ggml_neg(ctx,
1426
                ggml_sum_rows(ctx,
1427
                    ggml_mul(ctx,
1428
                        ggml_soft_max(ctx, a),
1429
                        ggml_log(ctx,
1430
                            ggml_add1(ctx,
1431
                                ggml_soft_max(ctx, b),
1432
                                ggml_new_f32(ctx, eps)))))));
1433
}
1434

1435
int main(int argc, char ** argv) {
1436
    if (argc < 1) {
1437
        fprintf(stderr, "usage: %s\n", argv[0]);
1438

1439
        return 1;
1440
    }
1441

1442
    struct ggml_init_params lcparams;
1443
    lcparams.mem_size   = 1024ll*1024ll*1024ll;
1444
    lcparams.mem_buffer = NULL;
1445
    lcparams.no_alloc   = false;
1446

1447
    struct llama_model model;
1448
    model.hparams.n_vocab = 8;
1449
    model.hparams.n_ctx   = 8;
1450
    model.hparams.n_embd  = 32;
1451
    model.hparams.n_mult  = 2;
1452
    model.hparams.n_head  = 8;
1453
    model.hparams.n_layer = 1;
1454
    model.hparams.n_rot   = std::min(16u, model.hparams.n_embd / model.hparams.n_head);
1455

1456
    // model.hparams.n_embd  = 32;
1457
    // model.hparams.n_mult  = 2;
1458
    // model.hparams.n_head  = 4;
1459
    // model.hparams.n_layer = 8;
1460
    // model.hparams.n_rot   = 8;
1461

1462
    model.ctx = ggml_init(lcparams);
1463
    printf("init model\n");
1464
    init_model(&model);
1465
    set_param_model(&model);
1466

1467
    randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
1468

1469
/*
1470
    struct llama_model_lora model_lora;
1471
    // model.hparams.n_vocab = 6;
1472
    // model.hparams.n_ctx   = 64;
1473
    // model.hparams.n_embd  = 128;
1474
    // model.hparams.n_mult  = 2;
1475
    // model.hparams.n_head  = 8;
1476
    // model.hparams.n_layer = 6;
1477
    // model.hparams.n_rot   = model.hparams.n_embd / model.hparams.n_head;
1478

1479
    model_lora.hparams.n_vocab = 16;
1480
    model_lora.hparams.n_ctx   = 32;
1481
    model_lora.hparams.n_embd  = 256;
1482
    model_lora.hparams.n_mult  = 2;
1483
    model_lora.hparams.n_head  = 16;
1484
    model_lora.hparams.n_layer = 1;
1485
    model_lora.hparams.n_lora  = 64;
1486
    model_lora.hparams.n_rot   = MIN(16, model_lora.hparams.n_embd / model_lora.hparams.n_head);
1487
    // model.hparams.n_rot   = (model.hparams.n_embd / model.hparams.n_head) / 2;
1488

1489
    // model.hparams.n_embd  = 32;
1490
    // model.hparams.n_mult  = 2;
1491
    // model.hparams.n_head  = 4;
1492
    // model.hparams.n_layer = 8;
1493
    // model.hparams.n_rot   = 8;
1494

1495
    model_lora.ctx = ggml_init(lcparams);
1496
    printf("init model_lora\n");
1497
    init_model_lora(&model_lora);
1498
    set_param_model_lora(&model_lora);
1499

1500
    randomize_model_lora(&model_lora, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
1501
*/
1502
    int n_batch = 8;
1503
    // key + value cache for the self attention
1504
    struct llama_kv_cache kv_self;
1505
    printf("init_kv_cache\n");
1506
    kv_self.ctx = model.ctx;
1507
    init_kv_cache(&kv_self, &model, n_batch);
1508
    //init_kv_cache_lora(&kv_self, &model_lora);
1509

1510
    size_t    compute_size = 1024ll*1024ll*1024ll;
1511
    uint8_t * compute_addr = new uint8_t[compute_size];
1512

1513
    int n_examples = 256;
1514
    int n_tokens = model.hparams.n_ctx;
1515
    int n_vocab  = model.hparams.n_vocab;
1516

1517
    std::vector<uint8_t> work_buffer;
1518

1519
    for (int ex=0; ex<n_examples; ++ex) {
1520
        struct ggml_init_params params = {
1521
            /*.mem_size   =*/ compute_size,
1522
            /*.mem_buffer =*/ compute_addr,
1523
            /*.no_alloc   =*/ false,
1524
        };
1525

1526
        struct ggml_context * ctx0 = ggml_init(params);
1527

1528
        struct ggml_tensor * after_opt_best_samples  = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
1529
        struct ggml_tensor * after_opt_probs         = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
1530
        struct ggml_tensor * tokens_input            = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
1531
        struct ggml_tensor * targets                 = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
1532

1533
        int n_past = 0;
1534

1535
        struct ggml_cgraph * gf = NULL;
1536
        gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
1537

1538
        get_example_targets_batch(ctx0, 64*ex+0,  tokens_input, targets);
1539

1540
        struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, gf, tokens_input, n_tokens, n_past, n_batch);
1541
        // struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
1542
        struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
1543

1544
        ggml_build_forward_expand(gf, e);
1545
        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
1546

1547
        float error_before_opt = ggml_get_f32_1d(e, 0);
1548

1549
        struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_TYPE_LBFGS);
1550
        opt_params_lbfgs.print_forward_graph = false;
1551
        opt_params_lbfgs.print_backward_graph = false;
1552
        opt_params_lbfgs.lbfgs.n_iter = 16;
1553
        ggml_opt(ctx0, opt_params_lbfgs, e);
1554
        //
1555
        ggml_build_forward_expand(gf, e);
1556
        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
1557

1558
        float error_after_opt = ggml_get_f32_1d(e, 0);
1559

1560
        if (ex % 8 == 0) {
1561
            printf("Example %d\n", (ex+1));
1562
            printf("error_before_opt: %.2f\n", error_before_opt);
1563
            printf("error_after_opt:  %.2f\n", error_after_opt);
1564
        }
1565

1566
        if (ex % 64 == 0) {
1567
            sample_softmax_batch(ctx0, logits, after_opt_probs, after_opt_best_samples);
1568
            // printf("probabilities after optimization:\n");
1569
            // print_matrix(after_opt_probs);
1570
            printf("best samples after optimization:\n");
1571
            print_tokens(after_opt_best_samples, n_vocab);
1572
        }
1573

1574
        ggml_free(ctx0);
1575
    }
1576

1577
    {
1578
        int n_gen = 128;
1579
        int sample_ctx = n_tokens-n_tokens/8;
1580

1581
        printf("Generating %d tokens.\n", n_gen);
1582

1583
        struct ggml_tensor * tokens_input = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens);
1584
        struct ggml_tensor * targets      = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens);
1585

1586
        get_example_targets(137, tokens_input, targets);
1587
        for (int i=sample_ctx; i<n_tokens; ++i) {
1588
            ggml_set_i32_1d(tokens_input, i, n_vocab/2);
1589
        }
1590

1591
        for (int i=0; i<sample_ctx-1; ++i) {
1592
            print_token(ggml_get_i32_1d(tokens_input, i), n_vocab);
1593
        }
1594
        printf("---\n");
1595
        for (int i=0; i<n_gen; ++i) {
1596
            struct ggml_init_params params = {
1597
                /*.mem_size   =*/ compute_size,
1598
                /*.mem_buffer =*/ compute_addr,
1599
                /*.no_alloc   =*/ false,
1600
            };
1601
            struct ggml_context * ctx0 = ggml_init(params);
1602

1603
            struct ggml_cgraph * gf = NULL;
1604
            gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
1605

1606
            int n_past = 0;
1607
            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
1608

1609
            ggml_build_forward_expand(gf, logits);
1610
            ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
1611

1612
            struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
1613
            struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
1614

1615
            sample_softmax(logits, probs, best_samples);
1616

1617
            // int sample_at = n_tokens-1;
1618
            int token = ggml_get_i32_1d(best_samples, sample_ctx-1);
1619

1620
            // print_row(probs, sample_at);
1621
            print_token(token, n_vocab);
1622

1623
            lshift_examples(tokens_input, targets, 1);
1624
            ggml_set_i32_1d(tokens_input, 0, 0);
1625
            ggml_set_i32_1d(tokens_input, sample_ctx-1, token);
1626

1627
            ggml_free(ctx0);
1628
        }
1629
    }
1630

1631
    print_matrix(model.tok_embeddings);
1632
    printf("done\n");
1633

1634
    // ggml_free(kv_self.ctx);
1635
    // ggml_free(model_lora.ctx);
1636
    ggml_free(model.ctx);
1637

1638
    return 0;
1639
}
1640
llama

Использование cookies