llama
1639 строк · 61.1 Кб
1#include "ggml.h"
2#include "train.h"
3
4#include <cassert>
5#include <cstdlib>
6#include <cstring>
7#include <random>
8#include <vector>
9
10#if defined(_MSC_VER)
11#pragma warning(disable: 4244 4267) // possible loss of data
12#endif
13
14#ifdef LLAMA_DEFAULT_RMS_EPS
15constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
16#else
17constexpr float rms_norm_eps = 5e-6f;
18#endif
19
20static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
21struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
22
23if (plan.work_size > 0) {
24buf.resize(plan.work_size);
25plan.work_data = buf.data();
26}
27
28ggml_graph_compute(graph, &plan);
29}
30
31static struct ggml_tensor * randomize_tensor(
32struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
33) {
34switch (ndims) {
35case 1:
36for (int i0 = 0; i0 < ne[0]; i0++) {
37((float *)tensor->data)[i0] = frand()*(fmax - fmin) + fmin;
38}
39break;
40case 2:
41for (int i1 = 0; i1 < ne[1]; i1++) {
42for (int i0 = 0; i0 < ne[0]; i0++) {
43((float *)tensor->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
44}
45}
46break;
47case 3:
48for (int i2 = 0; i2 < ne[2]; i2++) {
49for (int i1 = 0; i1 < ne[1]; i1++) {
50for (int i0 = 0; i0 < ne[0]; i0++) {
51((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
52}
53}
54}
55break;
56case 4:
57for (int i3 = 0; i3 < ne[3]; i3++) {
58for (int i2 = 0; i2 < ne[2]; i2++) {
59for (int i1 = 0; i1 < ne[1]; i1++) {
60for (int i0 = 0; i0 < ne[0]; i0++) {
61((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
62}
63}
64}
65}
66break;
67default:
68assert(false);
69}
70
71return tensor;
72}
73
74struct llama_hparams {
75uint32_t n_vocab = 32000;
76uint32_t n_ctx = 512; // this is provided as user input?
77uint32_t n_embd = 4096;
78uint32_t n_mult = 4;
79uint32_t n_head = 32;
80uint32_t n_layer = 32;
81uint32_t n_rot = 64;
82
83bool operator!=(const llama_hparams & other) const {
84return memcmp(this, &other, sizeof(llama_hparams));
85}
86};
87
88static uint32_t get_n_ff(const struct llama_hparams* hparams) {
89const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
90return n_ff;
91}
92
93struct llama_hparams_lora {
94uint32_t n_vocab = 32000;
95uint32_t n_ctx = 512; // this is provided as user input?
96uint32_t n_embd = 4096;
97uint32_t n_mult = 4;
98uint32_t n_head = 32;
99uint32_t n_layer = 32;
100uint32_t n_rot = 64;
101uint32_t n_lora = 64;
102
103bool operator!=(const llama_hparams_lora & other) const {
104return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0;
105}
106};
107
108struct llama_layer {
109// normalization
110struct ggml_tensor * attention_norm;
111
112// attention
113struct ggml_tensor * wq;
114struct ggml_tensor * wk;
115struct ggml_tensor * wv;
116struct ggml_tensor * wo;
117
118// normalization
119struct ggml_tensor * ffn_norm;
120
121// ff
122struct ggml_tensor * w1;
123struct ggml_tensor * w2;
124struct ggml_tensor * w3;
125};
126
127struct llama_layer_lora {
128// normalization
129struct ggml_tensor * attention_norm;
130
131// attention
132struct ggml_tensor * wqa;
133struct ggml_tensor * wqb;
134struct ggml_tensor * wka;
135struct ggml_tensor * wkb;
136struct ggml_tensor * wva;
137struct ggml_tensor * wvb;
138struct ggml_tensor * woa;
139struct ggml_tensor * wob;
140
141// normalization
142struct ggml_tensor * ffn_norm;
143
144// ff
145struct ggml_tensor * w1;
146struct ggml_tensor * w2;
147struct ggml_tensor * w3;
148};
149
150
151struct llama_kv_cache {
152struct ggml_context * ctx = NULL;
153
154struct ggml_tensor * k;
155struct ggml_tensor * v;
156
157// llama_ctx_buffer buf;
158
159int n; // number of tokens currently in the cache
160};
161
162struct llama_model {
163struct ggml_context * ctx = NULL;
164
165llama_hparams hparams;
166
167struct ggml_tensor * tok_embeddings;
168
169struct ggml_tensor * norm;
170struct ggml_tensor * output;
171
172std::vector<llama_layer> layers;
173};
174
175struct llama_model_lora {
176struct ggml_context * ctx = NULL;
177
178llama_hparams_lora hparams;
179
180struct ggml_tensor * tok_embeddings;
181
182struct ggml_tensor * norm;
183struct ggml_tensor * outputa;
184struct ggml_tensor * outputb;
185
186std::vector<llama_layer_lora> layers;
187};
188
189static void init_model(struct llama_model * model) {
190const auto & hparams = model->hparams;
191
192const uint32_t n_embd = hparams.n_embd;
193const uint32_t n_layer = hparams.n_layer;
194const uint32_t n_vocab = hparams.n_vocab;
195
196const uint32_t n_ff = get_n_ff(&hparams);
197
198struct ggml_context * ctx = model->ctx;
199
200model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
201model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd});
202model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("output.weight", {n_embd, n_vocab});
203
204model->layers.resize(n_layer);
205for (uint32_t i = 0; i < n_layer; ++i) {
206auto & layer = model->layers[i];
207
208// std::string layers_i = "layers." + std::to_string(i);
209
210layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
211
212layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
213layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
214layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
215layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
216
217layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd});
218
219layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
220layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
221layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
222}
223}
224
225
226static void init_model_lora(struct llama_model_lora * model) {
227const auto & hparams = model->hparams;
228
229const uint32_t n_embd = hparams.n_embd;
230const uint32_t n_mult = hparams.n_mult;
231const uint32_t n_layer = hparams.n_layer;
232const uint32_t n_vocab = hparams.n_vocab;
233const uint32_t n_lora = hparams.n_lora;
234
235const uint32_t n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult;
236
237struct ggml_context * ctx = model->ctx;
238
239model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
240model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd});
241model->outputa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_vocab); // ("output.weight", {n_embd, n_vocab});
242model->outputb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // ("output.weight", {n_embd, n_vocab});
243
244model->layers.resize(n_layer);
245for (uint32_t i = 0; i < n_layer; ++i) {
246auto & layer = model->layers[i];
247
248// std::string layers_i = "layers." + std::to_string(i);
249
250layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
251
252layer.wqa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
253layer.wqb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
254layer.wka = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
255layer.wkb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
256layer.wva = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
257layer.wvb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
258layer.woa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
259layer.wob = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
260
261layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd});
262
263layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
264layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
265layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
266}
267}
268
269static void set_param_model(struct llama_model * model) {
270const auto& hparams = model->hparams;
271
272const uint32_t n_layer = hparams.n_layer;
273
274struct ggml_context* ctx = model->ctx;
275
276ggml_set_param(ctx, model->tok_embeddings);
277ggml_set_param(ctx, model->norm);
278ggml_set_param(ctx, model->output);
279
280for (uint32_t i = 0; i < n_layer; ++i) {
281auto & layer = model->layers[i];
282
283ggml_set_param(ctx, layer.attention_norm);
284ggml_set_param(ctx, layer.wq);
285ggml_set_param(ctx, layer.wk);
286ggml_set_param(ctx, layer.wv);
287ggml_set_param(ctx, layer.wo);
288ggml_set_param(ctx, layer.ffn_norm);
289ggml_set_param(ctx, layer.w1);
290ggml_set_param(ctx, layer.w2);
291ggml_set_param(ctx, layer.w3);
292}
293}
294
295static void set_param_model_lora(struct llama_model_lora * model) {
296const auto& hparams = model->hparams;
297
298const uint32_t n_layer = hparams.n_layer;
299
300struct ggml_context* ctx = model->ctx;
301
302ggml_set_param(ctx, model->tok_embeddings);
303ggml_set_param(ctx, model->norm);
304ggml_set_param(ctx, model->outputa);
305ggml_set_param(ctx, model->outputb);
306
307for (uint32_t i = 0; i < n_layer; ++i) {
308auto & layer = model->layers[i];
309
310ggml_set_param(ctx, layer.attention_norm);
311ggml_set_param(ctx, layer.wqa);
312ggml_set_param(ctx, layer.wqb);
313ggml_set_param(ctx, layer.wka);
314ggml_set_param(ctx, layer.wkb);
315ggml_set_param(ctx, layer.wva);
316ggml_set_param(ctx, layer.wvb);
317ggml_set_param(ctx, layer.woa);
318ggml_set_param(ctx, layer.wob);
319ggml_set_param(ctx, layer.ffn_norm);
320ggml_set_param(ctx, layer.w1);
321ggml_set_param(ctx, layer.w2);
322ggml_set_param(ctx, layer.w3);
323}
324}
325
326static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
327const auto & hparams = model->hparams;
328
329const uint32_t n_layer = hparams.n_layer;
330
331struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
332
333randomize_tensor_normal(model->tok_embeddings , rnd);
334randomize_tensor_normal(model->norm , rnd);
335randomize_tensor_normal(model->output , rnd);
336
337for (uint32_t i = 0; i < n_layer; ++i) {
338auto & layer = model->layers[i];
339randomize_tensor_normal(layer.attention_norm, rnd);
340
341randomize_tensor_normal(layer.wq, rnd);
342randomize_tensor_normal(layer.wk, rnd);
343randomize_tensor_normal(layer.wv, rnd);
344randomize_tensor_normal(layer.wo, rnd);
345
346randomize_tensor_normal(layer.ffn_norm, rnd);
347
348randomize_tensor_normal(layer.w1, rnd);
349randomize_tensor_normal(layer.w2, rnd);
350randomize_tensor_normal(layer.w3, rnd);
351}
352
353free_random_normal_distribution(rnd);
354}
355
356
357static void randomize_model_lora(
358struct llama_model_lora * model, int seed, float mean, float std, float min, float max
359) {
360const auto & hparams = model->hparams;
361
362const uint32_t n_layer = hparams.n_layer;
363
364struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
365
366randomize_tensor_normal(model->tok_embeddings, rnd);
367randomize_tensor_normal(model->norm , rnd);
368randomize_tensor_normal(model->outputa , rnd);
369randomize_tensor_normal(model->outputb , rnd);
370
371for (uint32_t i = 0; i < n_layer; ++i) {
372auto & layer = model->layers[i];
373randomize_tensor_normal(layer.attention_norm, rnd);
374
375randomize_tensor_normal(layer.wqa, rnd);
376randomize_tensor_normal(layer.wqb, rnd);
377randomize_tensor_normal(layer.wka, rnd);
378randomize_tensor_normal(layer.wkb, rnd);
379randomize_tensor_normal(layer.wva, rnd);
380randomize_tensor_normal(layer.wvb, rnd);
381randomize_tensor_normal(layer.woa, rnd);
382randomize_tensor_normal(layer.wob, rnd);
383
384randomize_tensor_normal(layer.ffn_norm, rnd);
385
386randomize_tensor_normal(layer.w1, rnd);
387randomize_tensor_normal(layer.w2, rnd);
388randomize_tensor_normal(layer.w3, rnd);
389}
390
391free_random_normal_distribution(rnd);
392}
393
394static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
395const auto & hparams = model->hparams;
396
397const uint32_t n_ctx = hparams.n_ctx;
398const uint32_t n_embd = hparams.n_embd;
399const uint32_t n_layer = hparams.n_layer;
400
401const int64_t n_mem = n_layer*n_ctx*n_batch;
402const int64_t n_elements = n_embd*n_mem;
403
404// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
405
406// struct ggml_init_params params;
407// params.mem_size = cache.buf.size;
408// params.mem_buffer = cache.buf.addr;
409// params.no_alloc = false;
410if (!cache->ctx) {
411struct ggml_init_params params;
412params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
413params.mem_buffer = NULL;
414params.no_alloc = false;
415
416cache->ctx = ggml_init(params);
417
418if (!cache->ctx) {
419fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
420exit(1);
421}
422}
423
424cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
425cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
426}
427
428static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
429const auto & hparams = model->hparams;
430
431const uint32_t n_ctx = hparams.n_ctx;
432const uint32_t n_embd = hparams.n_embd;
433const uint32_t n_layer = hparams.n_layer;
434
435const int64_t n_mem = n_layer*n_ctx*n_batch;
436const int64_t n_elements = n_embd*n_mem;
437
438// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
439
440// struct ggml_init_params params;
441// params.mem_size = cache.buf.size;
442// params.mem_buffer = cache.buf.addr;
443// params.no_alloc = false;
444if (!cache->ctx) {
445struct ggml_init_params params;
446params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
447params.mem_buffer = NULL;
448params.no_alloc = false;
449
450cache->ctx = ggml_init(params);
451
452if (!cache->ctx) {
453fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
454return false;
455}
456}
457
458cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
459cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
460
461return true;
462}
463
464static struct ggml_tensor * forward(
465struct llama_model * model,
466struct llama_kv_cache * cache,
467struct ggml_context * ctx0,
468struct ggml_cgraph * gf,
469struct ggml_tensor * tokens_input,
470const int n_tokens,
471const int n_past
472) {
473const int N = n_tokens;
474
475struct llama_kv_cache& kv_self = *cache;
476const auto & hparams = model->hparams;
477const int n_ctx = hparams.n_ctx;
478const int n_embd = hparams.n_embd;
479const int n_layer = hparams.n_layer;
480const int n_head = hparams.n_head;
481const int n_rot = hparams.n_rot;
482
483struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
484memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
485
486struct ggml_tensor * kc = kv_self.k;
487struct ggml_tensor * vc = kv_self.v;
488
489struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
490{
491int * data = (int *) KQ_pos->data;
492for (int i = 0; i < N; ++i) {
493data[i] = n_past + i;
494}
495}
496
497// inpL shape [n_embd,N,1,1]
498struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
499for (int il = 0; il < n_layer; ++il) {
500struct ggml_tensor * inpSA = inpL;
501
502struct ggml_tensor * cur;
503
504// lctx.use_buf(ctx0, 0);
505
506// norm
507{
508// cur shape [n_embd,N,1,1]
509cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
510
511// cur = attention_norm*cur
512cur = ggml_mul(ctx0,
513ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
514cur);
515}
516
517// self-attention
518{
519// compute Q and K and RoPE them
520// wq shape [n_embd, n_embd, 1, 1]
521// wk shape [n_embd, n_embd, 1, 1]
522// Qcur shape [n_embd/n_head, n_head, N, 1]
523// Kcur shape [n_embd/n_head, n_head, N, 1]
524struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
525struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
526
527// store key and value to memory
528{
529// compute the transposed [N, n_embd] V matrix
530// wv shape [n_embd, n_embd, 1, 1]
531// Vcur shape [n_embd, N, 1, 1]
532struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));
533
534// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
535// kv_self.v shape [n_embd * n_ctx * n_layer, 1]
536// k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
537// v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
538
539/* {
540struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
541struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
542( n_ctx)*ggml_element_size(kv_self.v),
543(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
544
545// important: storing RoPE-ed version of K in the KV cache!
546ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
547ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
548} //*/
549
550kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
551vc = ggml_set_2d(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v),
552(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
553}
554
555// Qcur shape [n_embd/n_head, n_head, N, 1]
556// Q shape [n_embd/n_head, N, n_head, 1]
557struct ggml_tensor * Q =
558ggml_permute(ctx0,
559Qcur,
5600, 2, 1, 3);
561
562// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
563// K shape [n_embd/n_head, n_past + N, n_head, 1]
564struct ggml_tensor * K =
565ggml_permute(ctx0,
566ggml_reshape_3d(ctx0,
567ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
568n_embd/n_head, n_head, n_past + N),
5690, 2, 1, 3);
570
571// K * Q
572// KQ shape [n_past + N, N, n_head, 1]
573struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
574
575// KQ_scaled = KQ / sqrt(n_embd/n_head)
576// KQ_scaled shape [n_past + N, N, n_head, 1]
577struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
578
579// KQ_masked = mask_past(KQ_scaled)
580// KQ_masked shape [n_past + N, N, n_head, 1]
581struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
582
583// KQ = soft_max(KQ_masked)
584// KQ_soft_max shape [n_past + N, N, n_head, 1]
585struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
586
587// split cached V into n_head heads
588//// V shape [n_past + N, n_embd/n_head, n_head, 1]
589// V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
590struct ggml_tensor * V =
591ggml_view_3d(ctx0, vc,
592n_past + N, n_embd/n_head, n_head,
593n_ctx*ggml_element_size(vc),
594n_ctx*ggml_element_size(vc)*n_embd/n_head,
595il*n_ctx*ggml_element_size(vc)*n_embd);
596
597// KQV shape [n_embd/n_head, N, n_head, 1]
598struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
599
600// KQV_merged = KQV.permute(0, 2, 1, 3)
601// KQV_merged shape [n_embd/n_head, n_head, N, 1]
602struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
603// KQV_merged shape
604
605// cur = KQV_merged.contiguous().view(n_embd, N)
606// cur shape [n_embd,N,1,1]
607cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
608// cur = ggml_cpy(ctx0,
609// KQV_merged,
610// ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
611
612// projection (no bias)
613// cur shape [n_embd,N,1,1]
614cur = ggml_mul_mat(ctx0,
615model->layers[il].wo,
616cur);
617}
618
619// lctx.use_buf(ctx0, 1);
620
621// inpFF shape [n_embd,N,1,1]
622struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
623
624// feed-forward network
625{
626// norm
627{
628// cur shape [n_embd,N,1,1]
629cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
630
631// cur = ffn_norm*cur
632// cur shape [n_embd,N,1,1]
633cur = ggml_mul(ctx0,
634ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
635cur);
636}
637
638// tmp shape [n_ff,N,1,1]
639struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
640model->layers[il].w3,
641cur);
642
643// cur shape [n_ff,N,1,1]
644cur = ggml_mul_mat(ctx0,
645model->layers[il].w1,
646cur);
647
648// SILU activation
649// cur shape [n_ff,N,1,1]
650cur = ggml_silu(ctx0, cur);
651
652// cur shape [n_ff,N,1,1]
653cur = ggml_mul(ctx0, cur, tmp);
654
655// cur shape [n_embd,N,1,1]
656cur = ggml_mul_mat(ctx0,
657model->layers[il].w2,
658cur);
659}
660
661// cur shape [n_embd,N,1,1]
662cur = ggml_add(ctx0, cur, inpFF);
663
664// input for next layer
665// inpL shape [n_embd,N,1,1]
666inpL = cur;
667}
668
669// norm
670{
671
672// inpL shape [n_embd,N,1,1]
673inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
674
675// inpL = norm*inpL
676// inpL shape [n_embd,N,1,1]
677inpL = ggml_mul(ctx0,
678ggml_repeat(ctx0, model->norm, inpL),
679inpL);
680
681//embeddings = inpL;
682}
683
684// lm_head
685// inpL shape [n_vocab,N,1,1]
686inpL = ggml_mul_mat(ctx0, model->output, inpL);
687
688// run the computation
689ggml_build_forward_expand(gf, inpL);
690
691return inpL;
692}
693
694static struct ggml_tensor * forward_batch(
695struct llama_model * model,
696struct llama_kv_cache * cache,
697struct ggml_context * ctx0,
698struct ggml_cgraph * gf,
699struct ggml_tensor * tokens_input,
700const int n_tokens,
701const int n_past,
702const int n_batch
703) {
704const int N = n_tokens;
705
706struct llama_kv_cache& kv_self = *cache;
707const auto & hparams = model->hparams;
708const int n_ctx = hparams.n_ctx;
709const int n_vocab = hparams.n_vocab;
710const int n_embd = hparams.n_embd;
711const int n_layer = hparams.n_layer;
712const int n_head = hparams.n_head;
713const int n_rot = hparams.n_rot;
714const int n_ff = get_n_ff(&hparams);
715
716struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
717memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
718
719struct ggml_tensor * kc = kv_self.k;
720struct ggml_tensor * vc = kv_self.v;
721
722struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
723{
724int * data = (int *) KQ_pos->data;
725for (int i = 0; i < N; ++i) {
726data[i] = n_past + i;
727}
728}
729
730// inpL shape [n_embd,N*n_batch,1]
731struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
732assert_shape_2d(inpL, n_embd, N*n_batch);
733
734for (int il = 0; il < n_layer; ++il) {
735struct ggml_tensor * inpSA = inpL;
736
737struct ggml_tensor * cur;
738
739// lctx.use_buf(ctx0, 0);
740
741// norm
742{
743// cur shape [n_embd,N*n_batch,1,1]
744cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
745assert_shape_2d(cur, n_embd, N*n_batch);
746
747// cur = attention_norm*cur
748cur = ggml_mul(ctx0,
749ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
750cur);
751assert_shape_2d(cur, n_embd, N*n_batch);
752}
753
754// self-attention
755{
756// compute Q and K and RoPE them
757// wq shape [n_embd, n_embd, 1, 1]
758// wk shape [n_embd, n_embd, 1, 1]
759// Qcur shape [n_embd/n_head, n_head, N, n_batch]
760// Kcur shape [n_embd/n_head, n_head, N, n_batch]
761struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
762struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
763assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
764assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
765
766// store key and value to memory
767{
768// compute the transposed [N, n_embd] V matrix
769// wv shape [n_embd, n_embd, 1, 1]
770// Vcur shape [N, n_embd, n_batch, 1]
771struct ggml_tensor * Vcur = ggml_cont(ctx0,
772ggml_permute(ctx0,
773ggml_reshape_3d(ctx0,
774ggml_mul_mat(ctx0,
775model->layers[il].wv,
776cur),
777n_embd, N, n_batch),
7781, 0, 2, 3));
779
780assert_shape_3d(Vcur, N, n_embd, n_batch);
781
782// kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
783// kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
784// k shape [n_embd * N, n_batch] == kv_self.k[:,n_past:n_past+N,:,il]
785// v shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]
786
787/* {
788struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
789struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
790( n_ctx)*ggml_element_size(kv_self.v),
791(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
792
793// important: storing RoPE-ed version of K in the KV cache!
794ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
795ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
796} //*/
797
798kc = ggml_set_2d(ctx0, kc,
799ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch),
800ggml_element_size(kc)*n_embd*n_ctx,
801(ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past));
802vc = ggml_set_2d(ctx0, vc,
803ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch),
804ggml_element_size(vc)*n_ctx*n_embd,
805ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx));
806
807assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer);
808assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer);
809}
810
811// Qcur shape [n_embd/n_head, n_head, N, n_batch]
812// Q shape [n_embd/n_head, N, n_head, n_batch]
813struct ggml_tensor * Q =
814ggml_permute(ctx0,
815Qcur,
8160, 2, 1, 3);
817assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
818
819// kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
820// K shape [n_embd/n_head, n_past + N, n_head, n_batch]
821struct ggml_tensor * K =
822ggml_permute(ctx0,
823ggml_reshape_4d(ctx0,
824ggml_view_3d(ctx0,
825kc,
826n_embd,
827(n_past + N),
828n_batch,
829n_embd*ggml_element_size(kc),
830n_ctx*n_embd*ggml_element_size(kc),
831il*n_batch*n_ctx*n_embd*ggml_element_size(kc)),
832n_embd/n_head, n_head, n_past + N, n_batch),
8330, 2, 1, 3);
834assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch);
835
836// K * Q
837// KQ shape [n_past + N, N, n_head, n_batch]
838struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
839assert_shape_4d(KQ, n_past + N, N, n_head, n_batch);
840
841// KQ_scaled = KQ / sqrt(n_embd/n_head)
842// KQ_scaled shape [n_past + N, N, n_head, n_batch]
843struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
844assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);
845
846// KQ_masked = mask_past(KQ_scaled)
847// KQ_masked shape [n_past + N, N, n_head, n_batch]
848struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
849assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch);
850
851// KQ = soft_max(KQ_masked)
852// KQ_soft_max shape [n_past + N, N, n_head, n_batch]
853struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
854assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);
855
856// split cached V into n_head heads
857// kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
858// V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
859struct ggml_tensor * V =
860ggml_view_4d(ctx0, vc,
861n_past + N, n_embd/n_head, n_head, n_batch,
862ggml_element_size(vc)*n_ctx,
863ggml_element_size(vc)*n_ctx*n_embd/n_head,
864ggml_element_size(vc)*n_ctx*n_embd,
865il*n_batch*n_ctx*n_embd*ggml_element_size(vc));
866assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch);
867
868// KQV shape [n_embd/n_head, N, n_head, n_batch]
869struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
870assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
871
872// KQV_merged = KQV.permute(0, 2, 1, 3)
873// KQV_merged shape [n_embd/n_head, n_head, N, n_batch]
874struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
875assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
876// KQV_merged shape
877
878// cur = KQV_merged.contiguous().view(n_embd, N)
879// cur shape [n_embd,N*n_batch,1,1]
880cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
881assert_shape_2d(cur, n_embd, N*n_batch);
882// cur = ggml_cpy(ctx0,
883// KQV_merged,
884// ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
885
886// projection (no bias)
887// cur shape [n_embd,N*n_batch,1,1]
888cur = ggml_mul_mat(ctx0,
889model->layers[il].wo,
890cur);
891assert_shape_2d(cur, n_embd, N*n_batch);
892}
893
894// lctx.use_buf(ctx0, 1);
895
896// inpFF shape [n_embd,N*n_batch,1,1]
897struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
898assert_shape_2d(inpFF, n_embd, N*n_batch);
899
900// feed-forward network
901{
902// norm
903{
904// cur shape [n_embd,N*n_batch,1,1]
905cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
906assert_shape_2d(cur, n_embd, N*n_batch);
907
908// cur = ffn_norm*cur
909// cur shape [n_embd,N*n_batch,1,1]
910cur = ggml_mul(ctx0,
911ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
912cur);
913assert_shape_2d(cur, n_embd, N*n_batch);
914}
915
916// tmp shape [n_ff,N*n_batch,1,1]
917struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
918model->layers[il].w3,
919cur);
920assert_shape_2d(tmp, n_ff, N*n_batch);
921
922// cur shape [n_ff,N*n_batch,1,1]
923cur = ggml_mul_mat(ctx0,
924model->layers[il].w1,
925cur);
926assert_shape_2d(cur, n_ff, N*n_batch);
927
928// SILU activation
929// cur shape [n_ff,N*n_batch,1,1]
930cur = ggml_silu(ctx0, cur);
931assert_shape_2d(cur, n_ff, N*n_batch);
932
933// cur shape [n_ff,N*n_batch,1,1]
934cur = ggml_mul(ctx0, cur, tmp);
935assert_shape_2d(cur, n_ff, N*n_batch);
936
937// cur shape [n_embd,N*n_batch,1,1]
938cur = ggml_mul_mat(ctx0,
939model->layers[il].w2,
940cur);
941assert_shape_2d(cur, n_embd, N*n_batch);
942}
943
944// cur shape [n_embd,N*n_batch,1,1]
945cur = ggml_add(ctx0, cur, inpFF);
946assert_shape_2d(cur, n_embd, N*n_batch);
947
948// input for next layer
949// inpL shape [n_embd,N*n_batch,1,1]
950inpL = cur;
951assert_shape_2d(inpL, n_embd, N*n_batch);
952}
953
954// norm
955{
956
957// inpL shape [n_embd,N*n_batch,1,1]
958inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
959assert_shape_2d(inpL, n_embd, N*n_batch);
960
961// inpL = norm*inpL
962// inpL shape [n_embd,N*n_batch,1,1]
963inpL = ggml_mul(ctx0,
964ggml_repeat(ctx0, model->norm, inpL),
965inpL);
966
967assert_shape_2d(inpL, n_embd, N*n_batch);
968
969//embeddings = inpL;
970}
971
972// lm_head
973// inpL shape [n_vocab,N*n_batch,1,1]
974inpL = ggml_mul_mat(ctx0, model->output, inpL);
975assert_shape_2d(inpL, n_vocab, N*n_batch);
976
977{
978// inpL shape [n_vocab,N,n_batch,1]
979inpL = ggml_reshape_3d(ctx0,
980inpL,
981n_vocab, N, n_batch);
982assert_shape_3d(inpL, n_vocab, N, n_batch);
983}
984
985// run the computation
986ggml_build_forward_expand(gf, inpL);
987
988return inpL;
989}
990
991static struct ggml_tensor * forward_lora(
992struct llama_model_lora * model,
993struct llama_kv_cache * cache,
994struct ggml_context * ctx0,
995struct ggml_cgraph * gf,
996struct ggml_tensor * tokens_input,
997const int n_tokens,
998const int n_past
999) {
1000const int N = n_tokens;
1001
1002struct llama_kv_cache& kv_self = *cache;
1003const auto & hparams = model->hparams;
1004
1005const int n_ctx = hparams.n_ctx;
1006const int n_embd = hparams.n_embd;
1007const int n_layer = hparams.n_layer;
1008const int n_head = hparams.n_head;
1009const int n_rot = hparams.n_rot;
1010
1011struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1012memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
1013
1014struct ggml_tensor * kc = kv_self.k;
1015struct ggml_tensor * vc = kv_self.v;
1016
1017struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1018{
1019int * data = (int *) KQ_pos->data;
1020for (int i = 0; i < N; ++i) {
1021data[i] = n_past + i;
1022}
1023}
1024
1025// inpL shape [n_embd,N,1,1]
1026struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
1027for (int il = 0; il < n_layer; ++il) {
1028struct ggml_tensor * inpSA = inpL;
1029
1030struct ggml_tensor * cur;
1031
1032// norm
1033{
1034// cur shape [n_embd,N,1,1]
1035cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
1036
1037// cur = attention_norm*cur
1038cur = ggml_mul(ctx0,
1039ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
1040cur);
1041}
1042
1043// self-attention
1044{
1045// compute Q and K and RoPE them
1046// wq shape [n_embd, n_embd, 1, 1]
1047// wk shape [n_embd, n_embd, 1, 1]
1048// Qcur shape [n_embd/n_head, n_head, N, 1]
1049// Kcur shape [n_embd/n_head, n_head, N, 1]
1050struct ggml_tensor * Qcur = ggml_rope(ctx0,
1051ggml_reshape_3d(ctx0,
1052ggml_mul_mat(ctx0,
1053model->layers[il].wqa,
1054ggml_mul_mat(ctx0,
1055model->layers[il].wqb,
1056cur)),
1057n_embd/n_head, n_head, N),
1058KQ_pos, n_rot, 0);
1059struct ggml_tensor * Kcur = ggml_rope(ctx0,
1060ggml_reshape_3d(ctx0,
1061ggml_mul_mat(ctx0,
1062model->layers[il].wka,
1063ggml_mul_mat(ctx0,
1064model->layers[il].wkb,
1065cur)),
1066n_embd/n_head, n_head, N),
1067KQ_pos, n_rot, 0);
1068
1069// store key and value to memory
1070{
1071// compute the transposed [N, n_embd] V matrix
1072// wv shape [n_embd, n_embd, 1, 1]
1073// Vcur shape [n_embd, N, 1, 1]
1074struct ggml_tensor * Vcur = ggml_cont(ctx0,
1075ggml_transpose(ctx0,
1076ggml_reshape_2d(ctx0,
1077ggml_mul_mat(ctx0,
1078model->layers[il].wva,
1079ggml_mul_mat(ctx0,
1080model->layers[il].wvb,
1081cur)),
1082n_embd, N)));
1083
1084// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
1085// kv_self.v shape [n_embd * n_ctx * n_layer, 1]
1086// k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
1087// v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
1088
1089/* {
1090struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1091struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1092( n_ctx)*ggml_element_size(kv_self.v),
1093(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1094
1095// important: storing RoPE-ed version of K in the KV cache!
1096ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
1097ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
1098} //*/
1099
1100kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1101vc = ggml_set_2d(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v),
1102(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1103}
1104
1105// Qcur shape [n_embd/n_head, n_head, N, 1]
1106// Q shape [n_embd/n_head, N, n_head, 1]
1107struct ggml_tensor * Q =
1108ggml_permute(ctx0,
1109Qcur,
11100, 2, 1, 3);
1111
1112// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
1113// K shape [n_embd/n_head, n_past + N, n_head, 1]
1114struct ggml_tensor * K =
1115ggml_permute(ctx0,
1116ggml_reshape_3d(ctx0,
1117ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
1118n_embd/n_head, n_head, n_past + N),
11190, 2, 1, 3);
1120
1121// K * Q
1122// KQ shape [n_past + N, N, n_head, 1]
1123struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1124
1125// KQ_scaled = KQ / sqrt(n_embd/n_head)
1126// KQ_scaled shape [n_past + N, N, n_head, 1]
1127struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
1128
1129// KQ_masked = mask_past(KQ_scaled)
1130// KQ_masked shape [n_past + N, N, n_head, 1]
1131struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
1132
1133// KQ = soft_max(KQ_masked)
1134// KQ_soft_max shape [n_past + N, N, n_head, 1]
1135struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
1136
1137// split cached V into n_head heads
1138//// V shape [n_past + N, n_embd/n_head, n_head, 1]
1139// V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
1140struct ggml_tensor * V =
1141ggml_view_3d(ctx0, vc,
1142n_past + N, n_embd/n_head, n_head,
1143n_ctx*ggml_element_size(vc),
1144n_ctx*ggml_element_size(vc)*n_embd/n_head,
1145il*n_ctx*ggml_element_size(vc)*n_embd);
1146
1147// KQV shape [n_embd/n_head, N, n_head, 1]
1148struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1149
1150// KQV_merged = KQV.permute(0, 2, 1, 3)
1151// KQV_merged shape [n_embd/n_head, n_head, N, 1]
1152struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1153// KQV_merged shape
1154
1155// cur = KQV_merged.contiguous().view(n_embd, N)
1156// cur shape [n_embd,N,1,1]
1157cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
1158// cur = ggml_cpy(ctx0,
1159// KQV_merged,
1160// ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
1161
1162// projection (no bias)
1163// cur shape [n_embd,N,1,1]
1164cur = ggml_mul_mat(ctx0,
1165model->layers[il].woa,
1166ggml_mul_mat(ctx0,
1167model->layers[il].wob,
1168cur));
1169}
1170
1171// inpFF shape [n_embd,N,1,1]
1172struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1173
1174// feed-forward network
1175{
1176// norm
1177{
1178// cur shape [n_embd,N,1,1]
1179cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
1180
1181// cur = ffn_norm*cur
1182// cur shape [n_embd,N,1,1]
1183cur = ggml_mul(ctx0,
1184ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
1185cur);
1186}
1187
1188// tmp shape [n_ff,N,1,1]
1189struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
1190model->layers[il].w3,
1191cur);
1192
1193// cur shape [n_ff,N,1,1]
1194cur = ggml_mul_mat(ctx0,
1195model->layers[il].w1,
1196cur);
1197
1198// SILU activation
1199// cur shape [n_ff,N,1,1]
1200cur = ggml_silu(ctx0, cur);
1201
1202// cur shape [n_ff,N,1,1]
1203cur = ggml_mul(ctx0, cur, tmp);
1204
1205// cur shape [n_embd,N,1,1]
1206cur = ggml_mul_mat(ctx0,
1207model->layers[il].w2,
1208cur);
1209}
1210
1211// cur shape [n_embd,N,1,1]
1212cur = ggml_add(ctx0, cur, inpFF);
1213
1214// input for next layer
1215// inpL shape [n_embd,N,1,1]
1216inpL = cur;
1217}
1218
1219// norm
1220{
1221
1222// inpL shape [n_embd,N,1,1]
1223inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
1224
1225// inpL = norm*inpL
1226// inpL shape [n_embd,N,1,1]
1227inpL = ggml_mul(ctx0,
1228ggml_repeat(ctx0, model->norm, inpL),
1229inpL);
1230
1231//embeddings = inpL;
1232}
1233
1234
1235// lm_head
1236// inpL shape [n_vocab,N,1,1]
1237inpL = ggml_mul_mat(ctx0,
1238model->outputa,
1239ggml_mul_mat(ctx0,
1240model->outputb,
1241inpL));
1242
1243// ggml_set_scratch(ctx0, { 0, 0, nullptr, });
1244// run the computation
1245ggml_build_forward_expand(gf, inpL);
1246
1247return inpL;
1248}
1249
1250static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
1251assert(ggml_is_matrix(logits));
1252assert(ggml_is_matrix(probs));
1253assert(ggml_is_vector(best_samples));
1254assert(logits->ne[1] == best_samples->ne[0]);
1255assert(logits->ne[0] == probs->ne[0]);
1256assert(logits->ne[1] == probs->ne[1]);
1257for (int i = 0; i < logits->ne[1]; ++i) {
1258float max_logit = ggml_get_f32_1d(logits, i * logits->ne[0]);
1259ggml_set_i32_1d(best_samples, i, 0);
1260for (int k = 0; k < logits->ne[0]; ++k) {
1261float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
1262if (logit > max_logit) {
1263max_logit = logit;
1264ggml_set_i32_1d(best_samples, i, k);
1265}
1266}
1267float psum = 0;
1268for (int k = 0; k < logits->ne[0]; ++k) {
1269float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
1270float p = (logit == -INFINITY) ? 0 : expf(logit - max_logit);
1271psum += p;
1272ggml_set_f32_1d(probs, i * probs->ne[0] + k, p);
1273}
1274for (int k = 0; k < logits->ne[0]; ++k) {
1275float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
1276ggml_set_f32_1d(probs, i * probs->ne[0] + k, p / psum);
1277}
1278}
1279}
1280
1281static void sample_softmax_batch(
1282struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
1283struct ggml_tensor * best_samples
1284) {
1285GGML_ASSERT(ggml_is_matrix(best_samples));
1286GGML_ASSERT(ggml_is_3d(logits));
1287GGML_ASSERT(ggml_is_3d(probs));
1288int n_tokens = best_samples->ne[0];
1289int n_batch = best_samples->ne[1];
1290int n_vocab = logits->ne[0];
1291GGML_ASSERT(n_tokens == logits->ne[1]);
1292GGML_ASSERT(n_batch == logits->ne[2]);
1293GGML_ASSERT(n_vocab == probs->ne[0]);
1294GGML_ASSERT(n_tokens == probs->ne[1]);
1295GGML_ASSERT(n_batch == probs->ne[2]);
1296
1297for (int k = 0; k < n_batch; ++k) {
1298struct ggml_tensor * best_samples_k = ggml_view_1d(ctx,
1299best_samples,
1300best_samples->ne[0],
1301k*best_samples->nb[1]);
1302struct ggml_tensor * logits_k = ggml_view_2d(ctx,
1303logits,
1304logits->ne[0],
1305logits->ne[1],
1306logits->nb[1],
1307k*logits->nb[2]);
1308struct ggml_tensor * probs_k = ggml_view_2d(ctx,
1309probs,
1310probs->ne[0],
1311probs->ne[1],
1312probs->nb[1],
1313k*probs->nb[2]);
1314sample_softmax(logits_k, probs_k, best_samples_k);
1315}
1316}
1317
1318static void print_row(struct ggml_tensor * probs, int i) {
1319for (int k = 0; k < probs->ne[0]; ++k) {
1320float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
1321printf(" %.2f", p);
1322}
1323printf("\n");
1324}
1325
1326static void print_matrix(struct ggml_tensor * probs) {
1327assert(ggml_is_matrix(probs));
1328for (int i = 0; i < probs->ne[1]; ++i) {
1329for (int k = 0; k < probs->ne[0]; ++k) {
1330float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
1331printf(" %.2f", p);
1332}
1333printf("\n");
1334}
1335}
1336
1337static void print_token(int token, int n_vocab) {
1338for (int k = 0; k < token; ++k) {
1339printf(" ");
1340}
1341printf("X");
1342for (int k = token+1; k < n_vocab; ++k) {
1343printf(" ");
1344}
1345printf("\n");
1346}
1347
1348static void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
1349for (int i=0; i<tokens->ne[0]; ++i) {
1350int token = ggml_get_i32_1d(tokens, i);
1351print_token(token, n_vocab);
1352}
1353}
1354
1355static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
1356int n_tokens = tokens_input->ne[0];
1357int n_vocab = targets->ne[0];
1358float randomness = 0.0f;
1359// ggml_set_zero(targets);
1360ggml_set_f32(targets, -1.0f);
1361ggml_set_i32_1d(tokens_input, 0, 0);
1362for (int i=1; i<n_tokens+1; ++i) {
1363float x = example_id + i * 3.14159f * 2.0f * 1.0f * 0.5f / n_tokens;
1364float y = sinf(x);//*cosf(x*1.1f+1.0f);
1365float z = (y+1.0f)*0.5f; // scale to [0..1]
1366z += (frand()-0.5f)*(randomness/n_vocab);
1367z = (z < 0.0f) ? 0.0f : (z > 1.0f) ? 1.0f : z; // clamp to [0..1]
1368int token = std::max(1,std::min(1+(int)(z*(float)(n_vocab-1)), n_vocab-1));
1369ggml_set_f32_1d(targets, (i-1)*n_vocab + token, +1.0f);
1370if (i<n_tokens) {
1371ggml_set_i32_1d(tokens_input, i, token);
1372}
1373}
1374}
1375
1376static void get_example_targets_batch(
1377struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
1378) {
1379GGML_ASSERT(ggml_is_matrix(tokens_input));
1380GGML_ASSERT(ggml_is_3d(targets));
1381int n_tokens = tokens_input->ne[0];
1382int n_batch = tokens_input->ne[1];
1383GGML_ASSERT(n_tokens == targets->ne[1]);
1384GGML_ASSERT(n_batch == targets->ne[2]);
1385
1386for (int k=0; k<n_batch; ++k) {
1387struct ggml_tensor * tokens_input_k = ggml_view_1d(ctx,
1388tokens_input,
1389tokens_input->ne[0],
1390k*tokens_input->nb[1]);
1391struct ggml_tensor * targets_k = ggml_view_2d(ctx,
1392targets,
1393targets->ne[0],
1394targets->ne[1],
1395targets->nb[1],
1396k*targets->nb[2]);
1397get_example_targets(example_id*n_batch + k, tokens_input_k, targets_k);
1398}
1399}
1400
1401static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
1402int n_tokens = tokens_input->ne[0];
1403int n_vocab = targets->ne[0];
1404for (int i=0; i<n_tokens-n_shift; ++i) {
1405ggml_set_i32_1d(tokens_input, i, ggml_get_i32_1d(tokens_input, i + n_shift));
1406for (int k=0; k<n_vocab; ++k) {
1407ggml_set_f32_1d(targets, i*n_vocab + k, ggml_get_f32_1d(targets, (i + n_shift)*n_vocab + k));
1408}
1409}
1410}
1411
1412static struct ggml_tensor * square_error_loss(
1413struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
1414) {
1415// todo: instead of a-b: a[1:]-b[:-1]
1416return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
1417}
1418
1419static struct ggml_tensor * cross_entropy_loss(
1420struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
1421) {
1422const float eps = 1e-3f;
1423return
1424ggml_sum(ctx,
1425ggml_neg(ctx,
1426ggml_sum_rows(ctx,
1427ggml_mul(ctx,
1428ggml_soft_max(ctx, a),
1429ggml_log(ctx,
1430ggml_add1(ctx,
1431ggml_soft_max(ctx, b),
1432ggml_new_f32(ctx, eps)))))));
1433}
1434
1435int main(int argc, char ** argv) {
1436if (argc < 1) {
1437fprintf(stderr, "usage: %s\n", argv[0]);
1438
1439return 1;
1440}
1441
1442struct ggml_init_params lcparams;
1443lcparams.mem_size = 1024ll*1024ll*1024ll;
1444lcparams.mem_buffer = NULL;
1445lcparams.no_alloc = false;
1446
1447struct llama_model model;
1448model.hparams.n_vocab = 8;
1449model.hparams.n_ctx = 8;
1450model.hparams.n_embd = 32;
1451model.hparams.n_mult = 2;
1452model.hparams.n_head = 8;
1453model.hparams.n_layer = 1;
1454model.hparams.n_rot = std::min(16u, model.hparams.n_embd / model.hparams.n_head);
1455
1456// model.hparams.n_embd = 32;
1457// model.hparams.n_mult = 2;
1458// model.hparams.n_head = 4;
1459// model.hparams.n_layer = 8;
1460// model.hparams.n_rot = 8;
1461
1462model.ctx = ggml_init(lcparams);
1463printf("init model\n");
1464init_model(&model);
1465set_param_model(&model);
1466
1467randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
1468
1469/*
1470struct llama_model_lora model_lora;
1471// model.hparams.n_vocab = 6;
1472// model.hparams.n_ctx = 64;
1473// model.hparams.n_embd = 128;
1474// model.hparams.n_mult = 2;
1475// model.hparams.n_head = 8;
1476// model.hparams.n_layer = 6;
1477// model.hparams.n_rot = model.hparams.n_embd / model.hparams.n_head;
1478
1479model_lora.hparams.n_vocab = 16;
1480model_lora.hparams.n_ctx = 32;
1481model_lora.hparams.n_embd = 256;
1482model_lora.hparams.n_mult = 2;
1483model_lora.hparams.n_head = 16;
1484model_lora.hparams.n_layer = 1;
1485model_lora.hparams.n_lora = 64;
1486model_lora.hparams.n_rot = MIN(16, model_lora.hparams.n_embd / model_lora.hparams.n_head);
1487// model.hparams.n_rot = (model.hparams.n_embd / model.hparams.n_head) / 2;
1488
1489// model.hparams.n_embd = 32;
1490// model.hparams.n_mult = 2;
1491// model.hparams.n_head = 4;
1492// model.hparams.n_layer = 8;
1493// model.hparams.n_rot = 8;
1494
1495model_lora.ctx = ggml_init(lcparams);
1496printf("init model_lora\n");
1497init_model_lora(&model_lora);
1498set_param_model_lora(&model_lora);
1499
1500randomize_model_lora(&model_lora, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
1501*/
1502int n_batch = 8;
1503// key + value cache for the self attention
1504struct llama_kv_cache kv_self;
1505printf("init_kv_cache\n");
1506kv_self.ctx = model.ctx;
1507init_kv_cache(&kv_self, &model, n_batch);
1508//init_kv_cache_lora(&kv_self, &model_lora);
1509
1510size_t compute_size = 1024ll*1024ll*1024ll;
1511uint8_t * compute_addr = new uint8_t[compute_size];
1512
1513int n_examples = 256;
1514int n_tokens = model.hparams.n_ctx;
1515int n_vocab = model.hparams.n_vocab;
1516
1517std::vector<uint8_t> work_buffer;
1518
1519for (int ex=0; ex<n_examples; ++ex) {
1520struct ggml_init_params params = {
1521/*.mem_size =*/ compute_size,
1522/*.mem_buffer =*/ compute_addr,
1523/*.no_alloc =*/ false,
1524};
1525
1526struct ggml_context * ctx0 = ggml_init(params);
1527
1528struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
1529struct ggml_tensor * after_opt_probs = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
1530struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
1531struct ggml_tensor * targets = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
1532
1533int n_past = 0;
1534
1535struct ggml_cgraph * gf = NULL;
1536gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
1537
1538get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
1539
1540struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, gf, tokens_input, n_tokens, n_past, n_batch);
1541// struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
1542struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
1543
1544ggml_build_forward_expand(gf, e);
1545ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
1546
1547float error_before_opt = ggml_get_f32_1d(e, 0);
1548
1549struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_TYPE_LBFGS);
1550opt_params_lbfgs.print_forward_graph = false;
1551opt_params_lbfgs.print_backward_graph = false;
1552opt_params_lbfgs.lbfgs.n_iter = 16;
1553ggml_opt(ctx0, opt_params_lbfgs, e);
1554//
1555ggml_build_forward_expand(gf, e);
1556ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
1557
1558float error_after_opt = ggml_get_f32_1d(e, 0);
1559
1560if (ex % 8 == 0) {
1561printf("Example %d\n", (ex+1));
1562printf("error_before_opt: %.2f\n", error_before_opt);
1563printf("error_after_opt: %.2f\n", error_after_opt);
1564}
1565
1566if (ex % 64 == 0) {
1567sample_softmax_batch(ctx0, logits, after_opt_probs, after_opt_best_samples);
1568// printf("probabilities after optimization:\n");
1569// print_matrix(after_opt_probs);
1570printf("best samples after optimization:\n");
1571print_tokens(after_opt_best_samples, n_vocab);
1572}
1573
1574ggml_free(ctx0);
1575}
1576
1577{
1578int n_gen = 128;
1579int sample_ctx = n_tokens-n_tokens/8;
1580
1581printf("Generating %d tokens.\n", n_gen);
1582
1583struct ggml_tensor * tokens_input = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens);
1584struct ggml_tensor * targets = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens);
1585
1586get_example_targets(137, tokens_input, targets);
1587for (int i=sample_ctx; i<n_tokens; ++i) {
1588ggml_set_i32_1d(tokens_input, i, n_vocab/2);
1589}
1590
1591for (int i=0; i<sample_ctx-1; ++i) {
1592print_token(ggml_get_i32_1d(tokens_input, i), n_vocab);
1593}
1594printf("---\n");
1595for (int i=0; i<n_gen; ++i) {
1596struct ggml_init_params params = {
1597/*.mem_size =*/ compute_size,
1598/*.mem_buffer =*/ compute_addr,
1599/*.no_alloc =*/ false,
1600};
1601struct ggml_context * ctx0 = ggml_init(params);
1602
1603struct ggml_cgraph * gf = NULL;
1604gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
1605
1606int n_past = 0;
1607struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
1608
1609ggml_build_forward_expand(gf, logits);
1610ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
1611
1612struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
1613struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
1614
1615sample_softmax(logits, probs, best_samples);
1616
1617// int sample_at = n_tokens-1;
1618int token = ggml_get_i32_1d(best_samples, sample_ctx-1);
1619
1620// print_row(probs, sample_at);
1621print_token(token, n_vocab);
1622
1623lshift_examples(tokens_input, targets, 1);
1624ggml_set_i32_1d(tokens_input, 0, 0);
1625ggml_set_i32_1d(tokens_input, sample_ctx-1, token);
1626
1627ggml_free(ctx0);
1628}
1629}
1630
1631print_matrix(model.tok_embeddings);
1632printf("done\n");
1633
1634// ggml_free(kv_self.ctx);
1635// ggml_free(model_lora.ctx);
1636ggml_free(model.ctx);
1637
1638return 0;
1639}
1640