llama

convert-llama2c-to-ggml.cpp
939 строк · 34.4 Кб
Перенос по словам
1
#include "ggml.h"
2
#include "llama.h"
3
#include "common.h"
4
#include "log.h"
5

6
#include <unordered_map>
7
#include <vector>
8
#include <cassert>
9
#include <climits>
10
#include <cstring>
11
#include <cstdarg>
12
#include <cinttypes>
13
#include <ctime>
14
#include <random>
15
#include <stdexcept>
16
#include <sstream>
17
#include <algorithm>
18
#include <string>
19

20
// GGUF keys & tensor names.
21

22
#define KV_GENERAL_ARCHITECTURE          "general.architecture"
23
#define KV_GENERAL_NAME                  "general.name"
24

25
#define KV_TOKENIZER_MODEL               "tokenizer.ggml.model"
26
#define KV_TOKENIZER_LIST                "tokenizer.ggml.tokens"
27
#define KV_TOKENIZER_TOKEN_TYPE          "tokenizer.ggml.token_type"
28
#define KV_TOKENIZER_SCORES              "tokenizer.ggml.scores"
29
#define KV_TOKENIZER_BOS_ID              "tokenizer.ggml.bos_token_id"
30
#define KV_TOKENIZER_EOS_ID              "tokenizer.ggml.eos_token_id"
31
#define KV_TOKENIZER_UNK_ID              "tokenizer.ggml.unknown_token_id"
32
#define KV_TOKENIZER_SEP_ID              "tokenizer.ggml.seperator_token_id"
33
#define KV_TOKENIZER_PAD_ID              "tokenizer.ggml.padding_token_id"
34
#define KV_TOKENIZER_HF_JSON             "tokenizer.huggingface.json"
35

36
#define KV_CONTEXT_LENGTH                "llama.context_length"
37
#define KV_EMBEDDING_LENGTH              "llama.embedding_length"
38
#define KV_BLOCK_COUNT                   "llama.block_count"
39
#define KV_FEED_FORWARD_LENGTH           "llama.feed_forward_length"
40
#define KV_ATTENTION_HEAD_COUNT          "llama.attention.head_count"
41
#define KV_ATTENTION_HEAD_COUNT_KV       "llama.attention.head_count_kv"
42
#define KV_ATTENTION_LAYERNORM_RMS_EPS   "llama.attention.layer_norm_rms_epsilon"
43
#define KV_ROPE_DIMENSION_COUNT          "llama.rope.dimension_count"
44

45
#define TN_TOKEN_EMBD  "token_embd.weight"
46
#define TN_OUTPUT_NORM "output_norm.weight"
47
#define TN_OUTPUT      "output.weight"
48
#define TN_ATTN_NORM   "blk.%d.attn_norm.weight"
49
#define TN_ATTN_Q      "blk.%d.attn_q.weight"
50
#define TN_ATTN_K      "blk.%d.attn_k.weight"
51
#define TN_ATTN_V      "blk.%d.attn_v.weight"
52
#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight"
53
#define TN_FFN_NORM    "blk.%d.ffn_norm.weight"
54
#define TN_FFN_GATE    "blk.%d.ffn_gate.weight"
55
#define TN_FFN_DOWN    "blk.%d.ffn_down.weight"
56
#define TN_FFN_UP      "blk.%d.ffn_up.weight"
57

58
#if defined(_MSC_VER)
59
#pragma warning(disable: 4244 4267) // possible loss of data
60
#endif
61

62
#define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
63
#define LLAMA_FILE_VERSION_GGJT_V3   3
64

65
#define TOKENIZER_NAME "llama"
66
#define UNKNOWN_TOKEN_ID 0
67
#define BOS_TOKEN_ID 1
68
#define EOS_TOKEN_ID 2
69

70
//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
71
typedef struct {
72
    int dim; // transformer dimension
73
    int hidden_dim; // for ffn layers
74
    int n_layers; // number of layers
75
    int n_heads; // number of query heads
76
    int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
77
    int vocab_size; // vocabulary size, usually 256 (byte-level)
78
    int seq_len; // max sequence length
79
} Config;
80

81
struct TransformerWeights {
82
    // token embedding table
83
    std::vector<float> token_embedding_table;    // (vocab_size, dim)
84
    // weights for rmsnorms
85
    std::vector<float> rms_att_weight; // (layer, dim) rmsnorm weights
86
    std::vector<float> rms_ffn_weight; // (layer, dim)
87
    // weights for matmuls
88
    std::vector<float> wq; // (layer, dim, dim)
89
    std::vector<float> wk; // (layer, dim, dim)
90
    std::vector<float> wv; // (layer, dim, dim)
91
    std::vector<float> wo; // (layer, dim, dim)
92
    // weights for ffn
93
    std::vector<float> w1; // (layer, hidden_dim, dim)
94
    std::vector<float> w2; // (layer, dim, hidden_dim)
95
    std::vector<float> w3; // (layer, hidden_dim, dim)
96
    // final rmsnorm
97
    std::vector<float> rms_final_weight; // (dim,)
98
    // freq_cis for RoPE relatively positional embeddings
99
    // std::vector<float> freq_cis_real; // (seq_len, dim/2)
100
    // std::vector<float> freq_cis_imag; // (seq_len, dim/2)
101
    // (optional) classifier weights for the logits, on the last layer
102
    std::vector<float> wcls;
103
};
104

105
static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) {
106
    const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
107
    try {
108
        w->token_embedding_table.resize(p->vocab_size * p->dim);
109
        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
110

111
        w->rms_att_weight.resize(p->n_layers * p->dim);
112
        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
113

114
        w->rms_ffn_weight.resize(p->n_layers * p->dim);
115
        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
116

117
        w->wq.resize(p->n_layers * p->dim * p->dim);
118
        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
119

120
        w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
121
        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
122

123
        w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
124
        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
125

126
        w->wo.resize(p->n_layers * p->dim * p->dim);
127
        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
128

129
        w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
130
        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
131

132
        w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
133
        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
134

135
        w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
136
        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
137

138
        w->rms_final_weight.resize(p->dim);
139
        LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
140

141
        if (shared_weights) {
142
            w->wcls = {};
143
        } else {
144
            w->wcls.resize(p->vocab_size * p->dim);
145
            LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
146
        }
147
    }
148
    catch (std::length_error &) {
149
        die("Invalid configuration. Failed to allocate memory for weights");
150
    }
151
}
152

153
static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) {
154
    if (fread(w->token_embedding_table.data(), sizeof(float), w->token_embedding_table.size(), f) != w->token_embedding_table.size()) return 1;
155
    if (fread(w->rms_att_weight.data(), sizeof(float), w->rms_att_weight.size(), f) != w->rms_att_weight.size()) return 1;
156
    if (fread(w->wq.data(), sizeof(float), w->wq.size(), f) != w->wq.size()) return 1;
157
    if (fread(w->wk.data(), sizeof(float), w->wk.size(), f) != w->wk.size()) return 1;
158
    if (fread(w->wv.data(), sizeof(float), w->wv.size(), f) != w->wv.size()) return 1;
159
    if (fread(w->wo.data(), sizeof(float), w->wo.size(), f) != w->wo.size()) return 1;
160
    if (fread(w->rms_ffn_weight.data(), sizeof(float), w->rms_ffn_weight.size(), f) != w->rms_ffn_weight.size()) return 1;
161
    if (fread(w->w1.data(), sizeof(float), w->w1.size(), f) != w->w1.size()) return 1;
162
    if (fread(w->w2.data(), sizeof(float), w->w2.size(), f) != w->w2.size()) return 1;
163
    if (fread(w->w3.data(), sizeof(float), w->w3.size(), f) != w->w3.size()) return 1;
164
    if (fread(w->rms_final_weight.data(), sizeof(float), w->rms_final_weight.size(), f) != w->rms_final_weight.size()) return 1;
165

166
    // Skip freq_cis_real & freq_cis_imag
167
    int head_size = p->dim / p->n_heads;
168
    fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
169

170
    if (!shared_weights && fread(w->wcls.data(), sizeof(float), w->wcls.size(), f) != w->wcls.size()) return 1;
171

172
    // Check we didn't forget to read anything
173
    auto curr = ftell(f);
174
    fseek(f, 0, SEEK_END);
175
    auto end = ftell(f);
176
    if (curr != end) {
177
        LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
178
        return 1;
179
    }
180

181
    return 0;
182
}
183

184
static void print_sample_weights(TransformerWeights *w){
185
    LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
186
    LOG_INF("%f\n", w->token_embedding_table[0]);
187
    LOG_INF("%f\n", w->rms_att_weight[0]);
188
    LOG_INF("%f\n", w->rms_ffn_weight[0]);
189

190
    LOG_INF("%f\n", w->wq[0]);
191
    LOG_INF("%f\n", w->wk[0]);
192
    LOG_INF("%f\n", w->wv[0]);
193
    LOG_INF("%f\n", w->wo[0]);
194
    LOG_INF("%f\n", w->w1[0]);
195
    LOG_INF("%f\n", w->w2[0]);
196
    LOG_INF("%f\n", w->w3[0]);
197
    LOG_INF("%f\n", w->rms_att_weight[0]);
198
    if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
199
}
200
////////////////////////////////////////////////////////////////////////////////////////////////////////////
201

202
//////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
203

204
struct llama_vocab {
205
    using id    = int32_t;
206
    using token = std::string;
207
    using ttype = llama_token_type;
208

209
    struct token_data {
210
        token text;
211
        float score;
212
        ttype type;
213
    };
214

215
    std::unordered_map<token, id> token_to_id;
216
    std::vector<token_data> id_to_token;
217
};
218

219
struct my_llama_hparams {
220
    uint32_t n_vocab   = 32000;
221
    uint32_t n_ctx     = 512;   // this is provided as user input?
222
    uint32_t n_embd    = 4096;
223
    uint32_t n_ff      = 11008;
224
    uint32_t n_mult    = 4;
225
    uint32_t n_head    = 32;
226
    uint32_t n_head_kv = 32;
227
    uint32_t n_layer   = 32;
228
    uint32_t n_rot     = 64;
229

230
    bool operator!=(const my_llama_hparams& other) const {
231
        return memcmp(this, &other, sizeof(my_llama_hparams));
232
    }
233
};
234

235
struct my_llama_layer {
236
    // normalization
237
    struct ggml_tensor * attention_norm;
238

239
    // attention
240
    struct ggml_tensor * wq;
241
    struct ggml_tensor * wk;
242
    struct ggml_tensor * wv;
243
    struct ggml_tensor * wo;
244

245
    // normalization
246
    struct ggml_tensor * ffn_norm;
247

248
    // ff
249
    struct ggml_tensor * w1;
250
    struct ggml_tensor * w2;
251
    struct ggml_tensor * w3;
252
};
253

254
struct my_llama_model {
255
    struct ggml_context * ctx = NULL;
256

257
    std::string name;
258

259
    my_llama_hparams hparams;
260

261
    struct ggml_tensor * tok_embeddings;
262

263
    struct ggml_tensor * norm;
264
    struct ggml_tensor * output;
265

266
    std::vector<my_llama_layer> layers;
267

268
    uint32_t train_its = 0;
269
    uint32_t train_samples = 0;
270
    uint32_t train_tokens = 0;
271
};
272

273
struct train_params {
274
    const char * fn_vocab_model;
275
    const char * fn_llama2c_model;
276
    const char * fn_llama2c_output_model;
277
    const char * fn_train_data;
278
    const char * fn_checkpoint_in;
279
    const char * fn_checkpoint_out;
280
    const char * fn_model_out;
281

282
    uint32_t seed;
283

284
    int n_ctx;
285
    int n_embd;
286
    int n_mult;
287
    int n_head;
288
    int n_layer;
289
    int n_rotmax;
290

291
    int n_threads;
292
    int n_batch;
293
    int n_examples;
294
    int n_predict;
295

296
    int print_info_interval;
297
    int print_details_interval;
298

299
    bool samples_start_after_nl;
300
    bool use_adam;
301
    bool use_flash;
302
    bool use_scratch;
303

304
    // only adam
305
    int   warmup;
306
    int   cos_decay_steps;
307
    float cos_decay_restart;
308
    float cos_decay_alpha;
309

310
    int   lbfgs_n_iter;
311
    int   adam_n_iter;
312
    float adam_alpha;
313
    float adam_decay;
314

315
    int mem_model_gb;
316
    int mem_compute_gb;
317
    int mem_compute0_gb;
318
    int mem_compute1_gb;
319
};
320

321
static void print_params(struct my_llama_hparams * params) {
322
    LOG_INF("%s: n_vocab:   %u\n", __func__, params->n_vocab);
323
    LOG_INF("%s: n_ctx:     %u\n", __func__, params->n_ctx);
324
    LOG_INF("%s: n_embd:    %u\n", __func__, params->n_embd);
325
    LOG_INF("%s: n_mult:    %u\n", __func__, params->n_mult);
326
    LOG_INF("%s: n_head:    %u\n", __func__, params->n_head);
327
    LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
328
    LOG_INF("%s: n_ff:      %u\n", __func__, params->n_ff);
329
    LOG_INF("%s: n_layer:   %u\n", __func__, params->n_layer);
330
    LOG_INF("%s: n_rot:     %u\n", __func__, params->n_rot);
331
}
332

333
static void print_tensor_info(const struct ggml_context * ctx) {
334
    for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
335
        LOG_INF("%s: Allocating ", __func__);
336
        int64_t total = 1;
337
        int i = 0;
338
        for (; i < ggml_n_dims(t); ++i) {
339
            if (i > 0) LOG("x ");
340
            LOG("[%" PRId64 "] ", t->ne[i]);
341
            total *= t->ne[i];
342
        }
343
        if (i > 1) LOG("= [%" PRId64 "] ", total);
344
        LOG("float space for %s\n", ggml_get_name(t));
345
    }
346
}
347

348
static void init_model(struct my_llama_model * model) {
349
    const auto & hparams = model->hparams;
350

351
    const uint32_t n_embd  = hparams.n_embd;
352
    const uint32_t n_layer = hparams.n_layer;
353
    const uint32_t n_vocab = hparams.n_vocab;
354

355
    const uint32_t n_multiqueries = hparams.n_head_kv <= 0 || hparams.n_head_kv >= hparams.n_head ? 1 : hparams.n_head / hparams.n_head_kv;
356

357
    const uint32_t n_ff = hparams.n_ff;
358
    struct ggml_context * ctx = model->ctx;
359

360
    model->train_its = 0;
361
    model->train_samples = 0;
362
    model->train_tokens = 0;
363

364
    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
365
    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
366
    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
367

368
    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
369
    ggml_set_name(model->norm,           "norm.weight");
370
    ggml_set_name(model->output,         "output.weight");
371

372
    model->layers.resize(n_layer);
373
    for (uint32_t i = 0; i < n_layer; ++i) {
374
        auto & layer = model->layers[i];
375

376
        std::string layers_i = "layers." + std::to_string(i);
377

378
        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
379

380
        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
381
        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
382
        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
383
        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
384

385
        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
386

387
        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
388
        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
389
        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
390

391
        ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
392

393
        ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
394
        ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
395
        ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
396
        ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
397

398
        ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
399

400
        ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
401
        ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
402
        ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
403
    }
404

405
    print_tensor_info(ctx);
406
}
407

408
static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
409
    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
410
    return *ptr;
411
}
412

413
static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
414
    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
415
    return *ptr;
416
}
417

418
static void print_row(struct ggml_tensor * probs, int i) {
419
    for (int k = 0; k < probs->ne[0]; ++k) {
420
        float p = get_f32_2d(probs, k, i);
421
        LOG(" %f", p);
422
    }
423
    LOG("\n");
424
}
425

426
static void print_matrix(struct ggml_tensor * probs) {
427
    assert(ggml_is_matrix(probs));
428
    for (int i = 0; i < probs->ne[1]; ++i) {
429
        for (int k = 0; k < probs->ne[0]; ++k) {
430
            float p = get_f32_2d(probs, k, i);
431
            LOG(" %.2f", p);
432
        }
433
        LOG("\n");
434
    }
435
}
436

437
struct llama_file {
438
    // use FILE * so we don't have to re-open the file to mmap
439
    FILE * fp;
440
    size_t size;
441

442
    llama_file(const char * fname, const char * mode) {
443
        fp = std::fopen(fname, mode);
444
        if (fp == NULL) {
445
            size = 0;
446
        } else {
447
            seek(0, SEEK_END);
448
            size = tell();
449
            seek(0, SEEK_SET);
450
        }
451
    }
452

453
    size_t tell() const {
454
#ifdef _WIN32
455
        __int64 ret = _ftelli64(fp);
456
#else
457
        long ret = std::ftell(fp);
458
#endif
459
        GGML_ASSERT(ret != -1); // this really shouldn't fail
460
        return (size_t) ret;
461
    }
462

463
    void seek(size_t offset, int whence) {
464
#ifdef _WIN32
465
        int ret = _fseeki64(fp, (__int64) offset, whence);
466
#else
467
        int ret = std::fseek(fp, (long) offset, whence);
468
#endif
469
        GGML_ASSERT(ret == 0); // same
470
    }
471

472
    void read_raw(void * ptr, size_t size) {
473
        if (size == 0) {
474
            return;
475
        }
476
        errno = 0;
477
        std::size_t ret = std::fread(ptr, size, 1, fp);
478
        if (ferror(fp)) {
479
            die_fmt("fread failed: %s", strerror(errno));
480
        }
481
        if (ret != 1) {
482
            die("unexpectedly reached end of file");
483
        }
484
    }
485

486
    std::uint32_t read_u32() {
487
        std::uint32_t ret;
488
        read_raw(&ret, sizeof(ret));
489
        return ret;
490
    }
491
    std::float_t read_f32() {
492
        std::float_t ret;
493
        read_raw(&ret, sizeof(ret));
494
        return ret;
495
    }
496

497
    std::string read_string(std::uint32_t len) {
498
        std::vector<char> chars(len);
499
        read_raw(chars.data(), len);
500
        return std::string(chars.data(), len);
501
    }
502

503
    ~llama_file() {
504
        if (fp) {
505
            std::fclose(fp);
506
        }
507
    }
508
};
509

510
static bool is_ggml_file(const char * filename) {
511
    llama_file file(filename, "rb");
512
    if (file.size < 4) {
513
        return false;
514
    }
515
    std::string magic = file.read_string(4);
516
    return magic == GGUF_MAGIC;
517
}
518

519
static std::string llama_escape_whitespaces(const std::string & text) {
520
    std::ostringstream out;
521
    for (char c : text) {
522
        if (c == ' ') out << "\xe2\x96\x81";
523
        else out << c;
524
    }
525
    return out.str();
526
}
527

528
static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
529
    if (is_ggml_file(filename)) {
530
        LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
531
        struct ggml_context * ctx_data = NULL;
532

533
        struct gguf_init_params params = {
534
            /*.no_alloc = */ false,
535
            /*.ctx      = */ &ctx_data,
536
        };
537

538
        struct gguf_context * ctx = gguf_init_from_file(filename, params);
539
        GGML_ASSERT(ctx != NULL);
540

541
        const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL);
542
        GGML_ASSERT(model_idx >= 0);
543
        std::string tokenizer_name = gguf_get_val_str(ctx, model_idx);
544
        GGML_ASSERT(tokenizer_name == TOKENIZER_NAME);
545

546
        const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST);
547
        GGML_ASSERT(token_idx >= 0);
548

549
        const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES);
550
        GGML_ASSERT(score_idx >= 0);
551
        const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
552

553
        const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE);
554
        GGML_ASSERT(toktype_idx >= 0);
555
        const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
556

557
        const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
558
        if (n_vocab != static_cast<uint32_t>(config->vocab_size)) {
559
            die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size);
560
        }
561

562
        vocab->id_to_token.resize(n_vocab);
563

564
        for (uint32_t i = 0; i < n_vocab; i++) {
565
            std::string word = gguf_get_arr_str(ctx, token_idx, i);
566

567
            vocab->token_to_id[word] = i;
568

569
            auto & token_data = vocab->id_to_token[i];
570
            token_data.text  = std::move(word);
571
            token_data.score = scores[i];
572
            token_data.type  = (llama_token_type) toktypes[i];
573
        }
574
        ggml_free(ctx_data);
575
        gguf_free(ctx);
576
    } else {
577
        // assume llama2.c vocabulary
578
        LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
579
        llama_file file(filename, "rb");
580
        if (!file.fp) {
581
            die_fmt("%s: %s", strerror(errno), filename);
582
        }
583
        const int  n_vocab = config->vocab_size;
584
        /* uint32_t max_token_length =  */ file.read_u32(); // unused
585
        vocab->id_to_token.resize(n_vocab);
586
        for (llama_vocab::id id=0; id<n_vocab; ++id) {
587
            float_t score = file.read_f32();
588
            uint32_t len = file.read_u32();
589
            std::string text = file.read_string(len);
590

591
            unsigned char byte_val;
592
            llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
593
            if (id == UNKNOWN_TOKEN_ID) {
594
                text = "<unk>";
595
                type = LLAMA_TOKEN_TYPE_UNKNOWN;
596
            } else if (id == BOS_TOKEN_ID) {
597
                text = "<s>";
598
                type = LLAMA_TOKEN_TYPE_CONTROL;
599
            } else if (id == EOS_TOKEN_ID) {
600
                text = "</s>";
601
                type = LLAMA_TOKEN_TYPE_CONTROL;
602
            } else if (text.empty()) {
603
                type = LLAMA_TOKEN_TYPE_CONTROL;
604
            } else if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
605
                // Text of byte tokens is already in the expected format.
606
                type = LLAMA_TOKEN_TYPE_BYTE;
607
            } else {
608
                type = LLAMA_TOKEN_TYPE_NORMAL;
609
            }
610
            text = llama_escape_whitespaces(text);
611

612
            vocab->id_to_token[id].text = text;
613
            vocab->id_to_token[id].score = score;
614
            vocab->id_to_token[id].type = type;
615
            vocab->token_to_id.emplace(text, id);
616
        }
617
    }
618
}
619

620
static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
621
    int size = 1;
622
    for (int dim = 0; dim < ggml_n_dims(gg_weights); ++dim) {
623
        size *= gg_weights->ne[dim];
624
    }
625
    for (int ct = 0; ct < size; ++ct) {
626
        int64_t i0 = 0; int64_t i1 = 0;
627
        int64_t i2 = 0; int64_t i3 = 0;
628
        ggml_unravel_index(gg_weights, ct, &i0, &i1, &i2, &i3);
629
        ggml_set_f32_nd(gg_weights, i0, i1, i2, i3, karpathy_weights[ct]);
630
    }
631
}
632

633
static void save_as_llama_model(
634
    struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
635
) {
636
    // convert AK weights into GG weights one by one.
637
    // w->token_embedding_table -> model->tok_embeddings
638
    // float*                   -> struct ggml_tensor
639
    convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table.data());
640
    convert_weights_ak_to_gg(model->output, !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data());
641

642
    convert_weights_ak_to_gg(model->norm, w->rms_final_weight.data());
643
    //print_row(model->norm, 0);
644

645
    // for rms-att-weight
646
    int row_length = model->hparams.n_embd;
647
    int n_ff = model->hparams.n_ff;
648

649
    const uint32_t n_multiqueries = model->hparams.n_head_kv <= 0 || model->hparams.n_head_kv >= model->hparams.n_head ? 1 : model->hparams.n_head / model->hparams.n_head_kv;
650

651
    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
652
        auto & layer = model->layers[i];
653
        // 1d
654
        convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
655
        convert_weights_ak_to_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
656

657
        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
658
        convert_weights_ak_to_gg(layer.wq            , &w->wq[i*row_length*row_length]);
659
        convert_weights_ak_to_gg(layer.wo            , &w->wo[i*row_length*row_length]);
660
        // from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries
661
        convert_weights_ak_to_gg(layer.wk            , &w->wk[i*row_length*row_length/n_multiqueries]);
662
        convert_weights_ak_to_gg(layer.wv            , &w->wv[i*row_length*row_length/n_multiqueries]);
663

664
        convert_weights_ak_to_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
665
        convert_weights_ak_to_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
666
        convert_weights_ak_to_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
667
    }
668

669
    struct gguf_context * ctx = gguf_init_empty();
670

671
    std::vector<const char*> tokens;
672
    std::vector<float> scores;
673
    std::vector<llama_token_type> token_types;
674
    for (const llama_vocab::token_data & token_data : vocab->id_to_token) {
675
        tokens.push_back(token_data.text.c_str());
676
        scores.push_back(token_data.score);
677
        token_types.push_back(token_data.type);
678
    }
679
    gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, tokens.data(), tokens.size());
680
    gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, GGUF_TYPE_FLOAT32, scores.data(), scores.size());
681
    gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, GGUF_TYPE_INT32, token_types.data(), token_types.size());
682

683
    gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME);
684

685
    gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "llama");
686
    gguf_set_val_str(ctx, KV_GENERAL_NAME, "llama");
687

688
    // special tokens
689
    gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
690
    gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
691
    gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
692
    gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
693
    gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
694

695
    gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
696
    gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
697
    gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
698
    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
699
    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
700
    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, model->hparams.n_head_kv);
701
    gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
702
    gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
703
    gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
704

705
    // write tensors
706
    ggml_set_name(model->tok_embeddings, TN_TOKEN_EMBD);
707
    gguf_add_tensor(ctx, model->tok_embeddings);
708

709
    ggml_set_name(model->norm, TN_OUTPUT_NORM);
710
    gguf_add_tensor(ctx, model->norm);
711

712
    ggml_set_name(model->output, TN_OUTPUT);
713
    gguf_add_tensor(ctx, model->output);
714

715
    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
716
        auto & layer = model->layers[i];
717

718
        ggml_format_name(layer.wq, TN_ATTN_Q, i);
719
        gguf_add_tensor(ctx, layer.wq);
720

721
        ggml_format_name(layer.wk, TN_ATTN_K, i);
722
        gguf_add_tensor(ctx, layer.wk);
723

724
        ggml_format_name(layer.wv, TN_ATTN_V, i);
725
        gguf_add_tensor(ctx, layer.wv);
726

727
        ggml_format_name(layer.wo, TN_ATTN_OUTPUT, i);
728
        gguf_add_tensor(ctx, layer.wo);
729

730
        ggml_format_name(layer.attention_norm, TN_ATTN_NORM, i);
731
        gguf_add_tensor(ctx, layer.attention_norm);
732

733
        ggml_format_name(layer.w1, TN_FFN_GATE, i);
734
        gguf_add_tensor(ctx, layer.w1);
735

736
        ggml_format_name(layer.w2, TN_FFN_DOWN, i);
737
        gguf_add_tensor(ctx, layer.w2);
738

739
        ggml_format_name(layer.w3, TN_FFN_UP, i);
740
        gguf_add_tensor(ctx, layer.w3);
741

742
        ggml_format_name(layer.ffn_norm, TN_FFN_NORM, i);
743
        gguf_add_tensor(ctx, layer.ffn_norm);
744
    }
745

746
    gguf_write_to_file(ctx, filename, false);
747
    gguf_free(ctx);
748
}
749

750
static struct train_params get_default_train_params() {
751
    struct train_params params;
752
    params.fn_vocab_model          = "models/7B/ggml-model-f16.gguf";
753
    params.fn_llama2c_output_model = "ak_llama_model.bin";
754
    params.fn_train_data           = "shakespeare.txt";
755
    params.fn_checkpoint_in        = "checkpoint.bin";
756
    params.fn_checkpoint_out       = "checkpoint.bin";
757
    params.fn_model_out            = "ggml-checkpoint-f32.bin";
758

759
    params.seed       =   -1;
760

761
    params.n_ctx      =  128;
762
    params.n_embd     =  256;
763
    params.n_mult     =  256;
764
    params.n_head     =    8;
765
    params.n_layer    =   16;
766
    params.n_rotmax   =   64;
767

768
    params.n_threads  =    6;
769
    params.n_batch    =    8;
770
    params.n_examples =    8;
771
    params.n_predict  = 1024;
772

773
    params.print_info_interval    = 1;
774
    params.print_details_interval = 2;
775

776
    params.samples_start_after_nl = false;
777
    params.use_adam               = true;
778
    params.use_flash              = false;
779
    params.use_scratch            = true;
780

781
    // only adam
782
    params.warmup            =  100;
783
    params.cos_decay_steps   = 1000;
784
    params.cos_decay_restart = 1.1f;
785
    params.cos_decay_alpha   = 0.0f;
786

787
    params.lbfgs_n_iter      = 16;
788
    params.adam_n_iter       = 16;
789
    params.adam_alpha        = 1e-3f;
790
    params.adam_decay        = 1e-3f;
791

792
    params.mem_model_gb    = 2;
793
    params.mem_compute_gb  = 24;
794
    params.mem_compute0_gb = 8;
795
    params.mem_compute1_gb = 2;
796

797
    return params;
798
}
799

800
static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
801
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
802
    fprintf(stderr, "\n");
803
    fprintf(stderr, "options:\n");
804
    fprintf(stderr, "  -h, --help                       show this help message and exit\n");
805
    fprintf(stderr, "  --copy-vocab-from-model FNAME    path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model);
806
    fprintf(stderr, "  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
807
    fprintf(stderr, "  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
808
    fprintf(stderr, "\n");
809
}
810

811
static bool params_parse(int argc, char ** argv, struct train_params * params) {
812
    bool invalid_param = false;
813
    bool reqd_param_found = false;
814
    std::string arg;
815
    struct train_params default_params = get_default_train_params();
816
    const std::string arg_prefix = "--";
817

818
    for (int i = 1; i < argc; i++) {
819
        arg = argv[i];
820
        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
821
            std::replace(arg.begin(), arg.end(), '_', '-');
822
        }
823

824
        if (arg == "--copy-vocab-from-model") {
825
            if (++i >= argc) {
826
                invalid_param = true;
827
                break;
828
            }
829
            params->fn_vocab_model = argv[i];
830
        } else if (arg == "--llama2c-model") {
831
            if (++i >= argc) {
832
                invalid_param = true;
833
                break;
834
            }
835
            reqd_param_found = true;
836
            params->fn_llama2c_model = argv[i];
837
        } else if (arg == "--llama2c-output-model") {
838
            if (++i >= argc) {
839
                invalid_param = true;
840
                break;
841
            }
842
            params->fn_llama2c_output_model = argv[i];
843
        } else if (arg == "-h" || arg == "--help") {
844
            print_usage(argc, argv, &default_params);
845
            exit(0);
846
        } else {
847
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
848
            print_usage(argc, argv, &default_params);
849
            exit(1);
850
        }
851
    }
852
    if (invalid_param) {
853
        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
854
        print_usage(argc, argv, &default_params);
855
        exit(1);
856
    }
857
    if (!reqd_param_found){
858
        fprintf(stderr, "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n");
859
        print_usage(argc, argv, &default_params);
860
        exit(1);
861
    }
862

863
    return true;
864
}
865

866
static std::string basename(const std::string &path) {
867
    size_t pos = path.find_last_of("/\\");
868
    if (pos == std::string::npos) {
869
        return path;
870
    }
871
    return path.substr(pos + 1);
872
}
873

874
int main(int argc, char ** argv) {
875
    gpt_init();
876

877
    struct train_params params = get_default_train_params();
878
    if (!params_parse(argc, argv, &params)) {
879
        return 1;
880
    }
881

882
    Config config;
883
    TransformerWeights weights = {};
884
    {
885
        LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
886
        FILE * file = fopen(params.fn_llama2c_model, "rb");
887
        if (!file) {
888
            LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
889
            return 1;
890
        }
891
        // read in the config header
892
        if (fread(&config, sizeof(Config), 1, file) != 1) {
893
            LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
894
            return 1;
895
        }
896
        auto shared_weights = config.vocab_size > 0;
897
        config.vocab_size = abs(config.vocab_size);
898

899
        // read in the Transformer weights
900
        alloc_weights(&weights, &config, shared_weights);
901
        if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
902
            LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
903
            return 1;
904
        }
905
        fclose(file);
906
    }
907

908
    struct llama_vocab vocab;
909
    load_vocab(params.fn_vocab_model, &config, &vocab);
910

911
    struct my_llama_model model;
912
    model.hparams.n_vocab   = config.vocab_size; //llama_n_vocab(lctx);
913
    model.hparams.n_ctx     = params.n_ctx;
914
    model.hparams.n_embd    = config.dim; //params.n_embd;
915
    model.hparams.n_ff      = config.hidden_dim;
916
    model.hparams.n_mult    = 32;//params.n_mult;
917
    model.hparams.n_head    = config.n_heads; //params.n_head;
918
    model.hparams.n_head_kv = config.n_kv_heads;
919
    model.hparams.n_layer   = config.n_layers; //params.n_layer;
920
    model.hparams.n_rot     = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
921

922
    print_params(&model.hparams);
923

924
    struct ggml_init_params lcparams;
925
    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
926
    lcparams.mem_buffer = NULL;
927
    lcparams.no_alloc   = false;
928

929
    model.ctx = ggml_init(lcparams);
930

931
    init_model(&model);
932
    model.name = basename(params.fn_llama2c_model);
933
    save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
934

935
    LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
936

937
    ggml_free(model.ctx);
938
    return 0;
939
}
940
llama

Использование cookies