llama

cvector-generator.cpp
503 строки · 17.9 Кб
Перенос по словам
1
#include "arg.h"
2
#include "common.h"
3
#include "llama.h"
4
#include "ggml.h"
5
#include "pca.hpp"
6
#include "mean.hpp"
7

8
#ifdef GGML_USE_CUDA
9
#include "ggml-cuda.h"
10
#endif
11

12
#ifdef GGML_USE_METAL
13
#include "ggml-metal.h"
14
#endif
15

16
#include <algorithm>
17
#include <climits>
18
#include <cstdio>
19
#include <cstring>
20
#include <fstream>
21
#include <iostream>
22
#include <string>
23
#include <tuple>
24
#include <vector>
25

26

27
//////////////////////////////////////////////////
28
// utils
29

30
template <class Iter>
31
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
32
    std::string ret;
33
    for (; begin != end; ++begin) {
34
        ret += llama_token_to_piece(ctx, *begin);
35
    }
36

37
    return ret;
38
}
39

40
static void print_usage(int, char ** argv) {
41
    printf("\nexample usage:\n");
42
    printf("\n    CPU only:   %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
43
    printf("\n    with GPU:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
44
    printf("\n    advanced:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
45
    printf("\n    using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
46
    printf("\n");
47
}
48

49
//////////////////////////////////////////////////
50

51

52
// cb_eval is reused for each pair of positive - negative prompt
53
struct callback_data {
54
    ggml_context * ctx_ggml = nullptr;   // holds v_pos, v_neg, v_diff_filtered
55

56
    int n_layers = 0;
57
    int n_tokens = 0;
58
    bool is_eval_pos = true;
59

60
    // each element of the vector correspond to one layer
61
    std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
62
    std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
63
    std::vector<struct ggml_tensor *> v_diff_filtered;   // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
64

65
    // save a tensor into either v_pos or v_neg (decided by is_eval_pos)
66
    void save_tensor_for_layer(struct ggml_tensor * t) {
67
        GGML_ASSERT(t->type == GGML_TYPE_F32);
68

69
        if (ctx_ggml == nullptr) {
70
            // alloc a new ctx_ggml if needed
71
            struct ggml_init_params params_ggml = {
72
                /*.mem_size   =*/ ggml_tensor_overhead() * n_layers * 3u,
73
                /*.mem_buffer =*/ NULL,
74
                /*.no_alloc   =*/ true,
75
            };
76
            ctx_ggml = ggml_init(params_ggml);
77
        }
78

79
        // copy tensor data
80
        auto n_bytes = ggml_nbytes(t);
81
        struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
82
        t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
83
        ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
84
        ggml_set_name(t_layer, ggml_get_name(t));
85
        //print_debug_tensor(t_layer);
86

87
        if (is_eval_pos) {
88
            v_pos.push_back(t_layer);
89
        } else {
90
            v_neg.push_back(t_layer);
91
        }
92
    }
93

94
    // calculate diff (v_pos - v_neg) and place the result back to v_pos
95
    // all zero rows in the diff tensor will also be removed
96
    // NOTE: final layer is ignored. we only have (n_layers - 1) to process
97
    std::vector<struct ggml_tensor *> calc_diff() {
98
        for (float il = 0; il < v_pos.size(); il++) {
99
            float * a = (float *) v_pos[il]->data;
100
            float * b = (float *) v_neg[il]->data;
101
            size_t n_elem = ggml_nelements(v_pos[il]);
102
            for (size_t j = 0; j < n_elem; j++) {
103
                a[j] -= b[j];
104
            }
105
            //print_debug_tensor(v_pos[i]);
106
            auto diff_filtered = filter_nonzero_rows(v_pos[il]);
107
            v_diff_filtered.push_back(diff_filtered);
108
        }
109
        return v_diff_filtered; // for convinient, we return the result std::vector
110
    }
111

112
    // delete zero rows from a given 2D tensor
113
    struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
114
        //printf("filter_nonzero_rows\n");
115
        auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
116
            // check if given row containing all zero elements
117
            int n_cols = t->ne[0]; // hint: should be equal to n_embd
118
            for (int col = 0; col < n_cols; ++col) {
119
                if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
120
                    return false;
121
                }
122
            }
123
            return true;
124
        };
125
        std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
126
        for (int i_row = 0; i_row < a->ne[1]; i_row++) {
127
            if (!is_row_all_zeros(a, i_row, 1e-6)) {
128
                rows_to_copy.push_back(i_row);
129
            }
130
        }
131

132
        // get "n_nonzero_rows" for the output "diff_filtered"
133
        int n_nonzero_rows = rows_to_copy.size();
134
        //printf("n_nonzero_rows: %d\n", n_nonzero_rows);
135
        int n_embd = a->ne[0];
136
        GGML_ASSERT(n_nonzero_rows > 0);
137

138
        // diff_filtered: [n_embd, n_nonzero_rows]
139
        struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
140
            ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
141
        ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
142
        diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
143

144
        // copy non-zero rows
145
        for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
146
            int src_row = rows_to_copy[dest_row];
147
            for (int i = 0; i < n_embd; i++) {
148
                float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
149
                ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
150
            }
151
        }
152

153
        //print_debug_tensor(diff_filtered);
154

155
        return diff_filtered;
156
    }
157

158
    // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
159
    void reset() {
160
        for (auto ptr : v_pos) free(ptr->data);
161
        for (auto ptr : v_neg) free(ptr->data);
162
        for (auto ptr : v_diff_filtered) free(ptr->data);
163
        v_pos.clear();
164
        v_neg.clear();
165
        v_diff_filtered.clear();
166
        if (ctx_ggml) {
167
            ggml_free(ctx_ggml);
168
        }
169
        ctx_ggml = nullptr;
170
    }
171
};
172

173
/**
174
 * process_ctx is used to store the ggml context for pre-post processing the diff vectors
175
 * in short, input => v_diff and output => v_final
176
 */
177
struct train_context {
178
    ggml_context * ctx_ggml;
179
    int n_embd;
180
    int n_layers;
181

182
    /* pair of prompts to be used for generating final vector */
183
    std::vector<std::string> positive_entries;
184
    std::vector<std::string> negative_entries;
185

186
    // each element of the vector correspond to one layer
187
    // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
188
    // NOTE (2): v_diff is transposed from v_diff_tmp
189
    std::vector<struct ggml_tensor *> v_diff;  // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
190
    std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
191

192
    // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
193
    // v_diff_tmp will get converted unto v_diff later on
194
    std::vector<std::vector<uint8_t>> v_diff_tmp;
195

196
    train_context(int n_embd_, int n_layers_) {
197
        n_embd = n_embd_;
198
        n_layers = n_layers_;
199
        struct ggml_init_params params_ggml = {
200
            /*.mem_size   =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
201
            /*.mem_buffer =*/ NULL,
202
            /*.no_alloc   =*/ true,
203
        };
204
        ctx_ggml = ggml_init(params_ggml);
205
        for (int il = 0; il < n_layers - 1; il++) {
206
            std::vector<uint8_t> empty;
207
            v_diff_tmp.push_back(empty);
208
            auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
209
            t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
210
            v_final.push_back(t);
211
        }
212
    }
213

214
    // add new rows into existing tensor in v_diff_tmp
215
    void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
216
        GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
217
        for (int il = 0; il < n_layers - 1; il++) {
218
            auto t = diff_filtered[il];
219
            auto & diff_tmp = v_diff_tmp[il];
220
            size_t curr_size = diff_tmp.size();
221
            diff_tmp.resize(curr_size + ggml_nbytes(t));
222
            memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
223
        }
224
    }
225

226
    // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
227
    // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
228
    void build_v_diff(bool transpose) {
229
        printf("build_v_diff\n");
230
        for (int il = 0; il < n_layers - 1; il++) {
231
            auto & diff_tmp = v_diff_tmp[il];
232
            int n_elem = diff_tmp.size() / sizeof(float);
233
            GGML_ASSERT(n_elem % n_embd == 0);
234
            int n_rows = n_elem / n_embd;
235
            struct ggml_tensor * diff = transpose
236
                ? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
237
                : ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
238
            ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
239
            diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
240
            if (transpose) {
241
                // copy data & transpose
242
                float * arr = (float *) diff_tmp.data();
243
                for (int ir = 0; ir < n_rows; ++ir) {
244
                    for (int ic = 0; ic < n_embd; ++ic) {
245
                        float f = arr[ir*n_embd + ic];
246
                        ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
247
                    }
248
                }
249
            } else {
250
                // only copy
251
                memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
252
            }
253
            v_diff.push_back(diff);
254
            print_debug_tensor(diff);
255
            // free memory of diff_tmp
256
            diff_tmp.resize(0);
257
        }
258
    }
259

260
    ~train_context() {
261
        for (auto ptr : v_final) free(ptr->data);
262
        for (auto ptr : v_diff) free(ptr->data);
263
        // no need to free v_diff_tmp, since we didn't use malloc
264
        ggml_free(ctx_ggml);
265
    }
266
};
267

268
struct tokenized_prompt {
269
    std::vector<llama_token> tokens_pos;
270
    std::vector<llama_token> tokens_neg;
271
    size_t max_seq_len;
272

273
    tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
274
        const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
275
        tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
276
        tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
277
        max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
278
        padding_seq(ctx, tokens_pos, max_seq_len);
279
        padding_seq(ctx, tokens_neg, max_seq_len);
280
    }
281

282
    void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
283
        // TODO: customize padding token
284
        std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false);
285
        llama_token pad_tok = pad_tokens.back();
286
        while (tokens.size() < len) {
287
            tokens.push_back(pad_tok);
288
        }
289
    }
290
};
291

292
//////////////////////////////////////////////////
293

294
template <typename T>
295
static std::string to_string(const T & val) {
296
    std::stringstream ss;
297
    ss << val;
298
    return ss.str();
299
}
300

301
static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
302
    std::vector<std::string> output;
303
    std::ifstream file(path);
304
    if (!file.is_open()) {
305
        fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
306
        exit(1);
307
    }
308
    std::string line;
309
    while (std::getline(file, line)) {
310
        bool is_skip = skip_empty_lines && line.empty();
311
        if (!is_skip) {
312
            string_process_escapes(line);
313
            output.push_back(line);
314
        }
315
    }
316
    file.close();
317
    return output;
318
}
319

320
//////////////////////////////////////////////////
321

322
static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
323
    auto * cb_data = (callback_data *) user_data;
324
    static const char * l_out_name = "l_out";
325
    const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
326

327
    if (ask) {
328
        return is_l_out;
329
    }
330

331
    if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
332
        return true;
333
    }
334

335
    // save the tensor to current context
336
    cb_data->save_tensor_for_layer(t);
337
    return true;
338
}
339

340
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
341
    llama_kv_cache_clear(ctx);
342
    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
343
        fprintf(stderr, "%s : failed to eval\n", __func__);
344
        return false;
345
    }
346
    return true;
347
}
348

349
static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
350
    struct gguf_context * ctx = gguf_init_empty();
351

352
    const std::string arch = "controlvector";
353
    gguf_set_val_str(ctx, "general.architecture", arch.c_str());
354
    gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
355
    gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
356

357
    for (size_t i = 0; i < v_ctrl.size(); ++i) {
358
        gguf_add_tensor(ctx, v_ctrl[i]);
359
        print_debug_tensor(v_ctrl[i]);
360
        printf("Added tensor: %s\n", v_ctrl[i]->name);
361
    }
362

363
    printf("%s: writing file...\n", __func__);
364
    gguf_write_to_file(ctx, fname.c_str(), false);
365
    printf("%s: wrote file '%s'\n", __func__, fname.c_str());
366
    gguf_free(ctx);
367
}
368

369
/**
370
 * Load prompt files and completion file.
371
 * Then format each pair of prompt + completion to make an entry.
372
 */
373
static int prepare_entries(gpt_params & params, train_context & ctx_train) {
374
    // load prompts
375
    std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
376
    std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
377
    if (positive_prompts.size() != negative_prompts.size()) {
378
        fprintf(stderr, "number of positive and negative prompts must be equal\n");
379
        return 1;
380
    }
381
    if (positive_prompts.empty()) {
382
        fprintf(stderr, "must provide at least one prompt pair\n");
383
        return 1;
384
    }
385
    ctx_train.positive_entries = positive_prompts;
386
    ctx_train.negative_entries = negative_prompts;
387
    return 0;
388
}
389

390
int main(int argc, char ** argv) {
391
    gpt_params params;
392

393
    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
394
        return 1;
395
    }
396

397
    if (params.n_pca_iterations % params.n_pca_batch != 0) {
398
        fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
399
        return 1;
400
    }
401

402

403
    callback_data cb_data;
404

405
    // pass the callback to the backend scheduler
406
    // it will be executed for each node during the graph computation
407
    params.cb_eval = cb_eval;
408
    params.cb_eval_user_data = &cb_data;
409
    params.warmup = false;
410

411
    print_build_info();
412
    llama_backend_init();
413
    llama_numa_init(params.numa);
414

415
    // load the model to get hparams
416
    llama_init_result llama_init = llama_init_from_gpt_params(params);
417

418
    llama_model * model = llama_init.model;
419
    llama_context * ctx = llama_init.context;
420

421
    // int n_ctx = llama_n_ctx(ctx);
422
    int n_layers = llama_n_layer(model);
423
    int n_embd = llama_n_embd(model);
424
    // get model hint param (a.k.a model arch name)
425
    char model_hint[128];
426
    llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
427

428
    // init train_context
429
    train_context ctx_train(n_embd, n_layers);
430

431
    // load and prepare entries for training
432
    prepare_entries(params, ctx_train);
433

434
    // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
435
    std::vector<tokenized_prompt> tokenized_prompts;
436
    size_t n_total_tokens = 0;
437
    for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
438
        tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
439
        n_total_tokens += 2 * t.max_seq_len;
440
        tokenized_prompts.push_back(std::move(t));
441
    }
442

443
    std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
444

445
    for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
446
        bool success = false;
447
        tokenized_prompt t = tokenized_prompts[i];
448
        cb_data.n_layers = n_layers;
449
        cb_data.n_tokens = t.max_seq_len;
450

451
        printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
452
            (int) i+1, (int) ctx_train.positive_entries.size(),
453
            tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
454
            tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
455
            (int) t.max_seq_len);
456

457
        cb_data.is_eval_pos = true;
458
        success = get_hidden_layers(ctx, t.tokens_pos);
459
        if (!success) break;
460

461
        cb_data.is_eval_pos = false;
462
        success = get_hidden_layers(ctx, t.tokens_neg);
463
        if (!success) break;
464

465
        // calculate diff and remove all zero rows
466
        auto v_diff_filtered = cb_data.calc_diff();
467

468
        // save & concat the filtered v_diff to ctx_train
469
        ctx_train.concat_diff_tmp(v_diff_filtered);
470

471
        // reset for next iteration
472
        cb_data.reset();
473
    }
474

475
    // done with the model, we can now free it to make gain some memory
476
    printf("Done evaluate prompts, unload model...\n");
477
    llama_free(ctx);
478
    llama_free_model(model);
479

480
    bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
481

482
    // prepare ctx_train for PCA
483
    ctx_train.build_v_diff(use_pca);
484

485
    if (use_pca) {
486
        // run PCA
487
        PCA::pca_params pca_params;
488
        pca_params.n_threads    = params.cpuparams.n_threads;
489
        pca_params.n_batch      = params.n_pca_batch;
490
        pca_params.n_iterations = params.n_pca_iterations;
491
        PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
492
    } else {
493
        // run mean
494
        mean::run(ctx_train.v_diff, ctx_train.v_final);
495
    }
496

497
    // write output vectors to gguf
498
    export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
499

500
    llama_backend_free();
501

502
    return 0;
503
}
504
llama

Использование cookies