llama

imatrix.cpp
646 строк · 21.9 Кб
Перенос по словам
1
#include "arg.h"
2
#include "common.h"
3
#include "log.h"
4
#include "llama.h"
5

6
#include <cmath>
7
#include <cstdio>
8
#include <cstring>
9
#include <ctime>
10
#include <sstream>
11
#include <thread>
12
#include <mutex>
13
#include <vector>
14
#include <fstream>
15
#include <unordered_map>
16
#include <algorithm>
17

18
#if defined(_MSC_VER)
19
#pragma warning(disable: 4244 4267) // possible loss of data
20
#endif
21

22
static void print_usage(int, char ** argv) {
23
    LOG("\nexample usage:\n");
24
    LOG("\n    %s \\\n"
25
            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
26
            "       [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
27
            "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
28
    LOG("\n");
29
}
30

31
struct Stats {
32
    std::vector<float> values;
33
    std::vector<int> counts;
34
    int ncall = 0;
35
};
36

37
class IMatrixCollector {
38
public:
39
    IMatrixCollector() = default;
40
    void set_params(gpt_params params) { m_params = std::move(params); }
41
    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
42
    void save_imatrix(int ncall = -1) const;
43
    bool load_imatrix(const char * file_name);
44
private:
45
    std::unordered_map<std::string, Stats> m_stats;
46
    gpt_params                             m_params;
47
    std::mutex                             m_mutex;
48
    int                                    m_last_call = 0;
49
    std::vector<float>                     m_src1_data;
50
    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
51
};
52

53
// remove any prefix and suffixes from the name
54
// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
55
static std::string filter_tensor_name(const char * name) {
56
    std::string wname;
57
    const char * p = strchr(name, '#');
58
    if (p != NULL) {
59
        p = p + 1;
60
        const char * q = strchr(p, '#');
61
        if (q != NULL) {
62
            wname = std::string(p, q - p);
63
        } else {
64
            wname = p;
65
        }
66
    } else {
67
        wname = name;
68
    }
69
    return wname;
70
}
71

72
bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
73
    GGML_UNUSED(user_data);
74

75
    const struct ggml_tensor * src0 = t->src[0];
76
    const struct ggml_tensor * src1 = t->src[1];
77
    std::string wname = filter_tensor_name(src0->name);
78

79
    // when ask is true, the scheduler wants to know if we are interested in data from this tensor
80
    // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
81
    if (ask) {
82
        if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
83
        if (t->op != GGML_OP_MUL_MAT) return false;
84
        // why are small batches ignored (<16 tokens)?
85
        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
86
        if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
87
        return true;
88
    }
89

90
    std::lock_guard<std::mutex> lock(m_mutex);
91

92
    // copy the data from the GPU memory if needed
93
    const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
94

95
    if (!is_host) {
96
        m_src1_data.resize(ggml_nelements(src1));
97
        ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
98
    }
99

100
    const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
101

102
    // this has been adapted to the new format of storing merged experts in a single 3d tensor
103
    // ref: https://github.com/ggerganov/llama.cpp/pull/6387
104
    if (t->op == GGML_OP_MUL_MAT_ID) {
105
        //   ids  -> [n_experts_used, n_tokens]
106
        //   src1 -> [cols, n_expert_used, n_tokens]
107
        const ggml_tensor * ids = t->src[2];
108
        const int n_as = src0->ne[2];
109
        const int n_ids = ids->ne[0];
110

111
        // the top-k selected expert ids are stored in the ids tensor
112
        // for simplicity, always copy ids to host, because it is small
113
        // take into account that ids is not contiguous!
114

115
        GGML_ASSERT(ids->ne[1] == src1->ne[2]);
116

117
        m_ids.resize(ggml_nbytes(ids));
118
        ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
119

120
        auto & e = m_stats[wname];
121

122
        ++e.ncall;
123

124
        if (e.values.empty()) {
125
            e.values.resize(src1->ne[0]*n_as, 0);
126
            e.counts.resize(src1->ne[0]*n_as, 0);
127
        }
128
        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
129
            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
130
            exit(1); //GGML_ABORT("fatal error");
131
        }
132
        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
133
        // loop over all possible experts, regardless if they are used or not in the batch
134
        for (int ex = 0; ex < n_as; ++ex) {
135
            size_t e_start = ex*src1->ne[0];
136

137
            for (int idx = 0; idx < n_ids; ++idx) {
138
                for (int row = 0; row < (int)src1->ne[2]; ++row) {
139
                    const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]);
140

141
                    GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
142

143
                    if (excur != ex) continue;
144

145
                    const int64_t i11 = idx % src1->ne[1];
146
                    const int64_t i12 = row;
147
                    const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
148

149
                    for (int j = 0; j < (int)src1->ne[0]; ++j) {
150
                        e.values[e_start + j] += x[j]*x[j];
151
                        e.counts[e_start + j]++;
152
                        if (!std::isfinite(e.values[e_start + j])) {
153
                            LOG("\n");
154
                            LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
155
                            exit(1);
156
                        }
157
                    }
158
                }
159
            }
160
            if (e.ncall > m_last_call) {
161
                m_last_call = e.ncall;
162
                if (m_last_call % m_params.n_out_freq == 0) {
163
                    save_imatrix();
164
                }
165
                if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
166
                    save_imatrix(m_last_call);
167
                }
168
            }
169
        }
170
    } else {
171
        auto & e = m_stats[wname];
172
        if (e.values.empty()) {
173
            e.values.resize(src1->ne[0], 0);
174
            e.counts.resize(src1->ne[0], 0);
175
        }
176
        else if (e.values.size() != (size_t)src1->ne[0]) {
177
            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
178
            exit(1); //GGML_ABORT("fatal error");
179
        }
180
        ++e.ncall;
181
        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
182
        for (int row = 0; row < (int)src1->ne[1]; ++row) {
183
            const float * x = data + row * src1->ne[0];
184
            for (int j = 0; j < (int)src1->ne[0]; ++j) {
185
                e.values[j] += x[j]*x[j];
186
                e.counts[j]++;
187
                if (!std::isfinite(e.values[j])) {
188
                    LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
189
                    exit(1);
190
                }
191
            }
192
        }
193
        if (e.ncall > m_last_call) {
194
            m_last_call = e.ncall;
195
            if (m_last_call % m_params.n_out_freq == 0) {
196
                save_imatrix();
197
            }
198
            if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
199
                save_imatrix(m_last_call);
200
            }
201
        }
202
    }
203

204
    return true;
205
}
206

207
void IMatrixCollector::save_imatrix(int ncall) const {
208
    auto fname = m_params.out_file;
209
    if (fname.empty()) {
210
        fname = "imatrix.dat";
211
    }
212

213
    if (ncall > 0) {
214
        fname += ".at_";
215
        fname += std::to_string(ncall);
216
    }
217

218
    // avoid writing imatrix entries that do not have full data
219
    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
220

221
    int n_entries = 0;
222
    std::vector<std::string> to_store;
223

224
    bool is_first = true; // for printing
225
    for (const auto & kv : m_stats) {
226
        const int n_all = kv.second.counts.size();
227

228
        if (n_all == 0) {
229
            continue;
230
        }
231

232
        int n_zeros = 0;
233
        for (const int c : kv.second.counts) {
234
            if (c == 0) {
235
                n_zeros++;
236
            }
237
        }
238

239
        if (n_zeros != 0 && is_first) {
240
            LOG_INF("\n");
241
            is_first = false;
242
        }
243

244
        if (n_zeros == n_all) {
245
            LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
246
            continue;
247
        }
248

249
        if (n_zeros > 0) {
250
            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
251
            continue;
252
        }
253

254
        n_entries++;
255
        to_store.push_back(kv.first);
256
    }
257

258
    if (to_store.size() < m_stats.size()) {
259
        LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
260
    }
261

262
    std::ofstream out(fname, std::ios::binary);
263
    out.write((const char *) &n_entries, sizeof(n_entries));
264
    for (const auto & name : to_store) {
265
        const auto & stat = m_stats.at(name);
266
        int len = name.size();
267
        out.write((const char *) &len, sizeof(len));
268
        out.write(name.c_str(), len);
269
        out.write((const char *) &stat.ncall, sizeof(stat.ncall));
270
        int nval = stat.values.size();
271
        out.write((const char *) &nval, sizeof(nval));
272
        if (nval > 0) {
273
            std::vector<float> tmp(nval);
274
            for (int i = 0; i < nval; i++) {
275
                tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
276
            }
277
            out.write((const char*)tmp.data(), nval*sizeof(float));
278
        }
279
    }
280

281
    // Write the number of call the matrix was computed with
282
    out.write((const char *) &m_last_call, sizeof(m_last_call));
283

284
    // Write the input filename at the end of the file to later on specify it in quantize
285
    {
286
        int len = m_params.prompt_file.size();
287
        out.write((const char *) &len, sizeof(len));
288
        out.write(m_params.prompt_file.c_str(), len);
289
    }
290

291
    LOGV(1, "\n");
292
    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
293
}
294

295
bool IMatrixCollector::load_imatrix(const char * fname) {
296
    std::ifstream in(fname, std::ios::binary);
297
    if (!in) {
298
        LOG_ERR("%s: failed to open %s\n",__func__, fname);
299
        return false;
300
    }
301
    int n_entries;
302
    in.read((char*)&n_entries, sizeof(n_entries));
303
    if (in.fail() || n_entries < 1) {
304
        LOG_ERR("%s: no data in file %s\n", __func__, fname);
305
        return false;
306
    }
307
    for (int i = 0; i < n_entries; ++i) {
308
        int len; in.read((char *)&len, sizeof(len));
309
        std::vector<char> name_as_vec(len+1);
310
        in.read((char *)name_as_vec.data(), len);
311
        if (in.fail()) {
312
            LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
313
            return false;
314
        }
315
        name_as_vec[len] = 0;
316
        std::string name{name_as_vec.data()};
317
        auto & e = m_stats[std::move(name)];
318
        int ncall;
319
        in.read((char*)&ncall, sizeof(ncall));
320
        int nval;
321
        in.read((char *)&nval, sizeof(nval));
322
        if (in.fail() || nval < 1) {
323
            LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
324
            m_stats = {};
325
            return false;
326
        }
327

328
        if (e.values.empty()) {
329
            e.values.resize(nval, 0);
330
            e.counts.resize(nval, 0);
331
        }
332

333
        std::vector<float> tmp(nval);
334
        in.read((char*)tmp.data(), nval*sizeof(float));
335
        if (in.fail()) {
336
            LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
337
            m_stats = {};
338
            return false;
339
        }
340

341
        // Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
342
        for (int i = 0; i < nval; i++) {
343
            e.values[i] += tmp[i];
344
            e.counts[i] += ncall;
345
        }
346
        e.ncall += ncall;
347

348
    }
349
    return true;
350
}
351

352
static IMatrixCollector g_collector;
353

354
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
355
    return g_collector.collect_imatrix(t, ask, user_data);
356
}
357

358

359
struct results_log_softmax {
360
    double log_softmax;
361
    float  logit;
362
    float  prob;
363
};
364

365
static std::vector<float> softmax(const std::vector<float> & logits) {
366
    std::vector<float> probs(logits.size());
367
    float max_logit = logits[0];
368
    for (float v : logits) {
369
        max_logit = std::max(max_logit, v);
370
    }
371
    double sum_exp = 0.0;
372
    for (size_t i = 0; i < logits.size(); i++) {
373
        // Subtract the maximum logit value from the current logit value for numerical stability
374
        const float logit = logits[i] - max_logit;
375
        const float exp_logit = expf(logit);
376
        sum_exp += exp_logit;
377
        probs[i] = exp_logit;
378
    }
379
    for (size_t i = 0; i < probs.size(); i++) {
380
        probs[i] /= sum_exp;
381
    }
382
    return probs;
383
}
384

385
static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
386
    float max_logit = logits[0];
387
    for (int i = 1; i < n_vocab; ++i) {
388
        max_logit = std::max(max_logit, logits[i]);
389
    }
390
    double sum_exp = 0.0;
391
    for (int i = 0; i < n_vocab; ++i) {
392
        sum_exp += expf(logits[i] - max_logit);
393
    }
394
    return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
395
}
396

397
static void process_logits(
398
    int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
399
    double & nll, double & nll2, float * logit_history, float * prob_history) {
400
    std::mutex mutex;
401
    int counter = 0;
402
    auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
403
        double local_nll  = 0;
404
        double local_nll2 = 0;
405
        while (true) {
406
            std::unique_lock<std::mutex> lock(mutex);
407
            int i = counter++;
408
            if (i >= n_token) {
409
                nll += local_nll; nll2 += local_nll2;
410
                break;
411
            }
412
            lock.unlock();
413
            const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
414
            const double v = -results.log_softmax;
415
            local_nll += v;
416
            local_nll2 += v*v;
417

418
            logit_history[i] = results.logit;
419
            prob_history[i]  = results.prob;
420
        }
421
    };
422
    for (auto & w : workers) {
423
        w = std::thread(compute);
424
    }
425
    compute();
426
    for (auto & w : workers) {
427
        w.join();
428
    }
429
}
430

431
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
432
    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
433
    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
434
    const int n_ctx = llama_n_ctx(ctx);
435

436
    auto tim1 = std::chrono::high_resolution_clock::now();
437
    LOG_INF("%s: tokenizing the input ..\n", __func__);
438

439
    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
440

441
    auto tim2 = std::chrono::high_resolution_clock::now();
442
    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
443

444
    if (params.i_chunk > 0) {
445
        if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
446
            LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
447
            return false;
448
        }
449
        LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
450
        tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
451
    }
452

453
    if (int(tokens.size()) < 2*n_ctx) {
454
        LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
455
        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
456
        return false;
457
    }
458

459
    std::vector<float> logit_history;
460
    std::vector<float> prob_history;
461

462
    if (params.compute_ppl) {
463
        logit_history.resize(tokens.size());
464
        prob_history.resize(tokens.size());
465
    }
466

467
    const int n_chunk_max = tokens.size() / n_ctx;
468

469
    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
470
    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
471
    const int n_batch = params.n_batch;
472

473
    int count = 0;
474
    double nll = 0.0;
475
    double nll2 = 0.0;
476

477
    LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
478

479
    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
480

481
    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
482

483
    std::vector<float> logits;
484
    if (params.compute_ppl && num_batches > 1) {
485
        logits.reserve((size_t)n_ctx * n_vocab);
486
    }
487

488
    for (int i = 0; i < n_chunk; ++i) {
489
        const int start =     i * n_ctx;
490
        const int end   = start + n_ctx;
491

492
        std::vector<float> logits;
493

494
        const auto t_start = std::chrono::high_resolution_clock::now();
495

496
        // clear the KV cache
497
        llama_kv_cache_clear(ctx);
498

499
        for (int j = 0; j < num_batches; ++j) {
500
            const int batch_start = start + j * n_batch;
501
            const int batch_size  = std::min(end - batch_start, n_batch);
502

503
            // save original token and restore it after eval
504
            const auto token_org = tokens[batch_start];
505

506
            // add BOS token for the first batch of each chunk
507
            if (add_bos && j == 0) {
508
                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
509
            }
510

511
            // TODO: use batch.logits to save computations instead of relying on logits_all == true
512
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
513
                LOG_ERR("%s : failed to eval\n", __func__);
514
                return false;
515
            }
516

517
            // restore the original token in case it was set to BOS
518
            tokens[batch_start] = token_org;
519

520
            if (params.compute_ppl && num_batches > 1) {
521
                const auto * batch_logits = llama_get_logits(ctx);
522
                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
523
            }
524
        }
525

526
        const auto t_end = std::chrono::high_resolution_clock::now();
527

528
        if (i == 0) {
529
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
530
            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
531
            int total_seconds = (int)(t_total * n_chunk);
532
            if (total_seconds >= 60*60) {
533
                LOG("%d hours ", total_seconds / (60*60));
534
                total_seconds = total_seconds % (60*60);
535
            }
536
            LOG("%.2f minutes\n", total_seconds / 60.0);
537
        }
538

539
        if (params.compute_ppl) {
540
            const int first = n_ctx/2;
541
            const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
542
            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
543
                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
544
            count += n_ctx - first - 1;
545

546
            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
547
            fflush(stdout);
548

549
            logits.clear();
550
        }
551
    }
552
    LOG("\n");
553

554
    if (params.compute_ppl) {
555
        nll2 /= count;
556
        nll /= count;
557
        const double ppl = exp(nll);
558
        nll2 -= nll * nll;
559
        if (nll2 > 0) {
560
            nll2 = sqrt(nll2/(count-1));
561
            LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
562
        } else {
563
            LOG("Unexpected negative standard deviation of log(prob)\n");
564
        }
565
    }
566

567
    return true;
568
}
569

570
int main(int argc, char ** argv) {
571
    gpt_params params;
572

573
    params.n_ctx = 512;
574
    params.logits_all = true;
575
    params.escape = false;
576

577
    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
578
        return 1;
579
    }
580

581
    gpt_init();
582

583
    params.n_batch = std::min(params.n_batch, params.n_ctx);
584

585
    g_collector.set_params(params);
586

587
    for (const auto & in_file : params.in_files) {
588
        LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
589
        if (!g_collector.load_imatrix(in_file.c_str())) {
590
            LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
591
            return 1;
592
        }
593
    }
594

595
    if (params.in_files.size() > 1) {
596
        LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
597
        g_collector.save_imatrix();
598
    }
599

600
    llama_backend_init();
601
    llama_numa_init(params.numa);
602

603
    // pass the callback to the backend scheduler
604
    // it will be executed for each node during the graph computation
605
    params.cb_eval = ik_collect_imatrix;
606
    params.cb_eval_user_data = NULL;
607
    params.warmup = false;
608

609
    // init
610
    llama_init_result llama_init = llama_init_from_gpt_params(params);
611

612
    llama_model * model = llama_init.model;
613
    llama_context * ctx = llama_init.context;
614
    if (model == nullptr || ctx == nullptr) {
615
        LOG_ERR("%s : failed to init\n", __func__);
616
        return 1;
617
    }
618

619
    const int n_ctx_train = llama_n_ctx_train(model);
620
    if (params.n_ctx > n_ctx_train) {
621
        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
622
                __func__, n_ctx_train, params.n_ctx);
623
    }
624

625
    // print system information
626
    {
627
        LOG_INF("\n");
628
        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
629
    }
630

631
    if (!compute_imatrix(ctx, params)) {
632
        return 1;
633
    }
634

635
    g_collector.save_imatrix();
636

637
    LOG("\n");
638
    llama_perf_context_print(ctx);
639

640
    llama_free(ctx);
641
    llama_free_model(model);
642

643
    llama_backend_free();
644

645
    return 0;
646
}
647
llama

Использование cookies