llama

eval-callback.cpp
193 строки · 6.0 Кб
Перенос по словам
1
#include "arg.h"
2
#include "common.h"
3
#include "log.h"
4
#include "llama.h"
5
#include "ggml.h"
6

7
#include <cstdio>
8
#include <string>
9
#include <vector>
10

11
/**
12
 * This the arbitrary data which will be passed to each callback.
13
 * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
14
 */
15
struct callback_data {
16
    std::vector<uint8_t> data;
17
};
18

19
static std::string ggml_ne_string(const ggml_tensor * t) {
20
    std::string str;
21
    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
22
        str += std::to_string(t->ne[i]);
23
        if (i + 1 < GGML_MAX_DIMS) {
24
            str += ", ";
25
        }
26
    }
27
    return str;
28
}
29

30
static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
31
    GGML_ASSERT(n > 0);
32
    float sum = 0;
33
    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
34
        LOG("                                     [\n");
35
        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
36
            if (i2 == n && ne[2] > 2*n) {
37
                LOG("                                      ..., \n");
38
                i2 = ne[2] - n;
39
            }
40
            LOG("                                      [\n");
41
            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
42
                if (i1 == n && ne[1] > 2*n) {
43
                    LOG("                                       ..., \n");
44
                    i1 = ne[1] - n;
45
                }
46
                LOG("                                       [");
47
                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
48
                    if (i0 == n && ne[0] > 2*n) {
49
                        LOG("..., ");
50
                        i0 = ne[0] - n;
51
                    }
52
                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
53
                    float v;
54
                    if (type == GGML_TYPE_F16) {
55
                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
56
                    } else if (type == GGML_TYPE_F32) {
57
                        v = *(float *) &data[i];
58
                    } else if (type == GGML_TYPE_I32) {
59
                        v = (float) *(int32_t *) &data[i];
60
                    } else if (type == GGML_TYPE_I16) {
61
                        v = (float) *(int16_t *) &data[i];
62
                    } else if (type == GGML_TYPE_I8) {
63
                        v = (float) *(int8_t *) &data[i];
64
                    } else {
65
                        GGML_ABORT("fatal error");
66
                    }
67
                    LOG("%12.4f", v);
68
                    sum += v;
69
                    if (i0 < ne[0] - 1) LOG(", ");
70
                }
71
                LOG("],\n");
72
            }
73
            LOG("                                      ],\n");
74
        }
75
        LOG("                                     ]\n");
76
        LOG("                                     sum = %f\n", sum);
77
    }
78
}
79

80
/**
81
 * GGML operations callback during the graph execution.
82
 *
83
 * @param t current tensor
84
 * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
85
 *            if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
86
 *            see ggml_backend_sched_eval_callback
87
 * @param user_data user data to pass at each call back
88
 * @return true to receive data or continue the graph, false otherwise
89
 */
90
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
91
    auto * cb_data = (callback_data *) user_data;
92

93
    const struct ggml_tensor * src0 = t->src[0];
94
    const struct ggml_tensor * src1 = t->src[1];
95

96
    if (ask) {
97
        return true; // Always retrieve data
98
    }
99

100
    char src1_str[128] = {0};
101
    if (src1) {
102
        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
103
    }
104

105
    LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
106
         t->name, ggml_type_name(t->type), ggml_op_desc(t),
107
         src0->name, ggml_ne_string(src0).c_str(),
108
         src1 ? src1_str : "",
109
         ggml_ne_string(t).c_str());
110

111

112
    // copy the data from the GPU memory if needed
113
    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
114

115
    if (!is_host) {
116
        auto n_bytes = ggml_nbytes(t);
117
        cb_data->data.resize(n_bytes);
118
        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
119
    }
120

121
    if (!ggml_is_quantized(t->type)) {
122
        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
123
        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
124
    }
125

126
    return true;
127
}
128

129
static bool run(llama_context * ctx, const gpt_params & params) {
130
    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
131

132
    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
133

134
    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
135
        LOG_ERR("%s : failed to eval\n", __func__);
136
        return false;
137
    }
138

139
    return true;
140
}
141

142
int main(int argc, char ** argv) {
143
    callback_data cb_data;
144

145
    gpt_params params;
146

147
    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
148
        return 1;
149
    }
150

151
    gpt_init();
152

153
    llama_backend_init();
154
    llama_numa_init(params.numa);
155

156
    // pass the callback to the backend scheduler
157
    // it will be executed for each node during the graph computation
158
    params.cb_eval = ggml_debug;
159
    params.cb_eval_user_data = &cb_data;
160
    params.warmup = false;
161

162
    // init
163
    llama_init_result llama_init = llama_init_from_gpt_params(params);
164

165
    llama_model * model = llama_init.model;
166
    llama_context * ctx = llama_init.context;
167
    if (model == nullptr || ctx == nullptr) {
168
        LOG_ERR("%s : failed to init\n", __func__);
169
        return 1;
170
    }
171

172
    // print system information
173
    {
174
        LOG_INF("\n");
175
        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
176
        LOG_INF("\n");
177
    }
178

179
    bool OK = run(ctx, params);
180
    if (!OK) {
181
        return 1;
182
    }
183

184
    LOG("\n");
185
    llama_perf_context_print(ctx);
186

187
    llama_free(ctx);
188
    llama_free_model(model);
189

190
    llama_backend_free();
191

192
    return 0;
193
}
194
llama

Использование cookies