llama
193 строки · 6.0 Кб
1#include "arg.h"2#include "common.h"3#include "log.h"4#include "llama.h"5#include "ggml.h"6
7#include <cstdio>8#include <string>9#include <vector>10
11/**
12* This the arbitrary data which will be passed to each callback.
13* Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
14*/
15struct callback_data {16std::vector<uint8_t> data;17};18
19static std::string ggml_ne_string(const ggml_tensor * t) {20std::string str;21for (int i = 0; i < GGML_MAX_DIMS; ++i) {22str += std::to_string(t->ne[i]);23if (i + 1 < GGML_MAX_DIMS) {24str += ", ";25}26}27return str;28}
29
30static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {31GGML_ASSERT(n > 0);32float sum = 0;33for (int64_t i3 = 0; i3 < ne[3]; i3++) {34LOG(" [\n");35for (int64_t i2 = 0; i2 < ne[2]; i2++) {36if (i2 == n && ne[2] > 2*n) {37LOG(" ..., \n");38i2 = ne[2] - n;39}40LOG(" [\n");41for (int64_t i1 = 0; i1 < ne[1]; i1++) {42if (i1 == n && ne[1] > 2*n) {43LOG(" ..., \n");44i1 = ne[1] - n;45}46LOG(" [");47for (int64_t i0 = 0; i0 < ne[0]; i0++) {48if (i0 == n && ne[0] > 2*n) {49LOG("..., ");50i0 = ne[0] - n;51}52size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];53float v;54if (type == GGML_TYPE_F16) {55v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);56} else if (type == GGML_TYPE_F32) {57v = *(float *) &data[i];58} else if (type == GGML_TYPE_I32) {59v = (float) *(int32_t *) &data[i];60} else if (type == GGML_TYPE_I16) {61v = (float) *(int16_t *) &data[i];62} else if (type == GGML_TYPE_I8) {63v = (float) *(int8_t *) &data[i];64} else {65GGML_ABORT("fatal error");66}67LOG("%12.4f", v);68sum += v;69if (i0 < ne[0] - 1) LOG(", ");70}71LOG("],\n");72}73LOG(" ],\n");74}75LOG(" ]\n");76LOG(" sum = %f\n", sum);77}78}
79
80/**
81* GGML operations callback during the graph execution.
82*
83* @param t current tensor
84* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
85* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
86* see ggml_backend_sched_eval_callback
87* @param user_data user data to pass at each call back
88* @return true to receive data or continue the graph, false otherwise
89*/
90static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {91auto * cb_data = (callback_data *) user_data;92
93const struct ggml_tensor * src0 = t->src[0];94const struct ggml_tensor * src1 = t->src[1];95
96if (ask) {97return true; // Always retrieve data98}99
100char src1_str[128] = {0};101if (src1) {102snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());103}104
105LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,106t->name, ggml_type_name(t->type), ggml_op_desc(t),107src0->name, ggml_ne_string(src0).c_str(),108src1 ? src1_str : "",109ggml_ne_string(t).c_str());110
111
112// copy the data from the GPU memory if needed113const bool is_host = ggml_backend_buffer_is_host(t->buffer);114
115if (!is_host) {116auto n_bytes = ggml_nbytes(t);117cb_data->data.resize(n_bytes);118ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);119}120
121if (!ggml_is_quantized(t->type)) {122uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();123ggml_print_tensor(data, t->type, t->ne, t->nb, 3);124}125
126return true;127}
128
129static bool run(llama_context * ctx, const gpt_params & params) {130const bool add_bos = llama_add_bos_token(llama_get_model(ctx));131
132std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);133
134if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {135LOG_ERR("%s : failed to eval\n", __func__);136return false;137}138
139return true;140}
141
142int main(int argc, char ** argv) {143callback_data cb_data;144
145gpt_params params;146
147if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {148return 1;149}150
151gpt_init();152
153llama_backend_init();154llama_numa_init(params.numa);155
156// pass the callback to the backend scheduler157// it will be executed for each node during the graph computation158params.cb_eval = ggml_debug;159params.cb_eval_user_data = &cb_data;160params.warmup = false;161
162// init163llama_init_result llama_init = llama_init_from_gpt_params(params);164
165llama_model * model = llama_init.model;166llama_context * ctx = llama_init.context;167if (model == nullptr || ctx == nullptr) {168LOG_ERR("%s : failed to init\n", __func__);169return 1;170}171
172// print system information173{174LOG_INF("\n");175LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());176LOG_INF("\n");177}178
179bool OK = run(ctx, params);180if (!OK) {181return 1;182}183
184LOG("\n");185llama_perf_context_print(ctx);186
187llama_free(ctx);188llama_free_model(model);189
190llama_backend_free();191
192return 0;193}
194