1
#include <torch/csrc/jit/mobile/nnc/context.h>
3
#include <ATen/Functions.h>
4
#include <ATen/core/functional.h>
5
#include <c10/core/CPUAllocator.h>
6
#include <c10/util/irange.h>
8
#include <torch/csrc/jit/mobile/nnc/registry.h>
15
constexpr int64_t kProducedNNCFileFormatVersion = 0x1L;
19
c10::IValue Tup(std::initializer_list<c10::IValue> ivalues) {
20
return c10::ivalue::Tuple::create(ivalues);
23
c10::IValue Tup(std::vector<c10::IValue>&& ivalues) {
24
return c10::ivalue::Tuple::create(ivalues);
29
InputSpec::InputSpec(const c10::IValue& value) {
30
auto dict = value.toGenericDict();
31
sizes_ = dict.at("sizes").toIntVector();
32
dtype_ = dict.at("dtype").toScalarType();
35
c10::IValue InputSpec::serialize() const {
36
c10::Dict<c10::IValue, c10::IValue> dict(
37
at::StringType::get(), at::AnyType::get());
38
dict.insert("sizes", sizes_);
39
dict.insert("dtype", dtype_);
43
bool InputSpec::validate(const at::Tensor& input) const {
44
if (sizes_.size() != input.sizes().size() || input.scalar_type() != dtype_) {
47
auto spec_sizes = sizes_;
48
for (const auto i : c10::irange(spec_sizes.size())) {
49
// InputSpec size 0 means that the dimension is dynamic
50
if (spec_sizes[i] != 0 && spec_sizes[i] != input.sizes()[i]) {
57
OutputSpec::OutputSpec(const c10::IValue& value) {
58
auto dict = value.toGenericDict();
59
sizes_ = dict.at("sizes").toIntVector();
60
dtype_ = dict.at("dtype").toScalarType();
61
if (dict.contains("qscale")) {
62
qscale_ = dict.at("qscale").toDouble();
64
if (dict.contains("qzero")) {
65
qzero_ = dict.at("qzero").toInt();
69
c10::IValue OutputSpec::serialize() const {
70
c10::Dict<c10::IValue, c10::IValue> dict(
71
at::StringType::get(), at::AnyType::get());
72
dict.insert("sizes", sizes_);
73
dict.insert("dtype", dtype_);
75
dict.insert("qscale", *qscale_);
78
dict.insert("qzero", *qzero_);
83
at::Tensor OutputSpec::allocate() const {
84
if (isQIntType(dtype_)) {
87
"Quantized output tensor must have qscale_ and qzero_");
88
return at::_empty_affine_quantized(
94
.requires_grad(false),
102
.layout(at::kStrided)
104
.requires_grad(false));
107
MemoryPlan::MemoryPlan(const c10::IValue& value) {
108
auto dict = value.toGenericDict();
109
buffer_sizes_ = dict.at("buffer_sizes").toIntVector();
112
c10::IValue MemoryPlan::serialize() const {
113
c10::Dict<c10::IValue, c10::IValue> dict(
114
at::StringType::get(), at::AnyType::get());
115
dict.insert("buffer_sizes", buffer_sizes_);
119
void MemoryPlan::allocate(ExecutionState* state) const {
120
auto& allocations = state->preallocations_;
122
allocations.reserve(buffer_sizes_.size());
123
for (int64_t buffer_size : buffer_sizes_) {
124
at::DataPtr buffer = c10::GetCPUAllocator()->allocate(buffer_size);
125
allocations.emplace_back(std::move(buffer));
129
Function::Function(const c10::IValue& value) {
130
auto dict = value.toGenericDict();
131
name_ = c10::QualifiedName(dict.at("name").toStringRef());
132
nnc_kernel_id_ = dict.at("nnc_kernel_id").toStringRef();
133
parameters_ = dict.at("parameters").toList();
136
for (const auto& input_value :
137
dict.at("input_specs").toTupleRef().elements()) {
138
input_specs_.emplace_back(input_value);
142
for (const auto& output_value :
143
dict.at("output_specs").toTupleRef().elements()) {
144
output_specs_.emplace_back(output_value);
148
memory_plan_ = MemoryPlan(dict.at("memory_plan"));
150
// symbolic shape positions
151
for (const auto& sym_shape_pos :
152
dict.at("sym_shape_pos").toTupleRef().elements()) {
153
auto sym_shape_elements = sym_shape_pos.toTupleRef().elements();
154
sym_shape_positions_.emplace_back(
155
sym_shape_elements[0].toInt(), sym_shape_elements[1].toInt());
159
c10::IValue Function::serialize() const {
160
c10::Dict<c10::IValue, c10::IValue> dict(
161
at::StringType::get(), at::AnyType::get());
163
dict.insert("name", name_.qualifiedName());
164
dict.insert("nnc_kernel_id", nnc_kernel_id_);
165
// TODO: should serialize parameters with Module instead of with each Method.
166
// And ideally the parameters should be shared between the compiled model
167
// and the original model if we can serialize both in the same model file.
168
dict.insert("parameters", parameters_);
171
std::vector<c10::IValue> input_specs;
172
input_specs.reserve(input_specs_.size());
173
for (const auto& input_spec : input_specs_) {
174
input_specs.emplace_back(input_spec.serialize());
176
dict.insert("input_specs", Tup(std::move(input_specs)));
179
std::vector<c10::IValue> output_specs;
180
output_specs.reserve(output_specs_.size());
181
for (const auto& output_spec : output_specs_) {
182
output_specs.emplace_back(output_spec.serialize());
184
dict.insert("output_specs", Tup(std::move(output_specs)));
187
dict.insert("memory_plan", memory_plan_.serialize());
189
// sym_shape_positions_
190
std::vector<c10::IValue> sym_shape_pos_vec;
191
sym_shape_pos_vec.reserve(sym_shape_positions_.size());
192
for (const auto& sym_shape_pos : sym_shape_positions_) {
193
sym_shape_pos_vec.emplace_back(
194
Tup({sym_shape_pos.input_idx_, sym_shape_pos.dim_idx_}));
196
dict.insert("sym_shape_pos", Tup(std::move(sym_shape_pos_vec)));
201
void Function::init_execution_state() const {
202
if (execution_state_.get() != nullptr) {
206
ExecutionState state;
207
memory_plan_.allocate(&state);
209
// The arguments vector consists of 5 sections: inputs, symbolic shapes,
210
// outputs, parameters and buffers.
211
auto input_args = input_specs_.size();
212
auto sym_shape_args = sym_shape_positions_.size();
213
auto output_args = output_specs_.size();
214
auto param_args = parameters_.size();
215
auto buffer_args = state.preallocations_.size();
217
auto& arguments = state.arguments_;
219
input_args + sym_shape_args + output_args + param_args + buffer_args);
221
// Keep empty slots to fill in inputs/outputs pointers at execution time.
222
arguments.resize(input_args + sym_shape_args + output_args);
224
// Fill in parameters as untyped raw pointers.
225
// The underlying storage of the parameters should be owned by `parameters_`,
226
// which should be alive when `execution_state_` is being used.
227
for (const auto& param : parameters_) {
228
const c10::IValue& ivalue = (c10::IValue)param;
229
if (ivalue.isTensor()) {
230
arguments.emplace_back(ivalue.toTensor().data_ptr());
231
} else if (torch::isCustomClass(ivalue)) {
232
arguments.emplace_back(ivalue.toObjectRef().getSlot(0).toCapsule().get());
234
TORCH_CHECK(false, "Invalid parameter: ", ivalue);
238
// Fill in preallocated buffer pointers.
239
for (const auto& preallocation : state.preallocations_) {
240
arguments.emplace_back(preallocation.get());
243
execution_state_ = std::make_unique<ExecutionState>(std::move(state));
246
c10::impl::GenericList Function::run(
247
const c10::impl::GenericList& inputs) const {
249
registry::has_nnc_kernel(nnc_kernel_id_),
250
"Cannot find NNC kernel: ",
253
init_execution_state();
255
std::vector<void*>& args = execution_state_->arguments_;
257
// Fill in input tensors.
259
input_specs_.size() == inputs.size(),
260
"Input size doesn't match the spec, expect: ",
264
std::vector<int64_t> scalar_values;
266
for (const auto i : c10::irange(inputs.size())) {
267
const c10::IValue& input = inputs[i];
268
const auto& spec = input_specs_[i];
269
const auto& input_tensor = input.toTensor();
270
TORCH_CHECK(spec.validate(input_tensor), "Invalid input at pos: ", i);
271
args[i] = input_tensor.data_ptr();
273
offset += inputs.size();
275
scalar_values.reserve(sym_shape_positions_.size());
276
for (const auto i : c10::irange(sym_shape_positions_.size())) {
277
const auto& sym_shape_pos = sym_shape_positions_[i];
278
const c10::IValue& input = inputs[sym_shape_pos.input_idx_];
279
auto dim = input.toTensor().size(sym_shape_pos.dim_idx_);
280
scalar_values.push_back(dim);
281
args[i + offset] = &scalar_values[scalar_values.size() - 1];
283
offset += sym_shape_positions_.size();
285
// Preallocate and fill in output tensors.
286
c10::List<at::Tensor> outputs;
287
outputs.reserve(output_specs_.size());
288
for (const auto i : c10::irange(output_specs_.size())) {
289
at::Tensor output = output_specs_[i].allocate();
290
outputs.emplace_back(output);
291
args[i + offset] = output.data_ptr();
294
// TODO: check consistency, e.g.: code version, input shape and compiled
296
auto kernel = registry::get_nnc_kernel(nnc_kernel_id_);
297
kernel->execute(args.data());
299
return c10::impl::toList(outputs);
302
CompilationUnit::CompilationUnit(const c10::IValue& value) {
303
const auto& root = value.toTupleRef().elements();
304
const auto& functions = root[1].toTupleRef().elements();
305
for (const auto& function : functions) {
306
register_function(std::make_unique<Function>(function));
310
c10::IValue CompilationUnit::serialize() const {
312
c10::fmap(functions_, [](decltype(functions_)::const_reference func) {
313
return func.second->serialize();
315
return Tup({kProducedNNCFileFormatVersion, Tup(std::move(functions))});
318
c10::impl::GenericList CompilationUnit::run(
319
const c10::QualifiedName& name,
320
const c10::impl::GenericList& inputs) const {
321
Function* func = find_function(name);
323
func != nullptr, "Function '", name.qualifiedName(), "' is not defined.");
324
return func->run(inputs);
327
void CompilationUnit::register_function(std::unique_ptr<Function> fn) {
329
0 == functions_.count(fn->name()),
331
fn->name().qualifiedName(),
332
"' already defined.");
333
const auto& name = fn->name();
334
functions_.emplace(name, std::move(fn));
337
Function* CompilationUnit::find_function(const c10::QualifiedName& name) const {
338
auto it = functions_.find(name);
339
if (it == functions_.end()) {
342
return it->second.get();