pytorch
542 строки · 22.0 Кб
1// *** Tensor Expressions ***
2//
3// This tutorial covers basics of NNC's tensor expressions, shows basic APIs to
4// work with them, and outlines how they are used in the overall TorchScript
5// compilation pipeline. This doc is permanently a "work in progress" since NNC
6// is under active development and things change fast.
7//
8// This Tutorial's code is compiled in the standard pytorch build, and the
9// executable can be found in `build/bin/tutorial_tensorexpr`.
10//
11// *** What is NNC ***
12//
13// NNC stands for Neural Net Compiler. It is a component of TorchScript JIT
14// and it performs on-the-fly code generation for kernels, which are often a
15// combination of multiple aten (torch) operators.
16//
17// When the JIT interpreter executes a torchscript model, it automatically
18// extracts subgraphs from the torchscript IR graph for which specialized code
19// can be JIT generated. This usually improves performance as the 'combined'
20// kernel created from the subgraph could avoid unnecessary memory traffic that
21// is unavoidable when the subgraph is interpreted as-is, operator by operator.
22// This optimization is often referred to as 'fusion'. Relatedly, the process of
23// finding and extracting subgraphs suitable for NNC code generation is done by
24// a JIT pass called 'fuser'.
25//
26// *** What is TE ***
27//
28// TE stands for Tensor Expressions. TE is a commonly used approach for
29// compiling kernels performing tensor (~matrix) computation. The idea behind it
30// is that operators are represented as a mathematical formula describing what
31// computation they do (as TEs) and then the TE engine can perform mathematical
32// simplification and other optimizations using those formulas and eventually
33// generate executable code that would produce the same results as the original
34// sequence of operators, but more efficiently.
35//
36// NNC's design and implementation of TE was heavily inspired by Halide and TVM
37// projects.
38#include <iostream>
39#include <string>
40
41#include <c10/util/irange.h>
42#include <torch/csrc/jit/ir/ir.h>
43#include <torch/csrc/jit/ir/irparser.h>
44#include <torch/csrc/jit/tensorexpr/eval.h>
45#include <torch/csrc/jit/tensorexpr/expr.h>
46#include <torch/csrc/jit/tensorexpr/ir.h>
47#include <torch/csrc/jit/tensorexpr/ir_printer.h>
48#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
49#include <torch/csrc/jit/tensorexpr/kernel.h>
50#include <torch/csrc/jit/tensorexpr/loopnest.h>
51#include <torch/csrc/jit/tensorexpr/stmt.h>
52#include <torch/csrc/jit/tensorexpr/tensor.h>
53#include <torch/torch.h>
54
55using namespace torch::jit::tensorexpr;
56
57#ifdef TORCH_ENABLE_LLVM
58
59// Helper function to print a snippet from a big multi-line string
60static void printLinesToFrom(const std::string& input_str, int from, int to);
61
62#endif
63
64int main(int argc, char* argv[]) {
65std::cout << "*** Structure of tensor expressions and statements ***"
66<< std::endl;
67{
68// A tensor expression is a tree of expressions. Each expression has a type,
69// and that type defines what sub-expressions the current expression has.
70// For instance, an expression of type 'Mul' would have a type 'kMul' and
71// two subexpressions: LHS and RHS. Each of these two sub-expressions could
72// also be a 'Mul' or some other expression.
73//
74// Let's construct a simple TE:
75ExprPtr lhs = alloc<IntImm>(5);
76ExprPtr rhs = alloc<Var>("x", kInt);
77ExprPtr mul = alloc<Mul>(lhs, rhs);
78std::cout << "Tensor expression: " << *mul << std::endl;
79// Prints: Tensor expression: 5 * x
80
81// Here we created an expression representing a 5*x computation, where x is
82// an int variable.
83
84// Another, probably a more convenient, way to construct tensor expressions
85// is to use so called expression handles (as opposed to raw expressions
86// like we did in the previous example). Expression handles overload common
87// operations and allow us to express the same semantics in a more natural
88// way:
89ExprHandle l = 5;
90ExprHandle r = Var::make("x", kInt);
91ExprHandle m = l * r;
92std::cout << "Tensor expression: " << *m.node() << std::endl;
93// Prints: Tensor expression: 5 * x
94
95// Converting from handles to raw expressions and back is easy:
96ExprHandle handle = Var::make("x", kInt);
97ExprPtr raw_expr_from_handle = handle.node();
98ExprPtr raw_expr = alloc<Var>("x", kInt);
99ExprHandle handle_from_raw_expr = ExprHandle(raw_expr);
100
101// We could construct arbitrarily complex expressions using mathematical
102// and logical operations, casts between various data types, and a bunch of
103// intrinsics.
104ExprHandle a = Var::make("a", kInt);
105ExprHandle b = Var::make("b", kFloat);
106ExprHandle c = Var::make("c", kFloat);
107ExprHandle x = ExprHandle(5) * a + b / (sigmoid(c) - 3.0f);
108std::cout << "Tensor expression: " << *x.node() << std::endl;
109// Prints: Tensor expression: float(5 * a) + b / ((sigmoid(c)) - 3.f)
110
111// An ultimate purpose of tensor expressions is to optimize tensor
112// computations, and in order to represent accesses to tensors data, there
113// is a special kind of expression - a load.
114// To construct a load we need two pieces: the base and the indices. The
115// base of a load is a Buf expression, which could be thought of as a
116// placeholder similar to Var, but with dimensions info.
117//
118// Let's construct a simple load:
119BufHandle A("A", {64, 32}, kInt);
120VarPtr i_var = alloc<Var>("i", kInt), j_var = alloc<Var>("j", kInt);
121ExprHandle i(i_var), j(j_var);
122ExprHandle load = Load::make(A.dtype(), A, {i, j});
123std::cout << "Tensor expression: " << *load.node() << std::endl;
124// Prints: Tensor expression: A[i, j]
125
126// Tensor Expressions constitute Tensor Statements, which are used to
127// represent computation of a given operator or a group of operators from a
128// fusion group.
129//
130// There are three main kinds of tensor statements:
131// - block
132// - store
133// - loop
134//
135// A Store represents a store to a single element of a tensor (or to a
136// group of elements if it's a vectorized store). Store statements,
137// similarly to Load expressions, have a base and indices, but on top of
138// that they also include a value - an expression representing what needs
139// to be stored at the given memory location. Let's create a Store stmt:
140StmtPtr store_a = Store::make(A, {i, j}, i + j);
141std::cout << "Store statement: " << *store_a << std::endl;
142// Prints: Store statement: A[i, j] = i + j;
143
144// An operator fills the entire tensor, not just a single element, and to
145// represent this we need to use For stmt: let's wrap our store stmt with
146// two nested loops to represent that variables i and j need to iterate
147// over some ranges.
148ForPtr loop_j_a = For::make(VarHandle(j_var), 0, 32, store_a);
149ForPtr loop_i_a = For::make(VarHandle(i_var), 0, 64, loop_j_a);
150
151std::cout << "Nested for loops: " << std::endl << *loop_i_a << std::endl;
152// Prints:
153// Nested for loops:
154// for (const auto i : c10::irange(64)) {
155// for (const auto j : c10::irange(32)) {
156// A[i, j] = i + j;
157// }
158// }
159
160// A Block statement is used when we need a sequence of other statements.
161// E.g. if a fusion group contains several operators, we initially define
162// separate loopnest for each of them and put them all into a common block:
163BufHandle B("B", {64, 32}, kInt);
164StmtPtr store_b = Store::make(B, {i, j}, A.load(i, j));
165ForPtr loop_j_b = For::make(VarHandle(j_var), 0, 32, store_b);
166ForPtr loop_i_b = For::make(VarHandle(i_var), 0, 64, loop_j_b);
167
168BlockPtr block = Block::make({loop_i_a, loop_i_b});
169std::cout << "Compound Block statement: " << std::endl
170<< *block << std::endl;
171// Prints:
172// Compound Block statement:
173// {
174// for (const auto i : c10::irange(64)) {
175// for (const auto j : c10::irange(32)) {
176// A[i, j] = i + j;
177// }
178// }
179// for (const auto i : c10::irange(64)) {
180// for (const auto j : c10::irange(32)) {
181// B[i, j] = A[i, j];
182// }
183// }
184// }
185
186// Manually constructing nested loops and blocks to represent a computation
187// might be laborious, and instead we can use a 'Compute' API. This API
188// requires us to specify dimensions and a lambda to compute a single
189// element of the resulting tensor and returns a `Tensor` structure. This
190// structure is simply a pair of a buffer that was created to represent the
191// result of the computation (BufPtr) and a statement representing the
192// computation itself (StmtPtr).
193Tensor C =
194Compute("C", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
195return i * j;
196});
197std::cout << "Stmt produced by 'Compute' API: " << std::endl
198<< *C.stmt() << std::endl;
199// Prints:
200// Stmt produced by 'Compute' API:
201// for (const auto i : c10::irange(64)) {
202// for (const auto j : c10::irange(32)) {
203// C[i, j] = i * j;
204// }
205// }
206
207// To construct statements to represent computations with reductions, we
208// can use a 'Reduce' API - it is similar to 'Compute' but takes a couple
209// of extra arguments defining how to perform the reduction. Let's define a
210// simple 2D sum of C using that:
211Tensor D = Reduce(
212"D",
213{},
214Sum(),
215[&](const VarHandle& i, const VarHandle& j) { return C.load(i, j); },
216{64, 32});
217std::cout << "Stmt produced by 'Reduce' API: " << std::endl
218<< *D.stmt() << std::endl;
219}
220
221std::cout << "*** Loopnests transformations ***" << std::endl;
222{
223// When a statement for the computation is generated, we might want to
224// apply some optimizations to it. These transformations allow us to end up
225// with a statement producing the same results, but more efficiently.
226//
227// Let's look at a couple of transformations that are used in NNC. We will
228// begin with constructing a Block statement like we did before.
229
230Tensor C =
231Compute("C", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
232return i * (j + 1);
233});
234BufHandle c_buf(C.buf());
235Tensor D =
236Compute("D", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
237return c_buf.load(i, j) - i;
238});
239StmtPtr block = Block::make({C.stmt(), D.stmt()});
240std::cout << "Stmt produced by 'Compute' API: " << std::endl
241<< *block << std::endl;
242// Prints:
243// Stmt produced by 'Compute' API:
244// {
245// for (const auto i : c10::irange(64)) {
246// for (const auto j : c10::irange(32)) {
247// C[i, j] = i * (j + 1);
248// }
249// }
250// for (const auto i_1 : c10::irange(64)) {
251// for (const auto j_1 : c10::irange(32)) {
252// D[i_1, j_1] = (C[i_1, j_1]) - i_1;
253// }
254// }
255// }
256
257// One transformation we can apply to this computation is inlining: i.e.
258// taking the expression that defines values of C and substituting a load
259// from C with it.
260// To do that, we first need to create a special object called LoopNest -
261// all transformations are methods of this class. To create a loopnest we
262// need to provide a list of output buffers and the root statement:
263LoopNest nest(block, {D.buf()});
264
265// We can always retrieve the Stmt back from LoopNest:
266std::cout << "LoopNest root stmt: " << std::endl
267<< *nest.root_stmt() << std::endl;
268// Prints:
269// LoopNest root stmt:
270// {
271// for (const auto i : c10::irange(64)) {
272// for (const auto j : c10::irange(32)) {
273// C[i, j] = i * (j + 1);
274// }
275// }
276// for (const auto i_1 : c10::irange(64)) {
277// for (const auto j_1 : c10::irange(32)) {
278// D[i_1, j_1] = (C[i_1, j_1]) - i_1;
279// }
280// }
281// }
282
283// Now we can apply the inlining transformation:
284nest.computeInline(C.buf());
285std::cout << "Stmt after inlining:" << std::endl
286<< *nest.root_stmt() << std::endl;
287// Prints:
288// Stmt after inlining:
289// {
290// for (const auto i : c10::irange(64)) {
291// for (const auto j : c10::irange(32)) {
292// D[i, j] = i * (j + 1) - i;
293// }
294// }
295// }
296
297// We can also apply algebraic simplification to a statement:
298StmtPtr simplified = IRSimplifier::simplify(nest.root_stmt());
299std::cout << "Stmt after simplification:" << std::endl
300<< *simplified << std::endl;
301// Prints:
302// Stmt after simplification:
303// {
304// for (const auto i : c10::irange(64)) {
305// for (const auto j : c10::irange(32)) {
306// D[i, j] = i * j;
307// }
308// }
309// }
310
311// Many loopnest transformations are stateless and can be applied without
312// creating a LoopNest object. In fact, we plan to make all transformations
313// stateless.
314// splitWithTail is one such transformation: it splits an iteration space
315// of a given loop into two with a given factor.
316ForPtr outer_loop = to<For>(to<Block>(simplified)->stmts().front());
317LoopNest::splitWithTail(outer_loop, 13);
318// Call simplifier once more to fold some arithmetic.
319simplified = IRSimplifier::simplify(simplified);
320std::cout << "Stmt after splitWithTail:" << std::endl
321<< *simplified << std::endl;
322// Prints:
323// Stmt after splitWithTail:
324// {
325// for (const auto i_outer : c10::irange(4)) {
326// for (const auto i_inner : c10::irange(13)) {
327// for (const auto j : c10::irange(32)) {
328// D[i_inner + 13 * i_outer, j] = i_inner * j + 13 * (i_outer * j);
329// }
330// }
331// }
332// for (const auto i_tail : c10::irange(12)) {
333// for (const auto j : c10::irange(32)) {
334// D[i_tail + 52, j] = i_tail * j + 52 * j;
335// }
336// }
337// }
338
339// NNC supports a wide range of loop nest transformations, which we are not
340// listing here. Please refer to documentation in
341// https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/tensorexpr/loopnest.h
342// for more details.
343}
344
345std::cout << "*** Codegen ***" << std::endl;
346{
347// An ultimate goal of tensor expressions is to be provide a mechanism to
348// execute a given computation in the fastest possible way. So far we've
349// looked at how we could describe what computation we're interested in, but
350// we haven't looked at how to actually execute it.
351//
352// All we've been dealing with was just symbols with no actual data
353// associated, in this section we would look at how we can bridge that gap.
354
355// Let's start by constructing a simple computation for us to work with:
356BufHandle A("A", {64, 32}, kInt);
357BufHandle B("B", {64, 32}, kInt);
358Tensor X =
359Compute("X", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
360return A.load(i, j) + B.load(i, j);
361});
362
363// And let's lower it to a loop nest, as we did in the previous section. We
364// can pass Tensor object directly:
365LoopNest loopnest({X});
366std::cout << *loopnest.root_stmt() << std::endl;
367// Prints:
368// {
369// for (const auto i : c10::irange(64)) {
370// for (const auto j : c10::irange(32)) {
371// X[i, j] = (A[i, j]) + (B[i, j]);
372// }
373// }
374
375// Now imagine that we have two actual tensors 64x32 that we want sum
376// together, how do we pass those tensors to the computation and how do we
377// carry it out?
378//
379// Codegen object is aimed at providing exactly that functionality. Codegen
380// is an abstract class and concrete codegens are derived from it.
381// Currently, we have three codegens:
382// 1) Simple Evaluator,
383// 2) LLVM Codegen for CPU,
384// 3) CUDA Codegen.
385// In this example we will be using Simple Evaluator, since it's available
386// everywhere.
387
388// To create a codegen, we need to provide the statement - it specifies the
389// computation we want to perform - and a list of placeholders and tensors
390// used in the computation. The latter part is crucial since that's the only
391// way the codegen could use to correlate symbols in the statement to actual
392// data arrays that we will be passing when we will actually be performing
393// the computation.
394//
395// Let's create a Simple IR Evaluator codegen for our computation:
396SimpleIREvaluator ir_eval(loopnest.root_stmt(), {A, B, X});
397
398// We are using the simplest codegen and in it almost no work is done at the
399// construction step. Real codegens such as CUDA and LLVM perform
400// compilation during that stage so that when we're about to run the
401// computation everything is ready.
402
403// Let's now create some inputs and run our computation with them:
404std::vector<int> data_A(64 * 32, 3); // This will be the input A
405std::vector<int> data_B(64 * 32, 5); // This will be the input B
406std::vector<int> data_X(64 * 32, 0); // This will be used for the result
407
408// Now let's invoke our codegen to perform the computation on our data. We
409// need to provide as many arguments as how many placeholders and tensors we
410// passed at the codegen construction time. A position in these lists would
411// define how real data arrays from the latter call (these arguments are
412// referred to as 'CallArg's in our codebase) correspond to symbols
413// (placeholders and tensors) used in the tensor expressions we constructed
414// (these are referred to as 'BufferArg').
415// Thus, we will provide three arguments: data_A, data_B, and data_X. data_A
416// contains data for the placeholder A, data_B - for the placeholder B, and
417// data_X would be used for contents of tensor X.
418ir_eval(data_A, data_B, data_X);
419
420// Let's print one of the elements from each array to verify that the
421// computation did happen:
422std::cout << "A[10] = " << data_A[10] << std::endl
423<< "B[10] = " << data_B[10] << std::endl
424<< "X[10] = A[10] + B[10] = " << data_X[10] << std::endl;
425// Prints:
426// A[10] = 3
427// B[10] = 5
428// X[10] = A[10] + B[10] = 8
429}
430
431std::cout << "*** Lowering TorchScript IR to TensorExpr IR ***" << std::endl;
432{
433// This section requires a LLVM-enabled PyTorch build, so we have to use a
434// guard:
435#ifdef TORCH_ENABLE_LLVM
436
437// Often we would like to convert a TorchScript IR to TE rather than
438// construct TE IR from scratch. NNC provides an API to perform such
439// lowering: it takes a TorchScript graph and returns an object that can be
440// used to invoke the generated kernel.
441// This API is currently used by the TorchScript JIT fuser and can also be
442// used ahead of time to pre-compile parts of a model.
443//
444// To get familiar with this API let's first start with defining a simple
445// TorchScript graph:
446const auto graph_string = R"IR(
447graph(%A : Float(5, 3, strides=[3, 1], device=cpu),
448%B : Float(5, 3, strides=[3, 1], device=cpu)):
449%AB : Float(5, 3, strides=[3, 1]) = aten::mul(%A, %B)
450%one : int = prim::Constant[value=1]()
451%AAB : Float(5, 3, strides=[3, 1]) = aten::mul(%A, %AB)
452%AAB_plus_B: Float(5, 3, strides=[3, 1]) = aten::add(%AAB, %B, %one)
453return (%AAB_plus_B))IR";
454auto graph = std::make_shared<torch::jit::Graph>();
455parseIR(graph_string, &*graph);
456
457// This graph defines a simple computation of A*A*B + B where A and B are
458// input 5x3 tensors.
459
460// To lower this TorchScript graph to TE, we just need to create a
461// TensorExprKernel object. In its constructor it constructs the
462// corresponding TE IR and compiles it for the given backend (in this
463// example for CPU using LLVM compiler).
464TensorExprKernel kernel(graph);
465
466// We can retrieve the generated TE stmt from the kernel object:
467StmtPtr kernel_stmt = kernel.getCodeGenStmt();
468std::cout << "TE Stmt constructed from TorchScript: " << std::endl
469<< *kernel_stmt << std::endl;
470// Prints:
471// TE Stmt constructed from TorchScript:
472// {
473// for (const auto v : c10::irange(5)) {
474// for (const auto _tail_tail : c10::irange(3)) {
475// aten_add[_tail_tail + 3 * v] = (tA[_tail_tail + 3 * v]) *
476// ((tA[_tail_tail + 3 * v]) * (tB[_tail_tail + 3 * v])) +
477// (tB[_tail_tail + 3 * v]);
478// }
479// }
480// }
481
482// We can also examine generated LLVM IR and assembly code:
483std::cout << "Generated LLVM IR: " << std::endl;
484auto ir_str = kernel.getCodeText("ir");
485printLinesToFrom(ir_str, 15, 20);
486// Prints:
487// Generated LLVM IR:
488// %9 = bitcast float* %2 to <8 x float>*
489// %10 = load <8 x float>, <8 x float>* %9 ...
490// %11 = bitcast float* %5 to <8 x float>*
491// %12 = load <8 x float>, <8 x float>* %11 ...
492// %13 = fmul <8 x float> %10, %12
493// %14 = fmul <8 x float> %10, %13
494
495std::cout << "Generated assembly: " << std::endl;
496auto asm_str = kernel.getCodeText("asm");
497printLinesToFrom(asm_str, 10, 15);
498// Prints:
499// Generated assembly:
500// vmulps %ymm1, %ymm0, %ymm2
501// vfmadd213ps %ymm1, %ymm0, %ymm2
502// vmovups %ymm2, (%rax)
503// vmovss 32(%rcx), %xmm0
504// vmovss 32(%rdx), %xmm1
505// vmulss %xmm1, %xmm0, %xmm2
506
507// We can also execute the generated kernel:
508auto A =
509at::ones({5, 3}, torch::TensorOptions(torch::kCPU).dtype(at::kFloat)) *
5102.0;
511auto B =
512at::ones({5, 3}, torch::TensorOptions(torch::kCPU).dtype(at::kFloat)) *
5133.0;
514std::vector<at::Tensor> inputs = {A, B};
515std::vector<torch::IValue> stack = torch::fmap<torch::IValue>(inputs);
516kernel.run(stack);
517auto R = stack[0].toTensor();
518
519// Let's print one of the elements from the result tensor to verify that the
520// computation did happen and was correct:
521std::cout << "R[2][2] = " << R[2][2] << std::endl;
522// Prints:
523// R[2][2] = 15
524// [ CPUFloatType{} ]
525#endif
526}
527return 0;
528}
529
530void printLinesToFrom(const std::string& input_str, int from, int to) {
531std::istringstream f(input_str);
532std::string s;
533int idx = 0;
534while (getline(f, s)) {
535if (idx > from) {
536std::cout << s << "\n";
537}
538if (idx++ > to) {
539break;
540}
541}
542}
543