2
* Copyright (c) 2023, 2024, Oracle and/or its affiliates. All rights reserved.
3
* Copyright (c) 2023, Arm Limited. All rights reserved.
4
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6
* This code is free software; you can redistribute it and/or modify it
7
* under the terms of the GNU General Public License version 2 only, as
8
* published by the Free Software Foundation.
10
* This code is distributed in the hope that it will be useful, but WITHOUT
11
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
* version 2 for more details (a copy is included in the LICENSE file that
14
* accompanied this code).
16
* You should have received a copy of the GNU General Public License version
17
* 2 along with this work; if not, write to the Free Software Foundation,
18
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21
* or visit www.oracle.com if you need additional information or have any
25
#include "precompiled.hpp"
26
#include "opto/addnode.hpp"
27
#include "opto/connode.hpp"
28
#include "opto/convertnode.hpp"
29
#include "opto/mulnode.hpp"
30
#include "opto/rootnode.hpp"
31
#include "opto/vectorization.hpp"
34
static void print_con_or_idx(const Node* n) {
37
} else if (n->is_ConI()) {
38
jint val = n->as_ConI()->get_int();
39
tty->print("(%4d)", val);
41
tty->print("[%4d]", n->_idx);
46
bool VLoop::check_preconditions() {
48
if (is_trace_preconditions()) {
49
tty->print_cr("\nVLoop::check_preconditions");
51
lpt()->head()->dump();
55
VStatus status = check_preconditions_helper();
56
if (!status.is_success()) {
58
if (is_trace_preconditions()) {
59
tty->print_cr("VLoop::check_preconditions: failed: %s", status.failure_reason());
62
return false; // failure
64
return true; // success
67
VStatus VLoop::check_preconditions_helper() {
68
// Only accept vector width that is power of 2
69
int vector_width = Matcher::vector_width_in_bytes(T_BYTE);
70
if (vector_width < 2 || !is_power_of_2(vector_width)) {
71
return VStatus::make_failure(VLoop::FAILURE_VECTOR_WIDTH);
74
// Only accept valid counted loops (int)
75
if (!_lpt->_head->as_Loop()->is_valid_counted_loop(T_INT)) {
76
return VStatus::make_failure(VLoop::FAILURE_VALID_COUNTED_LOOP);
78
_cl = _lpt->_head->as_CountedLoop();
79
_iv = _cl->phi()->as_Phi();
81
if (_cl->is_vectorized_loop()) {
82
return VStatus::make_failure(VLoop::FAILURE_ALREADY_VECTORIZED);
85
if (_cl->is_unroll_only()) {
86
return VStatus::make_failure(VLoop::FAILURE_UNROLL_ONLY);
89
// Check for control flow in the body
90
_cl_exit = _cl->loopexit();
91
bool has_cfg = _cl_exit->in(0) != _cl;
92
if (has_cfg && !is_allow_cfg()) {
94
if (is_trace_preconditions()) {
95
tty->print_cr("VLoop::check_preconditions: fails because of control flow.");
96
tty->print(" cl_exit %d", _cl_exit->_idx); _cl_exit->dump();
97
tty->print(" cl_exit->in(0) %d", _cl_exit->in(0)->_idx); _cl_exit->in(0)->dump();
98
tty->print(" lpt->_head %d", _cl->_idx); _cl->dump();
102
return VStatus::make_failure(VLoop::FAILURE_CONTROL_FLOW);
105
// Make sure the are no extra control users of the loop backedge
106
if (_cl->back_control()->outcnt() != 1) {
107
return VStatus::make_failure(VLoop::FAILURE_BACKEDGE);
110
// To align vector memory accesses in the main-loop, we will have to adjust
111
// the pre-loop limit.
112
if (_cl->is_main_loop()) {
113
CountedLoopEndNode* pre_end = _cl->find_pre_loop_end();
114
if (pre_end == nullptr) {
115
return VStatus::make_failure(VLoop::FAILURE_PRE_LOOP_LIMIT);
117
Node* pre_opaq1 = pre_end->limit();
118
if (pre_opaq1->Opcode() != Op_Opaque1) {
119
return VStatus::make_failure(VLoop::FAILURE_PRE_LOOP_LIMIT);
121
_pre_loop_end = pre_end;
124
return VStatus::make_success();
127
// Return true iff all submodules are loaded successfully
128
bool VLoopAnalyzer::setup_submodules() {
130
if (_vloop.is_trace_loop_analyzer()) {
131
tty->print_cr("\nVLoopAnalyzer::setup_submodules");
132
_vloop.lpt()->dump_head();
137
VStatus status = setup_submodules_helper();
138
if (!status.is_success()) {
140
if (_vloop.is_trace_loop_analyzer()) {
141
tty->print_cr("\nVLoopAnalyze::setup_submodules: failed: %s", status.failure_reason());
144
return false; // failed
146
return true; // success
149
VStatus VLoopAnalyzer::setup_submodules_helper() {
150
// Skip any loop that has not been assigned max unroll by analysis.
151
if (SuperWordLoopUnrollAnalysis && _vloop.cl()->slp_max_unroll() == 0) {
152
return VStatus::make_failure(VLoopAnalyzer::FAILURE_NO_MAX_UNROLL);
155
if (SuperWordReductions) {
156
_reductions.mark_reductions();
159
_memory_slices.find_memory_slices();
161
// If there is no memory slice detected, it means there is no store.
162
// If there is no reduction and no store, then we give up, because
163
// vectorization is not possible anyway (given current limitations).
164
if (!_reductions.is_marked_reduction_loop() &&
165
_memory_slices.heads().is_empty()) {
166
return VStatus::make_failure(VLoopAnalyzer::FAILURE_NO_REDUCTION_OR_STORE);
169
VStatus body_status = _body.construct();
170
if (!body_status.is_success()) {
174
_types.compute_vector_element_type();
176
_vpointers.compute_vpointers();
178
_dependency_graph.construct();
180
return VStatus::make_success();
183
void VLoopVPointers::compute_vpointers() {
185
allocate_vpointers_array();
186
compute_and_cache_vpointers();
187
NOT_PRODUCT( if (_vloop.is_trace_vpointers()) { print(); } )
190
void VLoopVPointers::count_vpointers() {
191
_vpointers_length = 0;
192
_body.for_each_mem([&] (const MemNode* mem, int bb_idx) {
197
void VLoopVPointers::allocate_vpointers_array() {
198
uint bytes = _vpointers_length * sizeof(VPointer);
199
_vpointers = (VPointer*)_arena->Amalloc(bytes);
202
void VLoopVPointers::compute_and_cache_vpointers() {
203
int pointers_idx = 0;
204
_body.for_each_mem([&] (MemNode* const mem, int bb_idx) {
205
// Placement new: construct directly into the array.
206
::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop);
207
_bb_idx_to_vpointer.at_put(bb_idx, pointers_idx);
212
const VPointer& VLoopVPointers::vpointer(const MemNode* mem) const {
213
assert(mem != nullptr && _vloop.in_bb(mem), "only mem in loop");
214
int bb_idx = _body.bb_idx(mem);
215
int pointers_idx = _bb_idx_to_vpointer.at(bb_idx);
216
assert(0 <= pointers_idx && pointers_idx < _vpointers_length, "valid range");
217
return _vpointers[pointers_idx];
221
void VLoopVPointers::print() const {
222
tty->print_cr("\nVLoopVPointers::print:");
224
_body.for_each_mem([&] (const MemNode* mem, int bb_idx) {
225
const VPointer& p = vpointer(mem);
232
// Construct the dependency graph:
233
// - Data-dependencies: implicit (taken from C2 node inputs).
234
// - Memory-dependencies:
235
// - No edges between different slices.
236
// - No Load-Load edges.
237
// - Inside a slice, add all Store-Load, Load-Store, Store-Store edges,
238
// except if we can prove that the memory does not overlap.
239
void VLoopDependencyGraph::construct() {
240
const GrowableArray<PhiNode*>& mem_slice_heads = _memory_slices.heads();
241
const GrowableArray<MemNode*>& mem_slice_tails = _memory_slices.tails();
244
GrowableArray<MemNode*> slice_nodes;
245
GrowableArray<int> memory_pred_edges;
247
// For each memory slice, create the memory subgraph
248
for (int i = 0; i < mem_slice_heads.length(); i++) {
249
PhiNode* head = mem_slice_heads.at(i);
250
MemNode* tail = mem_slice_tails.at(i);
252
_memory_slices.get_slice_in_reverse_order(head, tail, slice_nodes);
254
// In forward order (reverse of reverse), visit all memory nodes in the slice.
255
for (int j = slice_nodes.length() - 1; j >= 0 ; j--) {
256
MemNode* n1 = slice_nodes.at(j);
257
memory_pred_edges.clear();
259
const VPointer& p1 = _vpointers.vpointer(n1);
260
// For all memory nodes before it, check if we need to add a memory edge.
261
for (int k = slice_nodes.length() - 1; k > j; k--) {
262
MemNode* n2 = slice_nodes.at(k);
264
// Ignore Load-Load dependencies:
265
if (n1->is_Load() && n2->is_Load()) { continue; }
267
const VPointer& p2 = _vpointers.vpointer(n2);
268
if (!VPointer::not_equal(p1.cmp(p2))) {
269
// Possibly overlapping memory
270
memory_pred_edges.append(_body.bb_idx(n2));
273
if (memory_pred_edges.is_nonempty()) {
274
// Data edges are taken implicitly from the C2 graph, thus we only add
275
// a dependency node if we have memory edges.
276
add_node(n1, memory_pred_edges);
284
NOT_PRODUCT( if (_vloop.is_trace_dependency_graph()) { print(); } )
287
void VLoopDependencyGraph::add_node(MemNode* n, GrowableArray<int>& memory_pred_edges) {
288
assert(_dependency_nodes.at_grow(_body.bb_idx(n), nullptr) == nullptr, "not yet created");
289
assert(!memory_pred_edges.is_empty(), "no need to create a node without edges");
290
DependencyNode* dn = new (_arena) DependencyNode(n, memory_pred_edges, _arena);
291
_dependency_nodes.at_put_grow(_body.bb_idx(n), dn, nullptr);
294
int VLoopDependencyGraph::find_max_pred_depth(const Node* n) const {
295
int max_pred_depth = 0;
296
if (!n->is_Phi()) { // ignore backedge
297
for (PredsIterator it(*this, n); !it.done(); it.next()) {
298
Node* pred = it.current();
299
if (_vloop.in_bb(pred)) {
300
max_pred_depth = MAX2(max_pred_depth, depth(pred));
304
return max_pred_depth;
307
// We iterate over the body, which is already ordered by the dependencies, i.e. pred comes
308
// before use. With a single pass, we can compute the depth of every node, since we can
309
// assume that the depth of all preds is already computed when we compute the depth of use.
310
void VLoopDependencyGraph::compute_depth() {
311
for (int i = 0; i < _body.body().length(); i++) {
312
Node* n = _body.body().at(i);
313
set_depth(n, find_max_pred_depth(n) + 1);
317
for (int i = 0; i < _body.body().length(); i++) {
318
Node* n = _body.body().at(i);
319
int max_pred_depth = find_max_pred_depth(n);
320
if (depth(n) != max_pred_depth + 1) {
322
tty->print_cr("Incorrect depth: %d vs %d", depth(n), max_pred_depth + 1);
325
assert(depth(n) == max_pred_depth + 1, "must have correct depth");
331
void VLoopDependencyGraph::print() const {
332
tty->print_cr("\nVLoopDependencyGraph::print:");
334
tty->print_cr(" Memory pred edges:");
335
for (int i = 0; i < _body.body().length(); i++) {
336
Node* n = _body.body().at(i);
337
const DependencyNode* dn = dependency_node(n);
339
tty->print(" DependencyNode[%d %s:", n->_idx, n->Name());
340
for (uint j = 0; j < dn->memory_pred_edges_length(); j++) {
341
Node* pred = _body.body().at(dn->memory_pred_edge(j));
342
tty->print(" %d %s", pred->_idx, pred->Name());
349
tty->print_cr(" Complete dependency graph:");
350
for (int i = 0; i < _body.body().length(); i++) {
351
Node* n = _body.body().at(i);
352
tty->print(" d%02d Dependencies[%d %s:", depth(n), n->_idx, n->Name());
353
for (PredsIterator it(*this, n); !it.done(); it.next()) {
354
Node* pred = it.current();
355
tty->print(" %d %s", pred->_idx, pred->Name());
362
VLoopDependencyGraph::DependencyNode::DependencyNode(MemNode* n,
363
GrowableArray<int>& memory_pred_edges,
366
_memory_pred_edges_length(memory_pred_edges.length()),
367
_memory_pred_edges(nullptr)
369
assert(memory_pred_edges.is_nonempty(), "not empty");
370
uint bytes = memory_pred_edges.length() * sizeof(int);
371
_memory_pred_edges = (int*)arena->Amalloc(bytes);
372
memcpy(_memory_pred_edges, memory_pred_edges.adr_at(0), bytes);
375
VLoopDependencyGraph::PredsIterator::PredsIterator(const VLoopDependencyGraph& dependency_graph,
377
_dependency_graph(dependency_graph),
379
_dependency_node(dependency_graph.dependency_node(node)),
382
_end_pred(node->req()),
383
_next_memory_pred(0),
384
_end_memory_pred((_dependency_node != nullptr) ? _dependency_node->memory_pred_edges_length() : 0)
386
if (_node->is_Store() || _node->is_Load()) {
388
// Store: address, value
389
_next_pred = MemNode::Address;
391
assert(!_node->is_Mem(), "only loads and stores are expected mem nodes");
392
_next_pred = 1; // skip control
397
void VLoopDependencyGraph::PredsIterator::next() {
398
if (_next_pred < _end_pred) {
399
_current = _node->in(_next_pred++);
400
} else if (_next_memory_pred < _end_memory_pred) {
401
int pred_bb_idx = _dependency_node->memory_pred_edge(_next_memory_pred++);
402
_current = _dependency_graph._body.body().at(pred_bb_idx);
404
_current = nullptr; // done
409
int VPointer::Tracer::_depth = 0;
412
VPointer::VPointer(MemNode* const mem, const VLoop& vloop,
413
Node_Stack* nstack, bool analyze_only) :
414
_mem(mem), _vloop(vloop),
415
_base(nullptr), _adr(nullptr), _scale(0), _offset(0), _invar(nullptr),
417
_debug_invar(nullptr), _debug_negate_invar(false), _debug_invar_scale(nullptr),
419
_nstack(nstack), _analyze_only(analyze_only), _stack_idx(0)
421
, _tracer(vloop.is_trace_pointer_analysis())
424
NOT_PRODUCT(_tracer.ctor_1(mem);)
426
Node* adr = mem->in(MemNode::Address);
427
if (!adr->is_AddP()) {
428
assert(!valid(), "too complex");
431
// Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant)
432
Node* base = adr->in(AddPNode::Base);
433
// The base address should be loop invariant
434
if (is_loop_member(base)) {
435
assert(!valid(), "base address is loop variant");
438
// unsafe references require misaligned vector access support
439
if (base->is_top() && !Matcher::misaligned_vectors_ok()) {
440
assert(!valid(), "unsafe access");
444
NOT_PRODUCT(if(_tracer._is_trace_alignment) _tracer.store_depth();)
445
NOT_PRODUCT(_tracer.ctor_2(adr);)
449
NOT_PRODUCT(_tracer.ctor_3(adr, i);)
451
if (!scaled_iv_plus_offset(adr->in(AddPNode::Offset))) {
452
assert(!valid(), "too complex");
455
adr = adr->in(AddPNode::Address);
456
NOT_PRODUCT(_tracer.ctor_4(adr, i);)
458
if (base == adr || !adr->is_AddP()) {
459
NOT_PRODUCT(_tracer.ctor_5(adr, base, i);)
460
break; // stop looking at addp's
463
if (!invariant(adr)) {
464
// The address must be invariant for the current loop. But if we are in a main-loop,
465
// it must also be invariant of the pre-loop, otherwise we cannot use this address
466
// for the pre-loop limit adjustment required for main-loop alignment.
467
assert(!valid(), "adr is loop variant");
471
if (!base->is_top() && adr != base) {
472
assert(!valid(), "adr and base differ");
476
NOT_PRODUCT(if(_tracer._is_trace_alignment) _tracer.restore_depth();)
477
NOT_PRODUCT(_tracer.ctor_6(mem);)
479
// In the pointer analysis, and especially the AlignVector, analysis we assume that
480
// stride and scale are not too large. For example, we multiply "scale * stride",
481
// and assume that this does not overflow the int range. We also take "abs(scale)"
482
// and "abs(stride)", which would overflow for min_int = -(2^31). Still, we want
483
// to at least allow small and moderately large stride and scale. Therefore, we
484
// allow values up to 2^30, which is only a factor 2 smaller than the max/min int.
485
// Normal performance relevant code will have much lower values. And the restriction
486
// allows us to keep the rest of the autovectorization code much simpler, since we
487
// do not have to deal with overflows.
488
jlong long_scale = _scale;
489
jlong long_stride = _vloop.iv_stride();
490
jlong max_val = 1 << 30;
491
if (abs(long_scale) >= max_val ||
492
abs(long_stride) >= max_val ||
493
abs(long_scale * long_stride) >= max_val) {
494
assert(!valid(), "adr stride*scale is too large");
500
assert(valid(), "Usable");
503
// Following is used to create a temporary object during
504
// the pattern match of an address expression.
505
VPointer::VPointer(VPointer* p) :
506
_mem(p->_mem), _vloop(p->_vloop),
507
_base(nullptr), _adr(nullptr), _scale(0), _offset(0), _invar(nullptr),
509
_debug_invar(nullptr), _debug_negate_invar(false), _debug_invar_scale(nullptr),
511
_nstack(p->_nstack), _analyze_only(p->_analyze_only), _stack_idx(p->_stack_idx)
513
, _tracer(p->_tracer._is_trace_alignment)
517
// Biggest detectable factor of the invariant.
518
int VPointer::invar_factor() const {
523
int opc = n->Opcode();
524
if (opc == Op_LShiftI && n->in(2)->is_Con()) {
525
return 1 << n->in(2)->get_int();
526
} else if (opc == Op_LShiftL && n->in(2)->is_Con()) {
527
return 1 << n->in(2)->get_int();
529
// All our best-effort has failed.
533
bool VPointer::is_loop_member(Node* n) const {
534
Node* n_c = phase()->get_ctrl(n);
535
return lpt()->is_member(phase()->get_loop(n_c));
538
bool VPointer::invariant(Node* n) const {
539
NOT_PRODUCT(Tracer::Depth dd;)
540
bool is_not_member = !is_loop_member(n);
542
CountedLoopNode* cl = lpt()->_head->as_CountedLoop();
543
if (cl->is_main_loop()) {
544
// Check that n_c dominates the pre loop head node. If it does not, then
545
// we cannot use n as invariant for the pre loop CountedLoopEndNode check
546
// because n_c is either part of the pre loop or between the pre and the
547
// main loop (Illegal invariant happens when n_c is a CastII node that
548
// prevents data nodes to flow above the main loop).
549
Node* n_c = phase()->get_ctrl(n);
550
return phase()->is_dominator(n_c, _vloop.pre_loop_head());
553
return is_not_member;
556
// Match: k*iv + offset
557
// where: k is a constant that maybe zero, and
558
// offset is (k2 [+/- invariant]) where k2 maybe zero and invariant is optional
559
bool VPointer::scaled_iv_plus_offset(Node* n) {
560
NOT_PRODUCT(Tracer::Depth ddd;)
561
NOT_PRODUCT(_tracer.scaled_iv_plus_offset_1(n);)
564
NOT_PRODUCT(_tracer.scaled_iv_plus_offset_2(n);)
568
if (offset_plus_k(n)) {
569
NOT_PRODUCT(_tracer.scaled_iv_plus_offset_3(n);)
573
int opc = n->Opcode();
574
if (opc == Op_AddI) {
575
if (offset_plus_k(n->in(2)) && scaled_iv_plus_offset(n->in(1))) {
576
NOT_PRODUCT(_tracer.scaled_iv_plus_offset_4(n);)
579
if (offset_plus_k(n->in(1)) && scaled_iv_plus_offset(n->in(2))) {
580
NOT_PRODUCT(_tracer.scaled_iv_plus_offset_5(n);)
583
} else if (opc == Op_SubI || opc == Op_SubL) {
584
if (offset_plus_k(n->in(2), true) && scaled_iv_plus_offset(n->in(1))) {
585
NOT_PRODUCT(_tracer.scaled_iv_plus_offset_6(n);)
588
if (offset_plus_k(n->in(1)) && scaled_iv_plus_offset(n->in(2))) {
590
NOT_PRODUCT(_tracer.scaled_iv_plus_offset_7(n);)
595
NOT_PRODUCT(_tracer.scaled_iv_plus_offset_8(n);)
599
// Match: k*iv where k is a constant that's not zero
600
bool VPointer::scaled_iv(Node* n) {
601
NOT_PRODUCT(Tracer::Depth ddd;)
602
NOT_PRODUCT(_tracer.scaled_iv_1(n);)
604
if (_scale != 0) { // already found a scale
605
NOT_PRODUCT(_tracer.scaled_iv_2(n, _scale);)
611
NOT_PRODUCT(_tracer.scaled_iv_3(n, _scale);)
614
if (_analyze_only && (is_loop_member(n))) {
615
_nstack->push(n, _stack_idx++);
618
int opc = n->Opcode();
619
if (opc == Op_MulI) {
620
if (n->in(1) == iv() && n->in(2)->is_Con()) {
621
_scale = n->in(2)->get_int();
622
NOT_PRODUCT(_tracer.scaled_iv_4(n, _scale);)
624
} else if (n->in(2) == iv() && n->in(1)->is_Con()) {
625
_scale = n->in(1)->get_int();
626
NOT_PRODUCT(_tracer.scaled_iv_5(n, _scale);)
629
} else if (opc == Op_LShiftI) {
630
if (n->in(1) == iv() && n->in(2)->is_Con()) {
631
_scale = 1 << n->in(2)->get_int();
632
NOT_PRODUCT(_tracer.scaled_iv_6(n, _scale);)
635
} else if (opc == Op_ConvI2L || opc == Op_CastII) {
636
if (scaled_iv_plus_offset(n->in(1))) {
637
NOT_PRODUCT(_tracer.scaled_iv_7(n);)
640
} else if (opc == Op_LShiftL && n->in(2)->is_Con()) {
642
// Need to preserve the current _offset value, so
643
// create a temporary object for this expression subtree.
644
// Hacky, so should re-engineer the address pattern match.
645
NOT_PRODUCT(Tracer::Depth dddd;)
647
NOT_PRODUCT(_tracer.scaled_iv_8(n, &tmp);)
649
if (tmp.scaled_iv_plus_offset(n->in(1))) {
650
int scale = n->in(2)->get_int();
651
_scale = tmp._scale << scale;
652
_offset += tmp._offset << scale;
653
if (tmp._invar != nullptr) {
654
BasicType bt = tmp._invar->bottom_type()->basic_type();
655
assert(bt == T_INT || bt == T_LONG, "");
656
maybe_add_to_invar(register_if_new(LShiftNode::make(tmp._invar, n->in(2), bt)), false);
658
_debug_invar_scale = n->in(2);
661
NOT_PRODUCT(_tracer.scaled_iv_9(n, _scale, _offset, _invar);)
666
NOT_PRODUCT(_tracer.scaled_iv_10(n);)
670
// Match: offset is (k [+/- invariant])
671
// where k maybe zero and invariant is optional, but not both.
672
bool VPointer::offset_plus_k(Node* n, bool negate) {
673
NOT_PRODUCT(Tracer::Depth ddd;)
674
NOT_PRODUCT(_tracer.offset_plus_k_1(n);)
676
int opc = n->Opcode();
677
if (opc == Op_ConI) {
678
_offset += negate ? -(n->get_int()) : n->get_int();
679
NOT_PRODUCT(_tracer.offset_plus_k_2(n, _offset);)
681
} else if (opc == Op_ConL) {
682
// Okay if value fits into an int
683
const TypeLong* t = n->find_long_type();
684
if (t->higher_equal(TypeLong::INT)) {
685
jlong loff = n->get_long();
686
jint off = (jint)loff;
687
_offset += negate ? -off : loff;
688
NOT_PRODUCT(_tracer.offset_plus_k_3(n, _offset);)
691
NOT_PRODUCT(_tracer.offset_plus_k_4(n);)
694
assert((_debug_invar == nullptr) == (_invar == nullptr), "");
696
if (_analyze_only && is_loop_member(n)) {
697
_nstack->push(n, _stack_idx++);
699
if (opc == Op_AddI) {
700
if (n->in(2)->is_Con() && invariant(n->in(1))) {
701
maybe_add_to_invar(n->in(1), negate);
702
_offset += negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
703
NOT_PRODUCT(_tracer.offset_plus_k_6(n, _invar, negate, _offset);)
705
} else if (n->in(1)->is_Con() && invariant(n->in(2))) {
706
_offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
707
maybe_add_to_invar(n->in(2), negate);
708
NOT_PRODUCT(_tracer.offset_plus_k_7(n, _invar, negate, _offset);)
712
if (opc == Op_SubI) {
713
if (n->in(2)->is_Con() && invariant(n->in(1))) {
714
maybe_add_to_invar(n->in(1), negate);
715
_offset += !negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
716
NOT_PRODUCT(_tracer.offset_plus_k_8(n, _invar, negate, _offset);)
718
} else if (n->in(1)->is_Con() && invariant(n->in(2))) {
719
_offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
720
maybe_add_to_invar(n->in(2), !negate);
721
NOT_PRODUCT(_tracer.offset_plus_k_9(n, _invar, !negate, _offset);)
726
if (!is_loop_member(n)) {
727
// 'n' is loop invariant. Skip ConvI2L and CastII nodes before checking if 'n' is dominating the pre loop.
728
if (opc == Op_ConvI2L) {
731
if (n->Opcode() == Op_CastII) {
733
assert(!is_loop_member(n), "sanity");
736
// Check if 'n' can really be used as invariant (not in main loop and dominating the pre loop).
738
maybe_add_to_invar(n, negate);
739
NOT_PRODUCT(_tracer.offset_plus_k_10(n, _invar, negate, _offset);)
744
NOT_PRODUCT(_tracer.offset_plus_k_11(n);)
748
Node* VPointer::maybe_negate_invar(bool negate, Node* invar) {
750
_debug_negate_invar = negate;
753
BasicType bt = invar->bottom_type()->basic_type();
754
assert(bt == T_INT || bt == T_LONG, "");
755
PhaseIterGVN& igvn = phase()->igvn();
756
Node* zero = igvn.zerocon(bt);
757
phase()->set_ctrl(zero, phase()->C->root());
758
Node* sub = SubNode::make(zero, invar, bt);
759
invar = register_if_new(sub);
764
Node* VPointer::register_if_new(Node* n) const {
765
PhaseIterGVN& igvn = phase()->igvn();
766
Node* prev = igvn.hash_find_insert(n);
767
if (prev != nullptr) {
771
Node* c = phase()->get_early_ctrl(n);
772
phase()->register_new_node(n, c);
777
void VPointer::maybe_add_to_invar(Node* new_invar, bool negate) {
778
new_invar = maybe_negate_invar(negate, new_invar);
779
if (_invar == nullptr) {
782
_debug_invar = new_invar;
787
_debug_invar = NodeSentinel;
789
BasicType new_invar_bt = new_invar->bottom_type()->basic_type();
790
assert(new_invar_bt == T_INT || new_invar_bt == T_LONG, "");
791
BasicType invar_bt = _invar->bottom_type()->basic_type();
792
assert(invar_bt == T_INT || invar_bt == T_LONG, "");
794
BasicType bt = (new_invar_bt == T_LONG || invar_bt == T_LONG) ? T_LONG : T_INT;
795
Node* current_invar = _invar;
796
if (invar_bt != bt) {
797
assert(bt == T_LONG && invar_bt == T_INT, "");
798
assert(new_invar_bt == bt, "");
799
current_invar = register_if_new(new ConvI2LNode(current_invar));
800
} else if (new_invar_bt != bt) {
801
assert(bt == T_LONG && new_invar_bt == T_INT, "");
802
assert(invar_bt == bt, "");
803
new_invar = register_if_new(new ConvI2LNode(new_invar));
805
Node* add = AddNode::make(current_invar, new_invar, bt);
806
_invar = register_if_new(add);
809
// We use two comparisons, because a subtraction could underflow.
810
#define RETURN_CMP_VALUE_IF_NOT_EQUAL(a, b) \
811
if (a < b) { return -1; } \
812
if (a > b) { return 1; }
814
// To be in the same group, two VPointers must be the same,
815
// except for the offset.
816
int VPointer::cmp_for_sort_by_group(const VPointer** p1, const VPointer** p2) {
817
const VPointer* a = *p1;
818
const VPointer* b = *p2;
820
RETURN_CMP_VALUE_IF_NOT_EQUAL(a->base()->_idx, b->base()->_idx);
821
RETURN_CMP_VALUE_IF_NOT_EQUAL(a->mem()->Opcode(), b->mem()->Opcode());
822
RETURN_CMP_VALUE_IF_NOT_EQUAL(a->scale_in_bytes(), b->scale_in_bytes());
824
int a_inva_idx = a->invar() == nullptr ? 0 : a->invar()->_idx;
825
int b_inva_idx = b->invar() == nullptr ? 0 : b->invar()->_idx;
826
RETURN_CMP_VALUE_IF_NOT_EQUAL(a_inva_idx, b_inva_idx);
831
// We compare by group, then by offset, and finally by node idx.
832
int VPointer::cmp_for_sort(const VPointer** p1, const VPointer** p2) {
833
int cmp_group = cmp_for_sort_by_group(p1, p2);
834
if (cmp_group != 0) { return cmp_group; }
836
const VPointer* a = *p1;
837
const VPointer* b = *p2;
839
RETURN_CMP_VALUE_IF_NOT_EQUAL(a->offset_in_bytes(), b->offset_in_bytes());
840
RETURN_CMP_VALUE_IF_NOT_EQUAL(a->mem()->_idx, b->mem()->_idx);
845
// Function for printing the fields of a VPointer
846
void VPointer::print() const {
847
tty->print("VPointer[mem: %4d %10s, ", _mem->_idx, _mem->Name());
850
tty->print_cr("invalid]");
854
tty->print("base: %4d, ", _base != nullptr ? _base->_idx : 0);
855
tty->print("adr: %4d, ", _adr != nullptr ? _adr->_idx : 0);
858
print_con_or_idx(_base);
860
tty->print(" + offset(%4d)", _offset);
862
tty->print(" + invar");
863
print_con_or_idx(_invar);
865
tty->print_cr(" + scale(%4d) * iv]", _scale);
869
// Following are functions for tracing VPointer match
871
void VPointer::Tracer::print_depth() const {
872
for (int ii = 0; ii < _depth; ++ii) {
877
void VPointer::Tracer::ctor_1(const Node* mem) {
878
if (_is_trace_alignment) {
879
print_depth(); tty->print(" %d VPointer::VPointer: start alignment analysis", mem->_idx); mem->dump();
883
void VPointer::Tracer::ctor_2(Node* adr) {
884
if (_is_trace_alignment) {
887
print_depth(); tty->print(" %d (adr) VPointer::VPointer: ", adr->_idx); adr->dump();
889
print_depth(); tty->print(" %d (base) VPointer::VPointer: ", adr->in(AddPNode::Base)->_idx); adr->in(AddPNode::Base)->dump();
893
void VPointer::Tracer::ctor_3(Node* adr, int i) {
894
if (_is_trace_alignment) {
896
Node* offset = adr->in(AddPNode::Offset);
897
print_depth(); tty->print(" %d (offset) VPointer::VPointer: i = %d: ", offset->_idx, i); offset->dump();
901
void VPointer::Tracer::ctor_4(Node* adr, int i) {
902
if (_is_trace_alignment) {
904
print_depth(); tty->print(" %d (adr) VPointer::VPointer: i = %d: ", adr->_idx, i); adr->dump();
908
void VPointer::Tracer::ctor_5(Node* adr, Node* base, int i) {
909
if (_is_trace_alignment) {
912
print_depth(); tty->print_cr(" \\ %d (adr) == %d (base) VPointer::VPointer: breaking analysis at i = %d", adr->_idx, base->_idx, i);
913
} else if (!adr->is_AddP()) {
914
print_depth(); tty->print_cr(" \\ %d (adr) is NOT Addp VPointer::VPointer: breaking analysis at i = %d", adr->_idx, i);
919
void VPointer::Tracer::ctor_6(const Node* mem) {
920
if (_is_trace_alignment) {
922
print_depth(); tty->print_cr(" %d (adr) VPointer::VPointer: stop analysis", mem->_idx);
926
void VPointer::Tracer::scaled_iv_plus_offset_1(Node* n) {
927
if (_is_trace_alignment) {
928
print_depth(); tty->print(" %d VPointer::scaled_iv_plus_offset testing node: ", n->_idx);
933
void VPointer::Tracer::scaled_iv_plus_offset_2(Node* n) {
934
if (_is_trace_alignment) {
935
print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: PASSED", n->_idx);
939
void VPointer::Tracer::scaled_iv_plus_offset_3(Node* n) {
940
if (_is_trace_alignment) {
941
print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: PASSED", n->_idx);
945
void VPointer::Tracer::scaled_iv_plus_offset_4(Node* n) {
946
if (_is_trace_alignment) {
947
print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: Op_AddI PASSED", n->_idx);
948
print_depth(); tty->print(" \\ %d VPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(1)->_idx); n->in(1)->dump();
949
print_depth(); tty->print(" \\ %d VPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(2)->_idx); n->in(2)->dump();
953
void VPointer::Tracer::scaled_iv_plus_offset_5(Node* n) {
954
if (_is_trace_alignment) {
955
print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: Op_AddI PASSED", n->_idx);
956
print_depth(); tty->print(" \\ %d VPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(2)->_idx); n->in(2)->dump();
957
print_depth(); tty->print(" \\ %d VPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(1)->_idx); n->in(1)->dump();
961
void VPointer::Tracer::scaled_iv_plus_offset_6(Node* n) {
962
if (_is_trace_alignment) {
963
print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: Op_%s PASSED", n->_idx, n->Name());
964
print_depth(); tty->print(" \\ %d VPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(1)->_idx); n->in(1)->dump();
965
print_depth(); tty->print(" \\ %d VPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(2)->_idx); n->in(2)->dump();
969
void VPointer::Tracer::scaled_iv_plus_offset_7(Node* n) {
970
if (_is_trace_alignment) {
971
print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: Op_%s PASSED", n->_idx, n->Name());
972
print_depth(); tty->print(" \\ %d VPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(2)->_idx); n->in(2)->dump();
973
print_depth(); tty->print(" \\ %d VPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(1)->_idx); n->in(1)->dump();
977
void VPointer::Tracer::scaled_iv_plus_offset_8(Node* n) {
978
if (_is_trace_alignment) {
979
print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: FAILED", n->_idx);
983
void VPointer::Tracer::scaled_iv_1(Node* n) {
984
if (_is_trace_alignment) {
985
print_depth(); tty->print(" %d VPointer::scaled_iv: testing node: ", n->_idx); n->dump();
989
void VPointer::Tracer::scaled_iv_2(Node* n, int scale) {
990
if (_is_trace_alignment) {
991
print_depth(); tty->print_cr(" %d VPointer::scaled_iv: FAILED since another _scale has been detected before", n->_idx);
992
print_depth(); tty->print_cr(" \\ VPointer::scaled_iv: _scale (%d) != 0", scale);
996
void VPointer::Tracer::scaled_iv_3(Node* n, int scale) {
997
if (_is_trace_alignment) {
998
print_depth(); tty->print_cr(" %d VPointer::scaled_iv: is iv, setting _scale = %d", n->_idx, scale);
1002
void VPointer::Tracer::scaled_iv_4(Node* n, int scale) {
1003
if (_is_trace_alignment) {
1004
print_depth(); tty->print_cr(" %d VPointer::scaled_iv: Op_MulI PASSED, setting _scale = %d", n->_idx, scale);
1005
print_depth(); tty->print(" \\ %d VPointer::scaled_iv: in(1) is iv: ", n->in(1)->_idx); n->in(1)->dump();
1006
print_depth(); tty->print(" \\ %d VPointer::scaled_iv: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
1010
void VPointer::Tracer::scaled_iv_5(Node* n, int scale) {
1011
if (_is_trace_alignment) {
1012
print_depth(); tty->print_cr(" %d VPointer::scaled_iv: Op_MulI PASSED, setting _scale = %d", n->_idx, scale);
1013
print_depth(); tty->print(" \\ %d VPointer::scaled_iv: in(2) is iv: ", n->in(2)->_idx); n->in(2)->dump();
1014
print_depth(); tty->print(" \\ %d VPointer::scaled_iv: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
1018
void VPointer::Tracer::scaled_iv_6(Node* n, int scale) {
1019
if (_is_trace_alignment) {
1020
print_depth(); tty->print_cr(" %d VPointer::scaled_iv: Op_LShiftI PASSED, setting _scale = %d", n->_idx, scale);
1021
print_depth(); tty->print(" \\ %d VPointer::scaled_iv: in(1) is iv: ", n->in(1)->_idx); n->in(1)->dump();
1022
print_depth(); tty->print(" \\ %d VPointer::scaled_iv: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
1026
void VPointer::Tracer::scaled_iv_7(Node* n) {
1027
if (_is_trace_alignment) {
1028
print_depth(); tty->print_cr(" %d VPointer::scaled_iv: Op_ConvI2L PASSED", n->_idx);
1029
print_depth(); tty->print_cr(" \\ VPointer::scaled_iv: in(1) %d is scaled_iv_plus_offset: ", n->in(1)->_idx);
1030
inc_depth(); inc_depth();
1031
print_depth(); n->in(1)->dump();
1032
dec_depth(); dec_depth();
1036
void VPointer::Tracer::scaled_iv_8(Node* n, VPointer* tmp) {
1037
if (_is_trace_alignment) {
1038
print_depth(); tty->print(" %d VPointer::scaled_iv: Op_LShiftL, creating tmp VPointer: ", n->_idx); tmp->print();
1042
void VPointer::Tracer::scaled_iv_9(Node* n, int scale, int offset, Node* invar) {
1043
if (_is_trace_alignment) {
1044
print_depth(); tty->print_cr(" %d VPointer::scaled_iv: Op_LShiftL PASSED, setting _scale = %d, _offset = %d", n->_idx, scale, offset);
1045
print_depth(); tty->print_cr(" \\ VPointer::scaled_iv: in(1) [%d] is scaled_iv_plus_offset, in(2) [%d] used to scale: _scale = %d, _offset = %d",
1046
n->in(1)->_idx, n->in(2)->_idx, scale, offset);
1047
if (invar != nullptr) {
1048
print_depth(); tty->print_cr(" \\ VPointer::scaled_iv: scaled invariant: [%d]", invar->_idx);
1050
inc_depth(); inc_depth();
1051
print_depth(); n->in(1)->dump();
1052
print_depth(); n->in(2)->dump();
1053
if (invar != nullptr) {
1054
print_depth(); invar->dump();
1056
dec_depth(); dec_depth();
1060
void VPointer::Tracer::scaled_iv_10(Node* n) {
1061
if (_is_trace_alignment) {
1062
print_depth(); tty->print_cr(" %d VPointer::scaled_iv: FAILED", n->_idx);
1066
void VPointer::Tracer::offset_plus_k_1(Node* n) {
1067
if (_is_trace_alignment) {
1068
print_depth(); tty->print(" %d VPointer::offset_plus_k: testing node: ", n->_idx); n->dump();
1072
void VPointer::Tracer::offset_plus_k_2(Node* n, int _offset) {
1073
if (_is_trace_alignment) {
1074
print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: Op_ConI PASSED, setting _offset = %d", n->_idx, _offset);
1078
void VPointer::Tracer::offset_plus_k_3(Node* n, int _offset) {
1079
if (_is_trace_alignment) {
1080
print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: Op_ConL PASSED, setting _offset = %d", n->_idx, _offset);
1084
void VPointer::Tracer::offset_plus_k_4(Node* n) {
1085
if (_is_trace_alignment) {
1086
print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: FAILED", n->_idx);
1087
print_depth(); tty->print_cr(" \\ " JLONG_FORMAT " VPointer::offset_plus_k: Op_ConL FAILED, k is too big", n->get_long());
1091
void VPointer::Tracer::offset_plus_k_5(Node* n, Node* _invar) {
1092
if (_is_trace_alignment) {
1093
print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: FAILED since another invariant has been detected before", n->_idx);
1094
print_depth(); tty->print(" \\ %d VPointer::offset_plus_k: _invar is not null: ", _invar->_idx); _invar->dump();
1098
void VPointer::Tracer::offset_plus_k_6(Node* n, Node* _invar, bool _negate_invar, int _offset) {
1099
if (_is_trace_alignment) {
1100
print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: Op_AddI PASSED, setting _debug_negate_invar = %d, _invar = %d, _offset = %d",
1101
n->_idx, _negate_invar, _invar->_idx, _offset);
1102
print_depth(); tty->print(" \\ %d VPointer::offset_plus_k: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
1103
print_depth(); tty->print(" \\ %d VPointer::offset_plus_k: in(1) is invariant: ", _invar->_idx); _invar->dump();
1107
void VPointer::Tracer::offset_plus_k_7(Node* n, Node* _invar, bool _negate_invar, int _offset) {
1108
if (_is_trace_alignment) {
1109
print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: Op_AddI PASSED, setting _debug_negate_invar = %d, _invar = %d, _offset = %d",
1110
n->_idx, _negate_invar, _invar->_idx, _offset);
1111
print_depth(); tty->print(" \\ %d VPointer::offset_plus_k: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
1112
print_depth(); tty->print(" \\ %d VPointer::offset_plus_k: in(2) is invariant: ", _invar->_idx); _invar->dump();
1116
void VPointer::Tracer::offset_plus_k_8(Node* n, Node* _invar, bool _negate_invar, int _offset) {
1117
if (_is_trace_alignment) {
1118
print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: Op_SubI is PASSED, setting _debug_negate_invar = %d, _invar = %d, _offset = %d",
1119
n->_idx, _negate_invar, _invar->_idx, _offset);
1120
print_depth(); tty->print(" \\ %d VPointer::offset_plus_k: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
1121
print_depth(); tty->print(" \\ %d VPointer::offset_plus_k: in(1) is invariant: ", _invar->_idx); _invar->dump();
1125
void VPointer::Tracer::offset_plus_k_9(Node* n, Node* _invar, bool _negate_invar, int _offset) {
1126
if (_is_trace_alignment) {
1127
print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: Op_SubI PASSED, setting _debug_negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset);
1128
print_depth(); tty->print(" \\ %d VPointer::offset_plus_k: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
1129
print_depth(); tty->print(" \\ %d VPointer::offset_plus_k: in(2) is invariant: ", _invar->_idx); _invar->dump();
1133
void VPointer::Tracer::offset_plus_k_10(Node* n, Node* _invar, bool _negate_invar, int _offset) {
1134
if (_is_trace_alignment) {
1135
print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: PASSED, setting _debug_negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset);
1136
print_depth(); tty->print_cr(" \\ %d VPointer::offset_plus_k: is invariant", n->_idx);
1140
void VPointer::Tracer::offset_plus_k_11(Node* n) {
1141
if (_is_trace_alignment) {
1142
print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: FAILED", n->_idx);
1148
AlignmentSolution* AlignmentSolver::solve() const {
1149
DEBUG_ONLY( trace_start_solve(); )
1151
// Out of simplicity: non power-of-2 stride not supported.
1152
if (!is_power_of_2(abs(_pre_stride))) {
1153
return new EmptyAlignmentSolution("non power-of-2 stride not supported");
1155
assert(is_power_of_2(abs(_main_stride)), "main_stride is power of 2");
1156
assert(_aw > 0 && is_power_of_2(_aw), "aw must be power of 2");
1158
// Out of simplicity: non power-of-2 scale not supported.
1159
if (abs(_scale) == 0 || !is_power_of_2(abs(_scale))) {
1160
return new EmptyAlignmentSolution("non power-of-2 scale not supported");
1163
// We analyze the address of mem_ref. The idea is to disassemble it into a linear
1164
// expression, where we can use the constant factors as the basis for ensuring the
1165
// alignment of vector memory accesses.
1167
// The Simple form of the address is disassembled by VPointer into:
1169
// adr = base + offset + invar + scale * iv
1171
// Where the iv can be written as:
1173
// iv = init + pre_stride * pre_iter + main_stride * main_iter
1175
// init: value before pre-loop
1176
// pre_stride: increment per pre-loop iteration
1177
// pre_iter: number of pre-loop iterations (adjustable via pre-loop limit)
1178
// main_stride: increment per main-loop iteration (= pre_stride * unroll_factor)
1179
// main_iter: number of main-loop iterations (main_iter >= 0)
1181
// In the following, we restate the Simple form of the address expression, by first
1182
// expanding the iv variable. In a second step, we reshape the expression again, and
1183
// state it as a linear expression, consisting of 6 terms.
1185
// Simple form Expansion of iv variable Reshaped with constants Comments for terms
1186
// ----------- ------------------------ ----------------------- ------------------
1187
// adr = base = base = base (base % aw = 0)
1188
// + offset + offset + C_const (sum of constant terms)
1189
// + invar + invar_factor * var_invar + C_invar * var_invar (term for invariant)
1190
// / + scale * init + C_init * var_init (term for variable init)
1191
// + scale * iv -> | + scale * pre_stride * pre_iter + C_pre * pre_iter (adjustable pre-loop term)
1192
// \ + scale * main_stride * main_iter + C_main * main_iter (main-loop term)
1194
// We describe the 6 terms:
1195
// 1) The "base" of the address is the address of a Java object (e.g. array),
1196
// and as such ObjectAlignmentInBytes (a power of 2) aligned. We have
1197
// defined aw = MIN(vector_width, ObjectAlignmentInBytes), which is also
1198
// a power of 2. And hence we know that "base" is thus also aw-aligned:
1200
// base % ObjectAlignmentInBytes = 0 ==> base % aw = 0
1202
// 2) The "C_const" term is the sum of all constant terms. This is "offset",
1203
// plus "scale * init" if it is constant.
1204
// 3) The "C_invar * var_invar" is the factorization of "invar" into a constant
1205
// and variable term. If there is no invariant, then "C_invar" is zero.
1207
// invar = C_invar * var_invar (FAC_INVAR)
1209
// 4) The "C_init * var_init" is the factorization of "scale * init" into a
1210
// constant and a variable term. If "init" is constant, then "C_init" is
1211
// zero, and "C_const" accounts for "init" instead.
1213
// scale * init = C_init * var_init + scale * C_const_init (FAC_INIT)
1214
// C_init = (init is constant) ? 0 : scale
1215
// C_const_init = (init is constant) ? init : 0
1217
// 5) The "C_pre * pre_iter" term represents how much the iv is incremented
1218
// during the "pre_iter" pre-loop iterations. This term can be adjusted
1219
// by changing the pre-loop limit, which defines how many pre-loop iterations
1220
// are executed. This allows us to adjust the alignment of the main-loop
1221
// memory reference.
1222
// 6) The "C_main * main_iter" term represents how much the iv is increased
1223
// during "main_iter" main-loop iterations.
1225
// Attribute init (i.e. _init_node) either to C_const or to C_init term.
1226
const int C_const_init = _init_node->is_ConI() ? _init_node->as_ConI()->get_int() : 0;
1227
const int C_const = _offset + C_const_init * _scale;
1229
// Set C_invar depending on if invar is present
1230
const int C_invar = (_invar == nullptr) ? 0 : abs(_invar_factor);
1232
const int C_init = _init_node->is_ConI() ? 0 : _scale;
1233
const int C_pre = _scale * _pre_stride;
1234
const int C_main = _scale * _main_stride;
1236
DEBUG_ONLY( trace_reshaped_form(C_const, C_const_init, C_invar, C_init, C_pre, C_main); )
1238
// We must find a pre_iter, such that adr is aw aligned: adr % aw = 0. Note, that we are defining the
1239
// modulo operator "%" such that the remainder is always positive, see AlignmentSolution::mod(i, q).
1241
// Since "base % aw = 0", we only need to ensure alignment of the other 5 terms:
1243
// (C_const + C_invar * var_invar + C_init * var_init + C_pre * pre_iter + C_main * main_iter) % aw = 0 (1)
1245
// Alignment must be maintained over all main-loop iterations, i.e. for any main_iter >= 0, we require:
1247
// C_main % aw = 0 (2)
1249
const int C_main_mod_aw = AlignmentSolution::mod(C_main, _aw);
1251
DEBUG_ONLY( trace_main_iteration_alignment(C_const, C_invar, C_init, C_pre, C_main, C_main_mod_aw); )
1253
if (C_main_mod_aw != 0) {
1254
return new EmptyAlignmentSolution("EQ(2) not satisfied (cannot align across main-loop iterations)");
1257
// In what follows, we need to show that the C_const, init and invar terms can be aligned by
1258
// adjusting the pre-loop iteration count (pre_iter), which is controlled by the pre-loop
1261
// (C_const + C_invar * var_invar + C_init * var_init + C_pre * pre_iter) % aw = 0 (3)
1263
// We strengthen the constraints by splitting the equation into 3 equations, where we
1264
// want to find integer solutions for pre_iter_C_const, pre_iter_C_invar, and
1265
// pre_iter_C_init, which means that the C_const, init and invar terms can be aligned
1268
// (C_const + C_pre * pre_iter_C_const) % aw = 0 (4a)
1269
// (C_invar * var_invar + C_pre * pre_iter_C_invar) % aw = 0 (4b)
1270
// (C_init * var_init + C_pre * pre_iter_C_init ) % aw = 0 (4c)
1272
// We now prove that (4a, b, c) are sufficient as well as necessary to guarantee (3)
1273
// for any runtime value of var_invar and var_init (i.e. for any invar and init).
1274
// This tells us that the "strengthening" does not restrict the algorithm more than
1277
// Sufficient (i.e (4a, b, c) imply (3)):
1279
// pre_iter = pre_iter_C_const + pre_iter_C_invar + pre_iter_C_init
1281
// Adding up (4a, b, c):
1283
// 0 = ( C_const + C_pre * pre_iter_C_const
1284
// + C_invar * var_invar + C_pre * pre_iter_C_invar
1285
// + C_init * var_init + C_pre * pre_iter_C_init ) % aw
1287
// = ( C_const + C_invar * var_invar + C_init * var_init
1288
// + C_pre * (pre_iter_C_const + pre_iter_C_invar + pre_iter_C_init)) % aw
1290
// = ( C_const + C_invar * var_invar + C_init * var_init
1291
// + C_pre * pre_iter) % aw
1293
// Necessary (i.e. (3) implies (4a, b, c)):
1294
// (4a): Set var_invar = var_init = 0 at runtime. Applying this to (3), we get:
1297
// = (C_const + C_invar * var_invar + C_init * var_init + C_pre * pre_iter) % aw
1298
// = (C_const + C_invar * 0 + C_init * 0 + C_pre * pre_iter) % aw
1299
// = (C_const + C_pre * pre_iter) % aw
1301
// This is of the same form as (4a), and we have a solution:
1302
// pre_iter_C_const = pre_iter
1304
// (4b): Set var_init = 0, and assume (4a), which we just proved is implied by (3).
1305
// Subtract (4a) from (3):
1308
// = (C_const + C_invar * var_invar + C_init * var_init + C_pre * pre_iter) % aw
1309
// - (C_const + C_pre * pre_iter_C_const) % aw
1310
// = (C_invar * var_invar + C_init * var_init + C_pre * pre_iter - C_pre * pre_iter_C_const) % aw
1311
// = (C_invar * var_invar + C_init * 0 + C_pre * (pre_iter - pre_iter_C_const)) % aw
1312
// = (C_invar * var_invar + + C_pre * (pre_iter - pre_iter_C_const)) % aw
1314
// This is of the same form as (4b), and we have a solution:
1315
// pre_iter_C_invar = pre_iter - pre_iter_C_const
1317
// (4c): Set var_invar = 0, and assume (4a), which we just proved is implied by (3).
1318
// Subtract (4a) from (3):
1321
// = (C_const + C_invar * var_invar + C_init * var_init + C_pre * pre_iter) % aw
1322
// - (C_const + C_pre * pre_iter_C_const) % aw
1323
// = (C_invar * var_invar + C_init * var_init + C_pre * pre_iter - C_pre * pre_iter_C_const) % aw
1324
// = (C_invar * 0 + C_init * var_init + C_pre * (pre_iter - pre_iter_C_const)) % aw
1325
// = ( + C_init * var_init + C_pre * (pre_iter - pre_iter_C_const)) % aw
1327
// This is of the same form as (4c), and we have a solution:
1328
// pre_iter_C_invar = pre_iter - pre_iter_C_const
1330
// The solutions of Equations (4a, b, c) for pre_iter_C_const, pre_iter_C_invar, and pre_iter_C_init
1331
// respectively, can have one of these states:
1333
// trivial: The solution can be any integer.
1334
// constrained: There is a (periodic) solution, but it is not trivial.
1335
// empty: Statically we cannot guarantee a solution for all var_invar and var_init.
1340
// -> Since abs(C_pre) is a power of two, we have C_pre % aw = 0. Therefore:
1342
// For any pre_iter_C_const: (C_pre * pre_iter_C_const) % aw = 0
1344
// (C_const + C_pre * pre_iter_C_const) % aw = 0
1347
// Hence, we can only satisfy (4a) if C_Const is aw aligned:
1349
// C_const % aw == 0:
1350
// -> (4a) has a trivial solution since we can choose any value for pre_iter_C_const.
1352
// C_const % aw != 0:
1353
// -> (4a) has an empty solution since no pre_iter_C_const can achieve aw alignment.
1356
// -> Since both abs(C_pre) and aw are powers of two, we know:
1358
// There exists integer x > 1: aw = abs(C_pre) * x
1360
// C_const % abs(C_pre) == 0:
1361
// -> There exists integer z: C_const = C_pre * z
1363
// (C_const + C_pre * pre_iter_C_const) % aw = 0
1365
// (C_pre * z + C_pre * pre_iter_C_const) % aw = 0
1367
// (C_pre * z + C_pre * pre_iter_C_const) % (abs(C_pre) * x) = 0
1369
// ( z + pre_iter_C_const) % x = 0
1371
// for any m: pre_iter_C_const = m * x - z
1373
// Hence, pre_iter_C_const has a non-trivial (because x > 1) periodic (periodicity x)
1374
// solution, i.e. it has a constrained solution.
1376
// C_const % abs(C_pre) != 0:
1377
// There exists integer x > 1: aw = abs(C_pre) * x
1379
// C_const % abs(C_pre) != 0
1381
// (C_const + C_pre * pre_iter_C_const) % abs(C_pre) != 0
1383
// (C_const + C_pre * pre_iter_C_const) % (abs(C_pre) * x) != 0
1385
// (C_const + C_pre * pre_iter_C_const) % aw != 0
1387
// This is in contradiction with (4a), and therefore there cannot be any solution,
1388
// i.e. we have an empty solution.
1390
// In summary, for (4a):
1392
// abs(C_pre) >= aw AND C_const % aw == 0 -> trivial
1393
// abs(C_pre) >= aw AND C_const % aw != 0 -> empty
1394
// abs(C_pre) < aw AND C_const % abs(C_pre) == 0 -> constrained
1395
// abs(C_pre) < aw AND C_const % abs(C_pre) != 0 -> empty
1397
// With analogue argumentation for (4b):
1399
// abs(C_pre) >= aw AND C_invar % aw == 0 -> trivial
1400
// abs(C_pre) >= aw AND C_invar % aw != 0 -> empty
1401
// abs(C_pre) < aw AND C_invar % abs(C_pre) == 0 -> constrained
1402
// abs(C_pre) < aw AND C_invar % abs(C_pre) != 0 -> empty
1404
// With analogue argumentation for (4c):
1406
// abs(C_pre) >= aw AND C_init % aw == 0 -> trivial
1407
// abs(C_pre) >= aw AND C_init % aw != 0 -> empty
1408
// abs(C_pre) < aw AND C_init % abs(C_pre) == 0 -> constrained
1409
// abs(C_pre) < aw AND C_init % abs(C_pre) != 0 -> empty
1411
// Out of these states follows the state for the solution of pre_iter:
1413
// Trivial: If (4a, b, c) are all trivial.
1414
// Empty: If any of (4a, b, c) is empty, because then we cannot guarantee a solution
1415
// for pre_iter, for all possible invar and init values.
1416
// Constrained: Else. Incidentally, (4a, b, c) are all constrained themselves, as we argue below.
1418
const EQ4 eq4(C_const, C_invar, C_init, C_pre, _aw);
1419
const EQ4::State eq4a_state = eq4.eq4a_state();
1420
const EQ4::State eq4b_state = eq4.eq4b_state();
1421
const EQ4::State eq4c_state = eq4.eq4c_state();
1429
// If (4a, b, c) are all trivial, then also the solution for pre_iter is trivial:
1430
if (eq4a_state == EQ4::State::TRIVIAL &&
1431
eq4b_state == EQ4::State::TRIVIAL &&
1432
eq4c_state == EQ4::State::TRIVIAL) {
1433
return new TrivialAlignmentSolution();
1436
// If any of (4a, b, c) is empty, then we also cannot guarantee a solution for pre_iter, for
1437
// any init and invar, hence the solution for pre_iter is empty:
1438
if (eq4a_state == EQ4::State::EMPTY ||
1439
eq4b_state == EQ4::State::EMPTY ||
1440
eq4c_state == EQ4::State::EMPTY) {
1441
return new EmptyAlignmentSolution("EQ(4a, b, c) not all non-empty: cannot align const, invar and init terms individually");
1444
// If abs(C_pre) >= aw, then the solutions to (4a, b, c) are all either trivial or empty, and
1445
// hence we would have found the solution to pre_iter above as either trivial or empty. Thus
1446
// we now know that:
1450
assert(abs(C_pre) < _aw, "implied by constrained case");
1452
// And since abs(C_pre) < aw, the solutions of (4a, b, c) can now only be constrained or empty.
1453
// But since we already handled the empty case, the solutions are now all constrained.
1454
assert(eq4a_state == EQ4::State::CONSTRAINED &&
1455
eq4a_state == EQ4::State::CONSTRAINED &&
1456
eq4a_state == EQ4::State::CONSTRAINED, "all must be constrained now");
1458
// And since they are all constrained, we must have:
1460
// C_const % abs(C_pre) = 0 (5a)
1461
// C_invar % abs(C_pre) = 0 (5b)
1462
// C_init % abs(C_pre) = 0 (5c)
1464
assert(AlignmentSolution::mod(C_const, abs(C_pre)) == 0, "EQ(5a): C_const must be alignable");
1465
assert(AlignmentSolution::mod(C_invar, abs(C_pre)) == 0, "EQ(5b): C_invar must be alignable");
1466
assert(AlignmentSolution::mod(C_init, abs(C_pre)) == 0, "EQ(5c): C_init must be alignable");
1468
// With (5a, b, c), we know that there are integers X, Y, Z:
1470
// C_const = X * abs(C_pre) ==> X = C_const / abs(C_pre) (6a)
1471
// C_invar = Y * abs(C_pre) ==> Y = C_invar / abs(C_pre) (6b)
1472
// C_init = Z * abs(C_pre) ==> Z = C_init / abs(C_pre) (6c)
1474
// Further, we define:
1476
// sign(C_pre) = C_pre / abs(C_pre) = (C_pre > 0) ? 1 : -1, (7)
1478
// We know that abs(C_pre) as well as aw are powers of 2, and since (5) we can define integer q:
1480
// q = aw / abs(C_pre) (8)
1482
const int q = _aw / abs(C_pre);
1484
assert(q >= 2, "implied by constrained solution");
1486
// We now know that all terms in (4a, b, c) are divisible by abs(C_pre):
1488
// (C_const / abs(C_pre) + C_pre * pre_iter_C_const / abs(C_pre)) % (aw / abs(C_pre)) =
1489
// (X * abs(C_pre) / abs(C_pre) + C_pre * pre_iter_C_const / abs(C_pre)) % (aw / abs(C_pre)) =
1490
// (X + pre_iter_C_const * sign(C_pre)) % q = 0 (9a)
1492
// -> pre_iter_C_const * sign(C_pre) = mx1 * q - X
1493
// -> pre_iter_C_const = mx2 * q - sign(C_pre) * X (10a)
1494
// (for any integers mx1, mx2)
1496
// (C_invar * var_invar / abs(C_pre) + C_pre * pre_iter_C_invar / abs(C_pre)) % (aw / abs(C_pre)) =
1497
// (Y * abs(C_pre) * var_invar / abs(C_pre) + C_pre * pre_iter_C_invar / abs(C_pre)) % (aw / abs(C_pre)) =
1498
// (Y * var_invar + pre_iter_C_invar * sign(C_pre)) % q = 0 (9b)
1500
// -> pre_iter_C_invar * sign(C_pre) = my1 * q - Y * var_invar
1501
// -> pre_iter_C_invar = my2 * q - sign(C_pre) * Y * var_invar (10b)
1502
// (for any integers my1, my2)
1504
// (C_init * var_init / abs(C_pre) + C_pre * pre_iter_C_init / abs(C_pre)) % (aw / abs(C_pre)) =
1505
// (Z * abs(C_pre) * var_init / abs(C_pre) + C_pre * pre_iter_C_init / abs(C_pre)) % (aw / abs(C_pre)) =
1506
// (Z * var_init + pre_iter_C_init * sign(C_pre)) % q = 0 (9c)
1508
// -> pre_iter_C_init * sign(C_pre) = mz1 * q - Z * var_init
1509
// -> pre_iter_C_init = mz2 * q - sign(C_pre) * Z * var_init (10c)
1510
// (for any integers mz1, mz2)
1513
// Having solved the equations using the division, we can re-substitute X, Y, and Z, and apply (FAC_INVAR) as
1514
// well as (FAC_INIT). We use the fact that sign(x) == 1 / sign(x) and sign(x) * abs(x) == x:
1516
// pre_iter_C_const = mx2 * q - sign(C_pre) * X
1517
// = mx2 * q - sign(C_pre) * C_const / abs(C_pre)
1518
// = mx2 * q - C_const / C_pre
1519
// = mx2 * q - C_const / (scale * pre_stride) (11a)
1521
// If there is an invariant:
1523
// pre_iter_C_invar = my2 * q - sign(C_pre) * Y * var_invar
1524
// = my2 * q - sign(C_pre) * C_invar * var_invar / abs(C_pre)
1525
// = my2 * q - sign(C_pre) * invar / abs(C_pre)
1526
// = my2 * q - invar / C_pre
1527
// = my2 * q - invar / (scale * pre_stride) (11b, with invar)
1529
// If there is no invariant (i.e. C_invar = 0 ==> Y = 0):
1531
// pre_iter_C_invar = my2 * q (11b, no invar)
1533
// If init is variable (i.e. C_init = scale, init = var_init):
1535
// pre_iter_C_init = mz2 * q - sign(C_pre) * Z * var_init
1536
// = mz2 * q - sign(C_pre) * C_init * var_init / abs(C_pre)
1537
// = mz2 * q - sign(C_pre) * scale * init / abs(C_pre)
1538
// = mz2 * q - scale * init / C_pre
1539
// = mz2 * q - scale * init / (scale * pre_stride)
1540
// = mz2 * q - init / pre_stride (11c, variable init)
1542
// If init is constant (i.e. C_init = 0 ==> Z = 0):
1544
// pre_iter_C_init = mz2 * q (11c, constant init)
1546
// Note, that the solutions found by (11a, b, c) are all periodic with periodicity q. We combine them,
1547
// with m = mx2 + my2 + mz2:
1549
// pre_iter = pre_iter_C_const + pre_iter_C_invar + pre_iter_C_init
1550
// = mx2 * q - C_const / (scale * pre_stride)
1551
// + my2 * q [- invar / (scale * pre_stride) ]
1552
// + mz2 * q [- init / pre_stride ]
1554
// = m * q (periodic part)
1555
// - C_const / (scale * pre_stride) (align constant term)
1556
// [- invar / (scale * pre_stride) ] (align invariant term, if present)
1557
// [- init / pre_stride ] (align variable init term, if present) (12)
1559
// We can further simplify this solution by introducing integer 0 <= r < q:
1561
// r = (-C_const / (scale * pre_stride)) % q (13)
1563
const int r = AlignmentSolution::mod(-C_const / (_scale * _pre_stride), q);
1565
// pre_iter = m * q + r
1566
// [- invar / (scale * pre_stride) ]
1567
// [- init / pre_stride ] (14)
1569
// We thus get a solution that can be stated in terms of:
1571
// q (periodicity), r (constant alignment), invar, scale, pre_stride, init
1573
// However, pre_stride and init are shared by all mem_ref in the loop, hence we do not need to provide
1574
// them in the solution description.
1576
DEBUG_ONLY( trace_constrained_solution(C_const, C_invar, C_init, C_pre, q, r); )
1578
return new ConstrainedAlignmentSolution(_mem_ref, q, r, _invar, _scale);
1581
// We can now verify the success of the solution given by (12):
1586
// (base + offset + invar + scale * iv) % aw =
1589
// (base + offset + invar + scale * (init + pre_stride * pre_iter + main_stride * main_iter)) % aw =
1592
// (base + offset + invar
1594
// + scale * pre_stride * pre_iter
1595
// + scale * main_stride * main_iter)) % aw =
1597
// -> base aligned: base % aw = 0
1598
// -> main-loop iterations aligned (2): C_main % aw = (scale * main_stride) % aw = 0
1599
// (offset + invar + scale * init + scale * pre_stride * pre_iter) % aw =
1602
// (offset + invar + scale * init
1603
// + scale * pre_stride * (m * q - C_const / (scale * pre_stride)
1604
// [- invar / (scale * pre_stride) ]
1605
// [- init / pre_stride ]
1609
// -> expand C_const = offset [+ init * scale] (if init const)
1610
// (offset + invar + scale * init
1611
// + scale * pre_stride * (m * q - offset / (scale * pre_stride)
1612
// [- init / pre_stride ] (if init constant)
1613
// [- invar / (scale * pre_stride) ] (if invar present)
1614
// [- init / pre_stride ] (if init variable)
1618
// -> assuming invar = 0 if it is not present
1619
// -> merge the two init terms (variable or constant)
1620
// -> apply (8): q = aw / (abs(C_pre)) = aw / abs(scale * pre_stride)
1621
// -> and hence: (scale * pre_stride * q) % aw = 0
1622
// -> all terms are canceled out
1623
// (offset + invar + scale * init
1624
// + scale * pre_stride * m * q -> aw aligned
1625
// - scale * pre_stride * offset / (scale * pre_stride) -> = offset
1626
// - scale * pre_stride * init / pre_stride -> = scale * init
1627
// - scale * pre_stride * invar / (scale * pre_stride) -> = invar
1630
// The solution given by (12) does indeed guarantee alignment.
1634
void AlignmentSolver::trace_start_solve() const {
1636
tty->print(" vector mem_ref:");
1638
tty->print_cr(" vector_width = vector_length(%d) * element_size(%d) = %d",
1639
_vector_length, _element_size, _vector_width);
1640
tty->print_cr(" aw = alignment_width = min(vector_width(%d), ObjectAlignmentInBytes(%d)) = %d",
1641
_vector_width, ObjectAlignmentInBytes, _aw);
1643
if (!_init_node->is_ConI()) {
1644
tty->print(" init:");
1648
if (_invar != nullptr) {
1649
tty->print(" invar:");
1653
tty->print_cr(" invar_factor = %d", _invar_factor);
1655
// iv = init + pre_iter * pre_stride + main_iter * main_stride
1656
tty->print(" iv = init");
1657
print_con_or_idx(_init_node);
1658
tty->print_cr(" + pre_iter * pre_stride(%d) + main_iter * main_stride(%d)",
1659
_pre_stride, _main_stride);
1661
// adr = base + offset + invar + scale * iv
1662
tty->print(" adr = base");
1663
print_con_or_idx(_base);
1664
tty->print(" + offset(%d) + invar", _offset);
1665
print_con_or_idx(_invar);
1666
tty->print_cr(" + scale(%d) * iv", _scale);
1670
void AlignmentSolver::trace_reshaped_form(const int C_const,
1671
const int C_const_init,
1675
const int C_main) const
1678
tty->print(" = base[%d] + ", _base->_idx);
1679
tty->print_cr("C_const(%d) + C_invar(%d) * var_invar + C_init(%d) * var_init + C_pre(%d) * pre_iter + C_main(%d) * main_iter",
1680
C_const, C_invar, C_init, C_pre, C_main);
1681
if (_init_node->is_ConI()) {
1682
tty->print_cr(" init is constant:");
1683
tty->print_cr(" C_const_init = %d", C_const_init);
1684
tty->print_cr(" C_init = %d", C_init);
1686
tty->print_cr(" init is variable:");
1687
tty->print_cr(" C_const_init = %d", C_const_init);
1688
tty->print_cr(" C_init = abs(scale)= %d", C_init);
1690
if (_invar != nullptr) {
1691
tty->print_cr(" invariant present:");
1692
tty->print_cr(" C_invar = abs(invar_factor) = %d", C_invar);
1694
tty->print_cr(" no invariant:");
1695
tty->print_cr(" C_invar = %d", C_invar);
1697
tty->print_cr(" C_const = offset(%d) + scale(%d) * C_const_init(%d) = %d",
1698
_offset, _scale, C_const_init, C_const);
1699
tty->print_cr(" C_pre = scale(%d) * pre_stride(%d) = %d",
1700
_scale, _pre_stride, C_pre);
1701
tty->print_cr(" C_main = scale(%d) * main_stride(%d) = %d",
1702
_scale, _main_stride, C_main);
1706
void AlignmentSolver::trace_main_iteration_alignment(const int C_const,
1711
const int C_main_mod_aw) const
1714
tty->print(" EQ(1 ): (C_const(%d) + C_invar(%d) * var_invar + C_init(%d) * var_init",
1715
C_const, C_invar, C_init);
1716
tty->print(" + C_pre(%d) * pre_iter + C_main(%d) * main_iter) %% aw(%d) = 0",
1717
C_pre, C_main, _aw);
1718
tty->print_cr(" (given base aligned -> align rest)");
1719
tty->print(" EQ(2 ): C_main(%d) %% aw(%d) = %d = 0",
1720
C_main, _aw, C_main_mod_aw);
1721
tty->print_cr(" (alignment across iterations)");
1725
void AlignmentSolver::EQ4::trace() const {
1726
tty->print_cr(" EQ(4a): (C_const(%3d) + C_pre(%d) * pre_iter_C_const) %% aw(%d) = 0 (align const term individually)",
1727
_C_const, _C_pre, _aw);
1728
tty->print_cr(" -> %s", state_to_str(eq4a_state()));
1730
tty->print_cr(" EQ(4b): (C_invar(%3d) * var_invar + C_pre(%d) * pre_iter_C_invar) %% aw(%d) = 0 (align invar term individually)",
1731
_C_invar, _C_pre, _aw);
1732
tty->print_cr(" -> %s", state_to_str(eq4b_state()));
1734
tty->print_cr(" EQ(4c): (C_init( %3d) * var_init + C_pre(%d) * pre_iter_C_init ) %% aw(%d) = 0 (align init term individually)",
1735
_C_init, _C_pre, _aw);
1736
tty->print_cr(" -> %s", state_to_str(eq4c_state()));
1739
void AlignmentSolver::trace_constrained_solution(const int C_const,
1747
tty->print_cr(" EQ(4a, b, c) all constrained, hence:");
1748
tty->print_cr(" EQ(5a): C_const(%3d) %% abs(C_pre(%d)) = 0", C_const, C_pre);
1749
tty->print_cr(" EQ(5b): C_invar(%3d) %% abs(C_pre(%d)) = 0", C_invar, C_pre);
1750
tty->print_cr(" EQ(5c): C_init( %3d) %% abs(C_pre(%d)) = 0", C_init, C_pre);
1752
tty->print_cr(" All terms in EQ(4a, b, c) are divisible by abs(C_pre(%d)).", C_pre);
1753
const int X = C_const / abs(C_pre);
1754
const int Y = C_invar / abs(C_pre);
1755
const int Z = C_init / abs(C_pre);
1756
const int sign = (C_pre > 0) ? 1 : -1;
1757
tty->print_cr(" X = C_const(%3d) / abs(C_pre(%d)) = %d (6a)", C_const, C_pre, X);
1758
tty->print_cr(" Y = C_invar(%3d) / abs(C_pre(%d)) = %d (6b)", C_invar, C_pre, Y);
1759
tty->print_cr(" Z = C_init( %3d) / abs(C_pre(%d)) = %d (6c)", C_init , C_pre, Z);
1760
tty->print_cr(" q = aw( %3d) / abs(C_pre(%d)) = %d (8)", _aw, C_pre, q);
1761
tty->print_cr(" sign(C_pre) = (C_pre(%d) > 0) ? 1 : -1 = %d (7)", C_pre, sign);
1763
tty->print_cr(" EQ(9a): (X(%3d) + pre_iter_C_const * sign(C_pre)) %% q(%d) = 0", X, q);
1764
tty->print_cr(" EQ(9b): (Y(%3d) * var_invar + pre_iter_C_invar * sign(C_pre)) %% q(%d) = 0", Y, q);
1765
tty->print_cr(" EQ(9c): (Z(%3d) * var_init + pre_iter_C_init * sign(C_pre)) %% q(%d) = 0", Z, q);
1767
tty->print_cr(" EQ(10a): pre_iter_C_const = mx2 * q(%d) - sign(C_pre) * X(%d)", q, X);
1768
tty->print_cr(" EQ(10b): pre_iter_C_invar = my2 * q(%d) - sign(C_pre) * Y(%d) * var_invar", q, Y);
1769
tty->print_cr(" EQ(10c): pre_iter_C_init = mz2 * q(%d) - sign(C_pre) * Z(%d) * var_init ", q, Z);
1771
tty->print_cr(" r = (-C_const(%d) / (scale(%d) * pre_stride(%d)) %% q(%d) = %d",
1772
C_const, _scale, _pre_stride, q, r);
1774
tty->print_cr(" EQ(14): pre_iter = m * q(%3d) - r(%d)", q, r);
1775
if (_invar != nullptr) {
1776
tty->print_cr(" - invar / (scale(%d) * pre_stride(%d))",
1777
_scale, _pre_stride);
1779
if (!_init_node->is_ConI()) {
1780
tty->print_cr(" - init / pre_stride(%d)",