jdk

vectorization.cpp
1785 строк · 70.6 Кб
Перенос по словам
1
/*
2
 * Copyright (c) 2023, 2024, Oracle and/or its affiliates. All rights reserved.
3
 * Copyright (c) 2023, Arm Limited. All rights reserved.
4
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5
 *
6
 * This code is free software; you can redistribute it and/or modify it
7
 * under the terms of the GNU General Public License version 2 only, as
8
 * published by the Free Software Foundation.
9
 *
10
 * This code is distributed in the hope that it will be useful, but WITHOUT
11
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13
 * version 2 for more details (a copy is included in the LICENSE file that
14
 * accompanied this code).
15
 *
16
 * You should have received a copy of the GNU General Public License version
17
 * 2 along with this work; if not, write to the Free Software Foundation,
18
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19
 *
20
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21
 * or visit www.oracle.com if you need additional information or have any
22
 * questions.
23
 */
24

25
#include "precompiled.hpp"
26
#include "opto/addnode.hpp"
27
#include "opto/connode.hpp"
28
#include "opto/convertnode.hpp"
29
#include "opto/mulnode.hpp"
30
#include "opto/rootnode.hpp"
31
#include "opto/vectorization.hpp"
32

33
#ifndef PRODUCT
34
static void print_con_or_idx(const Node* n) {
35
  if (n == nullptr) {
36
    tty->print("(   0)");
37
  } else if (n->is_ConI()) {
38
    jint val = n->as_ConI()->get_int();
39
    tty->print("(%4d)", val);
40
  } else {
41
    tty->print("[%4d]", n->_idx);
42
  }
43
}
44
#endif
45

46
bool VLoop::check_preconditions() {
47
#ifndef PRODUCT
48
  if (is_trace_preconditions()) {
49
    tty->print_cr("\nVLoop::check_preconditions");
50
    lpt()->dump_head();
51
    lpt()->head()->dump();
52
  }
53
#endif
54

55
  VStatus status = check_preconditions_helper();
56
  if (!status.is_success()) {
57
#ifndef PRODUCT
58
    if (is_trace_preconditions()) {
59
      tty->print_cr("VLoop::check_preconditions: failed: %s", status.failure_reason());
60
    }
61
#endif
62
    return false; // failure
63
  }
64
  return true; // success
65
}
66

67
VStatus VLoop::check_preconditions_helper() {
68
  // Only accept vector width that is power of 2
69
  int vector_width = Matcher::vector_width_in_bytes(T_BYTE);
70
  if (vector_width < 2 || !is_power_of_2(vector_width)) {
71
    return VStatus::make_failure(VLoop::FAILURE_VECTOR_WIDTH);
72
  }
73

74
  // Only accept valid counted loops (int)
75
  if (!_lpt->_head->as_Loop()->is_valid_counted_loop(T_INT)) {
76
    return VStatus::make_failure(VLoop::FAILURE_VALID_COUNTED_LOOP);
77
  }
78
  _cl = _lpt->_head->as_CountedLoop();
79
  _iv = _cl->phi()->as_Phi();
80

81
  if (_cl->is_vectorized_loop()) {
82
    return VStatus::make_failure(VLoop::FAILURE_ALREADY_VECTORIZED);
83
  }
84

85
  if (_cl->is_unroll_only()) {
86
    return VStatus::make_failure(VLoop::FAILURE_UNROLL_ONLY);
87
  }
88

89
  // Check for control flow in the body
90
  _cl_exit = _cl->loopexit();
91
  bool has_cfg = _cl_exit->in(0) != _cl;
92
  if (has_cfg && !is_allow_cfg()) {
93
#ifndef PRODUCT
94
    if (is_trace_preconditions()) {
95
      tty->print_cr("VLoop::check_preconditions: fails because of control flow.");
96
      tty->print("  cl_exit %d", _cl_exit->_idx); _cl_exit->dump();
97
      tty->print("  cl_exit->in(0) %d", _cl_exit->in(0)->_idx); _cl_exit->in(0)->dump();
98
      tty->print("  lpt->_head %d", _cl->_idx); _cl->dump();
99
      _lpt->dump_head();
100
    }
101
#endif
102
    return VStatus::make_failure(VLoop::FAILURE_CONTROL_FLOW);
103
  }
104

105
  // Make sure the are no extra control users of the loop backedge
106
  if (_cl->back_control()->outcnt() != 1) {
107
    return VStatus::make_failure(VLoop::FAILURE_BACKEDGE);
108
  }
109

110
  // To align vector memory accesses in the main-loop, we will have to adjust
111
  // the pre-loop limit.
112
  if (_cl->is_main_loop()) {
113
    CountedLoopEndNode* pre_end = _cl->find_pre_loop_end();
114
    if (pre_end == nullptr) {
115
      return VStatus::make_failure(VLoop::FAILURE_PRE_LOOP_LIMIT);
116
    }
117
    Node* pre_opaq1 = pre_end->limit();
118
    if (pre_opaq1->Opcode() != Op_Opaque1) {
119
      return VStatus::make_failure(VLoop::FAILURE_PRE_LOOP_LIMIT);
120
    }
121
    _pre_loop_end = pre_end;
122
  }
123

124
  return VStatus::make_success();
125
}
126

127
// Return true iff all submodules are loaded successfully
128
bool VLoopAnalyzer::setup_submodules() {
129
#ifndef PRODUCT
130
  if (_vloop.is_trace_loop_analyzer()) {
131
    tty->print_cr("\nVLoopAnalyzer::setup_submodules");
132
    _vloop.lpt()->dump_head();
133
    _vloop.cl()->dump();
134
  }
135
#endif
136

137
  VStatus status = setup_submodules_helper();
138
  if (!status.is_success()) {
139
#ifndef PRODUCT
140
    if (_vloop.is_trace_loop_analyzer()) {
141
      tty->print_cr("\nVLoopAnalyze::setup_submodules: failed: %s", status.failure_reason());
142
    }
143
#endif
144
    return false; // failed
145
  }
146
  return true; // success
147
}
148

149
VStatus VLoopAnalyzer::setup_submodules_helper() {
150
  // Skip any loop that has not been assigned max unroll by analysis.
151
  if (SuperWordLoopUnrollAnalysis && _vloop.cl()->slp_max_unroll() == 0) {
152
    return VStatus::make_failure(VLoopAnalyzer::FAILURE_NO_MAX_UNROLL);
153
  }
154

155
  if (SuperWordReductions) {
156
    _reductions.mark_reductions();
157
  }
158

159
  _memory_slices.find_memory_slices();
160

161
  // If there is no memory slice detected, it means there is no store.
162
  // If there is no reduction and no store, then we give up, because
163
  // vectorization is not possible anyway (given current limitations).
164
  if (!_reductions.is_marked_reduction_loop() &&
165
      _memory_slices.heads().is_empty()) {
166
    return VStatus::make_failure(VLoopAnalyzer::FAILURE_NO_REDUCTION_OR_STORE);
167
  }
168

169
  VStatus body_status = _body.construct();
170
  if (!body_status.is_success()) {
171
    return body_status;
172
  }
173

174
  _types.compute_vector_element_type();
175

176
  _vpointers.compute_vpointers();
177

178
  _dependency_graph.construct();
179

180
  return VStatus::make_success();
181
}
182

183
void VLoopVPointers::compute_vpointers() {
184
  count_vpointers();
185
  allocate_vpointers_array();
186
  compute_and_cache_vpointers();
187
  NOT_PRODUCT( if (_vloop.is_trace_vpointers()) { print(); } )
188
}
189

190
void VLoopVPointers::count_vpointers() {
191
  _vpointers_length = 0;
192
  _body.for_each_mem([&] (const MemNode* mem, int bb_idx) {
193
    _vpointers_length++;
194
  });
195
}
196

197
void VLoopVPointers::allocate_vpointers_array() {
198
  uint bytes = _vpointers_length * sizeof(VPointer);
199
  _vpointers = (VPointer*)_arena->Amalloc(bytes);
200
}
201

202
void VLoopVPointers::compute_and_cache_vpointers() {
203
  int pointers_idx = 0;
204
  _body.for_each_mem([&] (MemNode* const mem, int bb_idx) {
205
    // Placement new: construct directly into the array.
206
    ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop);
207
    _bb_idx_to_vpointer.at_put(bb_idx, pointers_idx);
208
    pointers_idx++;
209
  });
210
}
211

212
const VPointer& VLoopVPointers::vpointer(const MemNode* mem) const {
213
  assert(mem != nullptr && _vloop.in_bb(mem), "only mem in loop");
214
  int bb_idx = _body.bb_idx(mem);
215
  int pointers_idx = _bb_idx_to_vpointer.at(bb_idx);
216
  assert(0 <= pointers_idx && pointers_idx < _vpointers_length, "valid range");
217
  return _vpointers[pointers_idx];
218
}
219

220
#ifndef PRODUCT
221
void VLoopVPointers::print() const {
222
  tty->print_cr("\nVLoopVPointers::print:");
223

224
  _body.for_each_mem([&] (const MemNode* mem, int bb_idx) {
225
    const VPointer& p = vpointer(mem);
226
    tty->print("  ");
227
    p.print();
228
  });
229
}
230
#endif
231

232
// Construct the dependency graph:
233
//  - Data-dependencies: implicit (taken from C2 node inputs).
234
//  - Memory-dependencies:
235
//    - No edges between different slices.
236
//    - No Load-Load edges.
237
//    - Inside a slice, add all Store-Load, Load-Store, Store-Store edges,
238
//      except if we can prove that the memory does not overlap.
239
void VLoopDependencyGraph::construct() {
240
  const GrowableArray<PhiNode*>& mem_slice_heads = _memory_slices.heads();
241
  const GrowableArray<MemNode*>& mem_slice_tails = _memory_slices.tails();
242

243
  ResourceMark rm;
244
  GrowableArray<MemNode*> slice_nodes;
245
  GrowableArray<int> memory_pred_edges;
246

247
  // For each memory slice, create the memory subgraph
248
  for (int i = 0; i < mem_slice_heads.length(); i++) {
249
    PhiNode* head = mem_slice_heads.at(i);
250
    MemNode* tail = mem_slice_tails.at(i);
251

252
    _memory_slices.get_slice_in_reverse_order(head, tail, slice_nodes);
253

254
    // In forward order (reverse of reverse), visit all memory nodes in the slice.
255
    for (int j = slice_nodes.length() - 1; j >= 0 ; j--) {
256
      MemNode* n1 = slice_nodes.at(j);
257
      memory_pred_edges.clear();
258

259
      const VPointer& p1 = _vpointers.vpointer(n1);
260
      // For all memory nodes before it, check if we need to add a memory edge.
261
      for (int k = slice_nodes.length() - 1; k > j; k--) {
262
        MemNode* n2 = slice_nodes.at(k);
263

264
        // Ignore Load-Load dependencies:
265
        if (n1->is_Load() && n2->is_Load()) { continue; }
266

267
        const VPointer& p2 = _vpointers.vpointer(n2);
268
        if (!VPointer::not_equal(p1.cmp(p2))) {
269
          // Possibly overlapping memory
270
          memory_pred_edges.append(_body.bb_idx(n2));
271
        }
272
      }
273
      if (memory_pred_edges.is_nonempty()) {
274
        // Data edges are taken implicitly from the C2 graph, thus we only add
275
        // a dependency node if we have memory edges.
276
        add_node(n1, memory_pred_edges);
277
      }
278
    }
279
    slice_nodes.clear();
280
  }
281

282
  compute_depth();
283

284
  NOT_PRODUCT( if (_vloop.is_trace_dependency_graph()) { print(); } )
285
}
286

287
void VLoopDependencyGraph::add_node(MemNode* n, GrowableArray<int>& memory_pred_edges) {
288
  assert(_dependency_nodes.at_grow(_body.bb_idx(n), nullptr) == nullptr, "not yet created");
289
  assert(!memory_pred_edges.is_empty(), "no need to create a node without edges");
290
  DependencyNode* dn = new (_arena) DependencyNode(n, memory_pred_edges, _arena);
291
  _dependency_nodes.at_put_grow(_body.bb_idx(n), dn, nullptr);
292
}
293

294
int VLoopDependencyGraph::find_max_pred_depth(const Node* n) const {
295
  int max_pred_depth = 0;
296
  if (!n->is_Phi()) { // ignore backedge
297
    for (PredsIterator it(*this, n); !it.done(); it.next()) {
298
      Node* pred = it.current();
299
      if (_vloop.in_bb(pred)) {
300
        max_pred_depth = MAX2(max_pred_depth, depth(pred));
301
      }
302
    }
303
  }
304
  return max_pred_depth;
305
}
306

307
// We iterate over the body, which is already ordered by the dependencies, i.e. pred comes
308
// before use. With a single pass, we can compute the depth of every node, since we can
309
// assume that the depth of all preds is already computed when we compute the depth of use.
310
void VLoopDependencyGraph::compute_depth() {
311
  for (int i = 0; i < _body.body().length(); i++) {
312
    Node* n = _body.body().at(i);
313
    set_depth(n, find_max_pred_depth(n) + 1);
314
  }
315

316
#ifdef ASSERT
317
  for (int i = 0; i < _body.body().length(); i++) {
318
    Node* n = _body.body().at(i);
319
    int max_pred_depth = find_max_pred_depth(n);
320
    if (depth(n) != max_pred_depth + 1) {
321
      print();
322
      tty->print_cr("Incorrect depth: %d vs %d", depth(n), max_pred_depth + 1);
323
      n->dump();
324
    }
325
    assert(depth(n) == max_pred_depth + 1, "must have correct depth");
326
  }
327
#endif
328
}
329

330
#ifndef PRODUCT
331
void VLoopDependencyGraph::print() const {
332
  tty->print_cr("\nVLoopDependencyGraph::print:");
333

334
  tty->print_cr(" Memory pred edges:");
335
  for (int i = 0; i < _body.body().length(); i++) {
336
    Node* n = _body.body().at(i);
337
    const DependencyNode* dn = dependency_node(n);
338
    if (dn != nullptr) {
339
      tty->print("  DependencyNode[%d %s:", n->_idx, n->Name());
340
      for (uint j = 0; j < dn->memory_pred_edges_length(); j++) {
341
        Node* pred = _body.body().at(dn->memory_pred_edge(j));
342
        tty->print("  %d %s", pred->_idx, pred->Name());
343
      }
344
      tty->print_cr("]");
345
    }
346
  }
347
  tty->cr();
348

349
  tty->print_cr(" Complete dependency graph:");
350
  for (int i = 0; i < _body.body().length(); i++) {
351
    Node* n = _body.body().at(i);
352
    tty->print("  d%02d Dependencies[%d %s:", depth(n), n->_idx, n->Name());
353
    for (PredsIterator it(*this, n); !it.done(); it.next()) {
354
      Node* pred = it.current();
355
      tty->print("  %d %s", pred->_idx, pred->Name());
356
    }
357
    tty->print_cr("]");
358
  }
359
}
360
#endif
361

362
VLoopDependencyGraph::DependencyNode::DependencyNode(MemNode* n,
363
                                                     GrowableArray<int>& memory_pred_edges,
364
                                                     Arena* arena) :
365
    _node(n),
366
    _memory_pred_edges_length(memory_pred_edges.length()),
367
    _memory_pred_edges(nullptr)
368
{
369
  assert(memory_pred_edges.is_nonempty(), "not empty");
370
  uint bytes = memory_pred_edges.length() * sizeof(int);
371
  _memory_pred_edges = (int*)arena->Amalloc(bytes);
372
  memcpy(_memory_pred_edges, memory_pred_edges.adr_at(0), bytes);
373
}
374

375
VLoopDependencyGraph::PredsIterator::PredsIterator(const VLoopDependencyGraph& dependency_graph,
376
                                                   const Node* node) :
377
    _dependency_graph(dependency_graph),
378
    _node(node),
379
    _dependency_node(dependency_graph.dependency_node(node)),
380
    _current(nullptr),
381
    _next_pred(0),
382
    _end_pred(node->req()),
383
    _next_memory_pred(0),
384
    _end_memory_pred((_dependency_node != nullptr) ? _dependency_node->memory_pred_edges_length() : 0)
385
{
386
  if (_node->is_Store() || _node->is_Load()) {
387
    // Load: address
388
    // Store: address, value
389
    _next_pred = MemNode::Address;
390
  } else {
391
    assert(!_node->is_Mem(), "only loads and stores are expected mem nodes");
392
    _next_pred = 1; // skip control
393
  }
394
  next();
395
}
396

397
void VLoopDependencyGraph::PredsIterator::next() {
398
  if (_next_pred < _end_pred) {
399
    _current = _node->in(_next_pred++);
400
  } else if (_next_memory_pred < _end_memory_pred) {
401
    int pred_bb_idx = _dependency_node->memory_pred_edge(_next_memory_pred++);
402
    _current = _dependency_graph._body.body().at(pred_bb_idx);
403
  } else {
404
    _current = nullptr; // done
405
  }
406
}
407

408
#ifndef PRODUCT
409
int VPointer::Tracer::_depth = 0;
410
#endif
411

412
VPointer::VPointer(MemNode* const mem, const VLoop& vloop,
413
                   Node_Stack* nstack, bool analyze_only) :
414
  _mem(mem), _vloop(vloop),
415
  _base(nullptr), _adr(nullptr), _scale(0), _offset(0), _invar(nullptr),
416
#ifdef ASSERT
417
  _debug_invar(nullptr), _debug_negate_invar(false), _debug_invar_scale(nullptr),
418
#endif
419
  _nstack(nstack), _analyze_only(analyze_only), _stack_idx(0)
420
#ifndef PRODUCT
421
  , _tracer(vloop.is_trace_pointer_analysis())
422
#endif
423
{
424
  NOT_PRODUCT(_tracer.ctor_1(mem);)
425

426
  Node* adr = mem->in(MemNode::Address);
427
  if (!adr->is_AddP()) {
428
    assert(!valid(), "too complex");
429
    return;
430
  }
431
  // Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant)
432
  Node* base = adr->in(AddPNode::Base);
433
  // The base address should be loop invariant
434
  if (is_loop_member(base)) {
435
    assert(!valid(), "base address is loop variant");
436
    return;
437
  }
438
  // unsafe references require misaligned vector access support
439
  if (base->is_top() && !Matcher::misaligned_vectors_ok()) {
440
    assert(!valid(), "unsafe access");
441
    return;
442
  }
443

444
  NOT_PRODUCT(if(_tracer._is_trace_alignment) _tracer.store_depth();)
445
  NOT_PRODUCT(_tracer.ctor_2(adr);)
446

447
  int i;
448
  for (i = 0; ; i++) {
449
    NOT_PRODUCT(_tracer.ctor_3(adr, i);)
450

451
    if (!scaled_iv_plus_offset(adr->in(AddPNode::Offset))) {
452
      assert(!valid(), "too complex");
453
      return;
454
    }
455
    adr = adr->in(AddPNode::Address);
456
    NOT_PRODUCT(_tracer.ctor_4(adr, i);)
457

458
    if (base == adr || !adr->is_AddP()) {
459
      NOT_PRODUCT(_tracer.ctor_5(adr, base, i);)
460
      break; // stop looking at addp's
461
    }
462
  }
463
  if (!invariant(adr)) {
464
    // The address must be invariant for the current loop. But if we are in a main-loop,
465
    // it must also be invariant of the pre-loop, otherwise we cannot use this address
466
    // for the pre-loop limit adjustment required for main-loop alignment.
467
    assert(!valid(), "adr is loop variant");
468
    return;
469
  }
470

471
  if (!base->is_top() && adr != base) {
472
    assert(!valid(), "adr and base differ");
473
    return;
474
  }
475

476
  NOT_PRODUCT(if(_tracer._is_trace_alignment) _tracer.restore_depth();)
477
  NOT_PRODUCT(_tracer.ctor_6(mem);)
478

479
  // In the pointer analysis, and especially the AlignVector, analysis we assume that
480
  // stride and scale are not too large. For example, we multiply "scale * stride",
481
  // and assume that this does not overflow the int range. We also take "abs(scale)"
482
  // and "abs(stride)", which would overflow for min_int = -(2^31). Still, we want
483
  // to at least allow small and moderately large stride and scale. Therefore, we
484
  // allow values up to 2^30, which is only a factor 2 smaller than the max/min int.
485
  // Normal performance relevant code will have much lower values. And the restriction
486
  // allows us to keep the rest of the autovectorization code much simpler, since we
487
  // do not have to deal with overflows.
488
  jlong long_scale  = _scale;
489
  jlong long_stride = _vloop.iv_stride();
490
  jlong max_val = 1 << 30;
491
  if (abs(long_scale) >= max_val ||
492
      abs(long_stride) >= max_val ||
493
      abs(long_scale * long_stride) >= max_val) {
494
    assert(!valid(), "adr stride*scale is too large");
495
    return;
496
  }
497

498
  _base = base;
499
  _adr  = adr;
500
  assert(valid(), "Usable");
501
}
502

503
// Following is used to create a temporary object during
504
// the pattern match of an address expression.
505
VPointer::VPointer(VPointer* p) :
506
  _mem(p->_mem), _vloop(p->_vloop),
507
  _base(nullptr), _adr(nullptr), _scale(0), _offset(0), _invar(nullptr),
508
#ifdef ASSERT
509
  _debug_invar(nullptr), _debug_negate_invar(false), _debug_invar_scale(nullptr),
510
#endif
511
  _nstack(p->_nstack), _analyze_only(p->_analyze_only), _stack_idx(p->_stack_idx)
512
#ifndef PRODUCT
513
  , _tracer(p->_tracer._is_trace_alignment)
514
#endif
515
{}
516

517
// Biggest detectable factor of the invariant.
518
int VPointer::invar_factor() const {
519
  Node* n = invar();
520
  if (n == nullptr) {
521
    return 0;
522
  }
523
  int opc = n->Opcode();
524
  if (opc == Op_LShiftI && n->in(2)->is_Con()) {
525
    return 1 << n->in(2)->get_int();
526
  } else if (opc == Op_LShiftL && n->in(2)->is_Con()) {
527
    return 1 << n->in(2)->get_int();
528
  }
529
  // All our best-effort has failed.
530
  return 1;
531
}
532

533
bool VPointer::is_loop_member(Node* n) const {
534
  Node* n_c = phase()->get_ctrl(n);
535
  return lpt()->is_member(phase()->get_loop(n_c));
536
}
537

538
bool VPointer::invariant(Node* n) const {
539
  NOT_PRODUCT(Tracer::Depth dd;)
540
  bool is_not_member = !is_loop_member(n);
541
  if (is_not_member) {
542
    CountedLoopNode* cl = lpt()->_head->as_CountedLoop();
543
    if (cl->is_main_loop()) {
544
      // Check that n_c dominates the pre loop head node. If it does not, then
545
      // we cannot use n as invariant for the pre loop CountedLoopEndNode check
546
      // because n_c is either part of the pre loop or between the pre and the
547
      // main loop (Illegal invariant happens when n_c is a CastII node that
548
      // prevents data nodes to flow above the main loop).
549
      Node* n_c = phase()->get_ctrl(n);
550
      return phase()->is_dominator(n_c, _vloop.pre_loop_head());
551
    }
552
  }
553
  return is_not_member;
554
}
555

556
// Match: k*iv + offset
557
// where: k is a constant that maybe zero, and
558
//        offset is (k2 [+/- invariant]) where k2 maybe zero and invariant is optional
559
bool VPointer::scaled_iv_plus_offset(Node* n) {
560
  NOT_PRODUCT(Tracer::Depth ddd;)
561
  NOT_PRODUCT(_tracer.scaled_iv_plus_offset_1(n);)
562

563
  if (scaled_iv(n)) {
564
    NOT_PRODUCT(_tracer.scaled_iv_plus_offset_2(n);)
565
    return true;
566
  }
567

568
  if (offset_plus_k(n)) {
569
    NOT_PRODUCT(_tracer.scaled_iv_plus_offset_3(n);)
570
    return true;
571
  }
572

573
  int opc = n->Opcode();
574
  if (opc == Op_AddI) {
575
    if (offset_plus_k(n->in(2)) && scaled_iv_plus_offset(n->in(1))) {
576
      NOT_PRODUCT(_tracer.scaled_iv_plus_offset_4(n);)
577
      return true;
578
    }
579
    if (offset_plus_k(n->in(1)) && scaled_iv_plus_offset(n->in(2))) {
580
      NOT_PRODUCT(_tracer.scaled_iv_plus_offset_5(n);)
581
      return true;
582
    }
583
  } else if (opc == Op_SubI || opc == Op_SubL) {
584
    if (offset_plus_k(n->in(2), true) && scaled_iv_plus_offset(n->in(1))) {
585
      NOT_PRODUCT(_tracer.scaled_iv_plus_offset_6(n);)
586
      return true;
587
    }
588
    if (offset_plus_k(n->in(1)) && scaled_iv_plus_offset(n->in(2))) {
589
      _scale *= -1;
590
      NOT_PRODUCT(_tracer.scaled_iv_plus_offset_7(n);)
591
      return true;
592
    }
593
  }
594

595
  NOT_PRODUCT(_tracer.scaled_iv_plus_offset_8(n);)
596
  return false;
597
}
598

599
// Match: k*iv where k is a constant that's not zero
600
bool VPointer::scaled_iv(Node* n) {
601
  NOT_PRODUCT(Tracer::Depth ddd;)
602
  NOT_PRODUCT(_tracer.scaled_iv_1(n);)
603

604
  if (_scale != 0) { // already found a scale
605
    NOT_PRODUCT(_tracer.scaled_iv_2(n, _scale);)
606
    return false;
607
  }
608

609
  if (n == iv()) {
610
    _scale = 1;
611
    NOT_PRODUCT(_tracer.scaled_iv_3(n, _scale);)
612
    return true;
613
  }
614
  if (_analyze_only && (is_loop_member(n))) {
615
    _nstack->push(n, _stack_idx++);
616
  }
617

618
  int opc = n->Opcode();
619
  if (opc == Op_MulI) {
620
    if (n->in(1) == iv() && n->in(2)->is_Con()) {
621
      _scale = n->in(2)->get_int();
622
      NOT_PRODUCT(_tracer.scaled_iv_4(n, _scale);)
623
      return true;
624
    } else if (n->in(2) == iv() && n->in(1)->is_Con()) {
625
      _scale = n->in(1)->get_int();
626
      NOT_PRODUCT(_tracer.scaled_iv_5(n, _scale);)
627
      return true;
628
    }
629
  } else if (opc == Op_LShiftI) {
630
    if (n->in(1) == iv() && n->in(2)->is_Con()) {
631
      _scale = 1 << n->in(2)->get_int();
632
      NOT_PRODUCT(_tracer.scaled_iv_6(n, _scale);)
633
      return true;
634
    }
635
  } else if (opc == Op_ConvI2L || opc == Op_CastII) {
636
    if (scaled_iv_plus_offset(n->in(1))) {
637
      NOT_PRODUCT(_tracer.scaled_iv_7(n);)
638
      return true;
639
    }
640
  } else if (opc == Op_LShiftL && n->in(2)->is_Con()) {
641
    if (!has_iv()) {
642
      // Need to preserve the current _offset value, so
643
      // create a temporary object for this expression subtree.
644
      // Hacky, so should re-engineer the address pattern match.
645
      NOT_PRODUCT(Tracer::Depth dddd;)
646
      VPointer tmp(this);
647
      NOT_PRODUCT(_tracer.scaled_iv_8(n, &tmp);)
648

649
      if (tmp.scaled_iv_plus_offset(n->in(1))) {
650
        int scale = n->in(2)->get_int();
651
        _scale   = tmp._scale  << scale;
652
        _offset += tmp._offset << scale;
653
        if (tmp._invar != nullptr) {
654
          BasicType bt = tmp._invar->bottom_type()->basic_type();
655
          assert(bt == T_INT || bt == T_LONG, "");
656
          maybe_add_to_invar(register_if_new(LShiftNode::make(tmp._invar, n->in(2), bt)), false);
657
#ifdef ASSERT
658
          _debug_invar_scale = n->in(2);
659
#endif
660
        }
661
        NOT_PRODUCT(_tracer.scaled_iv_9(n, _scale, _offset, _invar);)
662
        return true;
663
      }
664
    }
665
  }
666
  NOT_PRODUCT(_tracer.scaled_iv_10(n);)
667
  return false;
668
}
669

670
// Match: offset is (k [+/- invariant])
671
// where k maybe zero and invariant is optional, but not both.
672
bool VPointer::offset_plus_k(Node* n, bool negate) {
673
  NOT_PRODUCT(Tracer::Depth ddd;)
674
  NOT_PRODUCT(_tracer.offset_plus_k_1(n);)
675

676
  int opc = n->Opcode();
677
  if (opc == Op_ConI) {
678
    _offset += negate ? -(n->get_int()) : n->get_int();
679
    NOT_PRODUCT(_tracer.offset_plus_k_2(n, _offset);)
680
    return true;
681
  } else if (opc == Op_ConL) {
682
    // Okay if value fits into an int
683
    const TypeLong* t = n->find_long_type();
684
    if (t->higher_equal(TypeLong::INT)) {
685
      jlong loff = n->get_long();
686
      jint  off  = (jint)loff;
687
      _offset += negate ? -off : loff;
688
      NOT_PRODUCT(_tracer.offset_plus_k_3(n, _offset);)
689
      return true;
690
    }
691
    NOT_PRODUCT(_tracer.offset_plus_k_4(n);)
692
    return false;
693
  }
694
  assert((_debug_invar == nullptr) == (_invar == nullptr), "");
695

696
  if (_analyze_only && is_loop_member(n)) {
697
    _nstack->push(n, _stack_idx++);
698
  }
699
  if (opc == Op_AddI) {
700
    if (n->in(2)->is_Con() && invariant(n->in(1))) {
701
      maybe_add_to_invar(n->in(1), negate);
702
      _offset += negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
703
      NOT_PRODUCT(_tracer.offset_plus_k_6(n, _invar, negate, _offset);)
704
      return true;
705
    } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
706
      _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
707
      maybe_add_to_invar(n->in(2), negate);
708
      NOT_PRODUCT(_tracer.offset_plus_k_7(n, _invar, negate, _offset);)
709
      return true;
710
    }
711
  }
712
  if (opc == Op_SubI) {
713
    if (n->in(2)->is_Con() && invariant(n->in(1))) {
714
      maybe_add_to_invar(n->in(1), negate);
715
      _offset += !negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
716
      NOT_PRODUCT(_tracer.offset_plus_k_8(n, _invar, negate, _offset);)
717
      return true;
718
    } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
719
      _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
720
      maybe_add_to_invar(n->in(2), !negate);
721
      NOT_PRODUCT(_tracer.offset_plus_k_9(n, _invar, !negate, _offset);)
722
      return true;
723
    }
724
  }
725

726
  if (!is_loop_member(n)) {
727
    // 'n' is loop invariant. Skip ConvI2L and CastII nodes before checking if 'n' is dominating the pre loop.
728
    if (opc == Op_ConvI2L) {
729
      n = n->in(1);
730
    }
731
    if (n->Opcode() == Op_CastII) {
732
      // Skip CastII nodes
733
      assert(!is_loop_member(n), "sanity");
734
      n = n->in(1);
735
    }
736
    // Check if 'n' can really be used as invariant (not in main loop and dominating the pre loop).
737
    if (invariant(n)) {
738
      maybe_add_to_invar(n, negate);
739
      NOT_PRODUCT(_tracer.offset_plus_k_10(n, _invar, negate, _offset);)
740
      return true;
741
    }
742
  }
743

744
  NOT_PRODUCT(_tracer.offset_plus_k_11(n);)
745
  return false;
746
}
747

748
Node* VPointer::maybe_negate_invar(bool negate, Node* invar) {
749
#ifdef ASSERT
750
  _debug_negate_invar = negate;
751
#endif
752
  if (negate) {
753
    BasicType bt = invar->bottom_type()->basic_type();
754
    assert(bt == T_INT || bt == T_LONG, "");
755
    PhaseIterGVN& igvn = phase()->igvn();
756
    Node* zero = igvn.zerocon(bt);
757
    phase()->set_ctrl(zero, phase()->C->root());
758
    Node* sub = SubNode::make(zero, invar, bt);
759
    invar = register_if_new(sub);
760
  }
761
  return invar;
762
}
763

764
Node* VPointer::register_if_new(Node* n) const {
765
  PhaseIterGVN& igvn = phase()->igvn();
766
  Node* prev = igvn.hash_find_insert(n);
767
  if (prev != nullptr) {
768
    n->destruct(&igvn);
769
    n = prev;
770
  } else {
771
    Node* c = phase()->get_early_ctrl(n);
772
    phase()->register_new_node(n, c);
773
  }
774
  return n;
775
}
776

777
void VPointer::maybe_add_to_invar(Node* new_invar, bool negate) {
778
  new_invar = maybe_negate_invar(negate, new_invar);
779
  if (_invar == nullptr) {
780
    _invar = new_invar;
781
#ifdef ASSERT
782
    _debug_invar = new_invar;
783
#endif
784
    return;
785
  }
786
#ifdef ASSERT
787
  _debug_invar = NodeSentinel;
788
#endif
789
  BasicType new_invar_bt = new_invar->bottom_type()->basic_type();
790
  assert(new_invar_bt == T_INT || new_invar_bt == T_LONG, "");
791
  BasicType invar_bt = _invar->bottom_type()->basic_type();
792
  assert(invar_bt == T_INT || invar_bt == T_LONG, "");
793

794
  BasicType bt = (new_invar_bt == T_LONG || invar_bt == T_LONG) ? T_LONG : T_INT;
795
  Node* current_invar = _invar;
796
  if (invar_bt != bt) {
797
    assert(bt == T_LONG && invar_bt == T_INT, "");
798
    assert(new_invar_bt == bt, "");
799
    current_invar = register_if_new(new ConvI2LNode(current_invar));
800
  } else if (new_invar_bt != bt) {
801
    assert(bt == T_LONG && new_invar_bt == T_INT, "");
802
    assert(invar_bt == bt, "");
803
    new_invar = register_if_new(new ConvI2LNode(new_invar));
804
  }
805
  Node* add = AddNode::make(current_invar, new_invar, bt);
806
  _invar = register_if_new(add);
807
}
808

809
// We use two comparisons, because a subtraction could underflow.
810
#define RETURN_CMP_VALUE_IF_NOT_EQUAL(a, b) \
811
  if (a < b) { return -1; }                 \
812
  if (a > b) { return  1; }
813

814
// To be in the same group, two VPointers must be the same,
815
// except for the offset.
816
int VPointer::cmp_for_sort_by_group(const VPointer** p1, const VPointer** p2) {
817
  const VPointer* a = *p1;
818
  const VPointer* b = *p2;
819

820
  RETURN_CMP_VALUE_IF_NOT_EQUAL(a->base()->_idx,     b->base()->_idx);
821
  RETURN_CMP_VALUE_IF_NOT_EQUAL(a->mem()->Opcode(),  b->mem()->Opcode());
822
  RETURN_CMP_VALUE_IF_NOT_EQUAL(a->scale_in_bytes(), b->scale_in_bytes());
823

824
  int a_inva_idx = a->invar() == nullptr ? 0 : a->invar()->_idx;
825
  int b_inva_idx = b->invar() == nullptr ? 0 : b->invar()->_idx;
826
  RETURN_CMP_VALUE_IF_NOT_EQUAL(a_inva_idx,          b_inva_idx);
827

828
  return 0; // equal
829
}
830

831
// We compare by group, then by offset, and finally by node idx.
832
int VPointer::cmp_for_sort(const VPointer** p1, const VPointer** p2) {
833
  int cmp_group = cmp_for_sort_by_group(p1, p2);
834
  if (cmp_group != 0) { return cmp_group; }
835

836
  const VPointer* a = *p1;
837
  const VPointer* b = *p2;
838

839
  RETURN_CMP_VALUE_IF_NOT_EQUAL(a->offset_in_bytes(), b->offset_in_bytes());
840
  RETURN_CMP_VALUE_IF_NOT_EQUAL(a->mem()->_idx,       b->mem()->_idx);
841
  return 0; // equal
842
}
843

844
#ifndef PRODUCT
845
// Function for printing the fields of a VPointer
846
void VPointer::print() const {
847
  tty->print("VPointer[mem: %4d %10s, ", _mem->_idx, _mem->Name());
848

849
  if (!valid()) {
850
    tty->print_cr("invalid]");
851
    return;
852
  }
853

854
  tty->print("base: %4d, ", _base != nullptr ? _base->_idx : 0);
855
  tty->print("adr: %4d, ", _adr != nullptr ? _adr->_idx : 0);
856

857
  tty->print(" base");
858
  print_con_or_idx(_base);
859

860
  tty->print(" + offset(%4d)", _offset);
861

862
  tty->print(" + invar");
863
  print_con_or_idx(_invar);
864

865
  tty->print_cr(" + scale(%4d) * iv]", _scale);
866
}
867
#endif
868

869
// Following are functions for tracing VPointer match
870
#ifndef PRODUCT
871
void VPointer::Tracer::print_depth() const {
872
  for (int ii = 0; ii < _depth; ++ii) {
873
    tty->print("  ");
874
  }
875
}
876

877
void VPointer::Tracer::ctor_1(const Node* mem) {
878
  if (_is_trace_alignment) {
879
    print_depth(); tty->print(" %d VPointer::VPointer: start alignment analysis", mem->_idx); mem->dump();
880
  }
881
}
882

883
void VPointer::Tracer::ctor_2(Node* adr) {
884
  if (_is_trace_alignment) {
885
    //store_depth();
886
    inc_depth();
887
    print_depth(); tty->print(" %d (adr) VPointer::VPointer: ", adr->_idx); adr->dump();
888
    inc_depth();
889
    print_depth(); tty->print(" %d (base) VPointer::VPointer: ", adr->in(AddPNode::Base)->_idx); adr->in(AddPNode::Base)->dump();
890
  }
891
}
892

893
void VPointer::Tracer::ctor_3(Node* adr, int i) {
894
  if (_is_trace_alignment) {
895
    inc_depth();
896
    Node* offset = adr->in(AddPNode::Offset);
897
    print_depth(); tty->print(" %d (offset) VPointer::VPointer: i = %d: ", offset->_idx, i); offset->dump();
898
  }
899
}
900

901
void VPointer::Tracer::ctor_4(Node* adr, int i) {
902
  if (_is_trace_alignment) {
903
    inc_depth();
904
    print_depth(); tty->print(" %d (adr) VPointer::VPointer: i = %d: ", adr->_idx, i); adr->dump();
905
  }
906
}
907

908
void VPointer::Tracer::ctor_5(Node* adr, Node* base, int i) {
909
  if (_is_trace_alignment) {
910
    inc_depth();
911
    if (base == adr) {
912
      print_depth(); tty->print_cr("  \\ %d (adr) == %d (base) VPointer::VPointer: breaking analysis at i = %d", adr->_idx, base->_idx, i);
913
    } else if (!adr->is_AddP()) {
914
      print_depth(); tty->print_cr("  \\ %d (adr) is NOT Addp VPointer::VPointer: breaking analysis at i = %d", adr->_idx, i);
915
    }
916
  }
917
}
918

919
void VPointer::Tracer::ctor_6(const Node* mem) {
920
  if (_is_trace_alignment) {
921
    //restore_depth();
922
    print_depth(); tty->print_cr(" %d (adr) VPointer::VPointer: stop analysis", mem->_idx);
923
  }
924
}
925

926
void VPointer::Tracer::scaled_iv_plus_offset_1(Node* n) {
927
  if (_is_trace_alignment) {
928
    print_depth(); tty->print(" %d VPointer::scaled_iv_plus_offset testing node: ", n->_idx);
929
    n->dump();
930
  }
931
}
932

933
void VPointer::Tracer::scaled_iv_plus_offset_2(Node* n) {
934
  if (_is_trace_alignment) {
935
    print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: PASSED", n->_idx);
936
  }
937
}
938

939
void VPointer::Tracer::scaled_iv_plus_offset_3(Node* n) {
940
  if (_is_trace_alignment) {
941
    print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: PASSED", n->_idx);
942
  }
943
}
944

945
void VPointer::Tracer::scaled_iv_plus_offset_4(Node* n) {
946
  if (_is_trace_alignment) {
947
    print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: Op_AddI PASSED", n->_idx);
948
    print_depth(); tty->print("  \\ %d VPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(1)->_idx); n->in(1)->dump();
949
    print_depth(); tty->print("  \\ %d VPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(2)->_idx); n->in(2)->dump();
950
  }
951
}
952

953
void VPointer::Tracer::scaled_iv_plus_offset_5(Node* n) {
954
  if (_is_trace_alignment) {
955
    print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: Op_AddI PASSED", n->_idx);
956
    print_depth(); tty->print("  \\ %d VPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(2)->_idx); n->in(2)->dump();
957
    print_depth(); tty->print("  \\ %d VPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(1)->_idx); n->in(1)->dump();
958
  }
959
}
960

961
void VPointer::Tracer::scaled_iv_plus_offset_6(Node* n) {
962
  if (_is_trace_alignment) {
963
    print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: Op_%s PASSED", n->_idx, n->Name());
964
    print_depth(); tty->print("  \\  %d VPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(1)->_idx); n->in(1)->dump();
965
    print_depth(); tty->print("  \\ %d VPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(2)->_idx); n->in(2)->dump();
966
  }
967
}
968

969
void VPointer::Tracer::scaled_iv_plus_offset_7(Node* n) {
970
  if (_is_trace_alignment) {
971
    print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: Op_%s PASSED", n->_idx, n->Name());
972
    print_depth(); tty->print("  \\ %d VPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(2)->_idx); n->in(2)->dump();
973
    print_depth(); tty->print("  \\ %d VPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(1)->_idx); n->in(1)->dump();
974
  }
975
}
976

977
void VPointer::Tracer::scaled_iv_plus_offset_8(Node* n) {
978
  if (_is_trace_alignment) {
979
    print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: FAILED", n->_idx);
980
  }
981
}
982

983
void VPointer::Tracer::scaled_iv_1(Node* n) {
984
  if (_is_trace_alignment) {
985
    print_depth(); tty->print(" %d VPointer::scaled_iv: testing node: ", n->_idx); n->dump();
986
  }
987
}
988

989
void VPointer::Tracer::scaled_iv_2(Node* n, int scale) {
990
  if (_is_trace_alignment) {
991
    print_depth(); tty->print_cr(" %d VPointer::scaled_iv: FAILED since another _scale has been detected before", n->_idx);
992
    print_depth(); tty->print_cr("  \\ VPointer::scaled_iv: _scale (%d) != 0", scale);
993
  }
994
}
995

996
void VPointer::Tracer::scaled_iv_3(Node* n, int scale) {
997
  if (_is_trace_alignment) {
998
    print_depth(); tty->print_cr(" %d VPointer::scaled_iv: is iv, setting _scale = %d", n->_idx, scale);
999
  }
1000
}
1001

1002
void VPointer::Tracer::scaled_iv_4(Node* n, int scale) {
1003
  if (_is_trace_alignment) {
1004
    print_depth(); tty->print_cr(" %d VPointer::scaled_iv: Op_MulI PASSED, setting _scale = %d", n->_idx, scale);
1005
    print_depth(); tty->print("  \\ %d VPointer::scaled_iv: in(1) is iv: ", n->in(1)->_idx); n->in(1)->dump();
1006
    print_depth(); tty->print("  \\ %d VPointer::scaled_iv: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
1007
  }
1008
}
1009

1010
void VPointer::Tracer::scaled_iv_5(Node* n, int scale) {
1011
  if (_is_trace_alignment) {
1012
    print_depth(); tty->print_cr(" %d VPointer::scaled_iv: Op_MulI PASSED, setting _scale = %d", n->_idx, scale);
1013
    print_depth(); tty->print("  \\ %d VPointer::scaled_iv: in(2) is iv: ", n->in(2)->_idx); n->in(2)->dump();
1014
    print_depth(); tty->print("  \\ %d VPointer::scaled_iv: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
1015
  }
1016
}
1017

1018
void VPointer::Tracer::scaled_iv_6(Node* n, int scale) {
1019
  if (_is_trace_alignment) {
1020
    print_depth(); tty->print_cr(" %d VPointer::scaled_iv: Op_LShiftI PASSED, setting _scale = %d", n->_idx, scale);
1021
    print_depth(); tty->print("  \\ %d VPointer::scaled_iv: in(1) is iv: ", n->in(1)->_idx); n->in(1)->dump();
1022
    print_depth(); tty->print("  \\ %d VPointer::scaled_iv: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
1023
  }
1024
}
1025

1026
void VPointer::Tracer::scaled_iv_7(Node* n) {
1027
  if (_is_trace_alignment) {
1028
    print_depth(); tty->print_cr(" %d VPointer::scaled_iv: Op_ConvI2L PASSED", n->_idx);
1029
    print_depth(); tty->print_cr("  \\ VPointer::scaled_iv: in(1) %d is scaled_iv_plus_offset: ", n->in(1)->_idx);
1030
    inc_depth(); inc_depth();
1031
    print_depth(); n->in(1)->dump();
1032
    dec_depth(); dec_depth();
1033
  }
1034
}
1035

1036
void VPointer::Tracer::scaled_iv_8(Node* n, VPointer* tmp) {
1037
  if (_is_trace_alignment) {
1038
    print_depth(); tty->print(" %d VPointer::scaled_iv: Op_LShiftL, creating tmp VPointer: ", n->_idx); tmp->print();
1039
  }
1040
}
1041

1042
void VPointer::Tracer::scaled_iv_9(Node* n, int scale, int offset, Node* invar) {
1043
  if (_is_trace_alignment) {
1044
    print_depth(); tty->print_cr(" %d VPointer::scaled_iv: Op_LShiftL PASSED, setting _scale = %d, _offset = %d", n->_idx, scale, offset);
1045
    print_depth(); tty->print_cr("  \\ VPointer::scaled_iv: in(1) [%d] is scaled_iv_plus_offset, in(2) [%d] used to scale: _scale = %d, _offset = %d",
1046
    n->in(1)->_idx, n->in(2)->_idx, scale, offset);
1047
    if (invar != nullptr) {
1048
      print_depth(); tty->print_cr("  \\ VPointer::scaled_iv: scaled invariant: [%d]", invar->_idx);
1049
    }
1050
    inc_depth(); inc_depth();
1051
    print_depth(); n->in(1)->dump();
1052
    print_depth(); n->in(2)->dump();
1053
    if (invar != nullptr) {
1054
      print_depth(); invar->dump();
1055
    }
1056
    dec_depth(); dec_depth();
1057
  }
1058
}
1059

1060
void VPointer::Tracer::scaled_iv_10(Node* n) {
1061
  if (_is_trace_alignment) {
1062
    print_depth(); tty->print_cr(" %d VPointer::scaled_iv: FAILED", n->_idx);
1063
  }
1064
}
1065

1066
void VPointer::Tracer::offset_plus_k_1(Node* n) {
1067
  if (_is_trace_alignment) {
1068
    print_depth(); tty->print(" %d VPointer::offset_plus_k: testing node: ", n->_idx); n->dump();
1069
  }
1070
}
1071

1072
void VPointer::Tracer::offset_plus_k_2(Node* n, int _offset) {
1073
  if (_is_trace_alignment) {
1074
    print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: Op_ConI PASSED, setting _offset = %d", n->_idx, _offset);
1075
  }
1076
}
1077

1078
void VPointer::Tracer::offset_plus_k_3(Node* n, int _offset) {
1079
  if (_is_trace_alignment) {
1080
    print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: Op_ConL PASSED, setting _offset = %d", n->_idx, _offset);
1081
  }
1082
}
1083

1084
void VPointer::Tracer::offset_plus_k_4(Node* n) {
1085
  if (_is_trace_alignment) {
1086
    print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: FAILED", n->_idx);
1087
    print_depth(); tty->print_cr("  \\ " JLONG_FORMAT " VPointer::offset_plus_k: Op_ConL FAILED, k is too big", n->get_long());
1088
  }
1089
}
1090

1091
void VPointer::Tracer::offset_plus_k_5(Node* n, Node* _invar) {
1092
  if (_is_trace_alignment) {
1093
    print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: FAILED since another invariant has been detected before", n->_idx);
1094
    print_depth(); tty->print("  \\ %d VPointer::offset_plus_k: _invar is not null: ", _invar->_idx); _invar->dump();
1095
  }
1096
}
1097

1098
void VPointer::Tracer::offset_plus_k_6(Node* n, Node* _invar, bool _negate_invar, int _offset) {
1099
  if (_is_trace_alignment) {
1100
    print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: Op_AddI PASSED, setting _debug_negate_invar = %d, _invar = %d, _offset = %d",
1101
    n->_idx, _negate_invar, _invar->_idx, _offset);
1102
    print_depth(); tty->print("  \\ %d VPointer::offset_plus_k: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
1103
    print_depth(); tty->print("  \\ %d VPointer::offset_plus_k: in(1) is invariant: ", _invar->_idx); _invar->dump();
1104
  }
1105
}
1106

1107
void VPointer::Tracer::offset_plus_k_7(Node* n, Node* _invar, bool _negate_invar, int _offset) {
1108
  if (_is_trace_alignment) {
1109
    print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: Op_AddI PASSED, setting _debug_negate_invar = %d, _invar = %d, _offset = %d",
1110
    n->_idx, _negate_invar, _invar->_idx, _offset);
1111
    print_depth(); tty->print("  \\ %d VPointer::offset_plus_k: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
1112
    print_depth(); tty->print("  \\ %d VPointer::offset_plus_k: in(2) is invariant: ", _invar->_idx); _invar->dump();
1113
  }
1114
}
1115

1116
void VPointer::Tracer::offset_plus_k_8(Node* n, Node* _invar, bool _negate_invar, int _offset) {
1117
  if (_is_trace_alignment) {
1118
    print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: Op_SubI is PASSED, setting _debug_negate_invar = %d, _invar = %d, _offset = %d",
1119
    n->_idx, _negate_invar, _invar->_idx, _offset);
1120
    print_depth(); tty->print("  \\ %d VPointer::offset_plus_k: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
1121
    print_depth(); tty->print("  \\ %d VPointer::offset_plus_k: in(1) is invariant: ", _invar->_idx); _invar->dump();
1122
  }
1123
}
1124

1125
void VPointer::Tracer::offset_plus_k_9(Node* n, Node* _invar, bool _negate_invar, int _offset) {
1126
  if (_is_trace_alignment) {
1127
    print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: Op_SubI PASSED, setting _debug_negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset);
1128
    print_depth(); tty->print("  \\ %d VPointer::offset_plus_k: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
1129
    print_depth(); tty->print("  \\ %d VPointer::offset_plus_k: in(2) is invariant: ", _invar->_idx); _invar->dump();
1130
  }
1131
}
1132

1133
void VPointer::Tracer::offset_plus_k_10(Node* n, Node* _invar, bool _negate_invar, int _offset) {
1134
  if (_is_trace_alignment) {
1135
    print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: PASSED, setting _debug_negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset);
1136
    print_depth(); tty->print_cr("  \\ %d VPointer::offset_plus_k: is invariant", n->_idx);
1137
  }
1138
}
1139

1140
void VPointer::Tracer::offset_plus_k_11(Node* n) {
1141
  if (_is_trace_alignment) {
1142
    print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: FAILED", n->_idx);
1143
  }
1144
}
1145
#endif
1146

1147

1148
AlignmentSolution* AlignmentSolver::solve() const {
1149
  DEBUG_ONLY( trace_start_solve(); )
1150

1151
  // Out of simplicity: non power-of-2 stride not supported.
1152
  if (!is_power_of_2(abs(_pre_stride))) {
1153
    return new EmptyAlignmentSolution("non power-of-2 stride not supported");
1154
  }
1155
  assert(is_power_of_2(abs(_main_stride)), "main_stride is power of 2");
1156
  assert(_aw > 0 && is_power_of_2(_aw), "aw must be power of 2");
1157

1158
  // Out of simplicity: non power-of-2 scale not supported.
1159
  if (abs(_scale) == 0 || !is_power_of_2(abs(_scale))) {
1160
    return new EmptyAlignmentSolution("non power-of-2 scale not supported");
1161
  }
1162

1163
  // We analyze the address of mem_ref. The idea is to disassemble it into a linear
1164
  // expression, where we can use the constant factors as the basis for ensuring the
1165
  // alignment of vector memory accesses.
1166
  //
1167
  // The Simple form of the address is disassembled by VPointer into:
1168
  //
1169
  //   adr = base + offset + invar + scale * iv
1170
  //
1171
  // Where the iv can be written as:
1172
  //
1173
  //   iv = init + pre_stride * pre_iter + main_stride * main_iter
1174
  //
1175
  // init:        value before pre-loop
1176
  // pre_stride:  increment per pre-loop iteration
1177
  // pre_iter:    number of pre-loop iterations (adjustable via pre-loop limit)
1178
  // main_stride: increment per main-loop iteration (= pre_stride * unroll_factor)
1179
  // main_iter:   number of main-loop iterations (main_iter >= 0)
1180
  //
1181
  // In the following, we restate the Simple form of the address expression, by first
1182
  // expanding the iv variable. In a second step, we reshape the expression again, and
1183
  // state it as a linear expression, consisting of 6 terms.
1184
  //
1185
  //          Simple form           Expansion of iv variable                  Reshaped with constants   Comments for terms
1186
  //          -----------           ------------------------                  -----------------------   ------------------
1187
  //   adr =  base               =  base                                   =  base                      (base % aw = 0)
1188
  //        + offset              + offset                                  + C_const                   (sum of constant terms)
1189
  //        + invar               + invar_factor * var_invar                + C_invar * var_invar       (term for invariant)
1190
  //                          /   + scale * init                            + C_init  * var_init        (term for variable init)
1191
  //        + scale * iv   -> |   + scale * pre_stride * pre_iter           + C_pre   * pre_iter        (adjustable pre-loop term)
1192
  //                          \   + scale * main_stride * main_iter         + C_main  * main_iter       (main-loop term)
1193
  //
1194
  // We describe the 6 terms:
1195
  //   1) The "base" of the address is the address of a Java object (e.g. array),
1196
  //      and as such ObjectAlignmentInBytes (a power of 2) aligned. We have
1197
  //      defined aw = MIN(vector_width, ObjectAlignmentInBytes), which is also
1198
  //      a power of 2. And hence we know that "base" is thus also aw-aligned:
1199
  //
1200
  //        base % ObjectAlignmentInBytes = 0     ==>    base % aw = 0
1201
  //
1202
  //   2) The "C_const" term is the sum of all constant terms. This is "offset",
1203
  //      plus "scale * init" if it is constant.
1204
  //   3) The "C_invar * var_invar" is the factorization of "invar" into a constant
1205
  //      and variable term. If there is no invariant, then "C_invar" is zero.
1206
  //
1207
  //        invar = C_invar * var_invar                                             (FAC_INVAR)
1208
  //
1209
  //   4) The "C_init * var_init" is the factorization of "scale * init" into a
1210
  //      constant and a variable term. If "init" is constant, then "C_init" is
1211
  //      zero, and "C_const" accounts for "init" instead.
1212
  //
1213
  //        scale * init = C_init * var_init + scale * C_const_init                 (FAC_INIT)
1214
  //        C_init       = (init is constant) ? 0    : scale
1215
  //        C_const_init = (init is constant) ? init : 0
1216
  //
1217
  //   5) The "C_pre * pre_iter" term represents how much the iv is incremented
1218
  //      during the "pre_iter" pre-loop iterations. This term can be adjusted
1219
  //      by changing the pre-loop limit, which defines how many pre-loop iterations
1220
  //      are executed. This allows us to adjust the alignment of the main-loop
1221
  //      memory reference.
1222
  //   6) The "C_main * main_iter" term represents how much the iv is increased
1223
  //      during "main_iter" main-loop iterations.
1224

1225
  // Attribute init (i.e. _init_node) either to C_const or to C_init term.
1226
  const int C_const_init = _init_node->is_ConI() ? _init_node->as_ConI()->get_int() : 0;
1227
  const int C_const =      _offset + C_const_init * _scale;
1228

1229
  // Set C_invar depending on if invar is present
1230
  const int C_invar = (_invar == nullptr) ? 0 : abs(_invar_factor);
1231

1232
  const int C_init = _init_node->is_ConI() ? 0 : _scale;
1233
  const int C_pre =  _scale * _pre_stride;
1234
  const int C_main = _scale * _main_stride;
1235

1236
  DEBUG_ONLY( trace_reshaped_form(C_const, C_const_init, C_invar, C_init, C_pre, C_main); )
1237

1238
  // We must find a pre_iter, such that adr is aw aligned: adr % aw = 0. Note, that we are defining the
1239
  // modulo operator "%" such that the remainder is always positive, see AlignmentSolution::mod(i, q).
1240
  //
1241
  // Since "base % aw = 0", we only need to ensure alignment of the other 5 terms:
1242
  //
1243
  //   (C_const + C_invar * var_invar + C_init * var_init + C_pre * pre_iter + C_main * main_iter) % aw = 0      (1)
1244
  //
1245
  // Alignment must be maintained over all main-loop iterations, i.e. for any main_iter >= 0, we require:
1246
  //
1247
  //   C_main % aw = 0                                                                                           (2)
1248
  //
1249
  const int C_main_mod_aw = AlignmentSolution::mod(C_main, _aw);
1250

1251
  DEBUG_ONLY( trace_main_iteration_alignment(C_const, C_invar, C_init, C_pre, C_main, C_main_mod_aw); )
1252

1253
  if (C_main_mod_aw != 0) {
1254
    return new EmptyAlignmentSolution("EQ(2) not satisfied (cannot align across main-loop iterations)");
1255
  }
1256

1257
  // In what follows, we need to show that the C_const, init and invar terms can be aligned by
1258
  // adjusting the pre-loop iteration count (pre_iter), which is controlled by the pre-loop
1259
  // limit.
1260
  //
1261
  //     (C_const + C_invar * var_invar + C_init * var_init + C_pre * pre_iter) % aw = 0                         (3)
1262
  //
1263
  // We strengthen the constraints by splitting the equation into 3 equations, where we
1264
  // want to find integer solutions for pre_iter_C_const, pre_iter_C_invar, and
1265
  // pre_iter_C_init, which means that the C_const, init and invar terms can be aligned
1266
  // independently:
1267
  //
1268
  //   (C_const             + C_pre * pre_iter_C_const) % aw = 0                 (4a)
1269
  //   (C_invar * var_invar + C_pre * pre_iter_C_invar) % aw = 0                 (4b)
1270
  //   (C_init  * var_init  + C_pre * pre_iter_C_init ) % aw = 0                 (4c)
1271
  //
1272
  // We now prove that (4a, b, c) are sufficient as well as necessary to guarantee (3)
1273
  // for any runtime value of var_invar and var_init (i.e. for any invar and init).
1274
  // This tells us that the "strengthening" does not restrict the algorithm more than
1275
  // necessary.
1276
  //
1277
  // Sufficient (i.e (4a, b, c) imply (3)):
1278
  //
1279
  //   pre_iter = pre_iter_C_const + pre_iter_C_invar + pre_iter_C_init
1280
  //
1281
  // Adding up (4a, b, c):
1282
  //
1283
  //   0 = (  C_const             + C_pre * pre_iter_C_const
1284
  //        + C_invar * var_invar + C_pre * pre_iter_C_invar
1285
  //        + C_init  * var_init  + C_pre * pre_iter_C_init  ) % aw
1286
  //
1287
  //     = (  C_const + C_invar * var_invar + C_init * var_init
1288
  //        + C_pre * (pre_iter_C_const + pre_iter_C_invar + pre_iter_C_init)) % aw
1289
  //
1290
  //     = (  C_const + C_invar * var_invar + C_init * var_init
1291
  //        + C_pre * pre_iter) % aw
1292
  //
1293
  // Necessary (i.e. (3) implies (4a, b, c)):
1294
  //  (4a): Set var_invar = var_init = 0 at runtime. Applying this to (3), we get:
1295
  //
1296
  //        0 =
1297
  //          = (C_const + C_invar * var_invar + C_init * var_init + C_pre * pre_iter) % aw
1298
  //          = (C_const + C_invar * 0         + C_init * 0        + C_pre * pre_iter) % aw
1299
  //          = (C_const                                           + C_pre * pre_iter) % aw
1300
  //
1301
  //        This is of the same form as (4a), and we have a solution:
1302
  //        pre_iter_C_const = pre_iter
1303
  //
1304
  //  (4b): Set var_init = 0, and assume (4a), which we just proved is implied by (3).
1305
  //        Subtract (4a) from (3):
1306
  //
1307
  //        0 =
1308
  //          =  (C_const + C_invar * var_invar + C_init * var_init + C_pre * pre_iter) % aw
1309
  //           - (C_const + C_pre * pre_iter_C_const) % aw
1310
  //          =  (C_invar * var_invar + C_init * var_init + C_pre * pre_iter - C_pre * pre_iter_C_const) % aw
1311
  //          =  (C_invar * var_invar + C_init * 0        + C_pre * (pre_iter - pre_iter_C_const)) % aw
1312
  //          =  (C_invar * var_invar +                   + C_pre * (pre_iter - pre_iter_C_const)) % aw
1313
  //
1314
  //        This is of the same form as (4b), and we have a solution:
1315
  //        pre_iter_C_invar = pre_iter - pre_iter_C_const
1316
  //
1317
  //  (4c): Set var_invar = 0, and assume (4a), which we just proved is implied by (3).
1318
  //        Subtract (4a) from (3):
1319
  //
1320
  //        0 =
1321
  //          =  (C_const + C_invar * var_invar + C_init * var_init + C_pre * pre_iter) % aw
1322
  //           - (C_const + C_pre * pre_iter_C_const) % aw
1323
  //          =  (C_invar * var_invar + C_init * var_init + C_pre * pre_iter - C_pre * pre_iter_C_const) % aw
1324
  //          =  (C_invar * 0         + C_init * var_init + C_pre * (pre_iter - pre_iter_C_const)) % aw
1325
  //          =  (                    + C_init * var_init + C_pre * (pre_iter - pre_iter_C_const)) % aw
1326
  //
1327
  //        This is of the same form as (4c), and we have a solution:
1328
  //        pre_iter_C_invar = pre_iter - pre_iter_C_const
1329
  //
1330
  // The solutions of Equations (4a, b, c) for pre_iter_C_const, pre_iter_C_invar, and pre_iter_C_init
1331
  // respectively, can have one of these states:
1332
  //
1333
  //   trivial:     The solution can be any integer.
1334
  //   constrained: There is a (periodic) solution, but it is not trivial.
1335
  //   empty:       Statically we cannot guarantee a solution for all var_invar and var_init.
1336
  //
1337
  // We look at (4a):
1338
  //
1339
  //   abs(C_pre) >= aw
1340
  //   -> Since abs(C_pre) is a power of two, we have C_pre % aw = 0. Therefore:
1341
  //
1342
  //        For any pre_iter_C_const: (C_pre * pre_iter_C_const) % aw = 0
1343
  //
1344
  //        (C_const + C_pre * pre_iter_C_const) % aw = 0
1345
  //         C_const                             % aw = 0
1346
  //
1347
  //      Hence, we can only satisfy (4a) if C_Const is aw aligned:
1348
  //
1349
  //      C_const % aw == 0:
1350
  //      -> (4a) has a trivial solution since we can choose any value for pre_iter_C_const.
1351
  //
1352
  //      C_const % aw != 0:
1353
  //      -> (4a) has an empty solution since no pre_iter_C_const can achieve aw alignment.
1354
  //
1355
  //   abs(C_pre) < aw:
1356
  //   -> Since both abs(C_pre) and aw are powers of two, we know:
1357
  //
1358
  //        There exists integer x > 1: aw = abs(C_pre) * x
1359
  //
1360
  //      C_const % abs(C_pre) == 0:
1361
  //      -> There exists integer z: C_const = C_pre * z
1362
  //
1363
  //          (C_const   + C_pre * pre_iter_C_const) % aw               = 0
1364
  //          ==>
1365
  //          (C_pre * z + C_pre * pre_iter_C_const) % aw               = 0
1366
  //          ==>
1367
  //          (C_pre * z + C_pre * pre_iter_C_const) % (abs(C_pre) * x) = 0
1368
  //          ==>
1369
  //          (        z +         pre_iter_C_const) %               x  = 0
1370
  //          ==>
1371
  //          for any m: pre_iter_C_const = m * x - z
1372
  //
1373
  //        Hence, pre_iter_C_const has a non-trivial (because x > 1) periodic (periodicity x)
1374
  //        solution, i.e. it has a constrained solution.
1375
  //
1376
  //      C_const % abs(C_pre) != 0:
1377
  //        There exists integer x > 1: aw = abs(C_pre) * x
1378
  //
1379
  //           C_const                             %  abs(C_pre)      != 0
1380
  //          ==>
1381
  //          (C_const + C_pre * pre_iter_C_const) %  abs(C_pre)      != 0
1382
  //          ==>
1383
  //          (C_const + C_pre * pre_iter_C_const) % (abs(C_pre) * x) != 0
1384
  //          ==>
1385
  //          (C_const + C_pre * pre_iter_C_const) % aw               != 0
1386
  //
1387
  //        This is in contradiction with (4a), and therefore there cannot be any solution,
1388
  //        i.e. we have an empty solution.
1389
  //
1390
  // In summary, for (4a):
1391
  //
1392
  //   abs(C_pre) >= aw  AND  C_const % aw == 0          -> trivial
1393
  //   abs(C_pre) >= aw  AND  C_const % aw != 0          -> empty
1394
  //   abs(C_pre) <  aw  AND  C_const % abs(C_pre) == 0  -> constrained
1395
  //   abs(C_pre) <  aw  AND  C_const % abs(C_pre) != 0  -> empty
1396
  //
1397
  // With analogue argumentation for (4b):
1398
  //
1399
  //   abs(C_pre) >= aw  AND  C_invar % aw == 0           -> trivial
1400
  //   abs(C_pre) >= aw  AND  C_invar % aw != 0           -> empty
1401
  //   abs(C_pre) <  aw  AND  C_invar % abs(C_pre) == 0   -> constrained
1402
  //   abs(C_pre) <  aw  AND  C_invar % abs(C_pre) != 0   -> empty
1403
  //
1404
  // With analogue argumentation for (4c):
1405
  //
1406
  //   abs(C_pre) >= aw  AND  C_init  % aw == 0           -> trivial
1407
  //   abs(C_pre) >= aw  AND  C_init  % aw != 0           -> empty
1408
  //   abs(C_pre) <  aw  AND  C_init  % abs(C_pre) == 0   -> constrained
1409
  //   abs(C_pre) <  aw  AND  C_init  % abs(C_pre) != 0   -> empty
1410
  //
1411
  // Out of these states follows the state for the solution of pre_iter:
1412
  //
1413
  //   Trivial:     If (4a, b, c) are all trivial.
1414
  //   Empty:       If any of (4a, b, c) is empty, because then we cannot guarantee a solution
1415
  //                for pre_iter, for all possible invar and init values.
1416
  //   Constrained: Else. Incidentally, (4a, b, c) are all constrained themselves, as we argue below.
1417

1418
  const EQ4 eq4(C_const, C_invar, C_init, C_pre, _aw);
1419
  const EQ4::State eq4a_state = eq4.eq4a_state();
1420
  const EQ4::State eq4b_state = eq4.eq4b_state();
1421
  const EQ4::State eq4c_state = eq4.eq4c_state();
1422

1423
#ifdef ASSERT
1424
  if (is_trace()) {
1425
    eq4.trace();
1426
  }
1427
#endif
1428

1429
  // If (4a, b, c) are all trivial, then also the solution for pre_iter is trivial:
1430
  if (eq4a_state == EQ4::State::TRIVIAL &&
1431
      eq4b_state == EQ4::State::TRIVIAL &&
1432
      eq4c_state == EQ4::State::TRIVIAL) {
1433
    return new TrivialAlignmentSolution();
1434
  }
1435

1436
  // If any of (4a, b, c) is empty, then we also cannot guarantee a solution for pre_iter, for
1437
  // any init and invar, hence the solution for pre_iter is empty:
1438
  if (eq4a_state == EQ4::State::EMPTY ||
1439
      eq4b_state == EQ4::State::EMPTY ||
1440
      eq4c_state == EQ4::State::EMPTY) {
1441
    return new EmptyAlignmentSolution("EQ(4a, b, c) not all non-empty: cannot align const, invar and init terms individually");
1442
  }
1443

1444
  // If abs(C_pre) >= aw, then the solutions to (4a, b, c) are all either trivial or empty, and
1445
  // hence we would have found the solution to pre_iter above as either trivial or empty. Thus
1446
  // we now know that:
1447
  //
1448
  //   abs(C_pre) < aw
1449
  //
1450
  assert(abs(C_pre) < _aw, "implied by constrained case");
1451

1452
  // And since abs(C_pre) < aw, the solutions of (4a, b, c) can now only be constrained or empty.
1453
  // But since we already handled the empty case, the solutions are now all constrained.
1454
  assert(eq4a_state == EQ4::State::CONSTRAINED &&
1455
         eq4a_state == EQ4::State::CONSTRAINED &&
1456
         eq4a_state == EQ4::State::CONSTRAINED, "all must be constrained now");
1457

1458
  // And since they are all constrained, we must have:
1459
  //
1460
  //   C_const % abs(C_pre) = 0                                                  (5a)
1461
  //   C_invar % abs(C_pre) = 0                                                  (5b)
1462
  //   C_init  % abs(C_pre) = 0                                                  (5c)
1463
  //
1464
  assert(AlignmentSolution::mod(C_const, abs(C_pre)) == 0, "EQ(5a): C_const must be alignable");
1465
  assert(AlignmentSolution::mod(C_invar, abs(C_pre)) == 0, "EQ(5b): C_invar must be alignable");
1466
  assert(AlignmentSolution::mod(C_init,  abs(C_pre)) == 0, "EQ(5c): C_init  must be alignable");
1467

1468
  // With (5a, b, c), we know that there are integers X, Y, Z:
1469
  //
1470
  //   C_const = X * abs(C_pre)   ==>   X = C_const / abs(C_pre)                 (6a)
1471
  //   C_invar = Y * abs(C_pre)   ==>   Y = C_invar / abs(C_pre)                 (6b)
1472
  //   C_init  = Z * abs(C_pre)   ==>   Z = C_init  / abs(C_pre)                 (6c)
1473
  //
1474
  // Further, we define:
1475
  //
1476
  //   sign(C_pre) = C_pre / abs(C_pre) = (C_pre > 0) ? 1 : -1,                  (7)
1477
  //
1478
  // We know that abs(C_pre) as well as aw are powers of 2, and since (5) we can define integer q:
1479
  //
1480
  //   q = aw / abs(C_pre)                                                       (8)
1481
  //
1482
  const int q = _aw / abs(C_pre);
1483

1484
  assert(q >= 2, "implied by constrained solution");
1485

1486
  // We now know that all terms in (4a, b, c) are divisible by abs(C_pre):
1487
  //
1488
  //   (C_const                    / abs(C_pre) + C_pre * pre_iter_C_const /  abs(C_pre)) % (aw / abs(C_pre)) =
1489
  //   (X * abs(C_pre)             / abs(C_pre) + C_pre * pre_iter_C_const /  abs(C_pre)) % (aw / abs(C_pre)) =
1490
  //   (X                                       +         pre_iter_C_const * sign(C_pre)) % q                 = 0  (9a)
1491
  //
1492
  //   -> pre_iter_C_const * sign(C_pre) = mx1 * q -               X
1493
  //   -> pre_iter_C_const               = mx2 * q - sign(C_pre) * X                                               (10a)
1494
  //      (for any integers mx1, mx2)
1495
  //
1496
  //   (C_invar        * var_invar / abs(C_pre) + C_pre * pre_iter_C_invar /  abs(C_pre)) % (aw / abs(C_pre)) =
1497
  //   (Y * abs(C_pre) * var_invar / abs(C_pre) + C_pre * pre_iter_C_invar /  abs(C_pre)) % (aw / abs(C_pre)) =
1498
  //   (Y              * var_invar              +         pre_iter_C_invar * sign(C_pre)) % q                 = 0  (9b)
1499
  //
1500
  //   -> pre_iter_C_invar * sign(C_pre) = my1 * q -               Y * var_invar
1501
  //   -> pre_iter_C_invar               = my2 * q - sign(C_pre) * Y * var_invar                                   (10b)
1502
  //      (for any integers my1, my2)
1503
  //
1504
  //   (C_init          * var_init  / abs(C_pre) + C_pre * pre_iter_C_init /  abs(C_pre)) % (aw / abs(C_pre)) =
1505
  //   (Z * abs(C_pre)  * var_init  / abs(C_pre) + C_pre * pre_iter_C_init /  abs(C_pre)) % (aw / abs(C_pre)) =
1506
  //   (Z * var_init                             +         pre_iter_C_init * sign(C_pre)) % q                 = 0  (9c)
1507
  //
1508
  //   -> pre_iter_C_init  * sign(C_pre) = mz1 * q -               Z * var_init
1509
  //   -> pre_iter_C_init                = mz2 * q - sign(C_pre) * Z * var_init                                    (10c)
1510
  //      (for any integers mz1, mz2)
1511
  //
1512
  //
1513
  // Having solved the equations using the division, we can re-substitute X, Y, and Z, and apply (FAC_INVAR) as
1514
  // well as (FAC_INIT). We use the fact that sign(x) == 1 / sign(x) and sign(x) * abs(x) == x:
1515
  //
1516
  //   pre_iter_C_const = mx2 * q - sign(C_pre) * X
1517
  //                    = mx2 * q - sign(C_pre) * C_const             / abs(C_pre)
1518
  //                    = mx2 * q - C_const / C_pre
1519
  //                    = mx2 * q - C_const / (scale * pre_stride)                                  (11a)
1520
  //
1521
  // If there is an invariant:
1522
  //
1523
  //   pre_iter_C_invar = my2 * q - sign(C_pre) * Y       * var_invar
1524
  //                    = my2 * q - sign(C_pre) * C_invar * var_invar / abs(C_pre)
1525
  //                    = my2 * q - sign(C_pre) * invar               / abs(C_pre)
1526
  //                    = my2 * q - invar / C_pre
1527
  //                    = my2 * q - invar / (scale * pre_stride)                                    (11b, with invar)
1528
  //
1529
  // If there is no invariant (i.e. C_invar = 0 ==> Y = 0):
1530
  //
1531
  //   pre_iter_C_invar = my2 * q                                                                   (11b, no invar)
1532
  //
1533
  // If init is variable (i.e. C_init = scale, init = var_init):
1534
  //
1535
  //   pre_iter_C_init  = mz2 * q - sign(C_pre) * Z       * var_init
1536
  //                    = mz2 * q - sign(C_pre) * C_init  * var_init  / abs(C_pre)
1537
  //                    = mz2 * q - sign(C_pre) * scale   * init      / abs(C_pre)
1538
  //                    = mz2 * q - scale * init / C_pre
1539
  //                    = mz2 * q - scale * init / (scale * pre_stride)
1540
  //                    = mz2 * q - init / pre_stride                                               (11c, variable init)
1541
  //
1542
  // If init is constant (i.e. C_init = 0 ==> Z = 0):
1543
  //
1544
  //   pre_iter_C_init  = mz2 * q                                                                   (11c, constant init)
1545
  //
1546
  // Note, that the solutions found by (11a, b, c) are all periodic with periodicity q. We combine them,
1547
  // with m = mx2 + my2 + mz2:
1548
  //
1549
  //   pre_iter =   pre_iter_C_const + pre_iter_C_invar + pre_iter_C_init
1550
  //            =   mx2 * q  - C_const / (scale * pre_stride)
1551
  //              + my2 * q [- invar / (scale * pre_stride) ]
1552
  //              + mz2 * q [- init / pre_stride            ]
1553
  //
1554
  //            =   m * q                                 (periodic part)
1555
  //              - C_const / (scale * pre_stride)        (align constant term)
1556
  //             [- invar / (scale * pre_stride)   ]      (align invariant term, if present)
1557
  //             [- init / pre_stride              ]      (align variable init term, if present)    (12)
1558
  //
1559
  // We can further simplify this solution by introducing integer 0 <= r < q:
1560
  //
1561
  //   r = (-C_const / (scale * pre_stride)) % q                                                    (13)
1562
  //
1563
  const int r = AlignmentSolution::mod(-C_const / (_scale * _pre_stride), q);
1564
  //
1565
  //   pre_iter = m * q + r
1566
  //                   [- invar / (scale * pre_stride)  ]
1567
  //                   [- init / pre_stride             ]                                           (14)
1568
  //
1569
  // We thus get a solution that can be stated in terms of:
1570
  //
1571
  //   q (periodicity), r (constant alignment), invar, scale, pre_stride, init
1572
  //
1573
  // However, pre_stride and init are shared by all mem_ref in the loop, hence we do not need to provide
1574
  // them in the solution description.
1575

1576
  DEBUG_ONLY( trace_constrained_solution(C_const, C_invar, C_init, C_pre, q, r); )
1577

1578
  return new ConstrainedAlignmentSolution(_mem_ref, q, r, _invar, _scale);
1579

1580
  // APPENDIX:
1581
  // We can now verify the success of the solution given by (12):
1582
  //
1583
  //   adr % aw =
1584
  //
1585
  //   -> Simple form
1586
  //   (base + offset + invar + scale * iv) % aw =
1587
  //
1588
  //   -> Expand iv
1589
  //   (base + offset + invar + scale * (init + pre_stride * pre_iter + main_stride * main_iter)) % aw =
1590
  //
1591
  //   -> Reshape
1592
  //   (base + offset + invar
1593
  //         + scale * init
1594
  //         + scale * pre_stride * pre_iter
1595
  //         + scale * main_stride * main_iter)) % aw =
1596
  //
1597
  //   -> base aligned: base % aw = 0
1598
  //   -> main-loop iterations aligned (2): C_main % aw = (scale * main_stride) % aw = 0
1599
  //   (offset + invar + scale * init + scale * pre_stride * pre_iter) % aw =
1600
  //
1601
  //   -> apply (12)
1602
  //   (offset + invar + scale * init
1603
  //           + scale * pre_stride * (m * q - C_const / (scale * pre_stride)
1604
  //                                        [- invar / (scale * pre_stride) ]
1605
  //                                        [- init / pre_stride            ]
1606
  //                                  )
1607
  //   ) % aw =
1608
  //
1609
  //   -> expand C_const = offset [+ init * scale]  (if init const)
1610
  //   (offset + invar + scale * init
1611
  //           + scale * pre_stride * (m * q - offset / (scale * pre_stride)
1612
  //                                        [- init / pre_stride            ]             (if init constant)
1613
  //                                        [- invar / (scale * pre_stride) ]             (if invar present)
1614
  //                                        [- init / pre_stride            ]             (if init variable)
1615
  //                                  )
1616
  //   ) % aw =
1617
  //
1618
  //   -> assuming invar = 0 if it is not present
1619
  //   -> merge the two init terms (variable or constant)
1620
  //   -> apply (8): q = aw / (abs(C_pre)) = aw / abs(scale * pre_stride)
1621
  //   -> and hence: (scale * pre_stride * q) % aw = 0
1622
  //   -> all terms are canceled out
1623
  //   (offset + invar + scale * init
1624
  //           + scale * pre_stride * m * q                             -> aw aligned
1625
  //           - scale * pre_stride * offset / (scale * pre_stride)     -> = offset
1626
  //           - scale * pre_stride * init / pre_stride                 -> = scale * init
1627
  //           - scale * pre_stride * invar / (scale * pre_stride)      -> = invar
1628
  //   ) % aw = 0
1629
  //
1630
  // The solution given by (12) does indeed guarantee alignment.
1631
}
1632

1633
#ifdef ASSERT
1634
void AlignmentSolver::trace_start_solve() const {
1635
  if (is_trace()) {
1636
    tty->print(" vector mem_ref:");
1637
    _mem_ref->dump();
1638
    tty->print_cr("  vector_width = vector_length(%d) * element_size(%d) = %d",
1639
                  _vector_length, _element_size, _vector_width);
1640
    tty->print_cr("  aw = alignment_width = min(vector_width(%d), ObjectAlignmentInBytes(%d)) = %d",
1641
                  _vector_width, ObjectAlignmentInBytes, _aw);
1642

1643
    if (!_init_node->is_ConI()) {
1644
      tty->print("  init:");
1645
      _init_node->dump();
1646
    }
1647

1648
    if (_invar != nullptr) {
1649
      tty->print("  invar:");
1650
      _invar->dump();
1651
    }
1652

1653
    tty->print_cr("  invar_factor = %d", _invar_factor);
1654

1655
    // iv = init + pre_iter * pre_stride + main_iter * main_stride
1656
    tty->print("  iv = init");
1657
    print_con_or_idx(_init_node);
1658
    tty->print_cr(" + pre_iter * pre_stride(%d) + main_iter * main_stride(%d)",
1659
                  _pre_stride, _main_stride);
1660

1661
    // adr = base + offset + invar + scale * iv
1662
    tty->print("  adr = base");
1663
    print_con_or_idx(_base);
1664
    tty->print(" + offset(%d) + invar", _offset);
1665
    print_con_or_idx(_invar);
1666
    tty->print_cr(" + scale(%d) * iv", _scale);
1667
  }
1668
}
1669

1670
void AlignmentSolver::trace_reshaped_form(const int C_const,
1671
                                          const int C_const_init,
1672
                                          const int C_invar,
1673
                                          const int C_init,
1674
                                          const int C_pre,
1675
                                          const int C_main) const
1676
{
1677
  if (is_trace()) {
1678
    tty->print("      = base[%d] + ", _base->_idx);
1679
    tty->print_cr("C_const(%d) + C_invar(%d) * var_invar + C_init(%d) * var_init + C_pre(%d) * pre_iter + C_main(%d) * main_iter",
1680
                  C_const, C_invar, C_init,  C_pre, C_main);
1681
    if (_init_node->is_ConI()) {
1682
      tty->print_cr("  init is constant:");
1683
      tty->print_cr("    C_const_init = %d", C_const_init);
1684
      tty->print_cr("    C_init = %d", C_init);
1685
    } else {
1686
      tty->print_cr("  init is variable:");
1687
      tty->print_cr("    C_const_init = %d", C_const_init);
1688
      tty->print_cr("    C_init = abs(scale)= %d", C_init);
1689
    }
1690
    if (_invar != nullptr) {
1691
      tty->print_cr("  invariant present:");
1692
      tty->print_cr("    C_invar = abs(invar_factor) = %d", C_invar);
1693
    } else {
1694
      tty->print_cr("  no invariant:");
1695
      tty->print_cr("    C_invar = %d", C_invar);
1696
    }
1697
    tty->print_cr("  C_const = offset(%d) + scale(%d) * C_const_init(%d) = %d",
1698
                  _offset, _scale, C_const_init, C_const);
1699
    tty->print_cr("  C_pre   = scale(%d) * pre_stride(%d) = %d",
1700
                  _scale, _pre_stride, C_pre);
1701
    tty->print_cr("  C_main  = scale(%d) * main_stride(%d) = %d",
1702
                  _scale, _main_stride, C_main);
1703
  }
1704
}
1705

1706
void AlignmentSolver::trace_main_iteration_alignment(const int C_const,
1707
                                                     const int C_invar,
1708
                                                     const int C_init,
1709
                                                     const int C_pre,
1710
                                                     const int C_main,
1711
                                                     const int C_main_mod_aw) const
1712
{
1713
  if (is_trace()) {
1714
    tty->print("  EQ(1 ): (C_const(%d) + C_invar(%d) * var_invar + C_init(%d) * var_init",
1715
                  C_const, C_invar, C_init);
1716
    tty->print(" + C_pre(%d) * pre_iter + C_main(%d) * main_iter) %% aw(%d) = 0",
1717
                  C_pre, C_main, _aw);
1718
    tty->print_cr(" (given base aligned -> align rest)");
1719
    tty->print("  EQ(2 ): C_main(%d) %% aw(%d) = %d = 0",
1720
               C_main, _aw, C_main_mod_aw);
1721
    tty->print_cr(" (alignment across iterations)");
1722
  }
1723
}
1724

1725
void AlignmentSolver::EQ4::trace() const {
1726
  tty->print_cr("  EQ(4a): (C_const(%3d)             + C_pre(%d) * pre_iter_C_const) %% aw(%d) = 0  (align const term individually)",
1727
                _C_const, _C_pre, _aw);
1728
  tty->print_cr("          -> %s", state_to_str(eq4a_state()));
1729

1730
  tty->print_cr("  EQ(4b): (C_invar(%3d) * var_invar + C_pre(%d) * pre_iter_C_invar) %% aw(%d) = 0  (align invar term individually)",
1731
                _C_invar, _C_pre, _aw);
1732
  tty->print_cr("          -> %s", state_to_str(eq4b_state()));
1733

1734
  tty->print_cr("  EQ(4c): (C_init( %3d) * var_init  + C_pre(%d) * pre_iter_C_init ) %% aw(%d) = 0  (align init term individually)",
1735
                _C_init, _C_pre, _aw);
1736
  tty->print_cr("          -> %s", state_to_str(eq4c_state()));
1737
}
1738

1739
void AlignmentSolver::trace_constrained_solution(const int C_const,
1740
                                                 const int C_invar,
1741
                                                 const int C_init,
1742
                                                 const int C_pre,
1743
                                                 const int q,
1744
                                                 const int r) const
1745
{
1746
  if (is_trace()) {
1747
    tty->print_cr("  EQ(4a, b, c) all constrained, hence:");
1748
    tty->print_cr("  EQ(5a): C_const(%3d) %% abs(C_pre(%d)) = 0", C_const, C_pre);
1749
    tty->print_cr("  EQ(5b): C_invar(%3d) %% abs(C_pre(%d)) = 0", C_invar, C_pre);
1750
    tty->print_cr("  EQ(5c): C_init( %3d) %% abs(C_pre(%d)) = 0", C_init,  C_pre);
1751

1752
    tty->print_cr("  All terms in EQ(4a, b, c) are divisible by abs(C_pre(%d)).", C_pre);
1753
    const int X    = C_const / abs(C_pre);
1754
    const int Y    = C_invar / abs(C_pre);
1755
    const int Z    = C_init  / abs(C_pre);
1756
    const int sign = (C_pre > 0) ? 1 : -1;
1757
    tty->print_cr("  X = C_const(%3d) / abs(C_pre(%d)) = %d       (6a)", C_const, C_pre, X);
1758
    tty->print_cr("  Y = C_invar(%3d) / abs(C_pre(%d)) = %d       (6b)", C_invar, C_pre, Y);
1759
    tty->print_cr("  Z = C_init( %3d) / abs(C_pre(%d)) = %d       (6c)", C_init , C_pre, Z);
1760
    tty->print_cr("  q = aw(     %3d) / abs(C_pre(%d)) = %d       (8)",  _aw,     C_pre, q);
1761
    tty->print_cr("  sign(C_pre) = (C_pre(%d) > 0) ? 1 : -1 = %d  (7)",  C_pre,   sign);
1762

1763
    tty->print_cr("  EQ(9a): (X(%3d)             + pre_iter_C_const * sign(C_pre)) %% q(%d) = 0", X, q);
1764
    tty->print_cr("  EQ(9b): (Y(%3d) * var_invar + pre_iter_C_invar * sign(C_pre)) %% q(%d) = 0", Y, q);
1765
    tty->print_cr("  EQ(9c): (Z(%3d) * var_init  + pre_iter_C_init  * sign(C_pre)) %% q(%d) = 0", Z, q);
1766

1767
    tty->print_cr("  EQ(10a): pre_iter_C_const = mx2 * q(%d) - sign(C_pre) * X(%d)",             q, X);
1768
    tty->print_cr("  EQ(10b): pre_iter_C_invar = my2 * q(%d) - sign(C_pre) * Y(%d) * var_invar", q, Y);
1769
    tty->print_cr("  EQ(10c): pre_iter_C_init  = mz2 * q(%d) - sign(C_pre) * Z(%d) * var_init ", q, Z);
1770

1771
    tty->print_cr("  r = (-C_const(%d) / (scale(%d) * pre_stride(%d)) %% q(%d) = %d",
1772
                  C_const, _scale, _pre_stride, q, r);
1773

1774
    tty->print_cr("  EQ(14):  pre_iter = m * q(%3d) - r(%d)", q, r);
1775
    if (_invar != nullptr) {
1776
      tty->print_cr("                                 - invar / (scale(%d) * pre_stride(%d))",
1777
                    _scale, _pre_stride);
1778
    }
1779
    if (!_init_node->is_ConI()) {
1780
      tty->print_cr("                                 - init / pre_stride(%d)",
1781
                    _pre_stride);
1782
    }
1783
  }
1784
}
1785
#endif
1786
jdk

Использование cookies