/*
 * Copyright (C) 2016 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
#define ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_

#include "induction_var_range.h"
#include "nodes.h"
#include "optimization.h"

namespace art {

class CompilerDriver;

/**
 * Loop optimizations. Builds a loop hierarchy and applies optimizations to
 * the detected nested loops, such as removal of dead induction and empty loops
 * and inner loop vectorization.
 */
class HLoopOptimization : public HOptimization {
 public:
  HLoopOptimization(HGraph* graph,
                    CompilerDriver* compiler_driver,
                    HInductionVarAnalysis* induction_analysis);

  void Run() OVERRIDE;

  static constexpr const char* kLoopOptimizationPassName = "loop_optimization";

 private:
  /**
   * A single loop inside the loop hierarchy representation.
   */
  struct LoopNode : public ArenaObject<kArenaAllocLoopOptimization> {
    explicit LoopNode(HLoopInformation* lp_info)
        : loop_info(lp_info),
          outer(nullptr),
          inner(nullptr),
          previous(nullptr),
          next(nullptr) {}
    HLoopInformation* loop_info;
    LoopNode* outer;
    LoopNode* inner;
    LoopNode* previous;
    LoopNode* next;
  };

  /*
   * Vectorization restrictions (bit mask).
   */
  enum VectorRestrictions {
    kNone            = 0,    // no restrictions
    kNoMul           = 1,    // no multiplication
    kNoDiv           = 2,    // no division
    kNoShift         = 4,    // no shift
    kNoShr           = 8,    // no arithmetic shift right
    kNoHiBits        = 16,   // "wider" operations cannot bring in higher order bits
    kNoSignedHAdd    = 32,   // no signed halving add
    kNoUnroundedHAdd = 64,   // no unrounded halving add
    kNoAbs           = 128,  // no absolute value
  };

  /*
   * Vectorization mode during synthesis
   * (sequential peeling/cleanup loop or vector loop).
   */
  enum VectorMode {
    kSequential,
    kVector
  };

  /*
   * Representation of a unit-stride array reference.
   */
  struct ArrayReference {
    ArrayReference(HInstruction* b, HInstruction* o, Primitive::Type t, bool l)
        : base(b), offset(o), type(t), lhs(l) { }
    bool operator<(const ArrayReference& other) const {
      return
          (base < other.base) ||
          (base == other.base &&
           (offset < other.offset || (offset == other.offset &&
                                      (type < other.type ||
                                       (type == other.type && lhs < other.lhs)))));
    }
    HInstruction* base;    // base address
    HInstruction* offset;  // offset + i
    Primitive::Type type;  // component type
    bool lhs;              // def/use
  };

  // Loop setup and traversal.
  void LocalRun();
  void AddLoop(HLoopInformation* loop_info);
  void RemoveLoop(LoopNode* node);
  void TraverseLoopsInnerToOuter(LoopNode* node);

  // Optimization.
  void SimplifyInduction(LoopNode* node);
  void SimplifyBlocks(LoopNode* node);
  void OptimizeInnerLoop(LoopNode* node);

  // Vectorization analysis and synthesis.
  bool CanVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count);
  void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count);
  void GenerateNewLoop(LoopNode* node,
                       HBasicBlock* block,
                       HBasicBlock* new_preheader,
                       HInstruction* lo,
                       HInstruction* hi,
                       HInstruction* step);
  bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code);
  bool VectorizeUse(LoopNode* node,
                    HInstruction* instruction,
                    bool generate_code,
                    Primitive::Type type,
                    uint64_t restrictions);
  bool TrySetVectorType(Primitive::Type type, /*out*/ uint64_t* restrictions);
  bool TrySetVectorLength(uint32_t length);
  void GenerateVecInv(HInstruction* org, Primitive::Type type);
  void GenerateVecSub(HInstruction* org, HInstruction* off);
  void GenerateVecMem(HInstruction* org,
                      HInstruction* opa,
                      HInstruction* opb,
                      Primitive::Type type);
  void GenerateVecOp(HInstruction* org, HInstruction* opa, HInstruction* opb, Primitive::Type type);

  // Vectorization idioms.
  bool VectorizeHalvingAddIdiom(LoopNode* node,
                                HInstruction* instruction,
                                bool generate_code,
                                Primitive::Type type,
                                uint64_t restrictions);

  // Helpers.
  bool TrySetPhiInduction(HPhi* phi, bool restrict_uses);
  bool TrySetSimpleLoopHeader(HBasicBlock* block);
  bool IsEmptyBody(HBasicBlock* block);
  bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info,
                           HInstruction* instruction,
                           bool collect_loop_uses,
                           /*out*/ int32_t* use_count);
  bool IsUsedOutsideLoop(HLoopInformation* loop_info,
                         HInstruction* instruction);
  bool TryReplaceWithLastValue(HLoopInformation* loop_info,
                               HInstruction* instruction,
                               HBasicBlock* block);
  bool TryAssignLastValue(HLoopInformation* loop_info,
                          HInstruction* instruction,
                          HBasicBlock* block,
                          bool collect_loop_uses);
  void RemoveDeadInstructions(const HInstructionList& list);
  bool CanRemoveCycle();  // Whether the current 'iset_' is removable.

  // Compiler driver (to query ISA features).
  const CompilerDriver* compiler_driver_;

  // Range information based on prior induction variable analysis.
  InductionVarRange induction_range_;

  // Phase-local heap memory allocator for the loop optimizer. Storage obtained
  // through this allocator is immediately released when the loop optimizer is done.
  ArenaAllocator* loop_allocator_;

  // Global heap memory allocator. Used to build HIR.
  ArenaAllocator* global_allocator_;

  // Entries into the loop hierarchy representation. The hierarchy resides
  // in phase-local heap memory.
  LoopNode* top_loop_;
  LoopNode* last_loop_;

  // Temporary bookkeeping of a set of instructions.
  // Contents reside in phase-local heap memory.
  ArenaSet<HInstruction*>* iset_;

  // Counter that tracks how many induction cycles have been simplified. Useful
  // to trigger incremental updates of induction variable analysis of outer loops
  // when the induction of inner loops has changed.
  uint32_t induction_simplication_count_;

  // Flag that tracks if any simplifications have occurred.
  bool simplified_;

  // Number of "lanes" for selected packed type.
  uint32_t vector_length_;

  // Set of array references in the vector loop.
  // Contents reside in phase-local heap memory.
  ArenaSet<ArrayReference>* vector_refs_;

  // Mapping used during vectorization synthesis for both the scalar peeling/cleanup
  // loop (simd_ is false) and the actual vector loop (simd_ is true). The data
  // structure maps original instructions into the new instructions.
  // Contents reside in phase-local heap memory.
  ArenaSafeMap<HInstruction*, HInstruction*>* vector_map_;

  // Temporary vectorization bookkeeping.
  HBasicBlock* vector_preheader_;  // preheader of the new loop
  HBasicBlock* vector_header_;  // header of the new loop
  HBasicBlock* vector_body_;  // body of the new loop
  HInstruction* vector_runtime_test_a_;
  HInstruction* vector_runtime_test_b_;  // defines a != b runtime test
  HPhi* vector_phi_;  // the Phi representing the normalized loop index
  VectorMode vector_mode_;  // selects synthesis mode

  friend class LoopOptimizationTest;

  DISALLOW_COPY_AND_ASSIGN(HLoopOptimization);
};

}  // namespace art

#endif  // ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_