//===- TensorOps.cpp - Implementation of the linalg TensorOps operation ---===//
//
// Copyright 2019 The MLIR Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
//
// This file implements a simple IR operation to create new tensor computation
// operations in the linalg dialect.
//
//===----------------------------------------------------------------------===//

#include "linalg1/Analysis.h"
#include "linalg1/Common.h"
#include "linalg3/Intrinsics.h"
#include "linalg3/Ops.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/OpDefinition.h"
#include "mlir/IR/OpImplementation.h"
#include "mlir/IR/StandardTypes.h"
#include "llvm/ADT/STLExtras.h"

using namespace mlir;
using namespace mlir::edsc;
using namespace mlir::edsc::intrinsics;
using namespace linalg;
using namespace linalg::intrinsics;

//////////////////////////////////////////////////////////////////////////////
// Implementation of DotOp.
//////////////////////////////////////////////////////////////////////////////
SmallVector<AffineMap, 8> linalg::DotOp::loopsToOperandRangeMaps() {
  // A(K), B(K), C()
  assert(getRanges(*this).size() == 2);
  auto *context = ScopedContext::getContext();
  auto d0 = getAffineDimExpr(0, context); // K
  // A(K), B(K), C()
  //   (d0) -> (d0, d0)(%k)
  return SmallVector<AffineMap, 8>{AffineMap::get(1, 0, {d0}, {}), // A(K)
                                   AffineMap::get(1, 0, {d0}, {}), // B(K)
                                   AffineMap()};                   // C()
}

void linalg::DotOp::emitScalarImplementation(
    llvm::ArrayRef<Value *> parallelIvs, llvm::ArrayRef<Value *> reductionIvs) {
  using IndexedValue = TemplatedIndexedValue<linalg::intrinsics::load,
                                             linalg::intrinsics::store>;
  assert(reductionIvs.size() == 1);
  auto innermostLoop = getForInductionVarOwner(reductionIvs.back());
  auto *body = innermostLoop.getBody();
  using edsc::op::operator+;
  using edsc::op::operator*;
  using edsc::op::operator==;
  using edsc::intrinsics::select;

  // Account for affine.terminator in loop.
  FuncBuilder builder(body, std::prev(body->end(), 1));
  ScopedContext scope(builder, innermostLoop.getLoc());
  FloatType fTy = getOperand(0)
                      ->getType()
                      .cast<ViewType>()
                      .getElementType()
                      .cast<FloatType>();
  IndexHandle zero(constant_index(0));
  ValueHandle zerof =
      constant_float(llvm::APFloat::getZero(fTy.getFloatSemantics()), fTy);
  IndexHandle r_i(reductionIvs[0]);
  IndexedValue A(getOperand(0)), B(getOperand(1)), C(getOperand(2));
  ValueHandle cond = (r_i == zero);
  ValueHandle scalarC = select(cond, zerof, *C());
  C() = scalarC + A(r_i) * B(r_i);
}

//////////////////////////////////////////////////////////////////////////////
// Implementation of MatvecOp.
//////////////////////////////////////////////////////////////////////////////
SmallVector<AffineMap, 8> linalg::MatvecOp::loopsToOperandRangeMaps() {
  // A(M, K), B(K), C(M)
  assert(getRanges(*this).size() == 4);
  auto *context = ScopedContext::getContext();
  auto d0 = getAffineDimExpr(0, context); // M
  auto d1 = getAffineDimExpr(1, context); // K
  // A(M, K), B(K), C(M)
  //   (d0, d1) -> (d0, d1, d1, d0)(%m, %k)
  return SmallVector<AffineMap, 8>{
      AffineMap::get(2, 0, {d0, d1}, {}), // A(M, K)
      AffineMap::get(2, 0, {d1}, {}),     // B(K)
      AffineMap::get(2, 0, {d0}, {})};    // C(M)
}

// The body expression for matvec is: C(i) = scalarC + A(i, r_j) * B(r_j)
// The body expression for dot is: C() = A(r_i) * B(r_i);
// So we must drop the `i` loop from the matvec.
void linalg::MatvecOp::writeAsFinerGrainTensorContraction() {
  auto *op = getOperation();
  auto *vA(getInputView(0)), *vB(getInputView(1)), *vC(getOutputView(0));
  auto indexingPosPair = getViewRootIndexing(vA, 0);
  assert(
      llvm::isa_and_nonnull<RangeOp>(indexingPosPair.first->getDefiningOp()));
  // clang-format off
  FuncBuilder builder(op);
  ScopedContext scope(builder, op->getLoc());
  IndexHandle i;
  using linalg::common::LoopNestRangeBuilder;
  LoopNestRangeBuilder(&i, ValueHandle(indexingPosPair.first))({
    [&i, &vA, &vB, &vC]() {
      ValueHandle sliceA = slice(vA, i, 0);
      ValueHandle sliceC = slice(vC, i, 0);
      dot(sliceA, vB, sliceC);
      /// NestedBuilders expect handles, we thus return an IndexHandle.
      return IndexHandle();
    }()
  });
  // clang-format on
}

void linalg::MatvecOp::emitScalarImplementation(
    llvm::ArrayRef<Value *> parallelIvs, llvm::ArrayRef<Value *> reductionIvs) {
  using IndexedValue = TemplatedIndexedValue<linalg::intrinsics::load,
                                             linalg::intrinsics::store>;
  assert(reductionIvs.size() == 1);
  auto innermostLoop = getForInductionVarOwner(reductionIvs.back());
  auto *body = innermostLoop.getBody();
  using edsc::op::operator+;
  using edsc::op::operator*;
  using edsc::op::operator==;
  using edsc::intrinsics::select;
  // Account for affine.terminator in loop.
  FuncBuilder builder(body, std::prev(body->end(), 1));
  ScopedContext scope(builder, innermostLoop.getLoc());
  FloatType fTy = getOperand(0)
                      ->getType()
                      .cast<ViewType>()
                      .getElementType()
                      .cast<FloatType>();
  IndexHandle i(parallelIvs[0]), r_j(reductionIvs[0]);
  IndexedValue A(getOperand(0)), B(getOperand(1)), C(getOperand(2));
  IndexHandle zero(constant_index(0));
  ValueHandle zerof =
      constant_float(llvm::APFloat::getZero(fTy.getFloatSemantics()), fTy);
  ValueHandle cond = (r_j == zero);
  ValueHandle scalarC = select(cond, zerof, *C(i));
  C(i) = scalarC + A(i, r_j) * B(r_j);
}

//////////////////////////////////////////////////////////////////////////////
// Implementation of Matmul.
//////////////////////////////////////////////////////////////////////////////
SmallVector<AffineMap, 8> linalg::MatmulOp::loopsToOperandRangeMaps() {
  // A(M, K), B(K, N), C(M, N)
  assert(getRanges(*this).size() == 6);
  auto *context = ScopedContext::getContext();
  auto d0 = getAffineDimExpr(0, context); // M
  auto d1 = getAffineDimExpr(1, context); // N
  auto d2 = getAffineDimExpr(2, context); // K
  // A(M, K), B(K, N), C(M, N):
  //   (d0, d1, d2) -> (d0, d2, d2, d1, d0, d1)(%m, %n, %k)
  return SmallVector<AffineMap, 8>{
      AffineMap::get(3, 0, {d0, d2}, {}), // A(M, K)
      AffineMap::get(3, 0, {d2, d1}, {}), // B(K, N)
      AffineMap::get(3, 0, {d0, d1}, {})  // C(M, N)
  };
}

// The body expression for matmul is: C(i, j) = scalarC + A(i, r_k) * B(r_k, j)
// The body expression for matvec is: C(i) = scalarC + A(i, r_j) * B(r_j)
// So we must drop the `j` loop from the matmul.
// This is fine because parallel dimensions permute: we can just do it
// declaratively.
void linalg::MatmulOp::writeAsFinerGrainTensorContraction() {
  auto *op = getOperation();
  auto *vA(getInputView(0)), *vB(getInputView(1)), *vC(getOutputView(0));
  auto indexingPosPair = getViewRootIndexing(vB, 1);
  assert(
      llvm::isa_and_nonnull<RangeOp>(indexingPosPair.first->getDefiningOp()));
  using linalg::common::LoopNestRangeBuilder;
  // clang-format off
  FuncBuilder builder(op);
  ScopedContext scope(builder, op->getLoc());
  IndexHandle j;
  LoopNestRangeBuilder(&j, ValueHandle(indexingPosPair.first))({
    [&j, &vA, &vB, &vC]() {
      ValueHandle sliceB = slice(vB, j, 1);
      ValueHandle sliceC = slice(vC, j, 1);
      matvec(vA, sliceB, sliceC);
      /// NestedBuilders expect handles, we thus return an IndexHandle.
      return IndexHandle();
    }()
  });
  // clang-format on
}

void linalg::MatmulOp::emitScalarImplementation(
    llvm::ArrayRef<Value *> parallelIvs, llvm::ArrayRef<Value *> reductionIvs) {
  using IndexedValue = TemplatedIndexedValue<linalg::intrinsics::load,
                                             linalg::intrinsics::store>;
  assert(reductionIvs.size() == 1);
  auto innermostLoop = getForInductionVarOwner(reductionIvs.back());
  auto *body = innermostLoop.getBody();
  using edsc::op::operator+;
  using edsc::op::operator*;
  using edsc::op::operator==;
  using edsc::intrinsics::select;
  // Account for affine.terminator in loop.
  FuncBuilder builder(body, std::prev(body->end(), 1));
  ScopedContext scope(builder, innermostLoop.getLoc());
  FloatType fTy = getOperand(0)
                      ->getType()
                      .cast<ViewType>()
                      .getElementType()
                      .cast<FloatType>();
  IndexHandle i(parallelIvs[0]), j(parallelIvs[1]), r_k(reductionIvs[0]);
  IndexedValue A(getOperand(0)), B(getOperand(1)), C(getOperand(2));
  IndexHandle zero(constant_index(0));
  ValueHandle zerof =
      constant_float(llvm::APFloat::getZero(fTy.getFloatSemantics()), fTy);
  ValueHandle cond = r_k == zero;
  ValueHandle scalarC = select(cond, zerof, *C(i, j));
  C(i, j) = scalarC + A(i, r_k) * B(r_k, j);
}