/*
 * Copyright (C) 2016 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "common/math/mat.h"

#include <assert.h>
#include <float.h>

#ifdef _OS_BUILD_
#include <nanohub_math.h>
#include <seos.h>
#else
#include <math.h>
#ifndef UNROLLED
#define UNROLLED
#endif
#endif  // _OS_BUILD_

#include <stddef.h>
#include <string.h>

#define EPSILON 1E-5
#define CHOLESKY_TOLERANCE 1E-6

// Forward declarations.
static void mat33SwapRows(struct Mat33 *A, uint32_t i, uint32_t j);
static uint32_t mat33Maxind(const struct Mat33 *A, uint32_t k);
static void mat33Rotate(struct Mat33 *A, float c, float s, uint32_t k,
                        uint32_t l, uint32_t i, uint32_t j);

static void mat44SwapRows(struct Mat44 *A, uint32_t i, uint32_t j);

void initZeroMatrix(struct Mat33 *A) {
  ASSERT_NOT_NULL(A);
  memset(A->elem, 0.0f, sizeof(A->elem));
}

UNROLLED
void initDiagonalMatrix(struct Mat33 *A, float x) {
  ASSERT_NOT_NULL(A);
  initZeroMatrix(A);

  uint32_t i;
  for (i = 0; i < 3; ++i) {
    A->elem[i][i] = x;
  }
}

void initMatrixColumns(struct Mat33 *A, const struct Vec3 *v1,
                       const struct Vec3 *v2, const struct Vec3 *v3) {
  ASSERT_NOT_NULL(A);
  ASSERT_NOT_NULL(v1);
  ASSERT_NOT_NULL(v2);
  ASSERT_NOT_NULL(v3);
  A->elem[0][0] = v1->x;
  A->elem[0][1] = v2->x;
  A->elem[0][2] = v3->x;

  A->elem[1][0] = v1->y;
  A->elem[1][1] = v2->y;
  A->elem[1][2] = v3->y;

  A->elem[2][0] = v1->z;
  A->elem[2][1] = v2->z;
  A->elem[2][2] = v3->z;
}

void mat33Apply(struct Vec3 *out, const struct Mat33 *A, const struct Vec3 *v) {
  ASSERT_NOT_NULL(out);
  ASSERT_NOT_NULL(A);
  ASSERT_NOT_NULL(v);
  out->x = A->elem[0][0] * v->x + A->elem[0][1] * v->y + A->elem[0][2] * v->z;
  out->y = A->elem[1][0] * v->x + A->elem[1][1] * v->y + A->elem[1][2] * v->z;
  out->z = A->elem[2][0] * v->x + A->elem[2][1] * v->y + A->elem[2][2] * v->z;
}

UNROLLED
void mat33Multiply(struct Mat33 *out, const struct Mat33 *A,
                   const struct Mat33 *B) {
  ASSERT_NOT_NULL(out);
  ASSERT_NOT_NULL(A);
  ASSERT_NOT_NULL(B);
  ASSERT(out != A);
  ASSERT(out != B);

  uint32_t i;
  for (i = 0; i < 3; ++i) {
    uint32_t j;
    for (j = 0; j < 3; ++j) {
      uint32_t k;
      float sum = 0.0f;
      for (k = 0; k < 3; ++k) {
        sum += A->elem[i][k] * B->elem[k][j];
      }

      out->elem[i][j] = sum;
    }
  }
}

UNROLLED
void mat33ScalarMul(struct Mat33 *A, float c) {
  ASSERT_NOT_NULL(A);
  uint32_t i;
  for (i = 0; i < 3; ++i) {
    uint32_t j;
    for (j = 0; j < 3; ++j) {
      A->elem[i][j] *= c;
    }
  }
}

UNROLLED
void mat33Add(struct Mat33 *out, const struct Mat33 *A) {
  ASSERT_NOT_NULL(out);
  ASSERT_NOT_NULL(A);
  uint32_t i;
  for (i = 0; i < 3; ++i) {
    uint32_t j;
    for (j = 0; j < 3; ++j) {
      out->elem[i][j] += A->elem[i][j];
    }
  }
}

UNROLLED
void mat33Sub(struct Mat33 *out, const struct Mat33 *A) {
  ASSERT_NOT_NULL(out);
  ASSERT_NOT_NULL(A);
  uint32_t i;
  for (i = 0; i < 3; ++i) {
    uint32_t j;
    for (j = 0; j < 3; ++j) {
      out->elem[i][j] -= A->elem[i][j];
    }
  }
}

UNROLLED
int mat33IsPositiveSemidefinite(const struct Mat33 *A, float tolerance) {
  ASSERT_NOT_NULL(A);
  uint32_t i;
  for (i = 0; i < 3; ++i) {
    if (A->elem[i][i] < 0.0f) {
      return 0;
    }
  }

  for (i = 0; i < 3; ++i) {
    uint32_t j;
    for (j = i + 1; j < 3; ++j) {
      if (fabsf(A->elem[i][j] - A->elem[j][i]) > tolerance) {
        return 0;
      }
    }
  }

  return 1;
}

UNROLLED
void mat33MultiplyTransposed(struct Mat33 *out, const struct Mat33 *A,
                             const struct Mat33 *B) {
  ASSERT(out != A);
  ASSERT(out != B);
  ASSERT_NOT_NULL(out);
  ASSERT_NOT_NULL(A);
  ASSERT_NOT_NULL(B);
  uint32_t i;
  for (i = 0; i < 3; ++i) {
    uint32_t j;
    for (j = 0; j < 3; ++j) {
      uint32_t k;
      float sum = 0.0f;
      for (k = 0; k < 3; ++k) {
        sum += A->elem[k][i] * B->elem[k][j];
      }

      out->elem[i][j] = sum;
    }
  }
}

UNROLLED
void mat33MultiplyTransposed2(struct Mat33 *out, const struct Mat33 *A,
                              const struct Mat33 *B) {
  ASSERT(out != A);
  ASSERT(out != B);
  ASSERT_NOT_NULL(out);
  ASSERT_NOT_NULL(A);
  ASSERT_NOT_NULL(B);
  uint32_t i;
  for (i = 0; i < 3; ++i) {
    uint32_t j;
    for (j = 0; j < 3; ++j) {
      uint32_t k;
      float sum = 0.0f;
      for (k = 0; k < 3; ++k) {
        sum += A->elem[i][k] * B->elem[j][k];
      }

      out->elem[i][j] = sum;
    }
  }
}

UNROLLED
void mat33Invert(struct Mat33 *out, const struct Mat33 *A) {
  ASSERT_NOT_NULL(out);
  ASSERT_NOT_NULL(A);
  float t;
  initDiagonalMatrix(out, 1.0f);

  struct Mat33 tmp = *A;

  uint32_t i, k;
  for (i = 0; i < 3; ++i) {
    uint32_t swap = i;
    uint32_t j;
    for (j = i + 1; j < 3; ++j) {
      if (fabsf(tmp.elem[j][i]) > fabsf(tmp.elem[i][i])) {
        swap = j;
      }
    }

    if (swap != i) {
      for (k = 0; k < 3; ++k) {
        t = tmp.elem[i][k];
        tmp.elem[i][k] = tmp.elem[swap][k];
        tmp.elem[swap][k] = t;

        t = out->elem[i][k];
        out->elem[i][k] = out->elem[swap][k];
        out->elem[swap][k] = t;
      }
    }
    // divide by zero guard.
    ASSERT(fabs(tmp.elem[i][i]) > 0);
    if(!(fabs(tmp.elem[i][i]) > 0)) {
      return;
    }
    t = 1.0f / tmp.elem[i][i];
    for (k = 0; k < 3; ++k) {
      tmp.elem[i][k] *= t;
      out->elem[i][k] *= t;
    }

    for (j = 0; j < 3; ++j) {
      if (j != i) {
        t = tmp.elem[j][i];
        for (k = 0; k < 3; ++k) {
          tmp.elem[j][k] -= tmp.elem[i][k] * t;
          out->elem[j][k] -= out->elem[i][k] * t;
        }
      }
    }
  }
}

UNROLLED
void mat33Transpose(struct Mat33 *out, const struct Mat33 *A) {
  ASSERT_NOT_NULL(out);
  ASSERT_NOT_NULL(A);
  uint32_t i;
  for (i = 0; i < 3; ++i) {
    uint32_t j;
    for (j = 0; j < 3; ++j) {
      out->elem[i][j] = A->elem[j][i];
    }
  }
}

UNROLLED
void mat33SwapRows(struct Mat33 *A, const uint32_t i, const uint32_t j) {
  ASSERT_NOT_NULL(A);
  const uint32_t N = 3;
  uint32_t k;

  if (i == j) {
    return;
  }

  for (k = 0; k < N; ++k) {
    float tmp = A->elem[i][k];
    A->elem[i][k] = A->elem[j][k];
    A->elem[j][k] = tmp;
  }
}

UNROLLED
void mat33GetEigenbasis(struct Mat33 *S, struct Vec3 *eigenvals,
                        struct Mat33 *eigenvecs) {
  ASSERT_NOT_NULL(S);
  ASSERT_NOT_NULL(eigenvals);
  ASSERT_NOT_NULL(eigenvecs);
  const uint32_t N = 3;
  uint32_t i, j, k, l, m;

  float _eigenvals[N];

  uint32_t ind[N];
  for (k = 0; k < N; ++k) {
    ind[k] = mat33Maxind(S, k);
    _eigenvals[k] = S->elem[k][k];
  }

  initDiagonalMatrix(eigenvecs, 1.0f);

  for (;;) {
    m = 0;
    for (k = 1; k + 1 < N; ++k) {
      if (fabsf(S->elem[k][ind[k]]) > fabsf(S->elem[m][ind[m]])) {
        m = k;
      }
    }

    k = m;
    l = ind[m];
    float p = S->elem[k][l];

    if (fabsf(p) < EPSILON) {
      break;
    }

    float y = (_eigenvals[l] - _eigenvals[k]) * 0.5f;

    float t = fabsf(y) + sqrtf(p * p + y * y);
    float s = sqrtf(p * p + t * t);
    float c = t / s;
    s = p / s;
    t = p * p / t;

    if (y < 0.0f) {
      s = -s;
      t = -t;
    }

    S->elem[k][l] = 0.0f;

    _eigenvals[k] -= t;
    _eigenvals[l] += t;

    for (i = 0; i < k; ++i) {
      mat33Rotate(S, c, s, i, k, i, l);
    }

    for (i = k + 1; i < l; ++i) {
      mat33Rotate(S, c, s, k, i, i, l);
    }

    for (i = l + 1; i < N; ++i) {
      mat33Rotate(S, c, s, k, i, l, i);
    }

    for (i = 0; i < N; ++i) {
      float tmp = c * eigenvecs->elem[k][i] - s * eigenvecs->elem[l][i];
      eigenvecs->elem[l][i] =
          s * eigenvecs->elem[k][i] + c * eigenvecs->elem[l][i];
      eigenvecs->elem[k][i] = tmp;
    }

    ind[k] = mat33Maxind(S, k);
    ind[l] = mat33Maxind(S, l);

    float sum = 0.0f;
    for (i = 0; i < N; ++i) {
      for (j = i + 1; j < N; ++j) {
        sum += fabsf(S->elem[i][j]);
      }
    }

    if (sum < EPSILON) {
      break;
    }
  }

  for (k = 0; k < N; ++k) {
    m = k;
    for (l = k + 1; l < N; ++l) {
      if (_eigenvals[l] > _eigenvals[m]) {
        m = l;
      }
    }

    if (k != m) {
      float tmp = _eigenvals[k];
      _eigenvals[k] = _eigenvals[m];
      _eigenvals[m] = tmp;

      mat33SwapRows(eigenvecs, k, m);
    }
  }

  initVec3(eigenvals, _eigenvals[0], _eigenvals[1], _eigenvals[2]);
}

// index of largest off-diagonal element in row k
UNROLLED
uint32_t mat33Maxind(const struct Mat33 *A, uint32_t k) {
  ASSERT_NOT_NULL(A);
  const uint32_t N = 3;

  uint32_t m = k + 1;
  uint32_t i;

  for (i = k + 2; i < N; ++i) {
    if (fabsf(A->elem[k][i]) > fabsf(A->elem[k][m])) {
      m = i;
    }
  }

  return m;
}

void mat33Rotate(struct Mat33 *A, float c, float s, uint32_t k, uint32_t l,
                 uint32_t i, uint32_t j) {
  ASSERT_NOT_NULL(A);
  float tmp = c * A->elem[k][l] - s * A->elem[i][j];
  A->elem[i][j] = s * A->elem[k][l] + c * A->elem[i][j];
  A->elem[k][l] = tmp;
}

void mat44Apply(struct Vec4 *out, const struct Mat44 *A, const struct Vec4 *v) {
  ASSERT_NOT_NULL(out);
  ASSERT_NOT_NULL(A);
  ASSERT_NOT_NULL(v);

  out->x = A->elem[0][0] * v->x + A->elem[0][1] * v->y + A->elem[0][2] * v->z +
           A->elem[0][3] * v->w;

  out->y = A->elem[1][0] * v->x + A->elem[1][1] * v->y + A->elem[1][2] * v->z +
           A->elem[1][3] * v->w;

  out->z = A->elem[2][0] * v->x + A->elem[2][1] * v->y + A->elem[2][2] * v->z +
           A->elem[2][3] * v->w;

  out->w = A->elem[3][0] * v->x + A->elem[3][1] * v->y + A->elem[3][2] * v->z +
           A->elem[3][3] * v->w;
}

UNROLLED
void mat44DecomposeLup(struct Mat44 *LU, struct Size4 *pivot) {
  ASSERT_NOT_NULL(LU);
  ASSERT_NOT_NULL(pivot);
  const uint32_t N = 4;
  uint32_t i, j, k;

  for (k = 0; k < N; ++k) {
    pivot->elem[k] = k;
    float max = fabsf(LU->elem[k][k]);
    for (j = k + 1; j < N; ++j) {
      if (max < fabsf(LU->elem[j][k])) {
        max = fabsf(LU->elem[j][k]);
        pivot->elem[k] = j;
      }
    }

    if (pivot->elem[k] != k) {
      mat44SwapRows(LU, k, pivot->elem[k]);
    }

    if (fabsf(LU->elem[k][k]) < EPSILON) {
      continue;
    }

    for (j = k + 1; j < N; ++j) {
      LU->elem[k][j] /= LU->elem[k][k];
    }

    for (i = k + 1; i < N; ++i) {
      for (j = k + 1; j < N; ++j) {
        LU->elem[i][j] -= LU->elem[i][k] * LU->elem[k][j];
      }
    }
  }
}

UNROLLED
void mat44SwapRows(struct Mat44 *A, const uint32_t i, const uint32_t j) {
  ASSERT_NOT_NULL(A);
  const uint32_t N = 4;
  uint32_t k;

  if (i == j) {
    return;
  }

  for (k = 0; k < N; ++k) {
    float tmp = A->elem[i][k];
    A->elem[i][k] = A->elem[j][k];
    A->elem[j][k] = tmp;
  }
}

UNROLLED
void mat44Solve(const struct Mat44 *A, struct Vec4 *x, const struct Vec4 *b,
                const struct Size4 *pivot) {
  ASSERT_NOT_NULL(A);
  ASSERT_NOT_NULL(x);
  ASSERT_NOT_NULL(b);
  ASSERT_NOT_NULL(pivot);
  const uint32_t N = 4;
  uint32_t i, k;

  float bCopy[N];
  bCopy[0] = b->x;
  bCopy[1] = b->y;
  bCopy[2] = b->z;
  bCopy[3] = b->w;

  float _x[N];
  for (k = 0; k < N; ++k) {
    if (pivot->elem[k] != k) {
      float tmp = bCopy[k];
      bCopy[k] = bCopy[pivot->elem[k]];
      bCopy[pivot->elem[k]] = tmp;
    }

    _x[k] = bCopy[k];
    for (i = 0; i < k; ++i) {
      _x[k] -= _x[i] * A->elem[k][i];
    }
    _x[k] /= A->elem[k][k];
  }

  for (k = N; k-- > 0;) {
    for (i = k + 1; i < N; ++i) {
      _x[k] -= _x[i] * A->elem[k][i];
    }
  }

  initVec4(x, _x[0], _x[1], _x[2], _x[3]);
}

float matMaxDiagonalElement(const float *square_mat, size_t n) {
  ASSERT_NOT_NULL(square_mat);
  ASSERT(n > 0);
  size_t i;
  float max = square_mat[0];
  const size_t n_square = n * n;
  const size_t offset = n + 1;
  for (i = offset; i < n_square; i += offset) {
    if (square_mat[i] > max) {
      max = square_mat[i];
    }
  }
  return max;
}

void matAddConstantDiagonal(float *square_mat, float u, size_t n) {
  ASSERT_NOT_NULL(square_mat);
  size_t i;
  const size_t n_square = n * n;
  const size_t offset = n + 1;
  for (i = 0; i < n_square; i += offset) {
    square_mat[i] += u;
  }
}

void matTransposeMultiplyMat(float *out, const float *A,
                             size_t nrows, size_t ncols) {
  ASSERT_NOT_NULL(out);
  ASSERT_NOT_NULL(A);
  size_t i;
  size_t j;
  size_t k;
  memset(out, 0, sizeof(float) * ncols * ncols);
  for (i = 0; i < ncols; ++i) {
    for (j = 0; j < ncols; ++j) {
      // Since A' * A is symmetric, can use upper diagonal elements
      // to fill in the lower diagonal without recomputing.
      if (j < i) {
        out[i * ncols + j] = out[j * ncols + i];
      } else {
        // mat_out[i, j] = ai ' * aj
        out[i * ncols + j] = 0;
        for (k = 0; k < nrows; ++k) {
          out[i * ncols + j] += A[k * ncols + i] *
              A[k * ncols + j];
        }
      }
    }
  }
}

void matMultiplyVec(float *out, const float *A, const float *v,
                    size_t nrows, size_t ncols) {
  ASSERT_NOT_NULL(out);
  ASSERT_NOT_NULL(A);
  ASSERT_NOT_NULL(v);
  size_t i;
  for (i = 0; i < nrows; ++i) {
    const float *row = &A[i * ncols];
    out[i] = vecDot(row, v, (int)ncols);
  }
}

void matTransposeMultiplyVec(float *out, const float *A, const float *v,
                             size_t nrows, size_t ncols) {
  ASSERT_NOT_NULL(out);
  ASSERT_NOT_NULL(A);
  ASSERT_NOT_NULL(v);
  size_t i, j;
  for (i = 0; i < ncols; ++i) {
    out[i] = 0;
    for (j = 0; j < nrows; ++j) {
      out[i] += A[j * ncols + i] * v[j];
    }
  }
}

bool matLinearSolveCholesky(float *x, const float *L, const float *b, size_t n) {
  ASSERT_NOT_NULL(x);
  ASSERT_NOT_NULL(L);
  ASSERT_NOT_NULL(b);
  ASSERT(n <= INT32_MAX);
  int32_t i, j;  // Loops below require signed integers.
  int32_t s_n = (int32_t)n; // Signed n.
  float sum = 0.0f;
  // 1. Solve Ly = b through forward substitution. Use x[] to store y.
  for (i = 0; i < s_n; ++i) {
    sum = 0.0f;
    for (j = 0; j < i; ++j) {
      sum += L[i * s_n + j] * x[j];
    }
    // Check for non-zero diagonals (don't divide by zero).
    if (L[i * s_n + i] < EPSILON) {
      return false;
    }
    x[i] = (b[i] - sum) / L[i * s_n + i];
  }

  // 2. Solve L'x = y through backwards substitution. Use x[] to store both
  // y and x.
  for (i = s_n - 1; i >= 0; --i) {
    sum = 0.0f;
    for (j = i + 1; j < s_n; ++j) {
      sum += L[j * s_n + i] * x[j];
    }
    x[i] = (x[i] - sum) / L[i * s_n + i];
  }

  return true;
}

bool matCholeskyDecomposition(float *L, const float *A, size_t n) {
  ASSERT_NOT_NULL(L);
  ASSERT_NOT_NULL(A);
  size_t i, j, k;
  float sum = 0.0f;
  // initialize L to zero.
  memset(L, 0, sizeof(float) * n * n);

  for (i = 0; i < n; ++i) {
    // compute L[i][i] = sqrt(A[i][i] - sum_k = 1:i-1 L^2[i][k])
    sum = 0.0f;
    for (k = 0; k < i; ++k) {
      sum += L[i * n + k] * L[i * n + k];
    }
    sum = A[i * n + i] - sum;
    // If diagonal element of L is too small, cholesky fails.
    if (sum < CHOLESKY_TOLERANCE) {
      return false;
    }
    L[i * n + i] = sqrtf(sum);

    // for j = i+1:N,  compute L[j][i] =
    //      (1/L[i][i]) * (A[i][j] - sum_k = 1:i-1 L[i][k] * L[j][k])
    for (j = i + 1; j < n; ++j) {
      sum = 0.0f;
      for (k = 0; k < i; ++k) {
        sum += L[i * n + k] * L[j * n + k];
      }
      // division okay because magnitude of L[i][i] already checked above.
      L[j * n + i] = (A[i * n + j] - sum) / L[i * n + i];
    }
  }

  return true;
}