// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008 Guillaume Saupin <guillaume.saupin@cea.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#ifndef EIGEN_SKYLINEINPLACELU_H
#define EIGEN_SKYLINEINPLACELU_H

namespace Eigen { 

/** \ingroup Skyline_Module
 *
 * \class SkylineInplaceLU
 *
 * \brief Inplace LU decomposition of a skyline matrix and associated features
 *
 * \param MatrixType the type of the matrix of which we are computing the LU factorization
 *
 */
template<typename MatrixType>
class SkylineInplaceLU {
protected:
    typedef typename MatrixType::Scalar Scalar;
    typedef typename MatrixType::Index Index;
    
    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;

public:

    /** Creates a LU object and compute the respective factorization of \a matrix using
     * flags \a flags. */
    SkylineInplaceLU(MatrixType& matrix, int flags = 0)
    : /*m_matrix(matrix.rows(), matrix.cols()),*/ m_flags(flags), m_status(0), m_lu(matrix) {
        m_precision = RealScalar(0.1) * Eigen::dummy_precision<RealScalar > ();
        m_lu.IsRowMajor ? computeRowMajor() : compute();
    }

    /** Sets the relative threshold value used to prune zero coefficients during the decomposition.
     *
     * Setting a value greater than zero speeds up computation, and yields to an imcomplete
     * factorization with fewer non zero coefficients. Such approximate factors are especially
     * useful to initialize an iterative solver.
     *
     * Note that the exact meaning of this parameter might depends on the actual
     * backend. Moreover, not all backends support this feature.
     *
     * \sa precision() */
    void setPrecision(RealScalar v) {
        m_precision = v;
    }

    /** \returns the current precision.
     *
     * \sa setPrecision() */
    RealScalar precision() const {
        return m_precision;
    }

    /** Sets the flags. Possible values are:
     *  - CompleteFactorization
     *  - IncompleteFactorization
     *  - MemoryEfficient
     *  - one of the ordering methods
     *  - etc...
     *
     * \sa flags() */
    void setFlags(int f) {
        m_flags = f;
    }

    /** \returns the current flags */
    int flags() const {
        return m_flags;
    }

    void setOrderingMethod(int m) {
        m_flags = m;
    }

    int orderingMethod() const {
        return m_flags;
    }

    /** Computes/re-computes the LU factorization */
    void compute();
    void computeRowMajor();

    /** \returns the lower triangular matrix L */
    //inline const MatrixType& matrixL() const { return m_matrixL; }

    /** \returns the upper triangular matrix U */
    //inline const MatrixType& matrixU() const { return m_matrixU; }

    template<typename BDerived, typename XDerived>
    bool solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>* x,
            const int transposed = 0) const;

    /** \returns true if the factorization succeeded */
    inline bool succeeded(void) const {
        return m_succeeded;
    }

protected:
    RealScalar m_precision;
    int m_flags;
    mutable int m_status;
    bool m_succeeded;
    MatrixType& m_lu;
};

/** Computes / recomputes the in place LU decomposition of the SkylineInplaceLU.
 * using the default algorithm.
 */
template<typename MatrixType>
//template<typename _Scalar>
void SkylineInplaceLU<MatrixType>::compute() {
    const size_t rows = m_lu.rows();
    const size_t cols = m_lu.cols();

    eigen_assert(rows == cols && "We do not (yet) support rectangular LU.");
    eigen_assert(!m_lu.IsRowMajor && "LU decomposition does not work with rowMajor Storage");

    for (Index row = 0; row < rows; row++) {
        const double pivot = m_lu.coeffDiag(row);

        //Lower matrix Columns update
        const Index& col = row;
        for (typename MatrixType::InnerLowerIterator lIt(m_lu, col); lIt; ++lIt) {
            lIt.valueRef() /= pivot;
        }

        //Upper matrix update -> contiguous memory access
        typename MatrixType::InnerLowerIterator lIt(m_lu, col);
        for (Index rrow = row + 1; rrow < m_lu.rows(); rrow++) {
            typename MatrixType::InnerUpperIterator uItPivot(m_lu, row);
            typename MatrixType::InnerUpperIterator uIt(m_lu, rrow);
            const double coef = lIt.value();

            uItPivot += (rrow - row - 1);

            //update upper part  -> contiguous memory access
            for (++uItPivot; uIt && uItPivot;) {
                uIt.valueRef() -= uItPivot.value() * coef;

                ++uIt;
                ++uItPivot;
            }
            ++lIt;
        }

        //Upper matrix update -> non contiguous memory access
        typename MatrixType::InnerLowerIterator lIt3(m_lu, col);
        for (Index rrow = row + 1; rrow < m_lu.rows(); rrow++) {
            typename MatrixType::InnerUpperIterator uItPivot(m_lu, row);
            const double coef = lIt3.value();

            //update lower part ->  non contiguous memory access
            for (Index i = 0; i < rrow - row - 1; i++) {
                m_lu.coeffRefLower(rrow, row + i + 1) -= uItPivot.value() * coef;
                ++uItPivot;
            }
            ++lIt3;
        }
        //update diag -> contiguous
        typename MatrixType::InnerLowerIterator lIt2(m_lu, col);
        for (Index rrow = row + 1; rrow < m_lu.rows(); rrow++) {

            typename MatrixType::InnerUpperIterator uItPivot(m_lu, row);
            typename MatrixType::InnerUpperIterator uIt(m_lu, rrow);
            const double coef = lIt2.value();

            uItPivot += (rrow - row - 1);
            m_lu.coeffRefDiag(rrow) -= uItPivot.value() * coef;
            ++lIt2;
        }
    }
}

template<typename MatrixType>
void SkylineInplaceLU<MatrixType>::computeRowMajor() {
    const size_t rows = m_lu.rows();
    const size_t cols = m_lu.cols();

    eigen_assert(rows == cols && "We do not (yet) support rectangular LU.");
    eigen_assert(m_lu.IsRowMajor && "You're trying to apply rowMajor decomposition on a ColMajor matrix !");

    for (Index row = 0; row < rows; row++) {
        typename MatrixType::InnerLowerIterator llIt(m_lu, row);


        for (Index col = llIt.col(); col < row; col++) {
            if (m_lu.coeffExistLower(row, col)) {
                const double diag = m_lu.coeffDiag(col);

                typename MatrixType::InnerLowerIterator lIt(m_lu, row);
                typename MatrixType::InnerUpperIterator uIt(m_lu, col);


                const Index offset = lIt.col() - uIt.row();


                Index stop = offset > 0 ? col - lIt.col() : col - uIt.row();

                //#define VECTORIZE
#ifdef VECTORIZE
                Map<VectorXd > rowVal(lIt.valuePtr() + (offset > 0 ? 0 : -offset), stop);
                Map<VectorXd > colVal(uIt.valuePtr() + (offset > 0 ? offset : 0), stop);


                Scalar newCoeff = m_lu.coeffLower(row, col) - rowVal.dot(colVal);
#else
                if (offset > 0) //Skip zero value of lIt
                    uIt += offset;
                else //Skip zero values of uIt
                    lIt += -offset;
                Scalar newCoeff = m_lu.coeffLower(row, col);

                for (Index k = 0; k < stop; ++k) {
                    const Scalar tmp = newCoeff;
                    newCoeff = tmp - lIt.value() * uIt.value();
                    ++lIt;
                    ++uIt;
                }
#endif

                m_lu.coeffRefLower(row, col) = newCoeff / diag;
            }
        }

        //Upper matrix update
        const Index col = row;
        typename MatrixType::InnerUpperIterator uuIt(m_lu, col);
        for (Index rrow = uuIt.row(); rrow < col; rrow++) {

            typename MatrixType::InnerLowerIterator lIt(m_lu, rrow);
            typename MatrixType::InnerUpperIterator uIt(m_lu, col);
            const Index offset = lIt.col() - uIt.row();

            Index stop = offset > 0 ? rrow - lIt.col() : rrow - uIt.row();

#ifdef VECTORIZE
            Map<VectorXd > rowVal(lIt.valuePtr() + (offset > 0 ? 0 : -offset), stop);
            Map<VectorXd > colVal(uIt.valuePtr() + (offset > 0 ? offset : 0), stop);

            Scalar newCoeff = m_lu.coeffUpper(rrow, col) - rowVal.dot(colVal);
#else
            if (offset > 0) //Skip zero value of lIt
                uIt += offset;
            else //Skip zero values of uIt
                lIt += -offset;
            Scalar newCoeff = m_lu.coeffUpper(rrow, col);
            for (Index k = 0; k < stop; ++k) {
                const Scalar tmp = newCoeff;
                newCoeff = tmp - lIt.value() * uIt.value();

                ++lIt;
                ++uIt;
            }
#endif
            m_lu.coeffRefUpper(rrow, col) = newCoeff;
        }


        //Diag matrix update
        typename MatrixType::InnerLowerIterator lIt(m_lu, row);
        typename MatrixType::InnerUpperIterator uIt(m_lu, row);

        const Index offset = lIt.col() - uIt.row();


        Index stop = offset > 0 ? lIt.size() : uIt.size();
#ifdef VECTORIZE
        Map<VectorXd > rowVal(lIt.valuePtr() + (offset > 0 ? 0 : -offset), stop);
        Map<VectorXd > colVal(uIt.valuePtr() + (offset > 0 ? offset : 0), stop);
        Scalar newCoeff = m_lu.coeffDiag(row) - rowVal.dot(colVal);
#else
        if (offset > 0) //Skip zero value of lIt
            uIt += offset;
        else //Skip zero values of uIt
            lIt += -offset;
        Scalar newCoeff = m_lu.coeffDiag(row);
        for (Index k = 0; k < stop; ++k) {
            const Scalar tmp = newCoeff;
            newCoeff = tmp - lIt.value() * uIt.value();
            ++lIt;
            ++uIt;
        }
#endif
        m_lu.coeffRefDiag(row) = newCoeff;
    }
}

/** Computes *x = U^-1 L^-1 b
 *
 * If \a transpose is set to SvTranspose or SvAdjoint, the solution
 * of the transposed/adjoint system is computed instead.
 *
 * Not all backends implement the solution of the transposed or
 * adjoint system.
 */
template<typename MatrixType>
template<typename BDerived, typename XDerived>
bool SkylineInplaceLU<MatrixType>::solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>* x, const int transposed) const {
    const size_t rows = m_lu.rows();
    const size_t cols = m_lu.cols();


    for (Index row = 0; row < rows; row++) {
        x->coeffRef(row) = b.coeff(row);
        Scalar newVal = x->coeff(row);
        typename MatrixType::InnerLowerIterator lIt(m_lu, row);

        Index col = lIt.col();
        while (lIt.col() < row) {

            newVal -= x->coeff(col++) * lIt.value();
            ++lIt;
        }

        x->coeffRef(row) = newVal;
    }


    for (Index col = rows - 1; col > 0; col--) {
        x->coeffRef(col) = x->coeff(col) / m_lu.coeffDiag(col);

        const Scalar x_col = x->coeff(col);

        typename MatrixType::InnerUpperIterator uIt(m_lu, col);
        uIt += uIt.size()-1;


        while (uIt) {
            x->coeffRef(uIt.row()) -= x_col * uIt.value();
            //TODO : introduce --operator
            uIt += -1;
        }


    }
    x->coeffRef(0) = x->coeff(0) / m_lu.coeffDiag(0);

    return true;
}

} // end namespace Eigen

#endif // EIGEN_SKYLINELU_H