/*-------------------------------------------------------------------------
 * drawElements Quality Program OpenGL ES 3.0 Module
 * -------------------------------------------------
 *
 * Copyright 2014 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *//*!
 * \file
 * \brief Shader precision tests.
 *
 * \note Floating-point case uses R32UI render target and uses
 *		 floatBitsToUint() in shader to write out floating-point value bits.
 *		 This is done since ES3 core doesn't support FP render targets.
 *//*--------------------------------------------------------------------*/

#include "es3fShaderPrecisionTests.hpp"
#include "tcuVector.hpp"
#include "tcuTestLog.hpp"
#include "tcuVectorUtil.hpp"
#include "tcuFloat.hpp"
#include "tcuFormatUtil.hpp"
#include "gluRenderContext.hpp"
#include "gluShaderProgram.hpp"
#include "gluShaderUtil.hpp"
#include "gluDrawUtil.hpp"
#include "deRandom.hpp"
#include "deString.h"

#include "glwEnums.hpp"
#include "glwFunctions.hpp"

#include <algorithm>

namespace deqp
{
namespace gles3
{
namespace Functional
{

using std::string;
using std::vector;
using std::ostringstream;
using tcu::TestLog;

enum
{
	FRAMEBUFFER_WIDTH	= 32,
	FRAMEBUFFER_HEIGHT	= 32
};

static glu::ShaderProgram* createFloatPrecisionEvalProgram (const glu::RenderContext& context, glu::Precision precision, const char* evalOp, bool isVertexCase)
{
	glu::DataType	type		= glu::TYPE_FLOAT;
	glu::DataType	outType		= glu::TYPE_UINT;
	const char*		typeName	= glu::getDataTypeName(type);
	const char*		outTypeName	= glu::getDataTypeName(outType);
	const char*		precName	= glu::getPrecisionName(precision);
	ostringstream	vtx;
	ostringstream	frag;
	ostringstream&	op			= isVertexCase ? vtx : frag;

	vtx << "#version 300 es\n"
		<< "in highp vec4 a_position;\n"
		<< "in " << precName << " " << typeName << " a_in0;\n"
		<< "in " << precName << " " << typeName << " a_in1;\n";
	frag << "#version 300 es\n"
		 << "layout(location = 0) out highp " << outTypeName << " o_out;\n";

	if (isVertexCase)
	{
		vtx << "flat out " << precName << " " << typeName << " v_out;\n";
		frag << "flat in " << precName << " " << typeName << " v_out;\n";
	}
	else
	{
		vtx << "flat out " << precName << " " << typeName << " v_in0;\n"
			<< "flat out " << precName << " " << typeName << " v_in1;\n";
		frag << "flat in " << precName << " " << typeName << " v_in0;\n"
			 << "flat in " << precName << " " << typeName << " v_in1;\n";
	}

	vtx << "\nvoid main (void)\n{\n"
		<< "	gl_Position = a_position;\n";
	frag << "\nvoid main (void)\n{\n";

	op << "\t" << precName << " " << typeName << " in0 = " << (isVertexCase ? "a_" : "v_") << "in0;\n"
	   << "\t" << precName << " " << typeName << " in1 = " << (isVertexCase ? "a_" : "v_") << "in1;\n";

	if (!isVertexCase)
		op << "\t" << precName << " " << typeName << " res;\n";

	op << "\t" << (isVertexCase ? "v_out" : "res") << " = " << evalOp << ";\n";

	if (isVertexCase)
	{
		frag << "	o_out = floatBitsToUint(v_out);\n";
	}
	else
	{
		vtx << "	v_in0 = a_in0;\n"
			<< "	v_in1 = a_in1;\n";
		frag << "	o_out = floatBitsToUint(res);\n";
	}

	vtx << "}\n";
	frag << "}\n";

	return new glu::ShaderProgram(context, glu::makeVtxFragSources(vtx.str(), frag.str()));
}

static glu::ShaderProgram* createIntUintPrecisionEvalProgram (const glu::RenderContext& context, glu::DataType type, glu::Precision precision, const char* evalOp, bool isVertexCase)
{
	const char*		typeName	= glu::getDataTypeName(type);
	const char*		precName	= glu::getPrecisionName(precision);
	ostringstream	vtx;
	ostringstream	frag;
	ostringstream&	op			= isVertexCase ? vtx : frag;

	vtx << "#version 300 es\n"
		<< "in highp vec4 a_position;\n"
		<< "in " << precName << " " << typeName << " a_in0;\n"
		<< "in " << precName << " " << typeName << " a_in1;\n";
	frag << "#version 300 es\n"
		 << "layout(location = 0) out " << precName << " " << typeName << " o_out;\n";

	if (isVertexCase)
	{
		vtx << "flat out " << precName << " " << typeName << " v_out;\n";
		frag << "flat in " << precName << " " << typeName << " v_out;\n";
	}
	else
	{
		vtx << "flat out " << precName << " " << typeName << " v_in0;\n"
			<< "flat out " << precName << " " << typeName << " v_in1;\n";
		frag << "flat in " << precName << " " << typeName << " v_in0;\n"
			 << "flat in " << precName << " " << typeName << " v_in1;\n";
	}

	vtx << "\nvoid main (void)\n{\n"
		<< "	gl_Position = a_position;\n";
	frag << "\nvoid main (void)\n{\n";

	op << "\t" << precName << " " << typeName << " in0 = " << (isVertexCase ? "a_" : "v_") << "in0;\n"
	   << "\t" << precName << " " << typeName << " in1 = " << (isVertexCase ? "a_" : "v_") << "in1;\n";

	op << "\t" << (isVertexCase ? "v_" : "o_") << "out = " << evalOp << ";\n";

	if (isVertexCase)
	{
		frag << "	o_out = v_out;\n";
	}
	else
	{
		vtx << "	v_in0 = a_in0;\n"
			<< "	v_in1 = a_in1;\n";
	}

	vtx << "}\n";
	frag << "}\n";

	return new glu::ShaderProgram(context, glu::makeVtxFragSources(vtx.str(), frag.str()));
}

class ShaderFloatPrecisionCase : public TestCase
{
public:
	typedef double (*EvalFunc) (double in0, double in1);

								ShaderFloatPrecisionCase	(Context& context, const char* name, const char* desc, const char* op, EvalFunc evalFunc, glu::Precision precision, const tcu::Vec2& rangeA, const tcu::Vec2& rangeB, bool isVertexCase);
								~ShaderFloatPrecisionCase	(void);

	void						init						(void);
	void						deinit						(void);
	IterateResult				iterate						(void);

protected:
	bool						compare						(float in0, float in1, double reference, float result)
#if (DE_COMPILER == DE_COMPILER_GCC) && (DE_CPU == DE_CPU_ARM_64)
#	if (__GNUC__ == 4) && (__GNUC_MINOR__ == 9) && (__GNUC_PATCHLEVEL__ == 0)
		// Some prerelease GCC 4.9 versions have a bug in shift right when
		// targeting ARMv8.
		//
		// If compiler wants to perform logical shift by variable/register
		// in fp/vector registers it uses USHL that selects shift direction
		// based on shift operand value. Thus for right shifts the shift
		// operand needs to be negated.
		//
		// The bug is in right shift pattern; it doesn't mark shift operand
		// as clobbered and thus later code using that same register may
		// see the negated value.
		//
		// Workaround is to disable optimization for this function.
		//
		// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61633
		__attribute__((optimize(0)))
#	endif
#endif
	;

private:
								ShaderFloatPrecisionCase	(const ShaderFloatPrecisionCase& other);
	ShaderFloatPrecisionCase&	operator=					(const ShaderFloatPrecisionCase& other);

	// Case parameters.
	std::string					m_op;
	EvalFunc					m_evalFunc;
	glu::Precision				m_precision;
	tcu::Vec2					m_rangeA;
	tcu::Vec2					m_rangeB;
	bool						m_isVertexCase;

	int							m_numTestsPerIter;
	int							m_numIters;
	de::Random					m_rnd;

	// Iteration state.
	glu::ShaderProgram*			m_program;
	deUint32					m_framebuffer;
	deUint32					m_renderbuffer;
	int							m_iterNdx;
};

ShaderFloatPrecisionCase::ShaderFloatPrecisionCase (Context& context, const char* name, const char* desc, const char* op, EvalFunc evalFunc, glu::Precision precision, const tcu::Vec2& rangeA, const tcu::Vec2& rangeB, bool isVertexCase)
	: TestCase			(context, name, desc)
	, m_op				(op)
	, m_evalFunc		(evalFunc)
	, m_precision		(precision)
	, m_rangeA			(rangeA)
	, m_rangeB			(rangeB)
	, m_isVertexCase	(isVertexCase)
	, m_numTestsPerIter	(32)
	, m_numIters		(4)
	, m_rnd				(deStringHash(name))
	, m_program			(DE_NULL)
	, m_framebuffer		(0)
	, m_renderbuffer	(0)
	, m_iterNdx			(0)
{
}

ShaderFloatPrecisionCase::~ShaderFloatPrecisionCase (void)
{
	ShaderFloatPrecisionCase::deinit();
}

void ShaderFloatPrecisionCase::init (void)
{
	const glw::Functions&	gl	= m_context.getRenderContext().getFunctions();
	TestLog&				log	= m_testCtx.getLog();

	DE_ASSERT(!m_program && !m_framebuffer && !m_renderbuffer);

	// Create program.
	m_program = createFloatPrecisionEvalProgram(m_context.getRenderContext(), m_precision, m_op.c_str(), m_isVertexCase);
	log << *m_program;

	TCU_CHECK(m_program->isOk());

	// Create framebuffer.
	gl.genFramebuffers(1, &m_framebuffer);
	gl.genRenderbuffers(1, &m_renderbuffer);

	gl.bindRenderbuffer(GL_RENDERBUFFER, m_renderbuffer);
	gl.renderbufferStorage(GL_RENDERBUFFER, GL_R32UI, FRAMEBUFFER_WIDTH, FRAMEBUFFER_HEIGHT);

	gl.bindFramebuffer(GL_FRAMEBUFFER, m_framebuffer);
	gl.framebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, m_renderbuffer);

	GLU_EXPECT_NO_ERROR(gl.getError(), "Post framebuffer setup");
	TCU_CHECK(gl.checkFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE);

	gl.bindFramebuffer(GL_FRAMEBUFFER, m_context.getRenderContext().getDefaultFramebuffer());

	// Initialize test result to pass.
	m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
	m_iterNdx = 0;
}

void ShaderFloatPrecisionCase::deinit (void)
{
	delete m_program;

	if (m_framebuffer)
		m_context.getRenderContext().getFunctions().deleteFramebuffers(1, &m_framebuffer);

	if (m_renderbuffer)
		m_context.getRenderContext().getFunctions().deleteRenderbuffers(1, &m_renderbuffer);

	m_program		= DE_NULL;
	m_framebuffer	= 0;
	m_renderbuffer	= 0;
}

bool ShaderFloatPrecisionCase::compare (float in0, float in1, double reference, float result)
{
	// Comparison is done using 64-bit reference value to accurately evaluate rounding mode error.
	// If 32-bit reference value is used, 2 bits of rounding error must be allowed.

	// For mediump and lowp types the comparison currently allows 3 bits of rounding error:
	// two bits from conversions and one from actual operation.

	// \todo [2013-09-30 pyry] Make this more strict: determine if rounding can actually happen.

	const int		mantissaBits		= m_precision == glu::PRECISION_HIGHP ? 23 : 10;
	const int		numPrecBits			= 52 - mantissaBits;

	const int		in0Exp				= tcu::Float32(in0).exponent();
	const int		in1Exp				= tcu::Float32(in1).exponent();
	const int		resExp				= tcu::Float32(result).exponent();
	const int		numLostBits			= de::max(de::max(in0Exp-resExp, in1Exp-resExp), 0); // Lost due to mantissa shift.

	const int		roundingUlpError	= m_precision == glu::PRECISION_HIGHP ? 1 : 3;
	const int		maskBits			= numLostBits + numPrecBits;

	m_testCtx.getLog() << TestLog::Message << "Assuming " << mantissaBits << " mantissa bits, " << numLostBits << " bits lost in operation, and " << roundingUlpError << " ULP rounding error."
					   << TestLog::EndMessage;

	{
		const deUint64	refBits				= tcu::Float64(reference).bits();
		const deUint64	resBits				= tcu::Float64(result).bits();
		const deUint64	accurateRefBits		= maskBits < 64 ? refBits >> (deUint64)maskBits : 0u;
		const deUint64	accurateResBits		= maskBits < 64 ? resBits >> (deUint64)maskBits : 0u;
		const deUint64	ulpDiff				= (deUint64)de::abs((deInt64)accurateRefBits - (deInt64)accurateResBits);

		if (ulpDiff > (deUint64)roundingUlpError)
		{
			m_testCtx.getLog() << TestLog::Message << "ERROR: comparison failed! ULP diff (ignoring lost/undefined bits) = " << ulpDiff << TestLog::EndMessage;
			return false;
		}
		else
			return true;
	}
}

ShaderFloatPrecisionCase::IterateResult ShaderFloatPrecisionCase::iterate (void)
{
	// Constant data.
	const float position[] =
	{
		-1.0f, -1.0f, 0.0f, 1.0f,
		-1.0f,  1.0f, 0.0f, 1.0f,
		 1.0f, -1.0f, 0.0f, 1.0f,
		 1.0f,  1.0f, 0.0f, 1.0f
	};
	const deUint16					indices[]	= { 0, 1, 2, 2, 1, 3 };

	const int						numVertices	= 4;
	float							in0Arr[4]	= { 0.0f };
	float							in1Arr[4]	= { 0.0f };

	TestLog&						log			= m_testCtx.getLog();
	const glw::Functions&			gl			= m_context.getRenderContext().getFunctions();
	vector<glu::VertexArrayBinding>	vertexArrays;

	// Image read from GL.
	std::vector<float>	pixels		(FRAMEBUFFER_WIDTH*FRAMEBUFFER_HEIGHT*4);

	// \todo [2012-05-03 pyry] Could be cached.
	deUint32			prog		= m_program->getProgram();

	gl.useProgram(prog);
	gl.bindFramebuffer(GL_FRAMEBUFFER, m_framebuffer);

	vertexArrays.push_back(glu::va::Float("a_position", 4, numVertices, 0, &position[0]));
	vertexArrays.push_back(glu::va::Float("a_in0", 1, numVertices, 0, &in0Arr[0]));
	vertexArrays.push_back(glu::va::Float("a_in1", 1, numVertices, 0, &in1Arr[0]));

	GLU_EXPECT_NO_ERROR(gl.getError(), "After program setup");

	// Compute values and reference.
	for (int testNdx = 0; testNdx < m_numTestsPerIter; testNdx++)
	{
		const float		in0		= m_rnd.getFloat(m_rangeA.x(), m_rangeA.y());
		const float		in1		= m_rnd.getFloat(m_rangeB.x(), m_rangeB.y());
		const double	refD	= m_evalFunc((double)in0, (double)in1);
		const float		refF	= tcu::Float64(refD).asFloat(); // Uses RTE rounding mode.

		log << TestLog::Message << "iter " << m_iterNdx << ", test " << testNdx << ": "
								<< "in0 = " << in0 << " / " << tcu::toHex(tcu::Float32(in0).bits())
								<< ", in1 = " << in1 << " / " << tcu::toHex(tcu::Float32(in1).bits())
			<< TestLog::EndMessage
			<< TestLog::Message << "  reference = " << refF << " / " << tcu::toHex(tcu::Float32(refF).bits()) << TestLog::EndMessage;

		std::fill(&in0Arr[0], &in0Arr[0] + DE_LENGTH_OF_ARRAY(in0Arr), in0);
		std::fill(&in1Arr[0], &in1Arr[0] + DE_LENGTH_OF_ARRAY(in1Arr), in1);

		glu::draw(m_context.getRenderContext(), prog, (int)vertexArrays.size(), &vertexArrays[0],
				  glu::pr::Triangles(DE_LENGTH_OF_ARRAY(indices), &indices[0]));
		gl.readPixels(0, 0, FRAMEBUFFER_WIDTH, FRAMEBUFFER_HEIGHT, GL_RGBA_INTEGER, GL_UNSIGNED_INT, &pixels[0]);
		GLU_EXPECT_NO_ERROR(gl.getError(), "After render");

		log << TestLog::Message << "  result = " << pixels[0] << " / " << tcu::toHex(tcu::Float32(pixels[0]).bits()) << TestLog::EndMessage;

		// Verify results
		{
			const bool firstPixelOk = compare(in0, in1, refD, pixels[0]);

			if (firstPixelOk)
			{
				// Check that rest of pixels match to first one.
				const deUint32	firstPixelBits	= tcu::Float32(pixels[0]).bits();
				bool			allPixelsOk		= true;

				for (int y = 0; y < FRAMEBUFFER_HEIGHT; y++)
				{
					for (int x = 0; x < FRAMEBUFFER_WIDTH; x++)
					{
						const deUint32 pixelBits = tcu::Float32(pixels[(y*FRAMEBUFFER_WIDTH + x)*4]).bits();

						if (pixelBits != firstPixelBits)
						{
							log << TestLog::Message << "ERROR: Inconsistent results, got " << tcu::toHex(pixelBits) << " at (" << x << ", " << y << ")" << TestLog::EndMessage;
							allPixelsOk = false;
						}
					}

					if (!allPixelsOk)
						break;
				}

				if (!allPixelsOk)
					m_testCtx.setTestResult(QP_TEST_RESULT_FAIL, "Inconsistent values in framebuffer");
			}
			else
				m_testCtx.setTestResult(QP_TEST_RESULT_FAIL, "Result comparison failed");
		}

		if (m_testCtx.getTestResult() != QP_TEST_RESULT_PASS)
			break;
	}

	gl.bindFramebuffer(GL_FRAMEBUFFER, m_context.getRenderContext().getDefaultFramebuffer());
	GLU_EXPECT_NO_ERROR(gl.getError(), "After iteration");

	m_iterNdx += 1;
	return (m_iterNdx < m_numIters && m_testCtx.getTestResult() == QP_TEST_RESULT_PASS) ? CONTINUE : STOP;
}

class ShaderIntPrecisionCase : public TestCase
{
public:
	typedef int					(*EvalFunc)					(int a, int b);

								ShaderIntPrecisionCase		(Context& context, const char* name, const char* desc, const char* op, EvalFunc evalFunc, glu::Precision precision, int bits, const tcu::IVec2& rangeA, const tcu::IVec2& rangeB, bool isVertexCase);
								~ShaderIntPrecisionCase		(void);

	void						init						(void);
	void						deinit						(void);
	IterateResult				iterate						(void);

private:
								ShaderIntPrecisionCase		(const ShaderIntPrecisionCase& other);
	ShaderIntPrecisionCase&		operator=					(const ShaderIntPrecisionCase& other);

	// Case parameters.
	std::string					m_op;
	EvalFunc					m_evalFunc;
	glu::Precision				m_precision;
	int							m_bits;
	tcu::IVec2					m_rangeA;
	tcu::IVec2					m_rangeB;
	bool						m_isVertexCase;

	int							m_numTestsPerIter;
	int							m_numIters;
	de::Random					m_rnd;

	// Iteration state.
	glu::ShaderProgram*			m_program;
	deUint32					m_framebuffer;
	deUint32					m_renderbuffer;
	int							m_iterNdx;
};

ShaderIntPrecisionCase::ShaderIntPrecisionCase (Context& context, const char* name, const char* desc, const char* op, EvalFunc evalFunc, glu::Precision precision, int bits, const tcu::IVec2& rangeA, const tcu::IVec2& rangeB, bool isVertexCase)
	: TestCase			(context, name, desc)
	, m_op				(op)
	, m_evalFunc		(evalFunc)
	, m_precision		(precision)
	, m_bits			(bits)
	, m_rangeA			(rangeA)
	, m_rangeB			(rangeB)
	, m_isVertexCase	(isVertexCase)
	, m_numTestsPerIter	(32)
	, m_numIters		(4)
	, m_rnd				(deStringHash(name))
	, m_program			(DE_NULL)
	, m_framebuffer		(0)
	, m_renderbuffer	(0)
	, m_iterNdx			(0)
{
}

ShaderIntPrecisionCase::~ShaderIntPrecisionCase (void)
{
	ShaderIntPrecisionCase::deinit();
}

void ShaderIntPrecisionCase::init (void)
{
	const glw::Functions&	gl	= m_context.getRenderContext().getFunctions();
	TestLog&				log	= m_testCtx.getLog();

	DE_ASSERT(!m_program && !m_framebuffer && !m_renderbuffer);

	// Create program.
	m_program = createIntUintPrecisionEvalProgram(m_context.getRenderContext(), glu::TYPE_INT, m_precision, m_op.c_str(), m_isVertexCase);
	log << *m_program;

	TCU_CHECK(m_program->isOk());

	// Create framebuffer.
	gl.genFramebuffers(1, &m_framebuffer);
	gl.genRenderbuffers(1, &m_renderbuffer);

	gl.bindRenderbuffer(GL_RENDERBUFFER, m_renderbuffer);
	gl.renderbufferStorage(GL_RENDERBUFFER, GL_R32I, FRAMEBUFFER_WIDTH, FRAMEBUFFER_HEIGHT);

	gl.bindFramebuffer(GL_FRAMEBUFFER, m_framebuffer);
	gl.framebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, m_renderbuffer);

	GLU_EXPECT_NO_ERROR(gl.getError(), "Post framebuffer setup");
	TCU_CHECK(gl.checkFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE);

	gl.bindFramebuffer(GL_FRAMEBUFFER, m_context.getRenderContext().getDefaultFramebuffer());

	// Initialize test result to pass.
	m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
	m_iterNdx = 0;

	log << TestLog::Message << "Number of accurate bits assumed = " << m_bits << TestLog::EndMessage;
}

void ShaderIntPrecisionCase::deinit (void)
{
	delete m_program;

	if (m_framebuffer)
		m_context.getRenderContext().getFunctions().deleteFramebuffers(1, &m_framebuffer);

	if (m_renderbuffer)
		m_context.getRenderContext().getFunctions().deleteRenderbuffers(1, &m_renderbuffer);

	m_program		= DE_NULL;
	m_framebuffer	= 0;
	m_renderbuffer	= 0;
}

ShaderIntPrecisionCase::IterateResult ShaderIntPrecisionCase::iterate (void)
{
	// Constant data.
	const float position[] =
	{
		-1.0f, -1.0f, 0.0f, 1.0f,
		-1.0f,  1.0f, 0.0f, 1.0f,
		 1.0f, -1.0f, 0.0f, 1.0f,
		 1.0f,  1.0f, 0.0f, 1.0f
	};
	const deUint16					indices[]	= { 0, 1, 2, 2, 1, 3 };

	const int						numVertices	= 4;
	int								in0Arr[4]	= { 0 };
	int								in1Arr[4]	= { 0 };

	TestLog&						log			= m_testCtx.getLog();
	const glw::Functions&			gl			= m_context.getRenderContext().getFunctions();
	deUint32						mask		= m_bits == 32 ? 0xffffffffu : ((1u<<m_bits)-1);
	vector<int>						pixels		(FRAMEBUFFER_WIDTH*FRAMEBUFFER_HEIGHT*4);
	vector<glu::VertexArrayBinding>	vertexArrays;

	deUint32						prog		= m_program->getProgram();

	// \todo [2012-05-03 pyry] A bit hacky. getInt() should work fine with ranges like this.
	bool							isMaxRangeA	= m_rangeA.x() == (int)0x80000000 && m_rangeA.y() == (int)0x7fffffff;
	bool							isMaxRangeB	= m_rangeB.x() == (int)0x80000000 && m_rangeB.y() == (int)0x7fffffff;

	gl.useProgram(prog);
	gl.bindFramebuffer(GL_FRAMEBUFFER, m_framebuffer);

	vertexArrays.push_back(glu::va::Float("a_position", 4, numVertices, 0, &position[0]));
	vertexArrays.push_back(glu::va::Int32("a_in0", 1, numVertices, 0, &in0Arr[0]));
	vertexArrays.push_back(glu::va::Int32("a_in1", 1, numVertices, 0, &in1Arr[0]));

	GLU_EXPECT_NO_ERROR(gl.getError(), "After program setup");

	// Compute values and reference.
	for (int testNdx = 0; testNdx < m_numTestsPerIter; testNdx++)
	{
		int		in0			= deSignExtendTo32(((isMaxRangeA ? (int)m_rnd.getUint32() : m_rnd.getInt(m_rangeA.x(), m_rangeA.y())) & mask), m_bits);
		int		in1			= deSignExtendTo32(((isMaxRangeB ? (int)m_rnd.getUint32() : m_rnd.getInt(m_rangeB.x(), m_rangeB.y())) & mask), m_bits);
		int		refMasked	= m_evalFunc(in0, in1) & mask;
		int		refOut		= deSignExtendTo32(refMasked, m_bits);

		log << TestLog::Message << "iter " << m_iterNdx << ", test " << testNdx << ": "
								<< "in0 = " << in0 << ", in1 = " << in1 << ", ref out = " << refOut << " / " << tcu::toHex(refMasked)
			<< TestLog::EndMessage;

		std::fill(&in0Arr[0], &in0Arr[0] + DE_LENGTH_OF_ARRAY(in0Arr), in0);
		std::fill(&in1Arr[0], &in1Arr[0] + DE_LENGTH_OF_ARRAY(in1Arr), in1);

		glu::draw(m_context.getRenderContext(), prog, (int)vertexArrays.size(), &vertexArrays[0],
				  glu::pr::Triangles(DE_LENGTH_OF_ARRAY(indices), &indices[0]));
		gl.readPixels(0, 0, FRAMEBUFFER_WIDTH, FRAMEBUFFER_HEIGHT, GL_RGBA_INTEGER, GL_INT, &pixels[0]);
		GLU_EXPECT_NO_ERROR(gl.getError(), "After render");

		// Compare pixels.
		for (int y = 0; y < FRAMEBUFFER_HEIGHT; y++)
		{
			for (int x = 0; x < FRAMEBUFFER_WIDTH; x++)
			{
				int			cmpOut		= pixels[(y*FRAMEBUFFER_WIDTH + x)*4];
				int			cmpMasked	= cmpOut & mask;

				if (cmpMasked != refMasked)
				{
					log << TestLog::Message << "Comparison failed (at " << x << ", " << y << "): "
											<< "got " << cmpOut << " / " << tcu::toHex(cmpOut)
						<< TestLog::EndMessage;
					m_testCtx.setTestResult(QP_TEST_RESULT_FAIL, "Fail");
					return STOP;
				}
			}
		}
	}

	gl.bindFramebuffer(GL_FRAMEBUFFER, m_context.getRenderContext().getDefaultFramebuffer());
	GLU_EXPECT_NO_ERROR(gl.getError(), "After iteration");

	m_iterNdx += 1;
	return (m_iterNdx < m_numIters) ? CONTINUE : STOP;
}

class ShaderUintPrecisionCase : public TestCase
{
public:
	typedef deUint32			(*EvalFunc)					(deUint32 a, deUint32 b);

								ShaderUintPrecisionCase		(Context& context, const char* name, const char* desc, const char* op, EvalFunc evalFunc, glu::Precision precision, int bits, const tcu::UVec2& rangeA, const tcu::UVec2& rangeB, bool isVertexCase);
								~ShaderUintPrecisionCase	(void);

	void						init						(void);
	void						deinit						(void);
	IterateResult				iterate						(void);

private:
								ShaderUintPrecisionCase		(const ShaderUintPrecisionCase& other);
	ShaderUintPrecisionCase&	operator=					(const ShaderUintPrecisionCase& other);

	// Case parameters.
	std::string					m_op;
	EvalFunc					m_evalFunc;
	glu::Precision				m_precision;
	int							m_bits;
	tcu::UVec2					m_rangeA;
	tcu::UVec2					m_rangeB;
	bool						m_isVertexCase;

	int							m_numTestsPerIter;
	int							m_numIters;
	de::Random					m_rnd;

	// Iteration state.
	glu::ShaderProgram*			m_program;
	deUint32					m_framebuffer;
	deUint32					m_renderbuffer;
	int							m_iterNdx;
};

ShaderUintPrecisionCase::ShaderUintPrecisionCase (Context& context, const char* name, const char* desc, const char* op, EvalFunc evalFunc, glu::Precision precision, int bits, const tcu::UVec2& rangeA, const tcu::UVec2& rangeB, bool isVertexCase)
	: TestCase			(context, name, desc)
	, m_op				(op)
	, m_evalFunc		(evalFunc)
	, m_precision		(precision)
	, m_bits			(bits)
	, m_rangeA			(rangeA)
	, m_rangeB			(rangeB)
	, m_isVertexCase	(isVertexCase)
	, m_numTestsPerIter	(32)
	, m_numIters		(4)
	, m_rnd				(deStringHash(name))
	, m_program			(DE_NULL)
	, m_framebuffer		(0)
	, m_renderbuffer	(0)
	, m_iterNdx			(0)
{
}

ShaderUintPrecisionCase::~ShaderUintPrecisionCase (void)
{
	ShaderUintPrecisionCase::deinit();
}

void ShaderUintPrecisionCase::init (void)
{
	const glw::Functions&	gl	= m_context.getRenderContext().getFunctions();
	TestLog&				log	= m_testCtx.getLog();

	DE_ASSERT(!m_program && !m_framebuffer && !m_renderbuffer);

	// Create program.
	m_program = createIntUintPrecisionEvalProgram(m_context.getRenderContext(), glu::TYPE_UINT, m_precision, m_op.c_str(), m_isVertexCase);
	log << *m_program;

	TCU_CHECK(m_program->isOk());

	// Create framebuffer.
	gl.genFramebuffers(1, &m_framebuffer);
	gl.genRenderbuffers(1, &m_renderbuffer);

	gl.bindRenderbuffer(GL_RENDERBUFFER, m_renderbuffer);
	gl.renderbufferStorage(GL_RENDERBUFFER, GL_R32UI, FRAMEBUFFER_WIDTH, FRAMEBUFFER_HEIGHT);

	gl.bindFramebuffer(GL_FRAMEBUFFER, m_framebuffer);
	gl.framebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, m_renderbuffer);

	GLU_EXPECT_NO_ERROR(gl.getError(), "Post framebuffer setup");
	TCU_CHECK(gl.checkFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE);

	gl.bindFramebuffer(GL_FRAMEBUFFER, m_context.getRenderContext().getDefaultFramebuffer());

	// Initialize test result to pass.
	m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
	m_iterNdx = 0;

	log << TestLog::Message << "Number of accurate bits assumed = " << m_bits << TestLog::EndMessage;
}

void ShaderUintPrecisionCase::deinit (void)
{
	delete m_program;

	if (m_framebuffer)
		m_context.getRenderContext().getFunctions().deleteFramebuffers(1, &m_framebuffer);

	if (m_renderbuffer)
		m_context.getRenderContext().getFunctions().deleteRenderbuffers(1, &m_renderbuffer);

	m_program		= DE_NULL;
	m_framebuffer	= 0;
	m_renderbuffer	= 0;
}

ShaderUintPrecisionCase::IterateResult ShaderUintPrecisionCase::iterate (void)
{
	// Constant data.
	const float position[] =
	{
		-1.0f, -1.0f, 0.0f, 1.0f,
		-1.0f,  1.0f, 0.0f, 1.0f,
		 1.0f, -1.0f, 0.0f, 1.0f,
		 1.0f,  1.0f, 0.0f, 1.0f
	};
	const deUint16					indices[]	= { 0, 1, 2, 2, 1, 3 };

	const int						numVertices	= 4;
	deUint32						in0Arr[4]	= { 0 };
	deUint32						in1Arr[4]	= { 0 };

	TestLog&						log			= m_testCtx.getLog();
	const glw::Functions&			gl			= m_context.getRenderContext().getFunctions();
	deUint32						mask		= m_bits == 32 ? 0xffffffffu : ((1u<<m_bits)-1);
	vector<deUint32>				pixels		(FRAMEBUFFER_WIDTH*FRAMEBUFFER_HEIGHT*4);
	vector<glu::VertexArrayBinding>	vertexArrays;

	deUint32						prog		= m_program->getProgram();

	// \todo [2012-05-03 pyry] A bit hacky.
	bool							isMaxRangeA	= m_rangeA.x() == 0 && m_rangeA.y() == 0xffffffff;
	bool							isMaxRangeB	= m_rangeB.x() == 0 && m_rangeB.y() == 0xffffffff;

	gl.useProgram(prog);
	gl.bindFramebuffer(GL_FRAMEBUFFER, m_framebuffer);

	vertexArrays.push_back(glu::va::Float("a_position", 4, numVertices, 0, &position[0]));
	vertexArrays.push_back(glu::va::Uint32("a_in0", 1, numVertices, 0, &in0Arr[0]));
	vertexArrays.push_back(glu::va::Uint32("a_in1", 1, numVertices, 0, &in1Arr[0]));

	GLU_EXPECT_NO_ERROR(gl.getError(), "After program setup");

	// Compute values and reference.
	for (int testNdx = 0; testNdx < m_numTestsPerIter; testNdx++)
	{
		deUint32	in0		= (isMaxRangeA ? m_rnd.getUint32() : (m_rangeA.x() + m_rnd.getUint32()%(m_rangeA.y()-m_rangeA.x()+1))) & mask;
		deUint32	in1		= (isMaxRangeB ? m_rnd.getUint32() : (m_rangeB.x() + m_rnd.getUint32()%(m_rangeB.y()-m_rangeB.x()+1))) & mask;
		deUint32	refOut	= m_evalFunc(in0, in1) & mask;

		log << TestLog::Message << "iter " << m_iterNdx << ", test " << testNdx << ": "
								<< "in0 = " << tcu::toHex(in0) << ", in1 = " << tcu::toHex(in1) << ", ref out = " << tcu::toHex(refOut)
			<< TestLog::EndMessage;

		std::fill(&in0Arr[0], &in0Arr[0] + DE_LENGTH_OF_ARRAY(in0Arr), in0);
		std::fill(&in1Arr[0], &in1Arr[0] + DE_LENGTH_OF_ARRAY(in1Arr), in1);

		glu::draw(m_context.getRenderContext(), prog, (int)vertexArrays.size(), &vertexArrays[0],
				  glu::pr::Triangles(DE_LENGTH_OF_ARRAY(indices), &indices[0]));
		gl.readPixels(0, 0, FRAMEBUFFER_WIDTH, FRAMEBUFFER_HEIGHT, GL_RGBA_INTEGER, GL_UNSIGNED_INT, &pixels[0]);
		GLU_EXPECT_NO_ERROR(gl.getError(), "After render");

		// Compare pixels.
		for (int y = 0; y < FRAMEBUFFER_HEIGHT; y++)
		{
			for (int x = 0; x < FRAMEBUFFER_WIDTH; x++)
			{
				deUint32	cmpOut		= pixels[(y*FRAMEBUFFER_WIDTH + x)*4];
				deUint32	cmpMasked	= cmpOut & mask;

				if (cmpMasked != refOut)
				{
					log << TestLog::Message << "Comparison failed (at " << x << ", " << y << "): "
											<< "got " << tcu::toHex(cmpOut)
						<< TestLog::EndMessage;
					m_testCtx.setTestResult(QP_TEST_RESULT_FAIL, "Fail");
					return STOP;
				}
			}
		}
	}

	gl.bindFramebuffer(GL_FRAMEBUFFER, 0);
	GLU_EXPECT_NO_ERROR(gl.getError(), "After iteration");

	m_iterNdx += 1;
	return (m_iterNdx < m_numIters) ? CONTINUE : STOP;
}

ShaderPrecisionTests::ShaderPrecisionTests (Context& context)
	: TestCaseGroup(context, "precision", "Shader precision requirements validation tests")
{
}

ShaderPrecisionTests::~ShaderPrecisionTests (void)
{
}

void ShaderPrecisionTests::init (void)
{
	using tcu::add;
	using tcu::sub;
	using tcu::mul;
	using tcu::div;
	using tcu::Vec2;
	using tcu::IVec2;
	using tcu::UVec2;

	// Exp = Emax-2, Mantissa = 0
	float		minF32			= tcu::Float32((1u<<31) | (0xfdu<<23) | 0x0u).asFloat();
	float		maxF32			= tcu::Float32((0u<<31) | (0xfdu<<23) | 0x0u).asFloat();
	float		minF16			= tcu::Float16((deUint16)((1u<<15) | (0x1du<<10) | 0x0u)).asFloat();
	float		maxF16			= tcu::Float16((deUint16)((0u<<15) | (0x1du<<10) | 0x0u)).asFloat();
	tcu::Vec2	fullRange32F	(minF32, maxF32);
	tcu::Vec2	fullRange16F	(minF16, maxF16);
	tcu::IVec2	fullRange32I	(0x80000000, 0x7fffffff);
	tcu::IVec2	fullRange16I	(-(1<<15), (1<<15)-1);
	tcu::IVec2	fullRange8I		(-(1<<7), (1<<7)-1);
	tcu::UVec2	fullRange32U	(0u, 0xffffffffu);
	tcu::UVec2	fullRange16U	(0u, 0xffffu);
	tcu::UVec2	fullRange8U		(0u, 0xffu);

	// \note Right now it is not programmatically verified that the results shouldn't end up being inf/nan but
	//       actual values used are ok.

	static const struct
	{
		const char*							name;
		const char*							op;
		ShaderFloatPrecisionCase::EvalFunc	evalFunc;
		glu::Precision						precision;
		tcu::Vec2							rangeA;
		tcu::Vec2							rangeB;
	} floatCases[] =
	{
		// Name				Op				Eval			Precision				RangeA				RangeB
		{ "highp_add",		"in0 + in1",	add<double>,	glu::PRECISION_HIGHP,	fullRange32F,		fullRange32F		},
		{ "highp_sub",		"in0 - in1",	sub<double>,	glu::PRECISION_HIGHP,	fullRange32F,		fullRange32F		},
		{ "highp_mul",		"in0 * in1",	mul<double>,	glu::PRECISION_HIGHP,	Vec2(-1e5f, 1e5f),	Vec2(-1e5f, 1e5f)	},
		{ "highp_div",		"in0 / in1",	div<double>,	glu::PRECISION_HIGHP,	Vec2(-1e5f, 1e5f),	Vec2(-1e5f, 1e5f)	},
		{ "mediump_add",	"in0 + in1",	add<double>,	glu::PRECISION_MEDIUMP,	fullRange16F,		fullRange16F		},
		{ "mediump_sub",	"in0 - in1",	sub<double>,	glu::PRECISION_MEDIUMP,	fullRange16F,		fullRange16F		},
		{ "mediump_mul",	"in0 * in1",	mul<double>,	glu::PRECISION_MEDIUMP,	Vec2(-1e2f, 1e2f),	Vec2(-1e2f, 1e2f)	},
		{ "mediump_div",	"in0 / in1",	div<double>,	glu::PRECISION_MEDIUMP,	Vec2(-1e2f, 1e2f),	Vec2(-1e2f, 1e2f)	}
	};

	static const struct
	{
		const char*							name;
		const char*							op;
		ShaderIntPrecisionCase::EvalFunc	evalFunc;
		glu::Precision						precision;
		int									bits;
		tcu::IVec2							rangeA;
		tcu::IVec2							rangeB;
	} intCases[] =
	{
		// Name				Op				Eval				Precision				Bits	RangeA			RangeB
		{ "highp_add",		"in0 + in1",	add<int>,			glu::PRECISION_HIGHP,	32,		fullRange32I,	fullRange32I },
		{ "highp_sub",		"in0 - in1",	sub<int>,			glu::PRECISION_HIGHP,	32,		fullRange32I,	fullRange32I },
		{ "highp_mul",		"in0 * in1",	mul<int>,			glu::PRECISION_HIGHP,	32,		fullRange32I,	fullRange32I },
		{ "highp_div",		"in0 / in1",	div<int>,			glu::PRECISION_HIGHP,	32,		fullRange32I,	IVec2(-10000, -1) },
		{ "mediump_add",	"in0 + in1",	add<int>,			glu::PRECISION_MEDIUMP,	16,		fullRange16I,	fullRange16I },
		{ "mediump_sub",	"in0 - in1",	sub<int>,			glu::PRECISION_MEDIUMP,	16,		fullRange16I,	fullRange16I },
		{ "mediump_mul",	"in0 * in1",	mul<int>,			glu::PRECISION_MEDIUMP,	16,		fullRange16I,	fullRange16I },
		{ "mediump_div",	"in0 / in1",	div<int>,			glu::PRECISION_MEDIUMP,	16,		fullRange16I,	IVec2(1, 1000) },
		{ "lowp_add",		"in0 + in1",	add<int>,			glu::PRECISION_LOWP,	8,		fullRange8I,	fullRange8I },
		{ "lowp_sub",		"in0 - in1",	sub<int>,			glu::PRECISION_LOWP,	8,		fullRange8I,	fullRange8I },
		{ "lowp_mul",		"in0 * in1",	mul<int>,			glu::PRECISION_LOWP,	8,		fullRange8I,	fullRange8I },
		{ "lowp_div",		"in0 / in1",	div<int>,			glu::PRECISION_LOWP,	8,		fullRange8I,	IVec2(-50, -1) }
	};

	static const struct
	{
		const char*							name;
		const char*							op;
		ShaderUintPrecisionCase::EvalFunc	evalFunc;
		glu::Precision						precision;
		int									bits;
		tcu::UVec2							rangeA;
		tcu::UVec2							rangeB;
	} uintCases[] =
	{
		// Name				Op				Eval				Precision				Bits	RangeA			RangeB
		{ "highp_add",		"in0 + in1",	add<deUint32>,		glu::PRECISION_HIGHP,	32,		fullRange32U,	fullRange32U },
		{ "highp_sub",		"in0 - in1",	sub<deUint32>,		glu::PRECISION_HIGHP,	32,		fullRange32U,	fullRange32U },
		{ "highp_mul",		"in0 * in1",	mul<deUint32>,		glu::PRECISION_HIGHP,	32,		fullRange32U,	fullRange32U },
		{ "highp_div",		"in0 / in1",	div<deUint32>,		glu::PRECISION_HIGHP,	32,		fullRange32U,	UVec2(1u, 10000u) },
		{ "mediump_add",	"in0 + in1",	add<deUint32>,		glu::PRECISION_MEDIUMP,	16,		fullRange16U,	fullRange16U },
		{ "mediump_sub",	"in0 - in1",	sub<deUint32>,		glu::PRECISION_MEDIUMP,	16,		fullRange16U,	fullRange16U },
		{ "mediump_mul",	"in0 * in1",	mul<deUint32>,		glu::PRECISION_MEDIUMP,	16,		fullRange16U,	fullRange16U },
		{ "mediump_div",	"in0 / in1",	div<deUint32>,		glu::PRECISION_MEDIUMP,	16,		fullRange16U,	UVec2(1, 1000u) },
		{ "lowp_add",		"in0 + in1",	add<deUint32>,		glu::PRECISION_LOWP,	8,		fullRange8U,	fullRange8U },
		{ "lowp_sub",		"in0 - in1",	sub<deUint32>,		glu::PRECISION_LOWP,	8,		fullRange8U,	fullRange8U },
		{ "lowp_mul",		"in0 * in1",	mul<deUint32>,		glu::PRECISION_LOWP,	8,		fullRange8U,	fullRange8U },
		{ "lowp_div",		"in0 / in1",	div<deUint32>,		glu::PRECISION_LOWP,	8,		fullRange8U,	UVec2(1, 50u) }
	};

	tcu::TestCaseGroup* floatGroup = new tcu::TestCaseGroup(m_testCtx, "float", "Floating-point precision tests");
	addChild(floatGroup);
	for (int ndx = 0; ndx < DE_LENGTH_OF_ARRAY(floatCases); ndx++)
	{
		floatGroup->addChild(new ShaderFloatPrecisionCase(m_context,
														  (string(floatCases[ndx].name) + "_vertex").c_str(), "",
														  floatCases[ndx].op,
														  floatCases[ndx].evalFunc,
														  floatCases[ndx].precision,
														  floatCases[ndx].rangeA,
														  floatCases[ndx].rangeB,
														  true));
		floatGroup->addChild(new ShaderFloatPrecisionCase(m_context,
														  (string(floatCases[ndx].name) + "_fragment").c_str(), "",
														  floatCases[ndx].op,
														  floatCases[ndx].evalFunc,
														  floatCases[ndx].precision,
														  floatCases[ndx].rangeA,
														  floatCases[ndx].rangeB,
														  false));
	}

	tcu::TestCaseGroup* intGroup = new tcu::TestCaseGroup(m_testCtx, "int", "Integer precision tests");
	addChild(intGroup);
	for (int ndx = 0; ndx < DE_LENGTH_OF_ARRAY(intCases); ndx++)
	{
		intGroup->addChild(new ShaderIntPrecisionCase(m_context,
													  (string(intCases[ndx].name) + "_vertex").c_str(), "",
													  intCases[ndx].op,
													  intCases[ndx].evalFunc,
													  intCases[ndx].precision,
													  intCases[ndx].bits,
													  intCases[ndx].rangeA,
													  intCases[ndx].rangeB,
													  true));
		intGroup->addChild(new ShaderIntPrecisionCase(m_context,
													  (string(intCases[ndx].name) + "_fragment").c_str(), "",
													  intCases[ndx].op,
													  intCases[ndx].evalFunc,
													  intCases[ndx].precision,
													  intCases[ndx].bits,
													  intCases[ndx].rangeA,
													  intCases[ndx].rangeB,
													  false));
	}

	tcu::TestCaseGroup* uintGroup = new tcu::TestCaseGroup(m_testCtx, "uint", "Unsigned integer precision tests");
	addChild(uintGroup);
	for (int ndx = 0; ndx < DE_LENGTH_OF_ARRAY(uintCases); ndx++)
	{
		uintGroup->addChild(new ShaderUintPrecisionCase(m_context,
														(string(uintCases[ndx].name) + "_vertex").c_str(), "",
														uintCases[ndx].op,
														uintCases[ndx].evalFunc,
														uintCases[ndx].precision,
														uintCases[ndx].bits,
														uintCases[ndx].rangeA,
														uintCases[ndx].rangeB,
														true));
		uintGroup->addChild(new ShaderUintPrecisionCase(m_context,
														(string(uintCases[ndx].name) + "_fragment").c_str(), "",
														uintCases[ndx].op,
														uintCases[ndx].evalFunc,
														uintCases[ndx].precision,
														uintCases[ndx].bits,
														uintCases[ndx].rangeA,
														uintCases[ndx].rangeB,
														false));
	}
}

} // Functional
} // gles3
} // deqp