// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "Renderer.hpp"

#include "Clipper.hpp"
#include "Surface.hpp"
#include "Primitive.hpp"
#include "Polygon.hpp"
#include "Main/FrameBuffer.hpp"
#include "Main/SwiftConfig.hpp"
#include "Reactor/Reactor.hpp"
#include "Shader/Constants.hpp"
#include "Common/MutexLock.hpp"
#include "Common/CPUID.hpp"
#include "Common/Memory.hpp"
#include "Common/Resource.hpp"
#include "Common/Half.hpp"
#include "Common/Math.hpp"
#include "Common/Timer.hpp"
#include "Common/Debug.hpp"

#undef max

bool disableServer = true;

#ifndef NDEBUG
unsigned int minPrimitives = 1;
unsigned int maxPrimitives = 1 << 21;
#endif

namespace sw
{
	extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
	extern bool symmetricNormalizedDepth;   // [-1, 1] instead of [0, 1]
	extern bool booleanFaceRegister;
	extern bool fullPixelPositionRegister;
	extern bool leadingVertexFirst;         // Flat shading uses first vertex, else last
	extern bool secondaryColor;             // Specular lighting is applied after texturing
	extern bool colorsDefaultToZero;

	extern bool forceWindowed;
	extern bool complementaryDepthBuffer;
	extern bool postBlendSRGB;
	extern bool exactColorRounding;
	extern TransparencyAntialiasing transparencyAntialiasing;
	extern bool forceClearRegisters;

	extern bool precacheVertex;
	extern bool precacheSetup;
	extern bool precachePixel;

	static const int batchSize = 128;
	AtomicInt threadCount(1);
	AtomicInt Renderer::unitCount(1);
	AtomicInt Renderer::clusterCount(1);

	TranscendentalPrecision logPrecision = ACCURATE;
	TranscendentalPrecision expPrecision = ACCURATE;
	TranscendentalPrecision rcpPrecision = ACCURATE;
	TranscendentalPrecision rsqPrecision = ACCURATE;
	bool perspectiveCorrection = true;

	static void setGlobalRenderingSettings(Conventions conventions, bool exactColorRounding)
	{
		static bool initialized = false;

		if(!initialized)
		{
			sw::halfIntegerCoordinates = conventions.halfIntegerCoordinates;
			sw::symmetricNormalizedDepth = conventions.symmetricNormalizedDepth;
			sw::booleanFaceRegister = conventions.booleanFaceRegister;
			sw::fullPixelPositionRegister = conventions.fullPixelPositionRegister;
			sw::leadingVertexFirst = conventions.leadingVertexFirst;
			sw::secondaryColor = conventions.secondaryColor;
			sw::colorsDefaultToZero = conventions.colorsDefaultToZero;
			sw::exactColorRounding = exactColorRounding;
			initialized = true;
		}
	}

	struct Parameters
	{
		Renderer *renderer;
		int threadIndex;
	};

	Query::Query(Type type) : building(false), data(0), type(type), reference(1)
	{
	}

	void Query::addRef()
	{
		++reference; // Atomic
	}

	void Query::release()
	{
		int ref = reference--; // Atomic

		ASSERT(ref >= 0);

		if(ref == 0)
		{
			delete this;
		}
	}

	DrawCall::DrawCall()
	{
		queries = 0;

		vsDirtyConstF = VERTEX_UNIFORM_VECTORS + 1;
		vsDirtyConstI = 16;
		vsDirtyConstB = 16;

		psDirtyConstF = FRAGMENT_UNIFORM_VECTORS;
		psDirtyConstI = 16;
		psDirtyConstB = 16;

		references = -1;

		data = (DrawData*)allocate(sizeof(DrawData));
		data->constants = &constants;
	}

	DrawCall::~DrawCall()
	{
		delete queries;

		deallocate(data);
	}

	Renderer::Renderer(Context *context, Conventions conventions, bool exactColorRounding) : VertexProcessor(context), PixelProcessor(context), SetupProcessor(context), context(context), viewport()
	{
		setGlobalRenderingSettings(conventions, exactColorRounding);

		setRenderTarget(0, 0);
		clipper = new Clipper(symmetricNormalizedDepth);
		blitter = new Blitter;

		updateViewMatrix = true;
		updateBaseMatrix = true;
		updateProjectionMatrix = true;
		updateClipPlanes = true;

		#if PERF_HUD
			resetTimers();
		#endif

		for(int i = 0; i < 16; i++)
		{
			vertexTask[i] = 0;

			worker[i] = 0;
			resume[i] = 0;
			suspend[i] = 0;
		}

		threadsAwake = 0;
		resumeApp = new Event();

		currentDraw = 0;
		nextDraw = 0;

		qHead = 0;
		qSize = 0;

		for(int i = 0; i < 16; i++)
		{
			triangleBatch[i] = 0;
			primitiveBatch[i] = 0;
		}

		for(int draw = 0; draw < DRAW_COUNT; draw++)
		{
			drawCall[draw] = new DrawCall();
			drawList[draw] = drawCall[draw];
		}

		for(int unit = 0; unit < 16; unit++)
		{
			primitiveProgress[unit].init();
		}

		for(int cluster = 0; cluster < 16; cluster++)
		{
			pixelProgress[cluster].init();
		}

		clipFlags = 0;

		swiftConfig = new SwiftConfig(disableServer);
		updateConfiguration(true);

		sync = new Resource(0);
	}

	Renderer::~Renderer()
	{
		sync->destruct();

		delete clipper;
		clipper = nullptr;

		delete blitter;
		blitter = nullptr;

		terminateThreads();
		delete resumeApp;

		for(int draw = 0; draw < DRAW_COUNT; draw++)
		{
			delete drawCall[draw];
		}

		delete swiftConfig;
	}

	// This object has to be mem aligned
	void* Renderer::operator new(size_t size)
	{
		ASSERT(size == sizeof(Renderer)); // This operator can't be called from a derived class
		return sw::allocate(sizeof(Renderer), 16);
	}

	void Renderer::operator delete(void * mem)
	{
		sw::deallocate(mem);
	}

	void Renderer::draw(DrawType drawType, unsigned int indexOffset, unsigned int count, bool update)
	{
		#ifndef NDEBUG
			if(count < minPrimitives || count > maxPrimitives)
			{
				return;
			}
		#endif

		context->drawType = drawType;

		updateConfiguration();
		updateClipper();

		int ss = context->getSuperSampleCount();
		int ms = context->getMultiSampleCount();
		bool requiresSync = false;

		for(int q = 0; q < ss; q++)
		{
			unsigned int oldMultiSampleMask = context->multiSampleMask;
			context->multiSampleMask = (context->sampleMask >> (ms * q)) & ((unsigned)0xFFFFFFFF >> (32 - ms));

			if(!context->multiSampleMask)
			{
				continue;
			}

			sync->lock(sw::PRIVATE);

			if(update || oldMultiSampleMask != context->multiSampleMask)
			{
				vertexState = VertexProcessor::update(drawType);
				setupState = SetupProcessor::update();
				pixelState = PixelProcessor::update();

				vertexRoutine = VertexProcessor::routine(vertexState);
				setupRoutine = SetupProcessor::routine(setupState);
				pixelRoutine = PixelProcessor::routine(pixelState);
			}

			int batch = batchSize / ms;

			int (Renderer::*setupPrimitives)(int batch, int count);

			if(context->isDrawTriangle())
			{
				switch(context->fillMode)
				{
				case FILL_SOLID:
					setupPrimitives = &Renderer::setupSolidTriangles;
					break;
				case FILL_WIREFRAME:
					setupPrimitives = &Renderer::setupWireframeTriangle;
					batch = 1;
					break;
				case FILL_VERTEX:
					setupPrimitives = &Renderer::setupVertexTriangle;
					batch = 1;
					break;
				default:
					ASSERT(false);
					return;
				}
			}
			else if(context->isDrawLine())
			{
				setupPrimitives = &Renderer::setupLines;
			}
			else   // Point draw
			{
				setupPrimitives = &Renderer::setupPoints;
			}

			DrawCall *draw = nullptr;

			do
			{
				for(int i = 0; i < DRAW_COUNT; i++)
				{
					if(drawCall[i]->references == -1)
					{
						draw = drawCall[i];
						drawList[nextDraw & DRAW_COUNT_BITS] = draw;

						break;
					}
				}

				if(!draw)
				{
					resumeApp->wait();
				}
			}
			while(!draw);

			DrawData *data = draw->data;

			if(queries.size() != 0)
			{
				draw->queries = new std::list<Query*>();
				bool includePrimitivesWrittenQueries = vertexState.transformFeedbackQueryEnabled && vertexState.transformFeedbackEnabled;
				for(auto &query : queries)
				{
					if(includePrimitivesWrittenQueries || (query->type != Query::TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN))
					{
						query->addRef();
						draw->queries->push_back(query);
					}
				}
			}

			draw->drawType = drawType;
			draw->batchSize = batch;

			vertexRoutine->bind();
			setupRoutine->bind();
			pixelRoutine->bind();

			draw->vertexRoutine = vertexRoutine;
			draw->setupRoutine = setupRoutine;
			draw->pixelRoutine = pixelRoutine;
			draw->vertexPointer = (VertexProcessor::RoutinePointer)vertexRoutine->getEntry();
			draw->setupPointer = (SetupProcessor::RoutinePointer)setupRoutine->getEntry();
			draw->pixelPointer = (PixelProcessor::RoutinePointer)pixelRoutine->getEntry();
			draw->setupPrimitives = setupPrimitives;
			draw->setupState = setupState;

			for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
			{
				draw->vertexStream[i] = context->input[i].resource;
				data->input[i] = context->input[i].buffer;
				data->stride[i] = context->input[i].stride;

				if(draw->vertexStream[i])
				{
					draw->vertexStream[i]->lock(PUBLIC, PRIVATE);
				}
			}

			if(context->indexBuffer)
			{
				data->indices = (unsigned char*)context->indexBuffer->lock(PUBLIC, PRIVATE) + indexOffset;
			}

			draw->indexBuffer = context->indexBuffer;

			for(int sampler = 0; sampler < TOTAL_IMAGE_UNITS; sampler++)
			{
				draw->texture[sampler] = 0;
			}

			for(int sampler = 0; sampler < TEXTURE_IMAGE_UNITS; sampler++)
			{
				if(pixelState.sampler[sampler].textureType != TEXTURE_NULL)
				{
					draw->texture[sampler] = context->texture[sampler];
					draw->texture[sampler]->lock(PUBLIC, isReadWriteTexture(sampler) ? MANAGED : PRIVATE);   // If the texure is both read and written, use the same read/write lock as render targets

					data->mipmap[sampler] = context->sampler[sampler].getTextureData();

					requiresSync |= context->sampler[sampler].requiresSync();
				}
			}

			if(context->pixelShader)
			{
				if(draw->psDirtyConstF)
				{
					memcpy(&data->ps.cW, PixelProcessor::cW, sizeof(word4) * 4 * (draw->psDirtyConstF < 8 ? draw->psDirtyConstF : 8));
					memcpy(&data->ps.c, PixelProcessor::c, sizeof(float4) * draw->psDirtyConstF);
					draw->psDirtyConstF = 0;
				}

				if(draw->psDirtyConstI)
				{
					memcpy(&data->ps.i, PixelProcessor::i, sizeof(int4) * draw->psDirtyConstI);
					draw->psDirtyConstI = 0;
				}

				if(draw->psDirtyConstB)
				{
					memcpy(&data->ps.b, PixelProcessor::b, sizeof(bool) * draw->psDirtyConstB);
					draw->psDirtyConstB = 0;
				}

				PixelProcessor::lockUniformBuffers(data->ps.u, draw->pUniformBuffers);
			}
			else
			{
				for(int i = 0; i < MAX_UNIFORM_BUFFER_BINDINGS; i++)
				{
					draw->pUniformBuffers[i] = nullptr;
				}
			}

			if(context->pixelShaderModel() <= 0x0104)
			{
				for(int stage = 0; stage < 8; stage++)
				{
					if(pixelState.textureStage[stage].stageOperation != TextureStage::STAGE_DISABLE || context->pixelShader)
					{
						data->textureStage[stage] = context->textureStage[stage].uniforms;
					}
					else break;
				}
			}

			if(context->vertexShader)
			{
				if(context->vertexShader->getShaderModel() >= 0x0300)
				{
					for(int sampler = 0; sampler < VERTEX_TEXTURE_IMAGE_UNITS; sampler++)
					{
						if(vertexState.sampler[sampler].textureType != TEXTURE_NULL)
						{
							draw->texture[TEXTURE_IMAGE_UNITS + sampler] = context->texture[TEXTURE_IMAGE_UNITS + sampler];
							draw->texture[TEXTURE_IMAGE_UNITS + sampler]->lock(PUBLIC, PRIVATE);

							data->mipmap[TEXTURE_IMAGE_UNITS + sampler] = context->sampler[TEXTURE_IMAGE_UNITS + sampler].getTextureData();

							requiresSync |= context->sampler[TEXTURE_IMAGE_UNITS + sampler].requiresSync();
						}
					}
				}

				if(draw->vsDirtyConstF)
				{
					memcpy(&data->vs.c, VertexProcessor::c, sizeof(float4) * draw->vsDirtyConstF);
					draw->vsDirtyConstF = 0;
				}

				if(draw->vsDirtyConstI)
				{
					memcpy(&data->vs.i, VertexProcessor::i, sizeof(int4) * draw->vsDirtyConstI);
					draw->vsDirtyConstI = 0;
				}

				if(draw->vsDirtyConstB)
				{
					memcpy(&data->vs.b, VertexProcessor::b, sizeof(bool) * draw->vsDirtyConstB);
					draw->vsDirtyConstB = 0;
				}

				if(context->vertexShader->isInstanceIdDeclared())
				{
					data->instanceID = context->instanceID;
				}

				VertexProcessor::lockUniformBuffers(data->vs.u, draw->vUniformBuffers);
				VertexProcessor::lockTransformFeedbackBuffers(data->vs.t, data->vs.reg, data->vs.row, data->vs.col, data->vs.str, draw->transformFeedbackBuffers);
			}
			else
			{
				data->ff = ff;

				draw->vsDirtyConstF = VERTEX_UNIFORM_VECTORS + 1;
				draw->vsDirtyConstI = 16;
				draw->vsDirtyConstB = 16;

				for(int i = 0; i < MAX_UNIFORM_BUFFER_BINDINGS; i++)
				{
					draw->vUniformBuffers[i] = nullptr;
				}

				for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; i++)
				{
					draw->transformFeedbackBuffers[i] = nullptr;
				}
			}

			if(pixelState.stencilActive)
			{
				data->stencil[0] = stencil;
				data->stencil[1] = stencilCCW;
			}

			if(pixelState.fogActive)
			{
				data->fog = fog;
			}

			if(setupState.isDrawPoint)
			{
				data->point = point;
			}

			data->lineWidth = context->lineWidth;

			data->factor = factor;

			if(pixelState.transparencyAntialiasing == TRANSPARENCY_ALPHA_TO_COVERAGE)
			{
				float ref = context->alphaReference * (1.0f / 255.0f);
				float margin = sw::min(ref, 1.0f - ref);

				if(ms == 4)
				{
					data->a2c0 = replicate(ref - margin * 0.6f);
					data->a2c1 = replicate(ref - margin * 0.2f);
					data->a2c2 = replicate(ref + margin * 0.2f);
					data->a2c3 = replicate(ref + margin * 0.6f);
				}
				else if(ms == 2)
				{
					data->a2c0 = replicate(ref - margin * 0.3f);
					data->a2c1 = replicate(ref + margin * 0.3f);
				}
				else ASSERT(false);
			}

			if(pixelState.occlusionEnabled)
			{
				for(int cluster = 0; cluster < clusterCount; cluster++)
				{
					data->occlusion[cluster] = 0;
				}
			}

			#if PERF_PROFILE
				for(int cluster = 0; cluster < clusterCount; cluster++)
				{
					for(int i = 0; i < PERF_TIMERS; i++)
					{
						data->cycles[i][cluster] = 0;
					}
				}
			#endif

			// Viewport
			{
				float W = 0.5f * viewport.width;
				float H = 0.5f * viewport.height;
				float X0 = viewport.x0 + W;
				float Y0 = viewport.y0 + H;
				float N = viewport.minZ;
				float F = viewport.maxZ;
				float Z = F - N;

				if(context->isDrawTriangle(false))
				{
					N += context->depthBias;
				}

				if(complementaryDepthBuffer)
				{
					Z = -Z;
					N = 1 - N;
				}

				static const float X[5][16] =   // Fragment offsets
				{
					{+0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 1 sample
					{-0.2500f, +0.2500f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 2 samples
					{-0.3000f, +0.1000f, +0.3000f, -0.1000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 4 samples
					{+0.1875f, -0.3125f, +0.3125f, -0.4375f, -0.0625f, +0.4375f, +0.0625f, -0.1875f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 8 samples
					{+0.2553f, -0.1155f, +0.1661f, -0.1828f, +0.2293f, -0.4132f, -0.1773f, -0.0577f, +0.3891f, -0.4656f, +0.4103f, +0.4248f, -0.2109f, +0.3966f, -0.2664f, -0.3872f}    // 16 samples
				};

				static const float Y[5][16] =   // Fragment offsets
				{
					{+0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 1 sample
					{-0.2500f, +0.2500f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 2 samples
					{-0.1000f, -0.3000f, +0.1000f, +0.3000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 4 samples
					{-0.4375f, -0.3125f, -0.1875f, -0.0625f, +0.0625f, +0.1875f, +0.3125f, +0.4375f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 8 samples
					{-0.4503f, +0.1883f, +0.3684f, -0.4668f, -0.0690f, -0.1315f, +0.4999f, +0.0728f, +0.1070f, -0.3086f, +0.3725f, -0.1547f, -0.1102f, -0.3588f, +0.1789f, +0.0269f}    // 16 samples
				};

				int s = sw::log2(ss);

				data->Wx16 = replicate(W * 16);
				data->Hx16 = replicate(H * 16);
				data->X0x16 = replicate(X0 * 16 - 8);
				data->Y0x16 = replicate(Y0 * 16 - 8);
				data->XXXX = replicate(X[s][q] / W);
				data->YYYY = replicate(Y[s][q] / H);
				data->halfPixelX = replicate(0.5f / W);
				data->halfPixelY = replicate(0.5f / H);
				data->viewportHeight = abs(viewport.height);
				data->slopeDepthBias = context->slopeDepthBias;
				data->depthRange = Z;
				data->depthNear = N;
				draw->clipFlags = clipFlags;

				if(clipFlags)
				{
					if(clipFlags & Clipper::CLIP_PLANE0) data->clipPlane[0] = clipPlane[0];
					if(clipFlags & Clipper::CLIP_PLANE1) data->clipPlane[1] = clipPlane[1];
					if(clipFlags & Clipper::CLIP_PLANE2) data->clipPlane[2] = clipPlane[2];
					if(clipFlags & Clipper::CLIP_PLANE3) data->clipPlane[3] = clipPlane[3];
					if(clipFlags & Clipper::CLIP_PLANE4) data->clipPlane[4] = clipPlane[4];
					if(clipFlags & Clipper::CLIP_PLANE5) data->clipPlane[5] = clipPlane[5];
				}
			}

			// Target
			{
				for(int index = 0; index < RENDERTARGETS; index++)
				{
					draw->renderTarget[index] = context->renderTarget[index];

					if(draw->renderTarget[index])
					{
						unsigned int layer = context->renderTargetLayer[index];
						requiresSync |= context->renderTarget[index]->requiresSync();
						data->colorBuffer[index] = (unsigned int*)context->renderTarget[index]->lockInternal(0, 0, layer, LOCK_READWRITE, MANAGED);
						data->colorBuffer[index] += q * ms * context->renderTarget[index]->getSliceB(true);
						data->colorPitchB[index] = context->renderTarget[index]->getInternalPitchB();
						data->colorSliceB[index] = context->renderTarget[index]->getInternalSliceB();
					}
				}

				draw->depthBuffer = context->depthBuffer;
				draw->stencilBuffer = context->stencilBuffer;

				if(draw->depthBuffer)
				{
					unsigned int layer = context->depthBufferLayer;
					requiresSync |= context->depthBuffer->requiresSync();
					data->depthBuffer = (float*)context->depthBuffer->lockInternal(0, 0, layer, LOCK_READWRITE, MANAGED);
					data->depthBuffer += q * ms * context->depthBuffer->getSliceB(true);
					data->depthPitchB = context->depthBuffer->getInternalPitchB();
					data->depthSliceB = context->depthBuffer->getInternalSliceB();
				}

				if(draw->stencilBuffer)
				{
					unsigned int layer = context->stencilBufferLayer;
					requiresSync |= context->stencilBuffer->requiresSync();
					data->stencilBuffer = (unsigned char*)context->stencilBuffer->lockStencil(0, 0, layer, MANAGED);
					data->stencilBuffer += q * ms * context->stencilBuffer->getSliceB(true);
					data->stencilPitchB = context->stencilBuffer->getStencilPitchB();
					data->stencilSliceB = context->stencilBuffer->getStencilSliceB();
				}
			}

			// Scissor
			{
				data->scissorX0 = scissor.x0;
				data->scissorX1 = scissor.x1;
				data->scissorY0 = scissor.y0;
				data->scissorY1 = scissor.y1;
			}

			draw->primitive = 0;
			draw->count = count;

			draw->references = (count + batch - 1) / batch;

			schedulerMutex.lock();
			++nextDraw; // Atomic
			schedulerMutex.unlock();

			#ifndef NDEBUG
			if(threadCount == 1)   // Use main thread for draw execution
			{
				threadsAwake = 1;
				task[0].type = Task::RESUME;

				taskLoop(0);
			}
			else
			#endif
			{
				if(!threadsAwake)
				{
					suspend[0]->wait();

					threadsAwake = 1;
					task[0].type = Task::RESUME;

					resume[0]->signal();
				}
			}
		}

		// TODO(sugoi): This is a temporary brute-force workaround to ensure IOSurface synchronization.
		if(requiresSync)
		{
			synchronize();
		}
	}

	void Renderer::clear(void *value, Format format, Surface *dest, const Rect &clearRect, unsigned int rgbaMask)
	{
		blitter->clear(value, format, dest, clearRect, rgbaMask);
	}

	void Renderer::blit(Surface *source, const SliceRectF &sRect, Surface *dest, const SliceRect &dRect, bool filter, bool isStencil, bool sRGBconversion)
	{
		blitter->blit(source, sRect, dest, dRect, {filter, isStencil, sRGBconversion});
	}

	void Renderer::blit3D(Surface *source, Surface *dest)
	{
		blitter->blit3D(source, dest);
	}

	void Renderer::threadFunction(void *parameters)
	{
		Renderer *renderer = static_cast<Parameters*>(parameters)->renderer;
		int threadIndex = static_cast<Parameters*>(parameters)->threadIndex;

		if(logPrecision < IEEE)
		{
			CPUID::setFlushToZero(true);
			CPUID::setDenormalsAreZero(true);
		}

		renderer->threadLoop(threadIndex);
	}

	void Renderer::threadLoop(int threadIndex)
	{
		while(!exitThreads)
		{
			taskLoop(threadIndex);

			suspend[threadIndex]->signal();
			resume[threadIndex]->wait();
		}
	}

	void Renderer::taskLoop(int threadIndex)
	{
		while(task[threadIndex].type != Task::SUSPEND)
		{
			scheduleTask(threadIndex);
			executeTask(threadIndex);
		}
	}

	void Renderer::findAvailableTasks()
	{
		// Find pixel tasks
		for(int cluster = 0; cluster < clusterCount; cluster++)
		{
			if(!pixelProgress[cluster].executing)
			{
				for(int unit = 0; unit < unitCount; unit++)
				{
					if(primitiveProgress[unit].references > 0)   // Contains processed primitives
					{
						if(pixelProgress[cluster].drawCall == primitiveProgress[unit].drawCall)
						{
							if(pixelProgress[cluster].processedPrimitives == primitiveProgress[unit].firstPrimitive)   // Previous primitives have been rendered
							{
								Task &task = taskQueue[qHead];
								task.type = Task::PIXELS;
								task.primitiveUnit = unit;
								task.pixelCluster = cluster;

								pixelProgress[cluster].executing = true;

								// Commit to the task queue
								qHead = (qHead + 1) & TASK_COUNT_BITS;
								qSize++;

								break;
							}
						}
					}
				}
			}
		}

		// Find primitive tasks
		if(currentDraw == nextDraw)
		{
			return;   // No more primitives to process
		}

		for(int unit = 0; unit < unitCount; unit++)
		{
			DrawCall *draw = drawList[currentDraw & DRAW_COUNT_BITS];

			int primitive = draw->primitive;
			int count = draw->count;

			if(primitive >= count)
			{
				++currentDraw; // Atomic

				if(currentDraw == nextDraw)
				{
					return;   // No more primitives to process
				}

				draw = drawList[currentDraw & DRAW_COUNT_BITS];
			}

			if(!primitiveProgress[unit].references)   // Task not already being executed and not still in use by a pixel unit
			{
				primitive = draw->primitive;
				count = draw->count;
				int batch = draw->batchSize;

				primitiveProgress[unit].drawCall = currentDraw;
				primitiveProgress[unit].firstPrimitive = primitive;
				primitiveProgress[unit].primitiveCount = count - primitive >= batch ? batch : count - primitive;

				draw->primitive += batch;

				Task &task = taskQueue[qHead];
				task.type = Task::PRIMITIVES;
				task.primitiveUnit = unit;

				primitiveProgress[unit].references = -1;

				// Commit to the task queue
				qHead = (qHead + 1) & TASK_COUNT_BITS;
				qSize++;
			}
		}
	}

	void Renderer::scheduleTask(int threadIndex)
	{
		schedulerMutex.lock();

		int curThreadsAwake = threadsAwake;

		if((int)qSize < threadCount - curThreadsAwake + 1)
		{
			findAvailableTasks();
		}

		if(qSize != 0)
		{
			task[threadIndex] = taskQueue[(qHead - qSize) & TASK_COUNT_BITS];
			qSize--;

			if(curThreadsAwake != threadCount)
			{
				int wakeup = qSize - curThreadsAwake + 1;

				for(int i = 0; i < threadCount && wakeup > 0; i++)
				{
					if(task[i].type == Task::SUSPEND)
					{
						suspend[i]->wait();
						task[i].type = Task::RESUME;
						resume[i]->signal();

						++threadsAwake; // Atomic
						wakeup--;
					}
				}
			}
		}
		else
		{
			task[threadIndex].type = Task::SUSPEND;

			--threadsAwake; // Atomic
		}

		schedulerMutex.unlock();
	}

	void Renderer::executeTask(int threadIndex)
	{
		#if PERF_HUD
			int64_t startTick = Timer::ticks();
		#endif

		switch(task[threadIndex].type)
		{
		case Task::PRIMITIVES:
			{
				int unit = task[threadIndex].primitiveUnit;

				int input = primitiveProgress[unit].firstPrimitive;
				int count = primitiveProgress[unit].primitiveCount;
				DrawCall *draw = drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
				int (Renderer::*setupPrimitives)(int batch, int count) = draw->setupPrimitives;

				processPrimitiveVertices(unit, input, count, draw->count, threadIndex);

				#if PERF_HUD
					int64_t time = Timer::ticks();
					vertexTime[threadIndex] += time - startTick;
					startTick = time;
				#endif

				int visible = 0;

				if(!draw->setupState.rasterizerDiscard)
				{
					visible = (this->*setupPrimitives)(unit, count);
				}

				primitiveProgress[unit].visible = visible;
				primitiveProgress[unit].references = clusterCount;

				#if PERF_HUD
					setupTime[threadIndex] += Timer::ticks() - startTick;
				#endif
			}
			break;
		case Task::PIXELS:
			{
				int unit = task[threadIndex].primitiveUnit;
				int visible = primitiveProgress[unit].visible;

				if(visible > 0)
				{
					int cluster = task[threadIndex].pixelCluster;
					Primitive *primitive = primitiveBatch[unit];
					DrawCall *draw = drawList[pixelProgress[cluster].drawCall & DRAW_COUNT_BITS];
					DrawData *data = draw->data;
					PixelProcessor::RoutinePointer pixelRoutine = draw->pixelPointer;

					pixelRoutine(primitive, visible, cluster, data);
				}

				finishRendering(task[threadIndex]);

				#if PERF_HUD
					pixelTime[threadIndex] += Timer::ticks() - startTick;
				#endif
			}
			break;
		case Task::RESUME:
			break;
		case Task::SUSPEND:
			break;
		default:
			ASSERT(false);
		}
	}

	void Renderer::synchronize()
	{
		sync->lock(sw::PUBLIC);
		sync->unlock();
	}

	void Renderer::finishRendering(Task &pixelTask)
	{
		int unit = pixelTask.primitiveUnit;
		int cluster = pixelTask.pixelCluster;

		DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
		DrawData &data = *draw.data;
		int primitive = primitiveProgress[unit].firstPrimitive;
		int count = primitiveProgress[unit].primitiveCount;
		int processedPrimitives = primitive + count;

		pixelProgress[cluster].processedPrimitives = processedPrimitives;

		if(pixelProgress[cluster].processedPrimitives >= draw.count)
		{
			++pixelProgress[cluster].drawCall; // Atomic
			pixelProgress[cluster].processedPrimitives = 0;
		}

		int ref = primitiveProgress[unit].references--; // Atomic

		if(ref == 0)
		{
			ref = draw.references--; // Atomic

			if(ref == 0)
			{
				#if PERF_PROFILE
					for(int cluster = 0; cluster < clusterCount; cluster++)
					{
						for(int i = 0; i < PERF_TIMERS; i++)
						{
							profiler.cycles[i] += data.cycles[i][cluster];
						}
					}
				#endif

				if(draw.queries)
				{
					for(auto &query : *(draw.queries))
					{
						switch(query->type)
						{
						case Query::FRAGMENTS_PASSED:
							for(int cluster = 0; cluster < clusterCount; cluster++)
							{
								query->data += data.occlusion[cluster];
							}
							break;
						case Query::TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
							query->data += processedPrimitives;
							break;
						default:
							break;
						}

						query->release();
					}

					delete draw.queries;
					draw.queries = 0;
				}

				for(int i = 0; i < RENDERTARGETS; i++)
				{
					if(draw.renderTarget[i])
					{
						draw.renderTarget[i]->unlockInternal();
					}
				}

				if(draw.depthBuffer)
				{
					draw.depthBuffer->unlockInternal();
				}

				if(draw.stencilBuffer)
				{
					draw.stencilBuffer->unlockStencil();
				}

				for(int i = 0; i < TOTAL_IMAGE_UNITS; i++)
				{
					if(draw.texture[i])
					{
						draw.texture[i]->unlock();
					}
				}

				for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
				{
					if(draw.vertexStream[i])
					{
						draw.vertexStream[i]->unlock();
					}
				}

				if(draw.indexBuffer)
				{
					draw.indexBuffer->unlock();
				}

				for(int i = 0; i < MAX_UNIFORM_BUFFER_BINDINGS; i++)
				{
					if(draw.pUniformBuffers[i])
					{
						draw.pUniformBuffers[i]->unlock();
					}
					if(draw.vUniformBuffers[i])
					{
						draw.vUniformBuffers[i]->unlock();
					}
				}

				for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; i++)
				{
					if(draw.transformFeedbackBuffers[i])
					{
						draw.transformFeedbackBuffers[i]->unlock();
					}
				}

				draw.vertexRoutine->unbind();
				draw.setupRoutine->unbind();
				draw.pixelRoutine->unbind();

				sync->unlock();

				draw.references = -1;
				resumeApp->signal();
			}
		}

		pixelProgress[cluster].executing = false;
	}

	void Renderer::processPrimitiveVertices(int unit, unsigned int start, unsigned int triangleCount, unsigned int loop, int thread)
	{
		Triangle *triangle = triangleBatch[unit];
		int primitiveDrawCall = primitiveProgress[unit].drawCall;
		DrawCall *draw = drawList[primitiveDrawCall & DRAW_COUNT_BITS];
		DrawData *data = draw->data;
		VertexTask *task = vertexTask[thread];

		const void *indices = data->indices;
		VertexProcessor::RoutinePointer vertexRoutine = draw->vertexPointer;

		if(task->vertexCache.drawCall != primitiveDrawCall)
		{
			task->vertexCache.clear();
			task->vertexCache.drawCall = primitiveDrawCall;
		}

		unsigned int batch[128][3];   // FIXME: Adjust to dynamic batch size

		switch(draw->drawType)
		{
		case DRAW_POINTLIST:
			{
				unsigned int index = start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index;
					batch[i][1] = index;
					batch[i][2] = index;

					index += 1;
				}
			}
			break;
		case DRAW_LINELIST:
			{
				unsigned int index = 2 * start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index + 0;
					batch[i][1] = index + 1;
					batch[i][2] = index + 1;

					index += 2;
				}
			}
			break;
		case DRAW_LINESTRIP:
			{
				unsigned int index = start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index + 0;
					batch[i][1] = index + 1;
					batch[i][2] = index + 1;

					index += 1;
				}
			}
			break;
		case DRAW_LINELOOP:
			{
				unsigned int index = start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = (index + 0) % loop;
					batch[i][1] = (index + 1) % loop;
					batch[i][2] = (index + 1) % loop;

					index += 1;
				}
			}
			break;
		case DRAW_TRIANGLELIST:
			{
				unsigned int index = 3 * start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index + 0;
					batch[i][1] = index + 1;
					batch[i][2] = index + 2;

					index += 3;
				}
			}
			break;
		case DRAW_TRIANGLESTRIP:
			{
				unsigned int index = start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					if(leadingVertexFirst)
					{
						batch[i][0] = index + 0;
						batch[i][1] = index + (index & 1) + 1;
						batch[i][2] = index + (~index & 1) + 1;
					}
					else
					{
						batch[i][0] = index + (index & 1);
						batch[i][1] = index + (~index & 1);
						batch[i][2] = index + 2;
					}

					index += 1;
				}
			}
			break;
		case DRAW_TRIANGLEFAN:
			{
				unsigned int index = start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					if(leadingVertexFirst)
					{
						batch[i][0] = index + 1;
						batch[i][1] = index + 2;
						batch[i][2] = 0;
					}
					else
					{
						batch[i][0] = 0;
						batch[i][1] = index + 1;
						batch[i][2] = index + 2;
					}

					index += 1;
				}
			}
			break;
		case DRAW_INDEXEDPOINTLIST8:
			{
				const unsigned char *index = (const unsigned char*)indices + start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = *index;
					batch[i][1] = *index;
					batch[i][2] = *index;

					index += 1;
				}
			}
			break;
		case DRAW_INDEXEDPOINTLIST16:
			{
				const unsigned short *index = (const unsigned short*)indices + start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = *index;
					batch[i][1] = *index;
					batch[i][2] = *index;

					index += 1;
				}
			}
			break;
		case DRAW_INDEXEDPOINTLIST32:
			{
				const unsigned int *index = (const unsigned int*)indices + start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = *index;
					batch[i][1] = *index;
					batch[i][2] = *index;

					index += 1;
				}
			}
			break;
		case DRAW_INDEXEDLINELIST8:
			{
				const unsigned char *index = (const unsigned char*)indices + 2 * start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[1];
					batch[i][2] = index[1];

					index += 2;
				}
			}
			break;
		case DRAW_INDEXEDLINELIST16:
			{
				const unsigned short *index = (const unsigned short*)indices + 2 * start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[1];
					batch[i][2] = index[1];

					index += 2;
				}
			}
			break;
		case DRAW_INDEXEDLINELIST32:
			{
				const unsigned int *index = (const unsigned int*)indices + 2 * start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[1];
					batch[i][2] = index[1];

					index += 2;
				}
			}
			break;
		case DRAW_INDEXEDLINESTRIP8:
			{
				const unsigned char *index = (const unsigned char*)indices + start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[1];
					batch[i][2] = index[1];

					index += 1;
				}
			}
			break;
		case DRAW_INDEXEDLINESTRIP16:
			{
				const unsigned short *index = (const unsigned short*)indices + start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[1];
					batch[i][2] = index[1];

					index += 1;
				}
			}
			break;
		case DRAW_INDEXEDLINESTRIP32:
			{
				const unsigned int *index = (const unsigned int*)indices + start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[1];
					batch[i][2] = index[1];

					index += 1;
				}
			}
			break;
		case DRAW_INDEXEDLINELOOP8:
			{
				const unsigned char *index = (const unsigned char*)indices;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index[(start + i + 0) % loop];
					batch[i][1] = index[(start + i + 1) % loop];
					batch[i][2] = index[(start + i + 1) % loop];
				}
			}
			break;
		case DRAW_INDEXEDLINELOOP16:
			{
				const unsigned short *index = (const unsigned short*)indices;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index[(start + i + 0) % loop];
					batch[i][1] = index[(start + i + 1) % loop];
					batch[i][2] = index[(start + i + 1) % loop];
				}
			}
			break;
		case DRAW_INDEXEDLINELOOP32:
			{
				const unsigned int *index = (const unsigned int*)indices;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index[(start + i + 0) % loop];
					batch[i][1] = index[(start + i + 1) % loop];
					batch[i][2] = index[(start + i + 1) % loop];
				}
			}
			break;
		case DRAW_INDEXEDTRIANGLELIST8:
			{
				const unsigned char *index = (const unsigned char*)indices + 3 * start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[1];
					batch[i][2] = index[2];

					index += 3;
				}
			}
			break;
		case DRAW_INDEXEDTRIANGLELIST16:
			{
				const unsigned short *index = (const unsigned short*)indices + 3 * start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[1];
					batch[i][2] = index[2];

					index += 3;
				}
			}
			break;
		case DRAW_INDEXEDTRIANGLELIST32:
			{
				const unsigned int *index = (const unsigned int*)indices + 3 * start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[1];
					batch[i][2] = index[2];

					index += 3;
				}
			}
			break;
		case DRAW_INDEXEDTRIANGLESTRIP8:
			{
				const unsigned char *index = (const unsigned char*)indices + start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[((start + i) & 1) + 1];
					batch[i][2] = index[(~(start + i) & 1) + 1];

					index += 1;
				}
			}
			break;
		case DRAW_INDEXEDTRIANGLESTRIP16:
			{
				const unsigned short *index = (const unsigned short*)indices + start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[((start + i) & 1) + 1];
					batch[i][2] = index[(~(start + i) & 1) + 1];

					index += 1;
				}
			}
			break;
		case DRAW_INDEXEDTRIANGLESTRIP32:
			{
				const unsigned int *index = (const unsigned int*)indices + start;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[((start + i) & 1) + 1];
					batch[i][2] = index[(~(start + i) & 1) + 1];

					index += 1;
				}
			}
			break;
		case DRAW_INDEXEDTRIANGLEFAN8:
			{
				const unsigned char *index = (const unsigned char*)indices;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index[start + i + 1];
					batch[i][1] = index[start + i + 2];
					batch[i][2] = index[0];
				}
			}
			break;
		case DRAW_INDEXEDTRIANGLEFAN16:
			{
				const unsigned short *index = (const unsigned short*)indices;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index[start + i + 1];
					batch[i][1] = index[start + i + 2];
					batch[i][2] = index[0];
				}
			}
			break;
		case DRAW_INDEXEDTRIANGLEFAN32:
			{
				const unsigned int *index = (const unsigned int*)indices;

				for(unsigned int i = 0; i < triangleCount; i++)
				{
					batch[i][0] = index[start + i + 1];
					batch[i][1] = index[start + i + 2];
					batch[i][2] = index[0];
				}
			}
			break;
		case DRAW_QUADLIST:
			{
				unsigned int index = 4 * start / 2;

				for(unsigned int i = 0; i < triangleCount; i += 2)
				{
					batch[i+0][0] = index + 0;
					batch[i+0][1] = index + 1;
					batch[i+0][2] = index + 2;

					batch[i+1][0] = index + 0;
					batch[i+1][1] = index + 2;
					batch[i+1][2] = index + 3;

					index += 4;
				}
			}
			break;
		default:
			ASSERT(false);
			return;
		}

		task->primitiveStart = start;
		task->vertexCount = triangleCount * 3;
		vertexRoutine(&triangle->v0, (unsigned int*)&batch, task, data);
	}

	int Renderer::setupSolidTriangles(int unit, int count)
	{
		Triangle *triangle = triangleBatch[unit];
		Primitive *primitive = primitiveBatch[unit];

		DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
		SetupProcessor::State &state = draw.setupState;
		const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;

		int ms = state.multiSample;
		int pos = state.positionRegister;
		const DrawData *data = draw.data;
		int visible = 0;

		for(int i = 0; i < count; i++, triangle++)
		{
			Vertex &v0 = triangle->v0;
			Vertex &v1 = triangle->v1;
			Vertex &v2 = triangle->v2;

			if((v0.clipFlags & v1.clipFlags & v2.clipFlags) == Clipper::CLIP_FINITE)
			{
				Polygon polygon(&v0.v[pos], &v1.v[pos], &v2.v[pos]);

				int clipFlagsOr = v0.clipFlags | v1.clipFlags | v2.clipFlags | draw.clipFlags;

				if(clipFlagsOr != Clipper::CLIP_FINITE)
				{
					if(!clipper->clip(polygon, clipFlagsOr, draw))
					{
						continue;
					}
				}

				if(setupRoutine(primitive, triangle, &polygon, data))
				{
					primitive += ms;
					visible++;
				}
			}
		}

		return visible;
	}

	int Renderer::setupWireframeTriangle(int unit, int count)
	{
		Triangle *triangle = triangleBatch[unit];
		Primitive *primitive = primitiveBatch[unit];
		int visible = 0;

		DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
		SetupProcessor::State &state = draw.setupState;

		const Vertex &v0 = triangle[0].v0;
		const Vertex &v1 = triangle[0].v1;
		const Vertex &v2 = triangle[0].v2;

		float d = (v0.y * v1.x - v0.x * v1.y) * v2.w + (v0.x * v2.y - v0.y * v2.x) * v1.w + (v2.x * v1.y - v1.x * v2.y) * v0.w;

		if(state.cullMode == CULL_CLOCKWISE)
		{
			if(d >= 0) return 0;
		}
		else if(state.cullMode == CULL_COUNTERCLOCKWISE)
		{
			if(d <= 0) return 0;
		}

		// Copy attributes
		triangle[1].v0 = v1;
		triangle[1].v1 = v2;
		triangle[2].v0 = v2;
		triangle[2].v1 = v0;

		if(state.color[0][0].flat)   // FIXME
		{
			for(int i = 0; i < 2; i++)
			{
				triangle[1].v0.C[i] = triangle[0].v0.C[i];
				triangle[1].v1.C[i] = triangle[0].v0.C[i];
				triangle[2].v0.C[i] = triangle[0].v0.C[i];
				triangle[2].v1.C[i] = triangle[0].v0.C[i];
			}
		}

		for(int i = 0; i < 3; i++)
		{
			if(setupLine(*primitive, *triangle, draw))
			{
				primitive->area = 0.5f * d;

				primitive++;
				visible++;
			}

			triangle++;
		}

		return visible;
	}

	int Renderer::setupVertexTriangle(int unit, int count)
	{
		Triangle *triangle = triangleBatch[unit];
		Primitive *primitive = primitiveBatch[unit];
		int visible = 0;

		DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
		SetupProcessor::State &state = draw.setupState;

		const Vertex &v0 = triangle[0].v0;
		const Vertex &v1 = triangle[0].v1;
		const Vertex &v2 = triangle[0].v2;

		float d = (v0.y * v1.x - v0.x * v1.y) * v2.w + (v0.x * v2.y - v0.y * v2.x) * v1.w + (v2.x * v1.y - v1.x * v2.y) * v0.w;

		if(state.cullMode == CULL_CLOCKWISE)
		{
			if(d >= 0) return 0;
		}
		else if(state.cullMode == CULL_COUNTERCLOCKWISE)
		{
			if(d <= 0) return 0;
		}

		// Copy attributes
		triangle[1].v0 = v1;
		triangle[2].v0 = v2;

		for(int i = 0; i < 3; i++)
		{
			if(setupPoint(*primitive, *triangle, draw))
			{
				primitive->area = 0.5f * d;

				primitive++;
				visible++;
			}

			triangle++;
		}

		return visible;
	}

	int Renderer::setupLines(int unit, int count)
	{
		Triangle *triangle = triangleBatch[unit];
		Primitive *primitive = primitiveBatch[unit];
		int visible = 0;

		DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
		SetupProcessor::State &state = draw.setupState;

		int ms = state.multiSample;

		for(int i = 0; i < count; i++)
		{
			if(setupLine(*primitive, *triangle, draw))
			{
				primitive += ms;
				visible++;
			}

			triangle++;
		}

		return visible;
	}

	int Renderer::setupPoints(int unit, int count)
	{
		Triangle *triangle = triangleBatch[unit];
		Primitive *primitive = primitiveBatch[unit];
		int visible = 0;

		DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
		SetupProcessor::State &state = draw.setupState;

		int ms = state.multiSample;

		for(int i = 0; i < count; i++)
		{
			if(setupPoint(*primitive, *triangle, draw))
			{
				primitive += ms;
				visible++;
			}

			triangle++;
		}

		return visible;
	}

	bool Renderer::setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
	{
		const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
		const SetupProcessor::State &state = draw.setupState;
		const DrawData &data = *draw.data;

		float lineWidth = data.lineWidth;

		Vertex &v0 = triangle.v0;
		Vertex &v1 = triangle.v1;

		int pos = state.positionRegister;

		const float4 &P0 = v0.v[pos];
		const float4 &P1 = v1.v[pos];

		if(P0.w <= 0 && P1.w <= 0)
		{
			return false;
		}

		const float W = data.Wx16[0] * (1.0f / 16.0f);
		const float H = data.Hx16[0] * (1.0f / 16.0f);

		float dx = W * (P1.x / P1.w - P0.x / P0.w);
		float dy = H * (P1.y / P1.w - P0.y / P0.w);

		if(dx == 0 && dy == 0)
		{
			return false;
		}

		if(state.multiSample > 1)   // Rectangle
		{
			float4 P[4];
			int C[4];

			P[0] = P0;
			P[1] = P1;
			P[2] = P1;
			P[3] = P0;

			float scale = lineWidth * 0.5f / sqrt(dx*dx + dy*dy);

			dx *= scale;
			dy *= scale;

			float dx0h = dx * P0.w / H;
			float dy0w = dy * P0.w / W;

			float dx1h = dx * P1.w / H;
			float dy1w = dy * P1.w / W;

			P[0].x += -dy0w;
			P[0].y += +dx0h;
			C[0] = clipper->computeClipFlags(P[0]);

			P[1].x += -dy1w;
			P[1].y += +dx1h;
			C[1] = clipper->computeClipFlags(P[1]);

			P[2].x += +dy1w;
			P[2].y += -dx1h;
			C[2] = clipper->computeClipFlags(P[2]);

			P[3].x += +dy0w;
			P[3].y += -dx0h;
			C[3] = clipper->computeClipFlags(P[3]);

			if((C[0] & C[1] & C[2] & C[3]) == Clipper::CLIP_FINITE)
			{
				Polygon polygon(P, 4);

				int clipFlagsOr = C[0] | C[1] | C[2] | C[3] | draw.clipFlags;

				if(clipFlagsOr != Clipper::CLIP_FINITE)
				{
					if(!clipper->clip(polygon, clipFlagsOr, draw))
					{
						return false;
					}
				}

				return setupRoutine(&primitive, &triangle, &polygon, &data);
			}
		}
		else   // Diamond test convention
		{
			float4 P[8];
			int C[8];

			P[0] = P0;
			P[1] = P0;
			P[2] = P0;
			P[3] = P0;
			P[4] = P1;
			P[5] = P1;
			P[6] = P1;
			P[7] = P1;

			float dx0 = lineWidth * 0.5f * P0.w / W;
			float dy0 = lineWidth * 0.5f * P0.w / H;

			float dx1 = lineWidth * 0.5f * P1.w / W;
			float dy1 = lineWidth * 0.5f * P1.w / H;

			P[0].x += -dx0;
			C[0] = clipper->computeClipFlags(P[0]);

			P[1].y += +dy0;
			C[1] = clipper->computeClipFlags(P[1]);

			P[2].x += +dx0;
			C[2] = clipper->computeClipFlags(P[2]);

			P[3].y += -dy0;
			C[3] = clipper->computeClipFlags(P[3]);

			P[4].x += -dx1;
			C[4] = clipper->computeClipFlags(P[4]);

			P[5].y += +dy1;
			C[5] = clipper->computeClipFlags(P[5]);

			P[6].x += +dx1;
			C[6] = clipper->computeClipFlags(P[6]);

			P[7].y += -dy1;
			C[7] = clipper->computeClipFlags(P[7]);

			if((C[0] & C[1] & C[2] & C[3] & C[4] & C[5] & C[6] & C[7]) == Clipper::CLIP_FINITE)
			{
				float4 L[6];

				if(dx > -dy)
				{
					if(dx > dy)   // Right
					{
						L[0] = P[0];
						L[1] = P[1];
						L[2] = P[5];
						L[3] = P[6];
						L[4] = P[7];
						L[5] = P[3];
					}
					else   // Down
					{
						L[0] = P[0];
						L[1] = P[4];
						L[2] = P[5];
						L[3] = P[6];
						L[4] = P[2];
						L[5] = P[3];
					}
				}
				else
				{
					if(dx > dy)   // Up
					{
						L[0] = P[0];
						L[1] = P[1];
						L[2] = P[2];
						L[3] = P[6];
						L[4] = P[7];
						L[5] = P[4];
					}
					else   // Left
					{
						L[0] = P[1];
						L[1] = P[2];
						L[2] = P[3];
						L[3] = P[7];
						L[4] = P[4];
						L[5] = P[5];
					}
				}

				Polygon polygon(L, 6);

				int clipFlagsOr = C[0] | C[1] | C[2] | C[3] | C[4] | C[5] | C[6] | C[7] | draw.clipFlags;

				if(clipFlagsOr != Clipper::CLIP_FINITE)
				{
					if(!clipper->clip(polygon, clipFlagsOr, draw))
					{
						return false;
					}
				}

				return setupRoutine(&primitive, &triangle, &polygon, &data);
			}
		}

		return false;
	}

	bool Renderer::setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
	{
		const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
		const SetupProcessor::State &state = draw.setupState;
		const DrawData &data = *draw.data;

		Vertex &v = triangle.v0;

		float pSize;

		int pts = state.pointSizeRegister;

		if(state.pointSizeRegister != Unused)
		{
			pSize = v.v[pts].y;
		}
		else
		{
			pSize = data.point.pointSize[0];
		}

		pSize = clamp(pSize, data.point.pointSizeMin, data.point.pointSizeMax);

		float4 P[4];
		int C[4];

		int pos = state.positionRegister;

		P[0] = v.v[pos];
		P[1] = v.v[pos];
		P[2] = v.v[pos];
		P[3] = v.v[pos];

		const float X = pSize * P[0].w * data.halfPixelX[0];
		const float Y = pSize * P[0].w * data.halfPixelY[0];

		P[0].x -= X;
		P[0].y += Y;
		C[0] = clipper->computeClipFlags(P[0]);

		P[1].x += X;
		P[1].y += Y;
		C[1] = clipper->computeClipFlags(P[1]);

		P[2].x += X;
		P[2].y -= Y;
		C[2] = clipper->computeClipFlags(P[2]);

		P[3].x -= X;
		P[3].y -= Y;
		C[3] = clipper->computeClipFlags(P[3]);

		triangle.v1 = triangle.v0;
		triangle.v2 = triangle.v0;

		triangle.v1.X += iround(16 * 0.5f * pSize);
		triangle.v2.Y -= iround(16 * 0.5f * pSize) * (data.Hx16[0] > 0.0f ? 1 : -1);   // Both Direct3D and OpenGL expect (0, 0) in the top-left corner

		Polygon polygon(P, 4);

		if((C[0] & C[1] & C[2] & C[3]) == Clipper::CLIP_FINITE)
		{
			int clipFlagsOr = C[0] | C[1] | C[2] | C[3] | draw.clipFlags;

			if(clipFlagsOr != Clipper::CLIP_FINITE)
			{
				if(!clipper->clip(polygon, clipFlagsOr, draw))
				{
					return false;
				}
			}

			return setupRoutine(&primitive, &triangle, &polygon, &data);
		}

		return false;
	}

	void Renderer::initializeThreads()
	{
		unitCount = ceilPow2(threadCount);
		clusterCount = ceilPow2(threadCount);

		for(int i = 0; i < unitCount; i++)
		{
			triangleBatch[i] = (Triangle*)allocate(batchSize * sizeof(Triangle));
			primitiveBatch[i] = (Primitive*)allocate(batchSize * sizeof(Primitive));
		}

		for(int i = 0; i < threadCount; i++)
		{
			vertexTask[i] = (VertexTask*)allocate(sizeof(VertexTask));
			vertexTask[i]->vertexCache.drawCall = -1;

			task[i].type = Task::SUSPEND;

			resume[i] = new Event();
			suspend[i] = new Event();

			Parameters parameters;
			parameters.threadIndex = i;
			parameters.renderer = this;

			exitThreads = false;
			worker[i] = new Thread(threadFunction, &parameters);

			suspend[i]->wait();
			suspend[i]->signal();
		}
	}

	void Renderer::terminateThreads()
	{
		while(threadsAwake != 0)
		{
			Thread::sleep(1);
		}

		for(int thread = 0; thread < threadCount; thread++)
		{
			if(worker[thread])
			{
				exitThreads = true;
				resume[thread]->signal();
				worker[thread]->join();

				delete worker[thread];
				worker[thread] = 0;
				delete resume[thread];
				resume[thread] = 0;
				delete suspend[thread];
				suspend[thread] = 0;
			}

			deallocate(vertexTask[thread]);
			vertexTask[thread] = 0;
		}

		for(int i = 0; i < 16; i++)
		{
			deallocate(triangleBatch[i]);
			triangleBatch[i] = 0;

			deallocate(primitiveBatch[i]);
			primitiveBatch[i] = 0;
		}
	}

	void Renderer::loadConstants(const VertexShader *vertexShader)
	{
		if(!vertexShader) return;

		size_t count = vertexShader->getLength();

		for(size_t i = 0; i < count; i++)
		{
			const Shader::Instruction *instruction = vertexShader->getInstruction(i);

			if(instruction->opcode == Shader::OPCODE_DEF)
			{
				int index = instruction->dst.index;
				float value[4];

				value[0] = instruction->src[0].value[0];
				value[1] = instruction->src[0].value[1];
				value[2] = instruction->src[0].value[2];
				value[3] = instruction->src[0].value[3];

				setVertexShaderConstantF(index, value);
			}
			else if(instruction->opcode == Shader::OPCODE_DEFI)
			{
				int index = instruction->dst.index;
				int integer[4];

				integer[0] = instruction->src[0].integer[0];
				integer[1] = instruction->src[0].integer[1];
				integer[2] = instruction->src[0].integer[2];
				integer[3] = instruction->src[0].integer[3];

				setVertexShaderConstantI(index, integer);
			}
			else if(instruction->opcode == Shader::OPCODE_DEFB)
			{
				int index = instruction->dst.index;
				int boolean = instruction->src[0].boolean[0];

				setVertexShaderConstantB(index, &boolean);
			}
		}
	}

	void Renderer::loadConstants(const PixelShader *pixelShader)
	{
		if(!pixelShader) return;

		size_t count = pixelShader->getLength();

		for(size_t i = 0; i < count; i++)
		{
			const Shader::Instruction *instruction = pixelShader->getInstruction(i);

			if(instruction->opcode == Shader::OPCODE_DEF)
			{
				int index = instruction->dst.index;
				float value[4];

				value[0] = instruction->src[0].value[0];
				value[1] = instruction->src[0].value[1];
				value[2] = instruction->src[0].value[2];
				value[3] = instruction->src[0].value[3];

				setPixelShaderConstantF(index, value);
			}
			else if(instruction->opcode == Shader::OPCODE_DEFI)
			{
				int index = instruction->dst.index;
				int integer[4];

				integer[0] = instruction->src[0].integer[0];
				integer[1] = instruction->src[0].integer[1];
				integer[2] = instruction->src[0].integer[2];
				integer[3] = instruction->src[0].integer[3];

				setPixelShaderConstantI(index, integer);
			}
			else if(instruction->opcode == Shader::OPCODE_DEFB)
			{
				int index = instruction->dst.index;
				int boolean = instruction->src[0].boolean[0];

				setPixelShaderConstantB(index, &boolean);
			}
		}
	}

	void Renderer::setIndexBuffer(Resource *indexBuffer)
	{
		context->indexBuffer = indexBuffer;
	}

	void Renderer::setMultiSampleMask(unsigned int mask)
	{
		context->sampleMask = mask;
	}

	void Renderer::setTransparencyAntialiasing(TransparencyAntialiasing transparencyAntialiasing)
	{
		sw::transparencyAntialiasing = transparencyAntialiasing;
	}

	bool Renderer::isReadWriteTexture(int sampler)
	{
		for(int index = 0; index < RENDERTARGETS; index++)
		{
			if(context->renderTarget[index] && context->texture[sampler] == context->renderTarget[index]->getResource())
			{
				return true;
			}
		}

		if(context->depthBuffer && context->texture[sampler] == context->depthBuffer->getResource())
		{
			return true;
		}

		return false;
	}

	void Renderer::updateClipper()
	{
		if(updateClipPlanes)
		{
			if(VertexProcessor::isFixedFunction())   // User plane in world space
			{
				const Matrix &scissorWorld = getViewTransform();

				if(clipFlags & Clipper::CLIP_PLANE0) clipPlane[0] = scissorWorld * userPlane[0];
				if(clipFlags & Clipper::CLIP_PLANE1) clipPlane[1] = scissorWorld * userPlane[1];
				if(clipFlags & Clipper::CLIP_PLANE2) clipPlane[2] = scissorWorld * userPlane[2];
				if(clipFlags & Clipper::CLIP_PLANE3) clipPlane[3] = scissorWorld * userPlane[3];
				if(clipFlags & Clipper::CLIP_PLANE4) clipPlane[4] = scissorWorld * userPlane[4];
				if(clipFlags & Clipper::CLIP_PLANE5) clipPlane[5] = scissorWorld * userPlane[5];
			}
			else   // User plane in clip space
			{
				if(clipFlags & Clipper::CLIP_PLANE0) clipPlane[0] = userPlane[0];
				if(clipFlags & Clipper::CLIP_PLANE1) clipPlane[1] = userPlane[1];
				if(clipFlags & Clipper::CLIP_PLANE2) clipPlane[2] = userPlane[2];
				if(clipFlags & Clipper::CLIP_PLANE3) clipPlane[3] = userPlane[3];
				if(clipFlags & Clipper::CLIP_PLANE4) clipPlane[4] = userPlane[4];
				if(clipFlags & Clipper::CLIP_PLANE5) clipPlane[5] = userPlane[5];
			}

			updateClipPlanes = false;
		}
	}

	void Renderer::setTextureResource(unsigned int sampler, Resource *resource)
	{
		ASSERT(sampler < TOTAL_IMAGE_UNITS);

		context->texture[sampler] = resource;
	}

	void Renderer::setTextureLevel(unsigned int sampler, unsigned int face, unsigned int level, Surface *surface, TextureType type)
	{
		ASSERT(sampler < TOTAL_IMAGE_UNITS && face < 6 && level < MIPMAP_LEVELS);

		context->sampler[sampler].setTextureLevel(face, level, surface, type);
	}

	void Renderer::setTextureFilter(SamplerType type, int sampler, FilterType textureFilter)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setTextureFilter(sampler, textureFilter);
		}
		else
		{
			VertexProcessor::setTextureFilter(sampler, textureFilter);
		}
	}

	void Renderer::setMipmapFilter(SamplerType type, int sampler, MipmapType mipmapFilter)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setMipmapFilter(sampler, mipmapFilter);
		}
		else
		{
			VertexProcessor::setMipmapFilter(sampler, mipmapFilter);
		}
	}

	void Renderer::setGatherEnable(SamplerType type, int sampler, bool enable)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setGatherEnable(sampler, enable);
		}
		else
		{
			VertexProcessor::setGatherEnable(sampler, enable);
		}
	}

	void Renderer::setAddressingModeU(SamplerType type, int sampler, AddressingMode addressMode)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setAddressingModeU(sampler, addressMode);
		}
		else
		{
			VertexProcessor::setAddressingModeU(sampler, addressMode);
		}
	}

	void Renderer::setAddressingModeV(SamplerType type, int sampler, AddressingMode addressMode)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setAddressingModeV(sampler, addressMode);
		}
		else
		{
			VertexProcessor::setAddressingModeV(sampler, addressMode);
		}
	}

	void Renderer::setAddressingModeW(SamplerType type, int sampler, AddressingMode addressMode)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setAddressingModeW(sampler, addressMode);
		}
		else
		{
			VertexProcessor::setAddressingModeW(sampler, addressMode);
		}
	}

	void Renderer::setReadSRGB(SamplerType type, int sampler, bool sRGB)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setReadSRGB(sampler, sRGB);
		}
		else
		{
			VertexProcessor::setReadSRGB(sampler, sRGB);
		}
	}

	void Renderer::setMipmapLOD(SamplerType type, int sampler, float bias)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setMipmapLOD(sampler, bias);
		}
		else
		{
			VertexProcessor::setMipmapLOD(sampler, bias);
		}
	}

	void Renderer::setBorderColor(SamplerType type, int sampler, const Color<float> &borderColor)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setBorderColor(sampler, borderColor);
		}
		else
		{
			VertexProcessor::setBorderColor(sampler, borderColor);
		}
	}

	void Renderer::setMaxAnisotropy(SamplerType type, int sampler, float maxAnisotropy)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setMaxAnisotropy(sampler, maxAnisotropy);
		}
		else
		{
			VertexProcessor::setMaxAnisotropy(sampler, maxAnisotropy);
		}
	}

	void Renderer::setHighPrecisionFiltering(SamplerType type, int sampler, bool highPrecisionFiltering)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setHighPrecisionFiltering(sampler, highPrecisionFiltering);
		}
		else
		{
			VertexProcessor::setHighPrecisionFiltering(sampler, highPrecisionFiltering);
		}
	}

	void Renderer::setSwizzleR(SamplerType type, int sampler, SwizzleType swizzleR)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setSwizzleR(sampler, swizzleR);
		}
		else
		{
			VertexProcessor::setSwizzleR(sampler, swizzleR);
		}
	}

	void Renderer::setSwizzleG(SamplerType type, int sampler, SwizzleType swizzleG)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setSwizzleG(sampler, swizzleG);
		}
		else
		{
			VertexProcessor::setSwizzleG(sampler, swizzleG);
		}
	}

	void Renderer::setSwizzleB(SamplerType type, int sampler, SwizzleType swizzleB)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setSwizzleB(sampler, swizzleB);
		}
		else
		{
			VertexProcessor::setSwizzleB(sampler, swizzleB);
		}
	}

	void Renderer::setSwizzleA(SamplerType type, int sampler, SwizzleType swizzleA)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setSwizzleA(sampler, swizzleA);
		}
		else
		{
			VertexProcessor::setSwizzleA(sampler, swizzleA);
		}
	}

	void Renderer::setCompareFunc(SamplerType type, int sampler, CompareFunc compFunc)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setCompareFunc(sampler, compFunc);
		}
		else
		{
			VertexProcessor::setCompareFunc(sampler, compFunc);
		}
	}

	void Renderer::setBaseLevel(SamplerType type, int sampler, int baseLevel)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setBaseLevel(sampler, baseLevel);
		}
		else
		{
			VertexProcessor::setBaseLevel(sampler, baseLevel);
		}
	}

	void Renderer::setMaxLevel(SamplerType type, int sampler, int maxLevel)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setMaxLevel(sampler, maxLevel);
		}
		else
		{
			VertexProcessor::setMaxLevel(sampler, maxLevel);
		}
	}

	void Renderer::setMinLod(SamplerType type, int sampler, float minLod)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setMinLod(sampler, minLod);
		}
		else
		{
			VertexProcessor::setMinLod(sampler, minLod);
		}
	}

	void Renderer::setMaxLod(SamplerType type, int sampler, float maxLod)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setMaxLod(sampler, maxLod);
		}
		else
		{
			VertexProcessor::setMaxLod(sampler, maxLod);
		}
	}

	void Renderer::setSyncRequired(SamplerType type, int sampler, bool syncRequired)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setSyncRequired(sampler, syncRequired);
		}
		else
		{
			VertexProcessor::setSyncRequired(sampler, syncRequired);
		}
	}

	void Renderer::setPointSpriteEnable(bool pointSpriteEnable)
	{
		context->setPointSpriteEnable(pointSpriteEnable);
	}

	void Renderer::setPointScaleEnable(bool pointScaleEnable)
	{
		context->setPointScaleEnable(pointScaleEnable);
	}

	void Renderer::setLineWidth(float width)
	{
		context->lineWidth = width;
	}

	void Renderer::setDepthBias(float bias)
	{
		context->depthBias = bias;
	}

	void Renderer::setSlopeDepthBias(float slopeBias)
	{
		context->slopeDepthBias = slopeBias;
	}

	void Renderer::setRasterizerDiscard(bool rasterizerDiscard)
	{
		context->rasterizerDiscard = rasterizerDiscard;
	}

	void Renderer::setPixelShader(const PixelShader *shader)
	{
		context->pixelShader = shader;

		loadConstants(shader);
	}

	void Renderer::setVertexShader(const VertexShader *shader)
	{
		context->vertexShader = shader;

		loadConstants(shader);
	}

	void Renderer::setPixelShaderConstantF(unsigned int index, const float value[4], unsigned int count)
	{
		for(unsigned int i = 0; i < DRAW_COUNT; i++)
		{
			if(drawCall[i]->psDirtyConstF < index + count)
			{
				drawCall[i]->psDirtyConstF = index + count;
			}
		}

		for(unsigned int i = 0; i < count; i++)
		{
			PixelProcessor::setFloatConstant(index + i, value);
			value += 4;
		}
	}

	void Renderer::setPixelShaderConstantI(unsigned int index, const int value[4], unsigned int count)
	{
		for(unsigned int i = 0; i < DRAW_COUNT; i++)
		{
			if(drawCall[i]->psDirtyConstI < index + count)
			{
				drawCall[i]->psDirtyConstI = index + count;
			}
		}

		for(unsigned int i = 0; i < count; i++)
		{
			PixelProcessor::setIntegerConstant(index + i, value);
			value += 4;
		}
	}

	void Renderer::setPixelShaderConstantB(unsigned int index, const int *boolean, unsigned int count)
	{
		for(unsigned int i = 0; i < DRAW_COUNT; i++)
		{
			if(drawCall[i]->psDirtyConstB < index + count)
			{
				drawCall[i]->psDirtyConstB = index + count;
			}
		}

		for(unsigned int i = 0; i < count; i++)
		{
			PixelProcessor::setBooleanConstant(index + i, *boolean);
			boolean++;
		}
	}

	void Renderer::setVertexShaderConstantF(unsigned int index, const float value[4], unsigned int count)
	{
		for(unsigned int i = 0; i < DRAW_COUNT; i++)
		{
			if(drawCall[i]->vsDirtyConstF < index + count)
			{
				drawCall[i]->vsDirtyConstF = index + count;
			}
		}

		for(unsigned int i = 0; i < count; i++)
		{
			VertexProcessor::setFloatConstant(index + i, value);
			value += 4;
		}
	}

	void Renderer::setVertexShaderConstantI(unsigned int index, const int value[4], unsigned int count)
	{
		for(unsigned int i = 0; i < DRAW_COUNT; i++)
		{
			if(drawCall[i]->vsDirtyConstI < index + count)
			{
				drawCall[i]->vsDirtyConstI = index + count;
			}
		}

		for(unsigned int i = 0; i < count; i++)
		{
			VertexProcessor::setIntegerConstant(index + i, value);
			value += 4;
		}
	}

	void Renderer::setVertexShaderConstantB(unsigned int index, const int *boolean, unsigned int count)
	{
		for(unsigned int i = 0; i < DRAW_COUNT; i++)
		{
			if(drawCall[i]->vsDirtyConstB < index + count)
			{
				drawCall[i]->vsDirtyConstB = index + count;
			}
		}

		for(unsigned int i = 0; i < count; i++)
		{
			VertexProcessor::setBooleanConstant(index + i, *boolean);
			boolean++;
		}
	}

	void Renderer::setModelMatrix(const Matrix &M, int i)
	{
		VertexProcessor::setModelMatrix(M, i);
	}

	void Renderer::setViewMatrix(const Matrix &V)
	{
		VertexProcessor::setViewMatrix(V);
		updateClipPlanes = true;
	}

	void Renderer::setBaseMatrix(const Matrix &B)
	{
		VertexProcessor::setBaseMatrix(B);
		updateClipPlanes = true;
	}

	void Renderer::setProjectionMatrix(const Matrix &P)
	{
		VertexProcessor::setProjectionMatrix(P);
		updateClipPlanes = true;
	}

	void Renderer::addQuery(Query *query)
	{
		queries.push_back(query);
	}

	void Renderer::removeQuery(Query *query)
	{
		queries.remove(query);
	}

	#if PERF_HUD
		int Renderer::getThreadCount()
		{
			return threadCount;
		}

		int64_t Renderer::getVertexTime(int thread)
		{
			return vertexTime[thread];
		}

		int64_t Renderer::getSetupTime(int thread)
		{
			return setupTime[thread];
		}

		int64_t Renderer::getPixelTime(int thread)
		{
			return pixelTime[thread];
		}

		void Renderer::resetTimers()
		{
			for(int thread = 0; thread < threadCount; thread++)
			{
				vertexTime[thread] = 0;
				setupTime[thread] = 0;
				pixelTime[thread] = 0;
			}
		}
	#endif

	void Renderer::setViewport(const Viewport &viewport)
	{
		this->viewport = viewport;
	}

	void Renderer::setScissor(const Rect &scissor)
	{
		this->scissor = scissor;
	}

	void Renderer::setClipFlags(int flags)
	{
		clipFlags = flags << 8;   // Bottom 8 bits used by legacy frustum
	}

	void Renderer::setClipPlane(unsigned int index, const float plane[4])
	{
		if(index < MAX_CLIP_PLANES)
		{
			userPlane[index] = plane;
		}
		else ASSERT(false);

		updateClipPlanes = true;
	}

	void Renderer::updateConfiguration(bool initialUpdate)
	{
		bool newConfiguration = swiftConfig->hasNewConfiguration();

		if(newConfiguration || initialUpdate)
		{
			terminateThreads();

			SwiftConfig::Configuration configuration = {};
			swiftConfig->getConfiguration(configuration);

			precacheVertex = !newConfiguration && configuration.precache;
			precacheSetup = !newConfiguration && configuration.precache;
			precachePixel = !newConfiguration && configuration.precache;

			VertexProcessor::setRoutineCacheSize(configuration.vertexRoutineCacheSize);
			PixelProcessor::setRoutineCacheSize(configuration.pixelRoutineCacheSize);
			SetupProcessor::setRoutineCacheSize(configuration.setupRoutineCacheSize);

			switch(configuration.textureSampleQuality)
			{
			case 0:  Sampler::setFilterQuality(FILTER_POINT);       break;
			case 1:  Sampler::setFilterQuality(FILTER_LINEAR);      break;
			case 2:  Sampler::setFilterQuality(FILTER_ANISOTROPIC); break;
			default: Sampler::setFilterQuality(FILTER_ANISOTROPIC); break;
			}

			switch(configuration.mipmapQuality)
			{
			case 0:  Sampler::setMipmapQuality(MIPMAP_POINT);  break;
			case 1:  Sampler::setMipmapQuality(MIPMAP_LINEAR); break;
			default: Sampler::setMipmapQuality(MIPMAP_LINEAR); break;
			}

			setPerspectiveCorrection(configuration.perspectiveCorrection);

			switch(configuration.transcendentalPrecision)
			{
			case 0:
				logPrecision = APPROXIMATE;
				expPrecision = APPROXIMATE;
				rcpPrecision = APPROXIMATE;
				rsqPrecision = APPROXIMATE;
				break;
			case 1:
				logPrecision = PARTIAL;
				expPrecision = PARTIAL;
				rcpPrecision = PARTIAL;
				rsqPrecision = PARTIAL;
				break;
			case 2:
				logPrecision = ACCURATE;
				expPrecision = ACCURATE;
				rcpPrecision = ACCURATE;
				rsqPrecision = ACCURATE;
				break;
			case 3:
				logPrecision = WHQL;
				expPrecision = WHQL;
				rcpPrecision = WHQL;
				rsqPrecision = WHQL;
				break;
			case 4:
				logPrecision = IEEE;
				expPrecision = IEEE;
				rcpPrecision = IEEE;
				rsqPrecision = IEEE;
				break;
			default:
				logPrecision = ACCURATE;
				expPrecision = ACCURATE;
				rcpPrecision = ACCURATE;
				rsqPrecision = ACCURATE;
				break;
			}

			switch(configuration.transparencyAntialiasing)
			{
			case 0:  transparencyAntialiasing = TRANSPARENCY_NONE;              break;
			case 1:  transparencyAntialiasing = TRANSPARENCY_ALPHA_TO_COVERAGE; break;
			default: transparencyAntialiasing = TRANSPARENCY_NONE;              break;
			}

			switch(configuration.threadCount)
			{
			case -1: threadCount = CPUID::coreCount();        break;
			case 0:  threadCount = CPUID::processAffinity();  break;
			default: threadCount = configuration.threadCount; break;
			}

			CPUID::setEnableSSE4_1(configuration.enableSSE4_1);
			CPUID::setEnableSSSE3(configuration.enableSSSE3);
			CPUID::setEnableSSE3(configuration.enableSSE3);
			CPUID::setEnableSSE2(configuration.enableSSE2);
			CPUID::setEnableSSE(configuration.enableSSE);

			for(int pass = 0; pass < 10; pass++)
			{
				optimization[pass] = configuration.optimization[pass];
			}

			forceWindowed = configuration.forceWindowed;
			complementaryDepthBuffer = configuration.complementaryDepthBuffer;
			postBlendSRGB = configuration.postBlendSRGB;
			exactColorRounding = configuration.exactColorRounding;
			forceClearRegisters = configuration.forceClearRegisters;

		#ifndef NDEBUG
			minPrimitives = configuration.minPrimitives;
			maxPrimitives = configuration.maxPrimitives;
		#endif
		}

		if(!initialUpdate && !worker[0])
		{
			initializeThreads();
		}
	}
}