// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "VertexRoutine.hpp"

#include "VertexShader.hpp"
#include "Vertex.hpp"
#include "Half.hpp"
#include "Renderer.hpp"
#include "Constants.hpp"
#include "Debug.hpp"

namespace sw
{
	extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
	extern bool symmetricNormalizedDepth;   // [-1, 1] instead of [0, 1]

	VertexRoutine::VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader)
		: v(shader && shader->dynamicallyIndexedInput),
		  o(shader && shader->dynamicallyIndexedOutput),
		  state(state)
	{
	}

	VertexRoutine::~VertexRoutine()
	{
	}

	void VertexRoutine::generate()
	{
		const bool textureSampling = state.textureSampling;

		Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
		Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
		Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);

		UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
		UInt primitiveNumber = *Pointer<UInt>(task + OFFSET(VertexTask, primitiveStart));
		UInt indexInPrimitive = 0;

		constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));

		Do
		{
			UInt index = *Pointer<UInt>(batch);
			UInt tagIndex = index & 0x0000003C;
			UInt indexQ = !textureSampling ? UInt(index & 0xFFFFFFFC) : index;   // FIXME: TEXLDL hack to have independent LODs, hurts performance.

			If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
			{
				*Pointer<UInt>(tagCache + tagIndex) = indexQ;

				readInput(indexQ);
				pipeline();
				postTransform();
				computeClipFlags();

				Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
				writeCache(cacheLine0);
			}

			UInt cacheIndex = index & 0x0000003F;
			Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
			writeVertex(vertex, cacheLine);

			if(state.transformFeedbackEnabled != 0)
			{
				transformFeedback(vertex, primitiveNumber, indexInPrimitive);

				indexInPrimitive++;
				If(indexInPrimitive == 3)
				{
					primitiveNumber++;
					indexInPrimitive = 0;
				}
			}

			vertex += sizeof(Vertex);
			batch += sizeof(unsigned int);
			vertexCount--;
		}
		Until(vertexCount == 0)

		Return();
	}

	void VertexRoutine::readInput(UInt &index)
	{
		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
		{
			Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,input) + sizeof(void*) * i);
			UInt stride = *Pointer<UInt>(data + OFFSET(DrawData,stride) + sizeof(unsigned int) * i);

			v[i] = readStream(input, stride, state.input[i], index);
		}
	}

	void VertexRoutine::computeClipFlags()
	{
		int pos = state.positionRegister;

		Int4 maxX = CmpLT(o[pos].w, o[pos].x);
		Int4 maxY = CmpLT(o[pos].w, o[pos].y);
		Int4 maxZ = CmpLT(o[pos].w, o[pos].z);
		Int4 minX = CmpNLE(-o[pos].w, o[pos].x);
		Int4 minY = CmpNLE(-o[pos].w, o[pos].y);
		Int4 minZ = symmetricNormalizedDepth ? CmpNLE(-o[pos].w, o[pos].z) : CmpNLE(Float4(0.0f), o[pos].z);

		clipFlags = *Pointer<Int>(constants + OFFSET(Constants,maxX) + SignMask(maxX) * 4);   // FIXME: Array indexing
		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxY) + SignMask(maxY) * 4);
		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxZ) + SignMask(maxZ) * 4);
		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minX) + SignMask(minX) * 4);
		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minY) + SignMask(minY) * 4);
		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minZ) + SignMask(minZ) * 4);

		Int4 finiteX = CmpLE(Abs(o[pos].x), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
		Int4 finiteY = CmpLE(Abs(o[pos].y), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
		Int4 finiteZ = CmpLE(Abs(o[pos].z), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));

		Int4 finiteXYZ = finiteX & finiteY & finiteZ;
		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,fini) + SignMask(finiteXYZ) * 4);

		if(state.preTransformed)
		{
			clipFlags &= 0xFBFBFBFB;   // Don't clip against far clip plane
		}
	}

	Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
	{
		const bool textureSampling = state.textureSampling;

		Vector4f v;

		Pointer<Byte> source0 = buffer + index * stride;
		Pointer<Byte> source1 = source0 + (!textureSampling ? stride : 0);
		Pointer<Byte> source2 = source1 + (!textureSampling ? stride : 0);
		Pointer<Byte> source3 = source2 + (!textureSampling ? stride : 0);

		switch(stream.type)
		{
		case STREAMTYPE_FLOAT:
			{
				if(stream.count == 0)
				{
					// Null stream, all default components
				}
				else if(stream.count == 1)
				{
					v.x.x = *Pointer<Float>(source0);
					v.x.y = *Pointer<Float>(source1);
					v.x.z = *Pointer<Float>(source2);
					v.x.w = *Pointer<Float>(source3);
				}
				else
				{
					v.x = *Pointer<Float4>(source0);
					v.y = *Pointer<Float4>(source1);
					v.z = *Pointer<Float4>(source2);
					v.w = *Pointer<Float4>(source3);

					transpose4xN(v.x, v.y, v.z, v.w, stream.count);
				}
			}
			break;
		case STREAMTYPE_BYTE:
			{
				v.x = Float4(*Pointer<Byte4>(source0));
				v.y = Float4(*Pointer<Byte4>(source1));
				v.z = Float4(*Pointer<Byte4>(source2));
				v.w = Float4(*Pointer<Byte4>(source3));

				transpose4xN(v.x, v.y, v.z, v.w, stream.count);

				if(stream.normalized)
				{
					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
				}
			}
			break;
		case STREAMTYPE_SBYTE:
			{
				v.x = Float4(*Pointer<SByte4>(source0));
				v.y = Float4(*Pointer<SByte4>(source1));
				v.z = Float4(*Pointer<SByte4>(source2));
				v.w = Float4(*Pointer<SByte4>(source3));

				transpose4xN(v.x, v.y, v.z, v.w, stream.count);

				if(stream.normalized)
				{
					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
				}
			}
			break;
		case STREAMTYPE_COLOR:
			{
				v.x = Float4(*Pointer<Byte4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
				v.y = Float4(*Pointer<Byte4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
				v.z = Float4(*Pointer<Byte4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
				v.w = Float4(*Pointer<Byte4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));

				transpose4x4(v.x, v.y, v.z, v.w);

				// Swap red and blue
				Float4 t = v.x;
				v.x = v.z;
				v.z = t;
			}
			break;
		case STREAMTYPE_SHORT:
			{
				v.x = Float4(*Pointer<Short4>(source0));
				v.y = Float4(*Pointer<Short4>(source1));
				v.z = Float4(*Pointer<Short4>(source2));
				v.w = Float4(*Pointer<Short4>(source3));

				transpose4xN(v.x, v.y, v.z, v.w, stream.count);

				if(stream.normalized)
				{
					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
				}
			}
			break;
		case STREAMTYPE_USHORT:
			{
				v.x = Float4(*Pointer<UShort4>(source0));
				v.y = Float4(*Pointer<UShort4>(source1));
				v.z = Float4(*Pointer<UShort4>(source2));
				v.w = Float4(*Pointer<UShort4>(source3));

				transpose4xN(v.x, v.y, v.z, v.w, stream.count);

				if(stream.normalized)
				{
					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
				}
			}
			break;
		case STREAMTYPE_INT:
			{
				if(stream.normalized)
				{
					v.x = Float4(*Pointer<Int4>(source0));
					v.y = Float4(*Pointer<Int4>(source1));
					v.z = Float4(*Pointer<Int4>(source2));
					v.w = Float4(*Pointer<Int4>(source3));

					transpose4xN(v.x, v.y, v.z, v.w, stream.count);

					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
				}
				else
				{
					v.x = As<Float4>(*Pointer<Int4>(source0));
					v.y = As<Float4>(*Pointer<Int4>(source1));
					v.z = As<Float4>(*Pointer<Int4>(source2));
					v.w = As<Float4>(*Pointer<Int4>(source3));

					transpose4xN(v.x, v.y, v.z, v.w, stream.count);
				}
			}
			break;
		case STREAMTYPE_UINT:
			{
				if(stream.normalized)
				{
					v.x = Float4(*Pointer<UInt4>(source0));
					v.y = Float4(*Pointer<UInt4>(source1));
					v.z = Float4(*Pointer<UInt4>(source2));
					v.w = Float4(*Pointer<UInt4>(source3));

					transpose4xN(v.x, v.y, v.z, v.w, stream.count);

					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
				}
				else
				{
					v.x = As<Float4>(*Pointer<UInt4>(source0));
					v.y = As<Float4>(*Pointer<UInt4>(source1));
					v.z = As<Float4>(*Pointer<UInt4>(source2));
					v.w = As<Float4>(*Pointer<UInt4>(source3));

					transpose4xN(v.x, v.y, v.z, v.w, stream.count);
				}
			}
			break;
		case STREAMTYPE_UDEC3:
			{
				// FIXME: Vectorize
				{
					Int x, y, z;

					x = y = z = *Pointer<Int>(source0);

					v.x.x = Float(x & 0x000003FF);
					v.x.y = Float(y & 0x000FFC00);
					v.x.z = Float(z & 0x3FF00000);
				}

				{
					Int x, y, z;

					x = y = z = *Pointer<Int>(source1);

					v.y.x = Float(x & 0x000003FF);
					v.y.y = Float(y & 0x000FFC00);
					v.y.z = Float(z & 0x3FF00000);
				}

				{
					Int x, y, z;

					x = y = z = *Pointer<Int>(source2);

					v.z.x = Float(x & 0x000003FF);
					v.z.y = Float(y & 0x000FFC00);
					v.z.z = Float(z & 0x3FF00000);
				}

				{
					Int x, y, z;

					x = y = z = *Pointer<Int>(source3);

					v.w.x = Float(x & 0x000003FF);
					v.w.y = Float(y & 0x000FFC00);
					v.w.z = Float(z & 0x3FF00000);
				}

				transpose4x3(v.x, v.y, v.z, v.w);

				v.y *= Float4(1.0f / 0x00000400);
				v.z *= Float4(1.0f / 0x00100000);
			}
			break;
		case STREAMTYPE_DEC3N:
			{
				// FIXME: Vectorize
				{
					Int x, y, z;

					x = y = z = *Pointer<Int>(source0);

					v.x.x = Float((x << 22) & 0xFFC00000);
					v.x.y = Float((y << 12) & 0xFFC00000);
					v.x.z = Float((z << 2)  & 0xFFC00000);
				}

				{
					Int x, y, z;

					x = y = z = *Pointer<Int>(source1);

					v.y.x = Float((x << 22) & 0xFFC00000);
					v.y.y = Float((y << 12) & 0xFFC00000);
					v.y.z = Float((z << 2)  & 0xFFC00000);
				}

				{
					Int x, y, z;

					x = y = z = *Pointer<Int>(source2);

					v.z.x = Float((x << 22) & 0xFFC00000);
					v.z.y = Float((y << 12) & 0xFFC00000);
					v.z.z = Float((z << 2)  & 0xFFC00000);
				}

				{
					Int x, y, z;

					x = y = z = *Pointer<Int>(source3);

					v.w.x = Float((x << 22) & 0xFFC00000);
					v.w.y = Float((y << 12) & 0xFFC00000);
					v.w.z = Float((z << 2)  & 0xFFC00000);
				}

				transpose4x3(v.x, v.y, v.z, v.w);

				v.x *= Float4(1.0f / 0x00400000 / 511.0f);
				v.y *= Float4(1.0f / 0x00400000 / 511.0f);
				v.z *= Float4(1.0f / 0x00400000 / 511.0f);
			}
			break;
		case STREAMTYPE_FIXED:
			{
				v.x = Float4(*Pointer<Int4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
				v.y = Float4(*Pointer<Int4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
				v.z = Float4(*Pointer<Int4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
				v.w = Float4(*Pointer<Int4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));

				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
			}
			break;
		case STREAMTYPE_HALF:
			{
				if(stream.count >= 1)
				{
					UShort x0 = *Pointer<UShort>(source0 + 0);
					UShort x1 = *Pointer<UShort>(source1 + 0);
					UShort x2 = *Pointer<UShort>(source2 + 0);
					UShort x3 = *Pointer<UShort>(source3 + 0);

					v.x.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x0) * 4);
					v.x.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x1) * 4);
					v.x.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x2) * 4);
					v.x.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x3) * 4);
				}

				if(stream.count >= 2)
				{
					UShort y0 = *Pointer<UShort>(source0 + 2);
					UShort y1 = *Pointer<UShort>(source1 + 2);
					UShort y2 = *Pointer<UShort>(source2 + 2);
					UShort y3 = *Pointer<UShort>(source3 + 2);

					v.y.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y0) * 4);
					v.y.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y1) * 4);
					v.y.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y2) * 4);
					v.y.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y3) * 4);
				}

				if(stream.count >= 3)
				{
					UShort z0 = *Pointer<UShort>(source0 + 4);
					UShort z1 = *Pointer<UShort>(source1 + 4);
					UShort z2 = *Pointer<UShort>(source2 + 4);
					UShort z3 = *Pointer<UShort>(source3 + 4);

					v.z.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z0) * 4);
					v.z.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z1) * 4);
					v.z.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z2) * 4);
					v.z.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z3) * 4);
				}

				if(stream.count >= 4)
				{
					UShort w0 = *Pointer<UShort>(source0 + 6);
					UShort w1 = *Pointer<UShort>(source1 + 6);
					UShort w2 = *Pointer<UShort>(source2 + 6);
					UShort w3 = *Pointer<UShort>(source3 + 6);

					v.w.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w0) * 4);
					v.w.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w1) * 4);
					v.w.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w2) * 4);
					v.w.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w3) * 4);
				}
			}
			break;
		case STREAMTYPE_INDICES:
			{
				v.x.x = *Pointer<Float>(source0);
				v.x.y = *Pointer<Float>(source1);
				v.x.z = *Pointer<Float>(source2);
				v.x.w = *Pointer<Float>(source3);
			}
			break;
		case STREAMTYPE_2_10_10_10_INT:
			{
				Int4 src;
				src = Insert(src, *Pointer<Int>(source0), 0);
				src = Insert(src, *Pointer<Int>(source1), 1);
				src = Insert(src, *Pointer<Int>(source2), 2);
				src = Insert(src, *Pointer<Int>(source3), 3);

				v.x = Float4((src << 22) >> 22);
				v.y = Float4((src << 12) >> 22);
				v.z = Float4((src << 02) >> 22);
				v.w = Float4(src >> 30);

				if(stream.normalized)
				{
					v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
					v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
					v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
					v.w = Max(v.w, Float4(-1.0f));
				}
			}
			break;
		case STREAMTYPE_2_10_10_10_UINT:
			{
				Int4 src;
				src = Insert(src, *Pointer<Int>(source0), 0);
				src = Insert(src, *Pointer<Int>(source1), 1);
				src = Insert(src, *Pointer<Int>(source2), 2);
				src = Insert(src, *Pointer<Int>(source3), 3);

				v.x = Float4(src & Int4(0x3FF));
				v.y = Float4((src >> 10) & Int4(0x3FF));
				v.z = Float4((src >> 20) & Int4(0x3FF));
				v.w = Float4((src >> 30) & Int4(0x3));

				if(stream.normalized)
				{
					v.x *= Float4(1.0f / 0x3FF);
					v.y *= Float4(1.0f / 0x3FF);
					v.z *= Float4(1.0f / 0x3FF);
					v.w *= Float4(1.0f / 0x3);
				}
			}
			break;
		default:
			ASSERT(false);
		}

		if(stream.count < 1) v.x = Float4(0.0f);
		if(stream.count < 2) v.y = Float4(0.0f);
		if(stream.count < 3) v.z = Float4(0.0f);
		if(stream.count < 4) v.w = Float4(1.0f);

		return v;
	}

	void VertexRoutine::postTransform()
	{
		int pos = state.positionRegister;

		// Backtransform
		if(state.preTransformed)
		{
			Float4 rhw = Float4(1.0f) / o[pos].w;

			Float4 W = *Pointer<Float4>(data + OFFSET(DrawData,Wx16)) * Float4(1.0f / 16.0f);
			Float4 H = *Pointer<Float4>(data + OFFSET(DrawData,Hx16)) * Float4(1.0f / 16.0f);
			Float4 L = *Pointer<Float4>(data + OFFSET(DrawData,X0x16)) * Float4(1.0f / 16.0f);
			Float4 T = *Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) * Float4(1.0f / 16.0f);

			o[pos].x = (o[pos].x - L) / W * rhw;
			o[pos].y = (o[pos].y - T) / H * rhw;
			o[pos].z = o[pos].z * rhw;
			o[pos].w = rhw;
		}

		if(!halfIntegerCoordinates && !state.preTransformed)
		{
			o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelX)) * o[pos].w;
			o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelY)) * o[pos].w;
		}

		if(state.superSampling)
		{
			o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,XXXX)) * o[pos].w;
			o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,YYYY)) * o[pos].w;
		}
	}

	void VertexRoutine::writeCache(Pointer<Byte> &cacheLine)
	{
		Vector4f v;

		for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
		{
			if(state.output[i].write)
			{
				v.x = o[i].x;
				v.y = o[i].y;
				v.z = o[i].z;
				v.w = o[i].w;

				if(state.output[i].xClamp)
				{
					v.x = Max(v.x, Float4(0.0f));
					v.x = Min(v.x, Float4(1.0f));
				}

				if(state.output[i].yClamp)
				{
					v.y = Max(v.y, Float4(0.0f));
					v.y = Min(v.y, Float4(1.0f));
				}

				if(state.output[i].zClamp)
				{
					v.z = Max(v.z, Float4(0.0f));
					v.z = Min(v.z, Float4(1.0f));
				}

				if(state.output[i].wClamp)
				{
					v.w = Max(v.w, Float4(0.0f));
					v.w = Min(v.w, Float4(1.0f));
				}

				if(state.output[i].write == 0x01)
				{
					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0) = v.x.x;
					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1) = v.x.y;
					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2) = v.x.z;
					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3) = v.x.w;
				}
				else
				{
					if(state.output[i].write == 0x02)
					{
						transpose2x4(v.x, v.y, v.z, v.w);
					}
					else
					{
						transpose4x4(v.x, v.y, v.z, v.w);
					}

					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0, 16) = v.x;
					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1, 16) = v.y;
					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2, 16) = v.z;
					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w;
				}
			}
		}

		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (clipFlags >> 0)  & 0x0000000FF;
		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (clipFlags >> 8)  & 0x0000000FF;
		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (clipFlags >> 16) & 0x0000000FF;
		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (clipFlags >> 24) & 0x0000000FF;

		// Viewport transform
		int pos = state.positionRegister;

		v.x = o[pos].x;
		v.y = o[pos].y;
		v.z = o[pos].z;
		v.w = o[pos].w;

		if(symmetricNormalizedDepth)
		{
			v.z = (v.z + v.w) * Float4(0.5f);   // [-1, 1] -> [0, 1]
		}

		Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
		Float4 rhw = Float4(1.0f) / w;

		v.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Wx16))));
		v.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Hx16))));
		v.z = v.z * rhw;
		v.w = rhw;

		transpose4x4(v.x, v.y, v.z, v.w);

		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 0, 16) = v.x;
		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 1, 16) = v.y;
		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 2, 16) = v.z;
		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 3, 16) = v.w;
	}

	void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cache)
	{
		for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
		{
			if(state.output[i].write)
			{
				*Pointer<Int4>(vertex + OFFSET(Vertex,v[i]), 16) = *Pointer<Int4>(cache + OFFSET(Vertex,v[i]), 16);
			}
		}

		*Pointer<Int4>(vertex + OFFSET(Vertex,X)) = *Pointer<Int4>(cache + OFFSET(Vertex,X));
		*Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cache + OFFSET(Vertex,clipFlags));
	}

	void VertexRoutine::transformFeedback(const Pointer<Byte> &vertex, const UInt &primitiveNumber, const UInt &indexInPrimitive)
	{
		If(indexInPrimitive < state.verticesPerPrimitive)
		{
			UInt tOffset = primitiveNumber * state.verticesPerPrimitive + indexInPrimitive;

			for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; i++)
			{
				if(state.transformFeedbackEnabled & (1ULL << i))
				{
					UInt reg = *Pointer<UInt>(data + OFFSET(DrawData, vs.reg[i]));
					UInt row = *Pointer<UInt>(data + OFFSET(DrawData, vs.row[i]));
					UInt col = *Pointer<UInt>(data + OFFSET(DrawData, vs.col[i]));
					UInt str = *Pointer<UInt>(data + OFFSET(DrawData, vs.str[i]));

					Pointer<Byte> t = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, vs.t[i])) + (tOffset * str * sizeof(float));
					Pointer<Byte> v = vertex + OFFSET(Vertex, v) + reg * sizeof(float);

					For(UInt r = 0, r < row, r++)
					{
						UInt rOffsetX = r * col * sizeof(float);
						UInt rOffset4 = r * sizeof(float4);

						For(UInt c = 0, c < col, c++)
						{
							UInt cOffset = c * sizeof(float);
							*Pointer<Float>(t + rOffsetX + cOffset) = *Pointer<Float>(v + rOffset4 + cOffset);
						}
					}
				}
			}
		}
	}
}