Add new GPU examples

2015-07-22 19:41:59 +02:00 · 2015-07-22 19:41:59 +02:00 · bf7d686e88
commit bf7d686e88
parent 10b81077e3
35 changed files with 2177 additions and 1230 deletions
--- a/examples/gpu/README.md
+++ b/examples/gpu/README.md
@ -1,11 +0,0 @@
-gpu
-=======
-
-example of how to use the GPU with libctru
-
-before trying to compile, make sure to download aemstro
-( https://github.com/smealum/aemstro reflog: 51bfeef9e1a0149726dca43b50919bd45917015a )
-and update AEMSTRO environment variable with the proper path
-
-You'll also need to install Python 3 and have that in your path.
-
--- a/examples/gpu/data/test.vsh
+++ b/examples/gpu/data/test.vsh
@ -1,57 +0,0 @@
-; setup constants
-	.const c20, 1.0, 0.0, 0.5, 1.0
- 
-; setup outmap
-	.out o0, result.position, 0xF
-	.out o1, result.color, 0xF
-	.out o2, result.texcoord0, 0x3
-	.out o3, result.texcoord1, 0x3
-	.out o4, result.texcoord2, 0x3
- 
-; setup uniform map (not required)
-	.uniform c0, c3, projection
-	.uniform c4, c7, modelview
-	.uniform c8, c8, lightDirection
-	.uniform c9, c9, lightAmbient
-	
-	.vsh vmain, end_vmain
- 
-;code
-	vmain:
-		mov r1, v0 (0x4)
-		mov r1, c20 (0x3)
-		; temp = modvMtx * in.pos
-		dp4 r0, c4, r1 (0x0)
-		dp4 r0, c5, r1 (0x1)
-		dp4 r0, c6, r1 (0x2)
-		mov r0, c20 (0x3)
-		; result.pos = projMtx * temp
-		dp4 o0, c0, r0 (0x0)
-		dp4 o0, c1, r0 (0x1)
-		dp4 o0, c2, r0 (0x2)
-		dp4 o0, c3, r0 (0x3)
-		; result.texcoord = in.texcoord
-		mov o2, v1 (0x5)
-		mov o3, c20 (0x7)
-		mov o4, c20 (0x7)
-		; result.color = crappy lighting
-		dp3 r0, c8, v2 (0x4)
-		max r0, c20, r0 (0x9)
-		mul r0, c9, r0 (0x4)
-		add o1, c9, r0 (0x4)
-		mov o1, c20 (0x3)
-		nop
-		end
-	end_vmain:
- 
-;operand descriptors
-	.opdesc x___, xyzw, xyzw ; 0x0
-	.opdesc _y__, xyzw, xyzw ; 0x1
-	.opdesc __z_, xyzw, xyzw ; 0x2
-	.opdesc ___w, xyzw, xyzw ; 0x3
-	.opdesc xyz_, xyzw, xyzw ; 0x4
-	.opdesc xyzw, xyzw, xyzw ; 0x5
-	.opdesc x_zw, xyzw, xyzw ; 0x6
-	.opdesc xyzw, yyyw, xyzw ; 0x7
-	.opdesc xyz_, wwww, wwww ; 0x8
-	.opdesc xyz_, yyyy, xyzw ; 0x9
--- a/examples/gpu/data/texture.bin
+++ b/examples/gpu/data/texture.bin
--- a/examples/gpu/source/_gs.s
+++ b/examples/gpu/source/_gs.s
@ -1,16 +0,0 @@
-.section ".text"
-.arm
-.align 4
-.global _vboMemcpy50
-
-# r0 : dst
-# r1 : src
-# fixed size 0x50
-_vboMemcpy50:
-	push {r4-r11}
-	ldmia r1!, {r2-r12}
-	stmia r0!, {r2-r12}
-	ldmia r1!, {r2-r12}
-	stmia r0!, {r2-r12}
-	pop {r4-r11}
-	bx lr
--- a/examples/gpu/source/gs.c
+++ b/examples/gpu/source/gs.c
@ -1,432 +0,0 @@
-#include <stdlib.h>
-#include <string.h>
-#include <malloc.h>
-#include <3ds.h>
-
-#include "gs.h"
-#include "math.h"
-
-#define BUFFERMATRIXLIST_SIZE (GS_MATRIXSTACK_SIZE*4)
-
-static void gsInitMatrixStack();
-
-Handle linearAllocMutex;
-
-static u32 gsMatrixStackRegisters[GS_MATRIXTYPES];
-
-typedef struct
-{
-	u32 offset;
-	mtx44 data;
-}bufferMatrix_s;
-
-bufferMatrix_s bufferMatrixList[BUFFERMATRIXLIST_SIZE];
-int bufferMatrixListLength;
-
-//----------------------
-//   GS SYSTEM STUFF
-//----------------------
-
-void initBufferMatrixList()
-{
-	bufferMatrixListLength=0;
-}
-
-void gsInit(shaderProgram_s* shader)
-{
-	gsInitMatrixStack();
-	initBufferMatrixList();
-	svcCreateMutex(&linearAllocMutex, false);
-	if(shader)
-	{
-		gsMatrixStackRegisters[0]=shaderInstanceGetUniformLocation(shader->vertexShader, "projection");
-		gsMatrixStackRegisters[1]=shaderInstanceGetUniformLocation(shader->vertexShader, "modelview");
-		shaderProgramUse(shader);
-	}
-}
-
-void gsExit(void)
-{
-	svcCloseHandle(linearAllocMutex);
-}
-
-void gsStartFrame(void)
-{
-	GPUCMD_SetBufferOffset(0);
-	initBufferMatrixList();
-}
-
-void* gsLinearAlloc(size_t size)
-{
-	void* ret=NULL;
-
-	svcWaitSynchronization(linearAllocMutex, U64_MAX);
-	ret=linearAlloc(size);
-	svcReleaseMutex(linearAllocMutex);
-	
-	return ret;
-}
-
-void gsLinearFree(void* mem)
-{
-	svcWaitSynchronization(linearAllocMutex, U64_MAX);
-	linearFree(mem);
-	svcReleaseMutex(linearAllocMutex);
-}
-
-//----------------------
-//  MATRIX STACK STUFF
-//----------------------
-
-static mtx44 gsMatrixStacks[GS_MATRIXTYPES][GS_MATRIXSTACK_SIZE];
-static u32 gsMatrixStackRegisters[GS_MATRIXTYPES]={0x00, 0x04};
-static u8 gsMatrixStackOffsets[GS_MATRIXTYPES];
-static bool gsMatrixStackUpdated[GS_MATRIXTYPES];
-static GS_MATRIX gsCurrentMatrixType;
-
-static void gsInitMatrixStack()
-{
-	int i;
-	for(i=0; i<GS_MATRIXTYPES; i++)
-	{
-		gsMatrixStackOffsets[i]=0;
-		gsMatrixStackUpdated[i]=true;
-		loadIdentity44((float*)gsMatrixStacks[i][0]);
-	}
-	gsCurrentMatrixType=GS_PROJECTION;
-}
-
-float* gsGetMatrix(GS_MATRIX m)
-{
-	if(m<0 || m>=GS_MATRIXTYPES)return NULL;
-	
-	return (float*)gsMatrixStacks[m][gsMatrixStackOffsets[m]];
-}
-
-int gsLoadMatrix(GS_MATRIX m, float* data)
-{
-	if(m<0 || m>=GS_MATRIXTYPES || !data)return -1;
-	
-	memcpy(gsGetMatrix(m), data, sizeof(mtx44));
-
-	gsMatrixStackUpdated[m]=true;
-
-	return 0;
-}
-
-int gsPushMatrix()
-{
-	const GS_MATRIX m=gsCurrentMatrixType;
-	if(m<0 || m>=GS_MATRIXTYPES)return -1;
-	if(gsMatrixStackOffsets[m]<0 || gsMatrixStackOffsets[m]>=GS_MATRIXSTACK_SIZE-1)return -1;
-
-	float* cur=gsGetMatrix(m);
-	gsMatrixStackOffsets[m]++;
-	memcpy(gsGetMatrix(m), cur, sizeof(mtx44));
-
-	return 0;
-}
-
-int gsPopMatrix()
-{
-	const GS_MATRIX m=gsCurrentMatrixType;
-	if(m<0 || m>=GS_MATRIXTYPES)return -1;
-	if(gsMatrixStackOffsets[m]<1 || gsMatrixStackOffsets[m]>=GS_MATRIXSTACK_SIZE)return -1;
-
-	gsMatrixStackOffsets[m]--;
-
-	gsMatrixStackUpdated[m]=true;
-
-	return 0;
-}
-
-int gsMatrixMode(GS_MATRIX m)
-{
-	if(m<0 || m>=GS_MATRIXTYPES)return -1;
-
-	gsCurrentMatrixType=m;
-
-	return 0;
-}
-
-//------------------------
-// MATRIX TRANSFORM STUFF
-//------------------------
-
-int gsMultMatrix(float* data)
-{
-	if(!data)return -1;
-	
-	mtx44 tmp;
-	multMatrix44(gsGetMatrix(gsCurrentMatrixType), data, (float*)tmp);
-	memcpy(gsGetMatrix(gsCurrentMatrixType), (float*)tmp, sizeof(mtx44));
-
-	gsMatrixStackUpdated[gsCurrentMatrixType]=true;
-
-	return 0;
-}
-
-void gsLoadIdentity()
-{
-	loadIdentity44(gsGetMatrix(gsCurrentMatrixType));
-	gsMatrixStackUpdated[gsCurrentMatrixType]=true;
-}
-
-void gsProjectionMatrix(float fovy, float aspect, float near, float far)
-{
-	initProjectionMatrix(gsGetMatrix(gsCurrentMatrixType), fovy, aspect, near, far);
-	gsMatrixStackUpdated[gsCurrentMatrixType]=true;
-}
-
-void gsRotateX(float x)
-{
-	rotateMatrixX(gsGetMatrix(gsCurrentMatrixType), x, false);
-	gsMatrixStackUpdated[gsCurrentMatrixType]=true;
-}
-
-void gsRotateY(float y)
-{
-	rotateMatrixY(gsGetMatrix(gsCurrentMatrixType), y, false);
-	gsMatrixStackUpdated[gsCurrentMatrixType]=true;
-}
-
-void gsRotateZ(float z)
-{
-	rotateMatrixZ(gsGetMatrix(gsCurrentMatrixType), z, false);
-	gsMatrixStackUpdated[gsCurrentMatrixType]=true;
-}
-
-void gsScale(float x, float y, float z)
-{
-	scaleMatrix(gsGetMatrix(gsCurrentMatrixType), x, y, z);
-	gsMatrixStackUpdated[gsCurrentMatrixType]=true;
-}
-
-void gsTranslate(float x, float y, float z)
-{
-	translateMatrix(gsGetMatrix(gsCurrentMatrixType), x, y, z);
-	gsMatrixStackUpdated[gsCurrentMatrixType]=true;
-}
-
-//----------------------
-// MATRIX RENDER STUFF
-//----------------------
-
-static void gsSetUniformMatrix(u32 startreg, float* m)
-{
-	float param[16];
-
-	param[0x0]=m[3]; //w
-	param[0x1]=m[2]; //z
-	param[0x2]=m[1]; //y
-	param[0x3]=m[0]; //x
-
-	param[0x4]=m[7];
-	param[0x5]=m[6];
-	param[0x6]=m[5];
-	param[0x7]=m[4];
-	
-	param[0x8]=m[11];
-	param[0x9]=m[10];
-	param[0xa]=m[9];
-	param[0xb]=m[8];
-
-	param[0xc]=m[15];
-	param[0xd]=m[14];
-	param[0xe]=m[13];
-	param[0xf]=m[12];
-
-	GPU_SetFloatUniform(GPU_VERTEX_SHADER, startreg, (u32*)param, 4);
-}
-
-static int gsUpdateTransformation()
-{
-	GS_MATRIX m;
-	for(m=0; m<GS_MATRIXTYPES; m++)
-	{
-		if(gsMatrixStackUpdated[m])
-		{
-			if(m==GS_PROJECTION && bufferMatrixListLength<BUFFERMATRIXLIST_SIZE)
-			{
-				GPUCMD_GetBuffer(NULL, NULL, &bufferMatrixList[bufferMatrixListLength].offset);
-				memcpy(bufferMatrixList[bufferMatrixListLength].data, gsGetMatrix(m), sizeof(mtx44));
-				bufferMatrixListLength++;
-			}
-			gsSetUniformMatrix(gsMatrixStackRegisters[m], gsGetMatrix(m));
-			gsMatrixStackUpdated[m]=false;
-		}
-	}
-	return 0;
-}
-
-void gsAdjustBufferMatrices(mtx44 transformation)
-{
-	int i;
-	u32* buffer;
-	u32 offset;
-	GPUCMD_GetBuffer(&buffer, NULL, &offset);
-	for(i=0; i<bufferMatrixListLength; i++)
-	{
-		u32 o=bufferMatrixList[i].offset;
-		if(o+2<offset) //TODO : better check, need to account for param size
-		{
-			mtx44 newMatrix;
-			GPUCMD_SetBufferOffset(o);
-			multMatrix44((float*)bufferMatrixList[i].data, (float*)transformation, (float*)newMatrix);
-			gsSetUniformMatrix(gsMatrixStackRegisters[GS_PROJECTION], (float*)newMatrix);
-		}
-	}
-	GPUCMD_SetBufferOffset(offset);
-}
-
-//----------------------
-//      VBO STUFF
-//----------------------
-
-int gsVboInit(gsVbo_s* vbo)
-{
-	if(!vbo)return -1;
-
-	vbo->data=NULL;
-	vbo->currentSize=0;
-	vbo->maxSize=0;
-	vbo->commands=NULL;
-	vbo->commandsSize=0;
-
-	return 0;
-}
-
-int gsVboCreate(gsVbo_s* vbo, u32 size)
-{
-	if(!vbo)return -1;
-
-	vbo->data=gsLinearAlloc(size);
-	vbo->numVertices=0;
-	vbo->currentSize=0;
-	vbo->maxSize=size;
-
-	return 0;
-}
-
-void* gsVboGetOffset(gsVbo_s* vbo)
-{
-	if(!vbo)return NULL;
-
-	return (void*)(&((u8*)vbo->data)[vbo->currentSize]);
-}
-
-int gsVboAddData(gsVbo_s* vbo, void* data, u32 size, u32 units)
-{
-	if(!vbo || !data || !size)return -1;
-	if(((s32)vbo->maxSize)-((s32)vbo->currentSize) < size)return -1;
-
-	memcpy(gsVboGetOffset(vbo), data, size);
-	vbo->currentSize+=size;
-	vbo->numVertices+=units;
-
-	return 0;
-}
-
-int gsVboFlushData(gsVbo_s* vbo)
-{
-	if(!vbo)return -1;
-
-	//unnecessary if we use flushAndRun
-	// GSPGPU_FlushDataCache(NULL, vbo->data, vbo->currentSize);
-
-	return 0;
-}
-
-int gsVboDestroy(gsVbo_s* vbo)
-{
-	if(!vbo)return -1;
-
-	if(vbo->commands)free(vbo->commands);
-	if(vbo->data)gsLinearFree(vbo->data);
-	gsVboInit(vbo);
-
-	return 0;
-}
-
-extern u32 debugValue[];
-
-void GPU_DrawArrayDirectly(GPU_Primitive_t primitive, u8* data, u32 n)
-{
-	//set attribute buffer address
-	GPUCMD_AddSingleParam(0x000F0200, (osConvertVirtToPhys((u32)data))>>3);
-	//set primitive type
-	GPUCMD_AddSingleParam(0x0002025E, primitive);
-	GPUCMD_AddSingleParam(0x0002025F, 0x00000001);
-	//index buffer not used for drawArrays but 0x000F0227 still required
-	GPUCMD_AddSingleParam(0x000F0227, 0x80000000);
-	//pass number of vertices
-	GPUCMD_AddSingleParam(0x000F0228, n);
-
-	GPUCMD_AddSingleParam(0x00010253, 0x00000001);
-
-	GPUCMD_AddSingleParam(0x00010245, 0x00000000);
-	GPUCMD_AddSingleParam(0x000F022E, 0x00000001);
-	GPUCMD_AddSingleParam(0x00010245, 0x00000001);
-	GPUCMD_AddSingleParam(0x000F0231, 0x00000001);
-
-	// GPUCMD_AddSingleParam(0x000F0111, 0x00000001); //breaks stuff
-}
-
-//not thread safe
-int gsVboPrecomputeCommands(gsVbo_s* vbo)
-{
-	if(!vbo || vbo->commands)return -1;
-
-	static u32 tmpBuffer[128];
-
-	u32* savedAdr; u32 savedSize, savedOffset;
-	GPUCMD_GetBuffer(&savedAdr, &savedSize, &savedOffset);
-	GPUCMD_SetBuffer(tmpBuffer, 128, 0);
-
-	GPU_DrawArrayDirectly(GPU_TRIANGLES, vbo->data, vbo->numVertices);
-	
-	GPUCMD_GetBuffer(NULL, NULL, &vbo->commandsSize);
-	vbo->commands=memalign(0x4, vbo->commandsSize*4);
-	if(!vbo->commands)return -1;
-	memcpy(vbo->commands, tmpBuffer, vbo->commandsSize*4);
-
-	GPUCMD_SetBuffer(savedAdr, savedSize, savedOffset);
-
-	return 0;
-}
-
-extern u32* gpuCmdBuf;
-extern u32 gpuCmdBufSize;
-extern u32 gpuCmdBufOffset;
-
-void _vboMemcpy50(u32* dst, u32* src);
-
-void _GPUCMD_AddRawCommands(u32* cmd, u32 size)
-{
-	if(!cmd || !size)return;
-
-	if(size*4==0x50)_vboMemcpy50(&gpuCmdBuf[gpuCmdBufOffset], cmd);
-	else memcpy(&gpuCmdBuf[gpuCmdBufOffset], cmd, size*4);
-	gpuCmdBufOffset+=size;
-}
-
-int gsVboDraw(gsVbo_s* vbo)
-{
-	if(!vbo || !vbo->data || !vbo->currentSize || !vbo->maxSize)return -1;
-
-	gsUpdateTransformation();
-
-	gsVboPrecomputeCommands(vbo);
-
-	// u64 val=svcGetSystemTick();
-	if(vbo->commands)
-	{
-		_GPUCMD_AddRawCommands(vbo->commands, vbo->commandsSize);
-	}else{
-		GPU_DrawArrayDirectly(GPU_TRIANGLES, vbo->data, vbo->numVertices);
-	}
-	// debugValue[5]+=(u32)(svcGetSystemTick()-val);
-	// debugValue[6]++;
-
-	return 0;
-}
--- a/examples/gpu/source/gs.h
+++ b/examples/gpu/source/gs.h
@ -1,59 +0,0 @@
-#ifndef GS_H
-#define GS_H
-
-#include <3ds.h>
-#include "math.h"
-
-#define GS_MATRIXSTACK_SIZE (8)
-
-typedef enum
-{
-	GS_PROJECTION = 0,
-	GS_MODELVIEW = 1,
-	GS_MATRIXTYPES
-}GS_MATRIX;
-
-typedef struct
-{
-	u8* data;
-	u32 currentSize; // in bytes
-	u32 maxSize; // in bytes
-	u32 numVertices;
-	u32* commands;
-	u32 commandsSize;
-}gsVbo_s;
-
-
-void gsInit(shaderProgram_s* shader);
-void gsExit(void);
-
-void gsStartFrame(void);
-void gsAdjustBufferMatrices(mtx44 transformation);
-
-void* gsLinearAlloc(size_t size);
-void gsLinearFree(void* mem);
-
-float* gsGetMatrix(GS_MATRIX m);
-int gsLoadMatrix(GS_MATRIX m, float* data);
-int gsPushMatrix();
-int gsPopMatrix();
-int gsMatrixMode(GS_MATRIX m);
-
-void gsLoadIdentity();
-void gsProjectionMatrix(float fovy, float aspect, float near, float far);
-void gsRotateX(float x);
-void gsRotateY(float y);
-void gsRotateZ(float z);
-void gsScale(float x, float y, float z);
-void gsTranslate(float x, float y, float z);
-int gsMultMatrix(float* data);
-
-int gsVboInit(gsVbo_s* vbo);
-int gsVboCreate(gsVbo_s* vbo, u32 size);
-int gsVboFlushData(gsVbo_s* vbo);
-int gsVboDestroy(gsVbo_s* vbo);
-int gsVboDraw(gsVbo_s* vbo);
-void* gsVboGetOffset(gsVbo_s* vbo);
-int gsVboAddData(gsVbo_s* vbo, void* data, u32 size, u32 units);
-
-#endif
--- a/examples/gpu/source/main.c
+++ b/examples/gpu/source/main.c
@ -1,354 +0,0 @@
-///////////////////////////////////////
-//            GPU example            //
-///////////////////////////////////////
-
-//this example is meant to show how to use the GPU to render a 3D object
-//it also shows how to do stereoscopic 3D
-//it uses GS which is a WIP GPU abstraction layer that's currently part of 3DScraft
-//keep in mind GPU reverse engineering is an ongoing effort and our understanding of it is still fairly limited.
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-#include <3ds.h>
-
-#include "math.h"
-#include "gs.h"
-
-#include "test_vsh_shbin.h"
-#include "texture_bin.h"
-
-//will be moved into ctrulib at some point
-#define CONFIG_3D_SLIDERSTATE (*(float*)0x1FF81080)
-
-#define RGBA8(r,g,b,a) ((((r)&0xFF)<<24) | (((g)&0xFF)<<16) | (((b)&0xFF)<<8) | (((a)&0xFF)<<0))
-
-//transfer from GPU output buffer to actual framebuffer flags
-#define DISPLAY_TRANSFER_FLAGS \
-	(GX_TRANSFER_FLIP_VERT(0) | GX_TRANSFER_OUT_TILED(0) | GX_TRANSFER_RAW_COPY(0) | \
-	 GX_TRANSFER_IN_FORMAT(GX_TRANSFER_FMT_RGBA8) | GX_TRANSFER_OUT_FORMAT(GX_TRANSFER_FMT_RGB8) | \
-	 GX_TRANSFER_SCALING(GX_TRANSFER_SCALE_X))
-
-//shader structure
-DVLB_s* dvlb;
-shaderProgram_s shader;
-//texture data pointer
-u32* texData;
-//vbo structure
-gsVbo_s vbo;
-
-//GPU framebuffer address
-u32* gpuOut=(u32*)0x1F119400;
-//GPU depth buffer address
-u32* gpuDOut=(u32*)0x1F370800;
-
-//angle for the vertex lighting (cf test.vsh)
-float lightAngle;
-//object position and rotation angle
-vect3Df_s position, angle;
-
-//vertex structure
-typedef struct
-{
-	vect3Df_s position;
-	float texcoord[2];
-	vect3Df_s normal;
-}vertex_s;
-
-//object data (cube)
-//obviously this doesn't have to be defined manually, but we will here for the purposes of the example
-//each line is a vertex : {position.x, position.y, position.z}, {texcoord.t, texcoord.s}, {normal.x, normal.y, normal.z}
-//we're drawing triangles so three lines = one triangle
-const vertex_s modelVboData[]=
-{
-	//first face (PZ)
-		//first triangle
-		{(vect3Df_s){-0.5f, -0.5f, +0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){0.0f, 0.0f, +1.0f}},
-		{(vect3Df_s){+0.5f, -0.5f, +0.5f}, (float[]){1.0f, 1.0f}, (vect3Df_s){0.0f, 0.0f, +1.0f}},
-		{(vect3Df_s){+0.5f, +0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){0.0f, 0.0f, +1.0f}},
-		//second triangle
-		{(vect3Df_s){+0.5f, +0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){0.0f, 0.0f, +1.0f}},
-		{(vect3Df_s){-0.5f, +0.5f, +0.5f}, (float[]){0.0f, 0.0f}, (vect3Df_s){0.0f, 0.0f, +1.0f}},
-		{(vect3Df_s){-0.5f, -0.5f, +0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){0.0f, 0.0f, +1.0f}},
-	//second face (MZ)
-		//first triangle
-		{(vect3Df_s){-0.5f, -0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){0.0f, 0.0f, -1.0f}},
-		{(vect3Df_s){-0.5f, +0.5f, -0.5f}, (float[]){1.0f, 1.0f}, (vect3Df_s){0.0f, 0.0f, -1.0f}},
-		{(vect3Df_s){+0.5f, +0.5f, -0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){0.0f, 0.0f, -1.0f}},
-		//second triangle
-		{(vect3Df_s){+0.5f, +0.5f, -0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){0.0f, 0.0f, -1.0f}},
-		{(vect3Df_s){+0.5f, -0.5f, -0.5f}, (float[]){0.0f, 0.0f}, (vect3Df_s){0.0f, 0.0f, -1.0f}},
-		{(vect3Df_s){-0.5f, -0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){0.0f, 0.0f, -1.0f}},
-	//third face (PX)
-		//first triangle
-		{(vect3Df_s){+0.5f, -0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){+1.0f, 0.0f, 0.0f}},
-		{(vect3Df_s){+0.5f, +0.5f, -0.5f}, (float[]){1.0f, 1.0f}, (vect3Df_s){+1.0f, 0.0f, 0.0f}},
-		{(vect3Df_s){+0.5f, +0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){+1.0f, 0.0f, 0.0f}},
-		//second triangle
-		{(vect3Df_s){+0.5f, +0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){+1.0f, 0.0f, 0.0f}},
-		{(vect3Df_s){+0.5f, -0.5f, +0.5f}, (float[]){0.0f, 0.0f}, (vect3Df_s){+1.0f, 0.0f, 0.0f}},
-		{(vect3Df_s){+0.5f, -0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){+1.0f, 0.0f, 0.0f}},
-	//fourth face (MX)
-		//first triangle
-		{(vect3Df_s){-0.5f, -0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){-1.0f, 0.0f, 0.0f}},
-		{(vect3Df_s){-0.5f, -0.5f, +0.5f}, (float[]){1.0f, 1.0f}, (vect3Df_s){-1.0f, 0.0f, 0.0f}},
-		{(vect3Df_s){-0.5f, +0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){-1.0f, 0.0f, 0.0f}},
-		//second triangle
-		{(vect3Df_s){-0.5f, +0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){-1.0f, 0.0f, 0.0f}},
-		{(vect3Df_s){-0.5f, +0.5f, -0.5f}, (float[]){0.0f, 0.0f}, (vect3Df_s){-1.0f, 0.0f, 0.0f}},
-		{(vect3Df_s){-0.5f, -0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){-1.0f, 0.0f, 0.0f}},
-	//fifth face (PY)
-		//first triangle
-		{(vect3Df_s){-0.5f, +0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){0.0f, +1.0f, 0.0f}},
-		{(vect3Df_s){-0.5f, +0.5f, +0.5f}, (float[]){1.0f, 1.0f}, (vect3Df_s){0.0f, +1.0f, 0.0f}},
-		{(vect3Df_s){+0.5f, +0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){0.0f, +1.0f, 0.0f}},
-		//second triangle
-		{(vect3Df_s){+0.5f, +0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){0.0f, +1.0f, 0.0f}},
-		{(vect3Df_s){+0.5f, +0.5f, -0.5f}, (float[]){0.0f, 0.0f}, (vect3Df_s){0.0f, +1.0f, 0.0f}},
-		{(vect3Df_s){-0.5f, +0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){0.0f, +1.0f, 0.0f}},
-	//sixth face (MY)
-		//first triangle
-		{(vect3Df_s){-0.5f, -0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){0.0f, -1.0f, 0.0f}},
-		{(vect3Df_s){+0.5f, -0.5f, -0.5f}, (float[]){1.0f, 1.0f}, (vect3Df_s){0.0f, -1.0f, 0.0f}},
-		{(vect3Df_s){+0.5f, -0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){0.0f, -1.0f, 0.0f}},
-		//second triangle
-		{(vect3Df_s){+0.5f, -0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){0.0f, -1.0f, 0.0f}},
-		{(vect3Df_s){-0.5f, -0.5f, +0.5f}, (float[]){0.0f, 0.0f}, (vect3Df_s){0.0f, -1.0f, 0.0f}},
-		{(vect3Df_s){-0.5f, -0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){0.0f, -1.0f, 0.0f}},
-};
-
-//stolen from staplebutt
-void GPU_SetDummyTexEnv(u8 num)
-{
-	GPU_SetTexEnv(num,
-		GPU_TEVSOURCES(GPU_PREVIOUS, 0, 0),
-		GPU_TEVSOURCES(GPU_PREVIOUS, 0, 0),
-		GPU_TEVOPERANDS(0,0,0),
-		GPU_TEVOPERANDS(0,0,0),
-		GPU_REPLACE,
-		GPU_REPLACE,
-		0xFFFFFFFF);
-}
-
-// topscreen
-void renderFrame()
-{
-	GPU_SetViewport((u32*)osConvertVirtToPhys((u32)gpuDOut),(u32*)osConvertVirtToPhys((u32)gpuOut),0,0,240*2,400);
-
-	GPU_DepthMap(-1.0f, 0.0f);
-	GPU_SetFaceCulling(GPU_CULL_BACK_CCW);
-	GPU_SetStencilTest(false, GPU_ALWAYS, 0x00, 0xFF, 0x00);
-	GPU_SetStencilOp(GPU_KEEP, GPU_KEEP, GPU_KEEP);
-	GPU_SetBlendingColor(0,0,0,0);
-	GPU_SetDepthTestAndWriteMask(true, GPU_GREATER, GPU_WRITE_ALL);
-
-	GPUCMD_AddMaskedWrite(GPUREG_0062, 0x1, 0);
-	GPUCMD_AddWrite(GPUREG_0118, 0);
-
-	GPU_SetAlphaBlending(GPU_BLEND_ADD, GPU_BLEND_ADD, GPU_SRC_ALPHA, GPU_ONE_MINUS_SRC_ALPHA, GPU_SRC_ALPHA, GPU_ONE_MINUS_SRC_ALPHA);
-	GPU_SetAlphaTest(false, GPU_ALWAYS, 0x00);
-
-	GPU_SetTextureEnable(GPU_TEXUNIT0);
-
-	GPU_SetTexEnv(0,
-		GPU_TEVSOURCES(GPU_TEXTURE0, GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR),
-		GPU_TEVSOURCES(GPU_TEXTURE0, GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR),
-		GPU_TEVOPERANDS(0,0,0),
-		GPU_TEVOPERANDS(0,0,0),
-		GPU_MODULATE, GPU_MODULATE,
-		0xFFFFFFFF);
-	GPU_SetDummyTexEnv(1);
-	GPU_SetDummyTexEnv(2);
-	GPU_SetDummyTexEnv(3);
-	GPU_SetDummyTexEnv(4);
-	GPU_SetDummyTexEnv(5);
-
-	//texturing stuff
-		GPU_SetTexture(
-			GPU_TEXUNIT0, //texture unit
-			(u32*)osConvertVirtToPhys((u32)texData), //data buffer
-			128, //texture width
-			128, //texture height
-			GPU_TEXTURE_MAG_FILTER(GPU_NEAREST) | GPU_TEXTURE_MIN_FILTER(GPU_NEAREST), //texture params
-			GPU_RGBA8 //texture pixel format
-		);
-
-		GPU_SetAttributeBuffers(
-			3, //3 attributes: vertices, texcoords, and normals
-			(u32*)osConvertVirtToPhys((u32)texData), //mesh buffer
-			GPU_ATTRIBFMT(0, 3, GPU_FLOAT) | // GPU Input attribute register 0 (v0): 3 floats (position)
-			GPU_ATTRIBFMT(1, 2, GPU_FLOAT) | // GPU Input attribute register 1 (v1): 2 floats (texcoord)
-			GPU_ATTRIBFMT(2, 3, GPU_FLOAT),  // GPU Input attribute register 2 (v2): 3 floats (normal)
-			0xFFC,
-			0x210,
-			1,
-			(u32[]){0x00000000},
-			(u64[]){0x210},
-			(u8[]){3}
-		);
-
-	//setup lighting (this is specific to our shader)
-		vect3Df_s lightDir=vnormf(vect3Df(cos(lightAngle), -1.0f, sin(lightAngle)));
-		GPU_SetFloatUniform(GPU_VERTEX_SHADER, shaderInstanceGetUniformLocation(shader.vertexShader, "lightDirection"), (u32*)(float[]){0.0f, -lightDir.z, -lightDir.y, -lightDir.x}, 1);
-		GPU_SetFloatUniform(GPU_VERTEX_SHADER, shaderInstanceGetUniformLocation(shader.vertexShader, "lightAmbient"), (u32*)(float[]){0.7f, 0.4f, 0.4f, 0.4f}, 1);
-
-	//initialize projection matrix to standard perspective stuff
-	gsMatrixMode(GS_PROJECTION);
-	gsProjectionMatrix(80.0f*M_PI/180.0f, 240.0f/400.0f, 0.01f, 100.0f);
-	gsRotateZ(M_PI/2); //because framebuffer is sideways...
-
-	//draw object
-		gsMatrixMode(GS_MODELVIEW);
-		gsPushMatrix();
-			gsTranslate(position.x, position.y, position.z);
-			gsRotateX(angle.x);
-			gsRotateY(angle.y);
-			gsVboDraw(&vbo);
-		gsPopMatrix();
-	GPU_FinishDrawing();
-}
-
-int main(int argc, char** argv)
-{
-
-	gfxInitDefault();
-
-	//initialize GPU
-	GPU_Init(NULL);
-
-	//let GFX know we're ok with doing stereoscopic 3D rendering
-	gfxSet3D(true);
-
-	//allocate our GPU command buffers
-	//they *have* to be on the linear heap
-	u32 gpuCmdSize=0x40000;
-	u32* gpuCmd=(u32*)linearAlloc(gpuCmdSize*4);
-	u32* gpuCmdRight=(u32*)linearAlloc(gpuCmdSize*4);
-
-	//actually reset the GPU
-	GPU_Reset(NULL, gpuCmd, gpuCmdSize);
-
-	//load our vertex shader binary
-	dvlb=DVLB_ParseFile((u32*)test_vsh_shbin, test_vsh_shbin_size);
-	shaderProgramInit(&shader);
-	shaderProgramSetVsh(&shader, &dvlb->DVLE[0]);
-
-	//initialize GS
-	gsInit(&shader);
-
-	// Flush the command buffer so that the shader upload gets executed
-	GPUCMD_Finalize();
-	GPUCMD_FlushAndRun(NULL);
-	gspWaitForP3D();
-
-	//create texture
-	texData=(u32*)linearMemAlign(texture_bin_size, 0x80); //textures need to be 0x80-byte aligned
-	memcpy(texData, texture_bin, texture_bin_size);
-
-	//create VBO
-	gsVboInit(&vbo);
-	gsVboCreate(&vbo, sizeof(modelVboData));
-	gsVboAddData(&vbo, (void*)modelVboData, sizeof(modelVboData), sizeof(modelVboData)/sizeof(vertex_s));
-	gsVboFlushData(&vbo);
-
-	//initialize object position and angle
-	position=vect3Df(0.0f, 0.0f, -2.0f);
-	angle=vect3Df(M_PI/4, M_PI/4, 0.0f);
-
-	//background color (blue)
-	u32 backgroundColor=RGBA8(0x68, 0xB0, 0xD8, 0xFF);
-
-	while(aptMainLoop())
-	{
-		//get current 3D slider state
-		float slider=CONFIG_3D_SLIDERSTATE;
-
-		//controls
-		hidScanInput();
-		//START to exit to hbmenu
-		if(keysDown()&KEY_START)break;
-
-		//A/B to change vertex lighting angle
-		if(keysHeld()&KEY_A)lightAngle+=0.1f;
-		if(keysHeld()&KEY_B)lightAngle-=0.1f;
-
-		//D-PAD to rotate object
-		if(keysHeld()&KEY_DOWN)angle.x+=0.05f;
-		if(keysHeld()&KEY_UP)angle.x-=0.05f;
-		if(keysHeld()&KEY_LEFT)angle.y+=0.05f;
-		if(keysHeld()&KEY_RIGHT)angle.y-=0.05f;
-
-		//R/L to bring object closer to or move it further from the camera
-		if(keysHeld()&KEY_R)position.z+=0.1f;
-		if(keysHeld()&KEY_L)position.z-=0.1f;
-
-		//generate our GPU command buffer for this frame
-		gsStartFrame();
-		renderFrame();
-		GPUCMD_Finalize();
-
-		if(slider>0.0f)
-		{
-			//new and exciting 3D !
-			//make a copy of left gpu buffer
-			u32 offset; GPUCMD_GetBuffer(NULL, NULL, &offset);
-			memcpy(gpuCmdRight, gpuCmd, offset*4);
-
-			//setup interaxial
-			float interaxial=slider*0.12f;
-
-			//adjust left gpu buffer fo 3D !
-			{mtx44 m; loadIdentity44((float*)m); translateMatrix((float*)m, -interaxial*0.5f, 0.0f, 0.0f); gsAdjustBufferMatrices(m);}
-
-			//draw left framebuffer
-			GPUCMD_FlushAndRun(NULL);
-
-			//while GPU starts drawing the left buffer, adjust right one for 3D !
-			GPUCMD_SetBuffer(gpuCmdRight, gpuCmdSize, offset);
-			{mtx44 m; loadIdentity44((float*)m); translateMatrix((float*)m, interaxial*0.5f, 0.0f, 0.0f); gsAdjustBufferMatrices(m);}
-
-			//we wait for the left buffer to finish drawing
-			gspWaitForP3D();
-			GX_SetDisplayTransfer(NULL, (u32*)gpuOut, GX_BUFFER_DIM(240*2, 400), (u32*)gfxGetFramebuffer(GFX_TOP, GFX_LEFT, NULL, NULL), GX_BUFFER_DIM(240*2, 400), DISPLAY_TRANSFER_FLAGS);
-			gspWaitForPPF();
-
-			//we draw the right buffer, wait for it to finish and then switch back to left one
-			//clear the screen
-			GX_SetMemoryFill(NULL, (u32*)gpuOut, backgroundColor, (u32*)&gpuOut[0x2EE00], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH , (u32*)gpuDOut, 0x00000000, (u32*)&gpuDOut[0x2EE00], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH);
-			gspWaitForPSC0();
-
-			//draw the right framebuffer
-			GPUCMD_FlushAndRun(NULL);
-			gspWaitForP3D();
-
-			//transfer from GPU output buffer to actual framebuffer
-			GX_SetDisplayTransfer(NULL, (u32*)gpuOut, GX_BUFFER_DIM(240*2, 400), (u32*)gfxGetFramebuffer(GFX_TOP, GFX_RIGHT, NULL, NULL), GX_BUFFER_DIM(240*2, 400), DISPLAY_TRANSFER_FLAGS);
-			gspWaitForPPF();
-			GPUCMD_SetBuffer(gpuCmd, gpuCmdSize, 0);
-		}else{
-			//boring old 2D !
-
-			//draw the frame
-			GPUCMD_FlushAndRun(NULL);
-			gspWaitForP3D();
-
-			//clear the screen
-			GX_SetDisplayTransfer(NULL, (u32*)gpuOut, GX_BUFFER_DIM(240*2, 400), (u32*)gfxGetFramebuffer(GFX_TOP, GFX_LEFT, NULL, NULL), GX_BUFFER_DIM(240*2, 400), DISPLAY_TRANSFER_FLAGS);
-			gspWaitForPPF();
-		}
-
-		//clear the screen
-		GX_SetMemoryFill(NULL, (u32*)gpuOut, backgroundColor, (u32*)&gpuOut[0x2EE00], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH, (u32*)gpuDOut, 0x00000000, (u32*)&gpuDOut[0x2EE00], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH);
-		gspWaitForPSC0();
-		gfxSwapBuffersGpu();
-
-		gspWaitForEvent(GSPEVENT_VBlank0, true);
-	}
-
-	gsExit();
-	shaderProgramFree(&shader);
-	DVLB_Free(dvlb);
-	gfxExit();
-	return 0;
-}
--- a/examples/gpu/source/math.c
+++ b/examples/gpu/source/math.c
@ -1,148 +0,0 @@
-#include <math.h>
-#include <string.h>
-
-#include "math.h"
-
-void loadIdentity44(float* m)
-{
-	if(!m)return;
-
-	memset(m, 0x00, 16*4);
-	m[0]=m[5]=m[10]=m[15]=1.0f;
-}
-
-void multMatrix44(float* m1, float* m2, float* m) //4x4
-{
-	int i, j;
-	for(i=0;i<4;i++)for(j=0;j<4;j++)m[i+j*4]=(m1[0+j*4]*m2[i+0*4])+(m1[1+j*4]*m2[i+1*4])+(m1[2+j*4]*m2[i+2*4])+(m1[3+j*4]*m2[i+3*4]);
-
-}
-
-void translateMatrix(float* tm, float x, float y, float z)
-{
-	float rm[16], m[16];
-
-	loadIdentity44(rm);
-	rm[3]=x;
-	rm[7]=y;
-	rm[11]=z;
-	
-	multMatrix44(tm,rm,m);
-	memcpy(tm,m,16*sizeof(float));
-}
-
-// 00 01 02 03
-// 04 05 06 07
-// 08 09 10 11
-// 12 13 14 15
-
-void rotateMatrixX(float* tm, float x, bool r)
-{
-	float rm[16], m[16];
-	memset(rm, 0x00, 16*4);
-	rm[0]=1.0f;
-	rm[5]=cos(x);
-	rm[6]=sin(x);
-	rm[9]=-sin(x);
-	rm[10]=cos(x);
-	rm[15]=1.0f;
-	if(!r)multMatrix44(tm,rm,m);
-	else multMatrix44(rm,tm,m);
-	memcpy(tm,m,16*sizeof(float));
-}
-
-void rotateMatrixY(float* tm, float x, bool r)
-{
-	float rm[16], m[16];
-	memset(rm, 0x00, 16*4);
-	rm[0]=cos(x);
-	rm[2]=sin(x);
-	rm[5]=1.0f;
-	rm[8]=-sin(x);
-	rm[10]=cos(x);
-	rm[15]=1.0f;
-	if(!r)multMatrix44(tm,rm,m);
-	else multMatrix44(rm,tm,m);
-	memcpy(tm,m,16*sizeof(float));
-}
-
-void rotateMatrixZ(float* tm, float x, bool r)
-{
-	float rm[16], m[16];
-	memset(rm, 0x00, 16*4);
-	rm[0]=cos(x);
-	rm[1]=sin(x);
-	rm[4]=-sin(x);
-	rm[5]=cos(x);
-	rm[10]=1.0f;
-	rm[15]=1.0f;
-	if(!r)multMatrix44(tm,rm,m);
-	else multMatrix44(rm,tm,m);
-	memcpy(tm,m,16*sizeof(float));
-}
-
-void scaleMatrix(float* tm, float x, float y, float z)
-{
-	tm[0]*=x; tm[4]*=x; tm[8]*=x; tm[12]*=x;
-	tm[1]*=y; tm[5]*=y; tm[9]*=y; tm[13]*=y;
-	tm[2]*=z; tm[6]*=z; tm[10]*=z; tm[14]*=z;
-}
-
-void initProjectionMatrix(float* m, float fovy, float aspect, float near, float far)
-{
-	float top = near*tan(fovy/2);
-	float right = (top*aspect);
-
-	float mp[4*4];
-	
-	mp[0x0] = near/right;
-	mp[0x1] = 0.0f;
-	mp[0x2] = 0.0f;
-	mp[0x3] = 0.0f;
-
-	mp[0x4] = 0.0f;
-	mp[0x5] = near/top;
-	mp[0x6] = 0.0f;
-	mp[0x7] = 0.0f;
-
-	mp[0x8] = 0.0f;
-	mp[0x9] = 0.0f;
-	mp[0xA] = -(far+near)/(far-near);
-	mp[0xB] = -2.0f*(far*near)/(far-near);
-
-	mp[0xC] = 0.0f;
-	mp[0xD] = 0.0f;
-	mp[0xE] = -1.0f;
-	mp[0xF] = 0.0f;
-
-	float mp2[4*4];
-	loadIdentity44(mp2);
-	mp2[0xA]=0.5;
-	mp2[0xB]=-0.5;
-
-	multMatrix44(mp2, mp, m);
-}
-
-vect3Df_s getMatrixColumn(float* m, u8 i)
-{
-	if(!m || i>=4)return vect3Df(0,0,0);
-	return vect3Df(m[0+i*4],m[1+i*4],m[2+i*4]);
-}
-
-vect3Df_s getMatrixRow(float* m, u8 i)
-{
-	if(!m || i>=4)return vect3Df(0,0,0);
-	return vect3Df(m[i+0*4],m[i+1*4],m[i+2*4]);
-}
-
-vect4Df_s getMatrixColumn4(float* m, u8 i)
-{
-	if(!m || i>=4)return vect4Df(0,0,0,0);
-	return vect4Df(m[0+i*4],m[1+i*4],m[2+i*4],m[3+i*4]);
-}
-
-vect4Df_s getMatrixRow4(float* m, u8 i)
-{
-	if(!m || i>=4)return vect4Df(0,0,0,0);
-	return vect4Df(m[i+0*4],m[i+1*4],m[i+2*4],m[i+3*4]);
-}
--- a/examples/gpu/source/math.h
+++ b/examples/gpu/source/math.h
@ -1,144 +0,0 @@
-#ifndef MATH_H
-#define MATH_H
-
-#include <3ds/types.h>
-#include <math.h>
-
-typedef float mtx44[4][4];
-typedef float mtx33[3][3];
-
-typedef struct
-{
-	s32 x, y, z;
-}vect3Di_s;
-
-static inline vect3Di_s vect3Di(s32 x, s32 y, s32 z)
-{
-	return (vect3Di_s){x,y,z};
-}
-
-static inline vect3Di_s vaddi(vect3Di_s u, vect3Di_s v)
-{
-	return (vect3Di_s){u.x+v.x,u.y+v.y,u.z+v.z};
-}
-
-static inline vect3Di_s vsubi(vect3Di_s u, vect3Di_s v)
-{
-	return (vect3Di_s){u.x-v.x,u.y-v.y,u.z-v.z};
-}
-
-static inline vect3Di_s vmuli(vect3Di_s v, s32 f)
-{
-	return (vect3Di_s){v.x*f,v.y*f,v.z*f};
-}
-
-typedef struct
-{
-	float x, y, z;
-}vect3Df_s;
-
-static inline vect3Df_s vect3Df(float x, float y, float z)
-{
-	return (vect3Df_s){x,y,z};
-}
-
-static inline vect3Df_s vaddf(vect3Df_s u, vect3Df_s v)
-{
-	return (vect3Df_s){u.x+v.x,u.y+v.y,u.z+v.z};
-}
-
-static inline vect3Df_s vsubf(vect3Df_s u, vect3Df_s v)
-{
-	return (vect3Df_s){u.x-v.x,u.y-v.y,u.z-v.z};
-}
-
-static inline vect3Df_s vmulf(vect3Df_s v, float f)
-{
-	return (vect3Df_s){v.x*f,v.y*f,v.z*f};
-}
-
-static inline vect3Df_s vscalef(vect3Df_s v1, vect3Df_s v2)
-{
-	return (vect3Df_s){v1.x*v2.x,v1.y*v2.y,v1.z*v2.z};
-}
-
-static inline float vmagf(vect3Df_s v)
-{
-	return sqrtf(v.x*v.x+v.y*v.y+v.z*v.z);
-}
-
-static inline float vdistf(vect3Df_s v1, vect3Df_s v2)
-{
-	return sqrtf((v1.x-v2.x)*(v1.x-v2.x)+(v1.y-v2.y)*(v1.y-v2.y)+(v1.z-v2.z)*(v1.z-v2.z));
-}
-
-static inline vect3Df_s vnormf(vect3Df_s v)
-{
-	const float l=sqrtf(v.x*v.x+v.y*v.y+v.z*v.z);
-	return (vect3Df_s){v.x/l,v.y/l,v.z/l};
-}
-
-typedef struct
-{
-	float x, y, z, w;
-}vect4Df_s;
-
-static inline vect4Df_s vect4Df(float x, float y, float z, float w)
-{
-	return (vect4Df_s){x,y,z,w};
-}
-
-static inline vect4Df_s vaddf4(vect4Df_s u, vect4Df_s v)
-{
-	return (vect4Df_s){u.x+v.x,u.y+v.y,u.z+v.z,u.w+v.w};
-}
-
-static inline vect4Df_s vsubf4(vect4Df_s u, vect4Df_s v)
-{
-	return (vect4Df_s){u.x-v.x,u.y-v.y,u.z-v.z,u.w-v.w};
-}
-
-static inline vect4Df_s vmulf4(vect4Df_s v, float f)
-{
-	return (vect4Df_s){v.x*f,v.y*f,v.z*f,v.w*f};
-}
-
-static inline float vdotf4(vect4Df_s v1, vect4Df_s v2)
-{
-	return v1.x*v2.x+v1.y*v2.y+v1.z*v2.z+v1.w*v2.w;
-}
-
-static inline vect4Df_s vnormf4(vect4Df_s v)
-{
-	const float l=sqrtf(v.x*v.x+v.y*v.y+v.z*v.z+v.w*v.w);
-	return (vect4Df_s){v.x/l,v.y/l,v.z/l,v.w/l};
-}
-
-//interstuff
-static inline vect3Di_s vf2i(vect3Df_s v)
-{
-	return (vect3Di_s){floorf(v.x),floorf(v.y),floorf(v.z)};
-}
-
-static inline vect3Df_s vi2f(vect3Di_s v)
-{
-	return (vect3Df_s){(float)v.x,(float)v.y,(float)v.z};
-}
-
-void loadIdentity44(float* m);
-void multMatrix44(float* m1, float* m2, float* m);
-
-void translateMatrix(float* tm, float x, float y, float z);
-void rotateMatrixX(float* tm, float x, bool r);
-void rotateMatrixY(float* tm, float x, bool r);
-void rotateMatrixZ(float* tm, float x, bool r);
-void scaleMatrix(float* tm, float x, float y, float z);
-
-void initProjectionMatrix(float* m, float fovy, float aspect, float near, float far);
-
-vect3Df_s getMatrixColumn(float* m, u8 i);
-vect3Df_s getMatrixRow(float* m, u8 i);
-vect4Df_s getMatrixColumn4(float* m, u8 i);
-vect4Df_s getMatrixRow4(float* m, u8 i);
-
-#endif
--- a/examples/graphics/gpu/geoshader/Makefile
+++ b/examples/graphics/gpu/geoshader/Makefile
@ -75,6 +75,7 @@ export DEPSDIR	:=	$(CURDIR)/$(BUILD)
 CFILES		:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.c)))
 CPPFILES	:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.cpp)))
 SFILES		:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.s)))
+PICAFILES	:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.pica)))
 BINFILES	:=	$(foreach dir,$(DATA),$(notdir $(wildcard $(dir)/*.*)))

 #---------------------------------------------------------------------------------
@ -91,7 +92,7 @@ else
 endif
 #---------------------------------------------------------------------------------

-export OFILES	:=	$(addsuffix .o,$(BINFILES)) \
+export OFILES	:=	$(addsuffix .o,$(BINFILES)) $(PICAFILES:.pica=.shbin.o) \
 			$(CPPFILES:.cpp=.o) $(CFILES:.c=.o) $(SFILES:.s=.o)

 export INCLUDE	:=	$(foreach dir,$(INCLUDES),-I$(CURDIR)/$(dir)) \
@ -156,17 +157,18 @@ $(OUTPUT).elf	:	$(OFILES)
 	@echo $(notdir $<)
 	@$(bin2o)

-# WARNING: This is not the right way to do this! TODO: Do it right!
 #---------------------------------------------------------------------------------
-%_vsh.h %.vsh.o	:	%.vsh
+# rule for assembling GPU shaders
 #---------------------------------------------------------------------------------
+%.shbin.o: %.pica
 	@echo $(notdir $<)
-	@python3 $(AEMSTRO)/aemstro_as.py $< ../$(notdir $<).shbin
-	@bin2s ../$(notdir $<).shbin | $(PREFIX)as -o $@
-	@echo "extern const u8" `(echo $(notdir $<).shbin | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`"_end[];" > `(echo $(notdir $<).shbin | tr . _)`.h
-	@echo "extern const u8" `(echo $(notdir $<).shbin | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`"[];" >> `(echo $(notdir $<).shbin | tr . _)`.h
-	@echo "extern const u32" `(echo $(notdir $<).shbin | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`_size";" >> `(echo $(notdir $<).shbin | tr . _)`.h
-	@rm ../$(notdir $<).shbin
+	$(eval CURBIN := $(patsubst %.pica,%.shbin,$(notdir $<)))
+	$(eval CURH := $(patsubst %.pica,%.psh.h,$(notdir $<)))
+	@picasso $(CURBIN) $< $(CURH)
+	@bin2s $(CURBIN) | $(AS) -o $@
+	@echo "extern const u8" `(echo $(CURBIN) | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`"_end[];" > `(echo $(CURBIN) | tr . _)`.h
+	@echo "extern const u8" `(echo $(CURBIN) | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`"[];" >> `(echo $(CURBIN) | tr . _)`.h
+	@echo "extern const u32" `(echo $(CURBIN) | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`_size";" >> `(echo $(CURBIN) | tr . _)`.h

 -include $(DEPENDS)

--- a/examples/graphics/gpu/geoshader/README.md
+++ b/examples/graphics/gpu/geoshader/README.md
@ -0,0 +1,6 @@
+# GPU example
+
+This is a simple GPU example using the `picasso` shader assembler which comes with devkitARM r45 and up.
+Users of earlier versions of devkitARM need to install the tool, which can be found in the address below:
+
+https://github.com/fincs/picasso/releases
--- a/examples/graphics/gpu/geoshader/source/3dmath.c
+++ b/examples/graphics/gpu/geoshader/source/3dmath.c
@ -0,0 +1,172 @@
+#include "3dmath.h"
+
+void m4x4_identity(matrix_4x4* out)
+{
+	m4x4_zeros(out);
+	out->r[0].x = out->r[1].y = out->r[2].z = out->r[3].w = 1.0f;
+}
+
+void m4x4_multiply(matrix_4x4* out, const matrix_4x4* a, const matrix_4x4* b)
+{
+	int i, j;
+	for (i = 0; i < 4; i ++)
+		for (j = 0; j < 4; j ++)
+			out->r[j].c[i] = a->r[j].x*b->r[0].c[i] + a->r[j].y*b->r[1].c[i] + a->r[j].z*b->r[2].c[i] + a->r[j].w*b->r[3].c[i];
+}
+
+void m4x4_translate(matrix_4x4* mtx, float x, float y, float z)
+{
+	matrix_4x4 tm, om;
+
+	m4x4_identity(&tm);
+	tm.r[0].w = x;
+	tm.r[1].w = y;
+	tm.r[2].w = z;
+
+	m4x4_multiply(&om, mtx, &tm);
+	m4x4_copy(mtx, &om);
+}
+
+void m4x4_scale(matrix_4x4* mtx, float x, float y, float z)
+{
+	int i;
+	for (i = 0; i < 4; i ++)
+	{
+		mtx->r[i].x *= x;
+		mtx->r[i].y *= y;
+		mtx->r[i].z *= z;
+	}
+}
+
+void m4x4_rotate_x(matrix_4x4* mtx, float angle, bool bRightSide)
+{
+	matrix_4x4 rm, om;
+
+	float cosAngle = cosf(angle);
+	float sinAngle = sinf(angle);
+
+	m4x4_zeros(&rm);
+	rm.r[0].x = 1.0f;
+	rm.r[1].y = cosAngle;
+	rm.r[1].z = sinAngle;
+	rm.r[2].y = -sinAngle;
+	rm.r[2].z = cosAngle;
+	rm.r[3].w = 1.0f;
+
+	if (bRightSide) m4x4_multiply(&om, mtx, &rm);
+	else            m4x4_multiply(&om, &rm, mtx);
+	m4x4_copy(mtx, &om);
+}
+
+void m4x4_rotate_y(matrix_4x4* mtx, float angle, bool bRightSide)
+{
+	matrix_4x4 rm, om;
+
+	float cosAngle = cosf(angle);
+	float sinAngle = sinf(angle);
+
+	m4x4_zeros(&rm);
+	rm.r[0].x = cosAngle;
+	rm.r[0].z = sinAngle;
+	rm.r[1].y = 1.0f;
+	rm.r[2].x = -sinAngle;
+	rm.r[2].z = cosAngle;
+	rm.r[3].w = 1.0f;
+
+	if (bRightSide) m4x4_multiply(&om, mtx, &rm);
+	else            m4x4_multiply(&om, &rm, mtx);
+	m4x4_copy(mtx, &om);
+}
+
+void m4x4_rotate_z(matrix_4x4* mtx, float angle, bool bRightSide)
+{
+	matrix_4x4 rm, om;
+
+	float cosAngle = cosf(angle);
+	float sinAngle = sinf(angle);
+
+	m4x4_zeros(&rm);
+	rm.r[0].x = cosAngle;
+	rm.r[0].y = sinAngle;
+	rm.r[1].x = -sinAngle;
+	rm.r[1].y = cosAngle;
+	rm.r[2].z = 1.0f;
+	rm.r[3].w = 1.0f;
+
+	if (bRightSide) m4x4_multiply(&om, mtx, &rm);
+	else            m4x4_multiply(&om, &rm, mtx);
+	m4x4_copy(mtx, &om);
+}
+
+void m4x4_ortho_tilt(matrix_4x4* mtx, float left, float right, float bottom, float top, float near, float far)
+{
+	matrix_4x4 mp;
+	m4x4_zeros(&mp);
+
+	// Build standard orthogonal projection matrix
+	mp.r[0].x = 2.0f / (right - left);
+	mp.r[0].w = (left + right) / (left - right);
+	mp.r[1].y = 2.0f / (top - bottom);
+	mp.r[1].w = (bottom + top) / (bottom - top);
+	mp.r[2].z = 2.0f / (near - far);
+	mp.r[2].w = (far + near) / (far - near);
+	mp.r[3].w = 1.0f;
+
+	// Fix depth range to [-1, 0]
+	matrix_4x4 mp2, mp3;
+	m4x4_identity(&mp2);
+	mp2.r[2].z = 0.5;
+	mp2.r[2].w = -0.5;
+	m4x4_multiply(&mp3, &mp2, &mp);
+
+	// Fix the 3DS screens' orientation by swapping the X and Y axis
+	m4x4_identity(&mp2);
+	mp2.r[0].x = 0.0;
+	mp2.r[0].y = 1.0;
+	mp2.r[1].x = -1.0; // flipped
+	mp2.r[1].y = 0.0;
+	m4x4_multiply(mtx, &mp2, &mp3);
+}
+
+void m4x4_persp_tilt(matrix_4x4* mtx, float fovx, float invaspect, float near, float far)
+{
+	// Notes:
+	// We are passed "fovy" and the "aspect ratio". However, the 3DS screens are sideways,
+	// and so are these parameters -- in fact, they are actually the fovx and the inverse
+	// of the aspect ratio. Therefore the formula for the perspective projection matrix
+	// had to be modified to be expressed in these terms instead.
+
+	// Notes:
+	// fovx = 2 atan(tan(fovy/2)*w/h)
+	// fovy = 2 atan(tan(fovx/2)*h/w)
+	// invaspect = h/w
+
+	// a0,0 = h / (w*tan(fovy/2)) =
+	//      = h / (w*tan(2 atan(tan(fovx/2)*h/w) / 2)) =
+	//      = h / (w*tan( atan(tan(fovx/2)*h/w) )) =
+	//      = h / (w * tan(fovx/2)*h/w) =
+	//      = 1 / tan(fovx/2)
+
+	// a1,1 = 1 / tan(fovy/2) = (...) = w / (h*tan(fovx/2))
+
+	float fovx_tan = tanf(fovx / 2);
+	matrix_4x4 mp;
+	m4x4_zeros(&mp);
+
+	// Build standard perspective projection matrix
+	mp.r[0].x = 1.0f / fovx_tan;
+	mp.r[1].y = 1.0f / (fovx_tan*invaspect);
+	mp.r[2].z = (near + far) / (near - far);
+	mp.r[2].w = (2 * near * far) / (near - far);
+	mp.r[3].z = -1.0f;
+
+	// Fix depth range to [-1, 0]
+	matrix_4x4 mp2;
+	m4x4_identity(&mp2);
+	mp2.r[2].z = 0.5;
+	mp2.r[2].w = -0.5;
+	m4x4_multiply(mtx, &mp2, &mp);
+
+	// Rotate the matrix one quarter of a turn CCW in order to fix the 3DS screens' orientation
+	m4x4_rotate_z(mtx, M_PI / 2, true);
+}
--- a/examples/graphics/gpu/geoshader/source/3dmath.h
+++ b/examples/graphics/gpu/geoshader/source/3dmath.h
@ -0,0 +1,56 @@
+/*
+ * Bare-bones simplistic 3D math library
+ * This library is common to all libctru GPU examples
+ */
+
+#pragma once
+#include <string.h>
+#include <stdbool.h>
+#include <math.h>
+
+typedef union { struct { float w, z, y, x; }; float c[4]; } vector_4f;
+typedef struct { vector_4f r[4]; } matrix_4x4;
+
+static inline float v4f_dp4(const vector_4f* a, const vector_4f* b)
+{
+	return a->x*b->x + a->y*b->y + a->z*b->z + a->w*b->w;
+}
+
+static inline float v4f_mod4(const vector_4f* a)
+{
+	return sqrtf(v4f_dp4(a,a));
+}
+
+static inline void v4f_norm4(vector_4f* vec)
+{
+	float m = v4f_mod4(vec);
+	if (m == 0.0) return;
+	vec->x /= m;
+	vec->y /= m;
+	vec->z /= m;
+	vec->w /= m;
+}
+
+static inline void m4x4_zeros(matrix_4x4* out)
+{
+	memset(out, 0, sizeof(*out));
+}
+
+static inline void m4x4_copy(matrix_4x4* out, const matrix_4x4* in)
+{
+	memcpy(out, in, sizeof(*out));
+}
+
+void m4x4_identity(matrix_4x4* out);
+void m4x4_multiply(matrix_4x4* out, const matrix_4x4* a, const matrix_4x4* b);
+
+void m4x4_translate(matrix_4x4* mtx, float x, float y, float z);
+void m4x4_scale(matrix_4x4* mtx, float x, float y, float z);
+
+void m4x4_rotate_x(matrix_4x4* mtx, float angle, bool bRightSide);
+void m4x4_rotate_y(matrix_4x4* mtx, float angle, bool bRightSide);
+void m4x4_rotate_z(matrix_4x4* mtx, float angle, bool bRightSide);
+
+// Special versions of the projection matrices that take the 3DS' screen orientation into account
+void m4x4_ortho_tilt(matrix_4x4* mtx, float left, float right, float bottom, float top, float near, float far);
+void m4x4_persp_tilt(matrix_4x4* mtx, float fovy, float aspect, float near, float far);
--- a/examples/graphics/gpu/geoshader/source/gpu.c
+++ b/examples/graphics/gpu/geoshader/source/gpu.c
@ -0,0 +1,93 @@
+#include "gpu.h"
+
+#define DISPLAY_TRANSFER_FLAGS \
+	(GX_TRANSFER_FLIP_VERT(0) | GX_TRANSFER_OUT_TILED(0) | GX_TRANSFER_RAW_COPY(0) | \
+	GX_TRANSFER_IN_FORMAT(GX_TRANSFER_FMT_RGBA8) | GX_TRANSFER_OUT_FORMAT(GX_TRANSFER_FMT_RGB8) | \
+	GX_TRANSFER_SCALING(GX_TRANSFER_SCALE_NO))
+
+static u32 *colorBuf, *depthBuf;
+static u32 *cmdBuf;
+
+void gpuInit(void)
+{
+	colorBuf = vramAlloc(400*240*4);
+	depthBuf = vramAlloc(400*240*4);
+	cmdBuf = linearAlloc(0x40000*4);
+
+	GPU_Init(NULL);
+	GPU_Reset(NULL, cmdBuf, 0x40000);
+}
+
+void gpuExit(void)
+{
+	linearFree(cmdBuf);
+	vramFree(depthBuf);
+	vramFree(colorBuf);
+}
+
+void gpuClearBuffers(u32 clearColor)
+{
+	GX_SetMemoryFill(NULL,
+		colorBuf, clearColor, &colorBuf[240*400], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH,
+		depthBuf, 0,          &depthBuf[240*400], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH);
+	gspWaitForPSC0(); // Wait for the fill to complete
+}
+
+void gpuFrameBegin(void)
+{
+	// Configure the viewport and the depth linear conversion function
+	GPU_SetViewport(
+		(u32*)osConvertVirtToPhys((u32)depthBuf),
+		(u32*)osConvertVirtToPhys((u32)colorBuf),
+		0, 0, 240, 400); // The top screen is physically 240x400 pixels
+	GPU_DepthMap(-1.0f, 0.0f); // calculate the depth value from the Z coordinate in the following way: -1.0*z + 0.0
+
+	// Configure some boilerplate
+	GPU_SetFaceCulling(GPU_CULL_BACK_CCW);
+	GPU_SetStencilTest(false, GPU_ALWAYS, 0x00, 0xFF, 0x00);
+	GPU_SetStencilOp(GPU_KEEP, GPU_KEEP, GPU_KEEP);
+	GPU_SetBlendingColor(0,0,0,0);
+	GPU_SetDepthTestAndWriteMask(true, GPU_GREATER, GPU_WRITE_ALL);
+
+	// This is unknown
+	GPUCMD_AddMaskedWrite(GPUREG_0062, 0x1, 0);
+	GPUCMD_AddWrite(GPUREG_0118, 0);
+
+	// Configure alpha blending and test
+	GPU_SetAlphaBlending(GPU_BLEND_ADD, GPU_BLEND_ADD, GPU_SRC_ALPHA, GPU_ONE_MINUS_SRC_ALPHA, GPU_SRC_ALPHA, GPU_ONE_MINUS_SRC_ALPHA);
+	GPU_SetAlphaTest(false, GPU_ALWAYS, 0x00);
+
+	int i;
+	for (i = 0; i < 6; i ++)
+		GPU_SetDummyTexEnv(i);
+}
+
+void gpuFrameEnd(void)
+{
+	// Finish rendering
+	GPU_FinishDrawing();
+	GPUCMD_Finalize();
+	GPUCMD_FlushAndRun(NULL);
+	gspWaitForP3D(); // Wait for the rendering to complete
+
+	// Transfer the GPU output to the framebuffer
+	GX_SetDisplayTransfer(NULL, colorBuf, GX_BUFFER_DIM(240, 400),
+		(u32*)gfxGetFramebuffer(GFX_TOP, GFX_LEFT, NULL, NULL), GX_BUFFER_DIM(240, 400),
+		DISPLAY_TRANSFER_FLAGS);
+	gspWaitForPPF(); // Wait for the transfer to complete
+
+	// Reset the command buffer
+	GPUCMD_SetBufferOffset(0);
+};
+
+void GPU_SetDummyTexEnv(int id)
+{
+	GPU_SetTexEnv(id,
+		GPU_TEVSOURCES(GPU_PREVIOUS, 0, 0),
+		GPU_TEVSOURCES(GPU_PREVIOUS, 0, 0),
+		GPU_TEVOPERANDS(0, 0, 0),
+		GPU_TEVOPERANDS(0, 0, 0),
+		GPU_REPLACE,
+		GPU_REPLACE,
+		0xFFFFFFFF);
+}
--- a/examples/graphics/gpu/geoshader/source/gpu.h
+++ b/examples/graphics/gpu/geoshader/source/gpu.h
@ -0,0 +1,26 @@
+/*
+ * Bare-bones simplistic GPU wrapper
+ * This library is common to all libctru GPU examples
+ */
+
+#pragma once
+#include <string.h>
+#include <3ds.h>
+#include "3dmath.h"
+
+void gpuInit(void);
+void gpuExit(void);
+
+void gpuClearBuffers(u32 clearColor);
+
+void gpuFrameBegin(void);
+void gpuFrameEnd(void);
+
+// Configures the specified fixed-function fragment shading substage to be a no-operation
+void GPU_SetDummyTexEnv(int id);
+
+// Uploads an uniform matrix
+static inline void GPU_SetFloatUniformMatrix(GPU_SHADER_TYPE type, int location, matrix_4x4* matrix)
+{
+	GPU_SetFloatUniform(type, location, (u32*)matrix, 4);
+}
--- a/examples/graphics/gpu/geoshader/source/gshader.pica
+++ b/examples/graphics/gpu/geoshader/source/gshader.pica
@ -0,0 +1,91 @@
+; Example PICA200 geometry shader
+
+; Uniforms
+.fvec projection[4]
+
+; Constants
+.constf myconst(0.0, 1.0, -1.0, 0.5)
+.alias  zeros myconst.xxxx ; Vector full of zeros
+.alias  ones  myconst.yyyy ; Vector full of ones
+.alias  half  myconst.wwww
+
+; Outputs - this time the type *is* used
+.out outpos position
+.out outclr color
+
+; Inputs: we will receive the following inputs:
+; v0-v1: position/color of the first vertex
+; v2-v3: position/color of the second vertex
+; v4-v5: position/color of the third vertex
+
+.proc main
+	; Calculate the midpoints of the vertices
+	mov r4, v0
+	add r4, v2,   r4
+	mul r4, half, r4
+	mov r5, v2
+	add r5, v4,   r5
+	mul r5, half, r5
+	mov r6, v4
+	add r6, v0,   r6
+	mul r6, half, r6
+
+	; Emit the first triangle
+	mov r0, v0
+	mov r1, r4
+	mov r2, r6
+	call emit_triangle
+
+	; Emit the second triangle
+	mov r0, r4
+	mov r1, v2
+	mov r2, r5
+	call emit_triangle
+
+	; Emit the third triangle
+	mov r0, r6
+	mov r1, r5
+	mov r2, v4
+	call emit_triangle
+
+	; We're finished
+	end
+.end
+
+.proc emit_triangle
+	; Emit the first vertex
+	setemit 0
+	mov r8, r0
+	mov r9, v1
+	call process_vertex
+	emit
+
+	; Emit the second vertex
+	setemit 1
+	mov r8, r1
+	mov r9, v3
+	call process_vertex
+	emit
+
+	; Emit the third vertex and finish the primitive
+	setemit 2, prim
+	mov r8, r2
+	mov r9, v5
+	call process_vertex
+	emit
+.end
+
+; Subroutine
+; Inputs:
+;   r8: vertex position
+;   r9: vertex color
+.proc process_vertex
+	; outpos = projectionMatrix * r8
+	dp4 outpos.x, projection[0], r8
+	dp4 outpos.y, projection[1], r8
+	dp4 outpos.z, projection[2], r8
+	dp4 outpos.w, projection[3], r8
+
+	; outclr = r9
+	mov outclr, r9
+.end
--- a/examples/graphics/gpu/geoshader/source/main.c
+++ b/examples/graphics/gpu/geoshader/source/main.c
@ -0,0 +1,139 @@
+/*
+ * ~~ Simple libctru GPU geometry shader example ~~
+ * This example demonstrates the basics of using the PICA200 in a 3DS homebrew
+ * application in order to render a basic scene using a geoshader.
+ * The example geoshader receives the vertices of a triangle and emits three
+ * smaller triangles, thus forming a 'triforce' shape.
+ */
+
+#include "gpu.h"
+#include "vshader_shbin.h"
+#include "gshader_shbin.h"
+
+#define CLEAR_COLOR 0x68B0D8FF
+
+typedef struct { float position[3]; float color[4]; } vertex;
+
+static const vertex vertex_list[] =
+{
+	{ {200.0f, 200.0f, 0.5f}, {1.0f, 0.0f, 0.0f, 1.0f} },
+	{ {100.0f,  40.0f, 0.5f}, {0.0f, 1.0f, 0.0f, 1.0f} },
+	{ {300.0f,  40.0f, 0.5f}, {0.0f, 0.0f, 1.0f, 1.0f} },
+};
+
+#define vertex_list_count (sizeof(vertex_list)/sizeof(vertex_list[0]))
+
+static DVLB_s *vshader_dvlb, *gshader_dvlb;
+static shaderProgram_s program;
+static int uLoc_projection;
+static matrix_4x4 projection;
+
+static void* vbo_data;
+
+static void sceneInit(void)
+{
+	// Load the shaders and create a shader program
+	// The geoshader stride is set to 6 so that it processes a triangle at a time
+	vshader_dvlb = DVLB_ParseFile((u32*)vshader_shbin, vshader_shbin_size);
+	gshader_dvlb = DVLB_ParseFile((u32*)gshader_shbin, gshader_shbin_size);
+	shaderProgramInit(&program);
+	shaderProgramSetVsh(&program, &vshader_dvlb->DVLE[0]);
+	shaderProgramSetGsh(&program, &gshader_dvlb->DVLE[0], 6);
+
+	// Get the location of the projection matrix uniform
+	uLoc_projection = shaderInstanceGetUniformLocation(program.geometryShader, "projection");
+
+	// Compute the projection matrix
+	m4x4_ortho_tilt(&projection, 0.0, 400.0, 0.0, 240.0, 0.0, 1.0);
+
+	// Create the VBO (vertex buffer object)
+	vbo_data = linearAlloc(sizeof(vertex_list));
+	memcpy(vbo_data, vertex_list, sizeof(vertex_list));
+}
+
+static void sceneRender(void)
+{
+	// Bind the shader program
+	shaderProgramUse(&program);
+
+	// Configure the first fragment shading substage to just pass through the vertex color
+	// See https://www.opengl.org/sdk/docs/man2/xhtml/glTexEnv.xml for more insight
+	GPU_SetTexEnv(0,
+		GPU_TEVSOURCES(GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR), // RGB channels
+		GPU_TEVSOURCES(GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR), // Alpha
+		GPU_TEVOPERANDS(0, 0, 0), // RGB
+		GPU_TEVOPERANDS(0, 0, 0), // Alpha
+		GPU_REPLACE, GPU_REPLACE, // RGB, Alpha
+		0xFFFFFFFF);
+
+	// Configure the "attribute buffers" (that is, the vertex input buffers)
+	GPU_SetAttributeBuffers(
+		2, // Number of inputs per vertex
+		(u32*)osConvertVirtToPhys((u32)vbo_data), // Location of the VBO
+		GPU_ATTRIBFMT(0, 3, GPU_FLOAT) |
+		GPU_ATTRIBFMT(1, 4, GPU_FLOAT), // Format of the inputs (in this case the only input is a 3-element float vector)
+		0xFFC, // Unused attribute mask, in our case bit 0 is cleared since it is used
+		0x10, // Attribute permutations (here it is the identity)
+		1, // Number of buffers
+		(u32[]) { 0x0 }, // Buffer offsets (placeholders)
+		(u64[]) { 0x10 }, // Attribute permutations for each buffer (identity again)
+		(u8[])  { 2 }); // Number of attributes for each buffer
+
+	// Upload the projection matrix
+	GPU_SetFloatUniformMatrix(GPU_GEOMETRY_SHADER, uLoc_projection, &projection);
+
+	// Draw the VBO - GPU_UNKPRIM allows the geoshader to control primitive emission
+	GPU_DrawArray(GPU_UNKPRIM, vertex_list_count);
+}
+
+static void sceneExit(void)
+{
+	// Free the VBO
+	linearFree(vbo_data);
+
+	// Free the shader program
+	shaderProgramFree(&program);
+	DVLB_Free(vshader_dvlb);
+	DVLB_Free(gshader_dvlb);
+}
+
+int main()
+{
+	// Initialize graphics
+	gfxInitDefault();
+	gpuInit();
+
+	// Initialize the scene
+	sceneInit();
+	gpuClearBuffers(CLEAR_COLOR);
+
+	// Main loop
+	while (aptMainLoop())
+	{
+		gspWaitForVBlank();  // Synchronize with the start of VBlank
+		gfxSwapBuffersGpu(); // Swap the framebuffers so that the frame that we rendered last frame is now visible
+		hidScanInput();      // Read the user input
+
+		// Respond to user input
+		u32 kDown = hidKeysDown();
+		if (kDown & KEY_START)
+			break; // break in order to return to hbmenu
+
+		// Render the scene
+		gpuFrameBegin();
+		sceneRender();
+		gpuFrameEnd();
+		gpuClearBuffers(CLEAR_COLOR);
+
+		// Flush the framebuffers out of the data cache (not necessary with pure GPU rendering)
+		//gfxFlushBuffers();
+	}
+
+	// Deinitialize the scene
+	sceneExit();
+
+	// Deinitialize graphics
+	gpuExit();
+	gfxExit();
+	return 0;
+}
--- a/examples/graphics/gpu/geoshader/source/vshader.pica
+++ b/examples/graphics/gpu/geoshader/source/vshader.pica
@ -0,0 +1,24 @@
+; Example PICA200 vertex shader
+
+; Constants
+.constf myconst(0.0, 1.0, -1.0, -0.5)
+.alias  zeros myconst.xxxx ; Vector full of zeros
+.alias  ones  myconst.yyyy ; Vector full of ones
+
+; Outputs - since we are also using a geoshader the output type isn't really used
+.out outpos position
+.out outclr color
+
+; Inputs (defined as aliases for convenience)
+.alias inpos v0
+.alias inclr v1
+
+.proc main
+	; Pass through both inputs to the geoshader
+	mov outpos.xyz, inpos
+	mov outpos.w,   ones
+	mov outclr,     inclr
+
+	; We're finished
+	end
+.end
--- a/examples/graphics/gpu/simple_tri/Makefile
+++ b/examples/graphics/gpu/simple_tri/Makefile
@ -0,0 +1,177 @@
+#---------------------------------------------------------------------------------
+.SUFFIXES:
+#---------------------------------------------------------------------------------
+
+ifeq ($(strip $(DEVKITARM)),)
+$(error "Please set DEVKITARM in your environment. export DEVKITARM=<path to>devkitARM")
+endif
+
+TOPDIR ?= $(CURDIR)
+include $(DEVKITARM)/3ds_rules
+
+#---------------------------------------------------------------------------------
+# TARGET is the name of the output
+# BUILD is the directory where object files & intermediate files will be placed
+# SOURCES is a list of directories containing source code
+# DATA is a list of directories containing data files
+# INCLUDES is a list of directories containing header files
+#
+# NO_SMDH: if set to anything, no SMDH file is generated.
+# APP_TITLE is the name of the app stored in the SMDH file (Optional)
+# APP_DESCRIPTION is the description of the app stored in the SMDH file (Optional)
+# APP_AUTHOR is the author of the app stored in the SMDH file (Optional)
+# ICON is the filename of the icon (.png), relative to the project folder.
+#   If not set, it attempts to use one of the following (in this order):
+#     - <Project name>.png
+#     - icon.png
+#     - <libctru folder>/default_icon.png
+#---------------------------------------------------------------------------------
+TARGET		:=	$(notdir $(CURDIR))
+BUILD		:=	build
+SOURCES		:=	source
+DATA		:=	data
+INCLUDES	:=	include
+
+#---------------------------------------------------------------------------------
+# options for code generation
+#---------------------------------------------------------------------------------
+ARCH	:=	-march=armv6k -mtune=mpcore -mfloat-abi=hard
+
+CFLAGS	:=	-g -Wall -O2 -mword-relocations \
+			-fomit-frame-pointer -ffast-math \
+			$(ARCH)
+
+CFLAGS	+=	$(INCLUDE) -DARM11 -D_3DS
+
+CXXFLAGS	:= $(CFLAGS) -fno-rtti -fno-exceptions -std=gnu++11
+
+ASFLAGS	:=	-g $(ARCH)
+LDFLAGS	=	-specs=3dsx.specs -g $(ARCH) -Wl,-Map,$(notdir $*.map)
+
+LIBS	:= -lctru -lm
+
+#---------------------------------------------------------------------------------
+# list of directories containing libraries, this must be the top level containing
+# include and lib
+#---------------------------------------------------------------------------------
+LIBDIRS	:= $(CTRULIB)
+
+
+#---------------------------------------------------------------------------------
+# no real need to edit anything past this point unless you need to add additional
+# rules for different file extensions
+#---------------------------------------------------------------------------------
+ifneq ($(BUILD),$(notdir $(CURDIR)))
+#---------------------------------------------------------------------------------
+
+export OUTPUT	:=	$(CURDIR)/$(TARGET)
+export TOPDIR	:=	$(CURDIR)
+
+export VPATH	:=	$(foreach dir,$(SOURCES),$(CURDIR)/$(dir)) \
+			$(foreach dir,$(DATA),$(CURDIR)/$(dir))
+
+export DEPSDIR	:=	$(CURDIR)/$(BUILD)
+
+CFILES		:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.c)))
+CPPFILES	:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.cpp)))
+SFILES		:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.s)))
+PICAFILES	:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.pica)))
+BINFILES	:=	$(foreach dir,$(DATA),$(notdir $(wildcard $(dir)/*.*)))
+
+#---------------------------------------------------------------------------------
+# use CXX for linking C++ projects, CC for standard C
+#---------------------------------------------------------------------------------
+ifeq ($(strip $(CPPFILES)),)
+#---------------------------------------------------------------------------------
+	export LD	:=	$(CC)
+#---------------------------------------------------------------------------------
+else
+#---------------------------------------------------------------------------------
+	export LD	:=	$(CXX)
+#---------------------------------------------------------------------------------
+endif
+#---------------------------------------------------------------------------------
+
+export OFILES	:=	$(addsuffix .o,$(BINFILES)) $(PICAFILES:.pica=.shbin.o) \
+			$(CPPFILES:.cpp=.o) $(CFILES:.c=.o) $(SFILES:.s=.o)
+
+export INCLUDE	:=	$(foreach dir,$(INCLUDES),-I$(CURDIR)/$(dir)) \
+			$(foreach dir,$(LIBDIRS),-I$(dir)/include) \
+			-I$(CURDIR)/$(BUILD)
+
+export LIBPATHS	:=	$(foreach dir,$(LIBDIRS),-L$(dir)/lib)
+
+ifeq ($(strip $(ICON)),)
+	icons := $(wildcard *.png)
+	ifneq (,$(findstring $(TARGET).png,$(icons)))
+		export APP_ICON := $(TOPDIR)/$(TARGET).png
+	else
+		ifneq (,$(findstring icon.png,$(icons)))
+			export APP_ICON := $(TOPDIR)/icon.png
+		endif
+	endif
+else
+	export APP_ICON := $(TOPDIR)/$(ICON)
+endif
+
+ifeq ($(strip $(NO_SMDH)),)
+	export _3DSXFLAGS += --smdh=$(CURDIR)/$(TARGET).smdh
+endif
+
+.PHONY: $(BUILD) clean all
+
+#---------------------------------------------------------------------------------
+all: $(BUILD)
+
+$(BUILD):
+	@[ -d $@ ] || mkdir -p $@
+	@$(MAKE) --no-print-directory -C $(BUILD) -f $(CURDIR)/Makefile
+
+#---------------------------------------------------------------------------------
+clean:
+	@echo clean ...
+	@rm -fr $(BUILD) $(TARGET).3dsx $(OUTPUT).smdh $(TARGET).elf
+
+
+#---------------------------------------------------------------------------------
+else
+
+DEPENDS	:=	$(OFILES:.o=.d)
+
+#---------------------------------------------------------------------------------
+# main targets
+#---------------------------------------------------------------------------------
+ifeq ($(strip $(NO_SMDH)),)
+$(OUTPUT).3dsx	:	$(OUTPUT).elf $(OUTPUT).smdh
+else
+$(OUTPUT).3dsx	:	$(OUTPUT).elf
+endif
+
+$(OUTPUT).elf	:	$(OFILES)
+
+#---------------------------------------------------------------------------------
+# you need a rule like this for each extension you use as binary data
+#---------------------------------------------------------------------------------
+%.bin.o	:	%.bin
+#---------------------------------------------------------------------------------
+	@echo $(notdir $<)
+	@$(bin2o)
+
+#---------------------------------------------------------------------------------
+# rule for assembling GPU shaders
+#---------------------------------------------------------------------------------
+%.shbin.o: %.pica
+	@echo $(notdir $<)
+	$(eval CURBIN := $(patsubst %.pica,%.shbin,$(notdir $<)))
+	$(eval CURH := $(patsubst %.pica,%.psh.h,$(notdir $<)))
+	@picasso $(CURBIN) $< $(CURH)
+	@bin2s $(CURBIN) | $(AS) -o $@
+	@echo "extern const u8" `(echo $(CURBIN) | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`"_end[];" > `(echo $(CURBIN) | tr . _)`.h
+	@echo "extern const u8" `(echo $(CURBIN) | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`"[];" >> `(echo $(CURBIN) | tr . _)`.h
+	@echo "extern const u32" `(echo $(CURBIN) | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`_size";" >> `(echo $(CURBIN) | tr . _)`.h
+
+-include $(DEPENDS)
+
+#---------------------------------------------------------------------------------------
+endif
+#---------------------------------------------------------------------------------------
--- a/examples/graphics/gpu/simple_tri/README.md
+++ b/examples/graphics/gpu/simple_tri/README.md
@ -0,0 +1,6 @@
+# GPU example
+
+This is a simple GPU example using the `picasso` shader assembler which comes with devkitARM r45 and up.
+Users of earlier versions of devkitARM need to install the tool, which can be found in the address below:
+
+https://github.com/fincs/picasso/releases
--- a/examples/graphics/gpu/simple_tri/source/3dmath.c
+++ b/examples/graphics/gpu/simple_tri/source/3dmath.c
@ -0,0 +1,172 @@
+#include "3dmath.h"
+
+void m4x4_identity(matrix_4x4* out)
+{
+	m4x4_zeros(out);
+	out->r[0].x = out->r[1].y = out->r[2].z = out->r[3].w = 1.0f;
+}
+
+void m4x4_multiply(matrix_4x4* out, const matrix_4x4* a, const matrix_4x4* b)
+{
+	int i, j;
+	for (i = 0; i < 4; i ++)
+		for (j = 0; j < 4; j ++)
+			out->r[j].c[i] = a->r[j].x*b->r[0].c[i] + a->r[j].y*b->r[1].c[i] + a->r[j].z*b->r[2].c[i] + a->r[j].w*b->r[3].c[i];
+}
+
+void m4x4_translate(matrix_4x4* mtx, float x, float y, float z)
+{
+	matrix_4x4 tm, om;
+
+	m4x4_identity(&tm);
+	tm.r[0].w = x;
+	tm.r[1].w = y;
+	tm.r[2].w = z;
+
+	m4x4_multiply(&om, mtx, &tm);
+	m4x4_copy(mtx, &om);
+}
+
+void m4x4_scale(matrix_4x4* mtx, float x, float y, float z)
+{
+	int i;
+	for (i = 0; i < 4; i ++)
+	{
+		mtx->r[i].x *= x;
+		mtx->r[i].y *= y;
+		mtx->r[i].z *= z;
+	}
+}
+
+void m4x4_rotate_x(matrix_4x4* mtx, float angle, bool bRightSide)
+{
+	matrix_4x4 rm, om;
+
+	float cosAngle = cosf(angle);
+	float sinAngle = sinf(angle);
+
+	m4x4_zeros(&rm);
+	rm.r[0].x = 1.0f;
+	rm.r[1].y = cosAngle;
+	rm.r[1].z = sinAngle;
+	rm.r[2].y = -sinAngle;
+	rm.r[2].z = cosAngle;
+	rm.r[3].w = 1.0f;
+
+	if (bRightSide) m4x4_multiply(&om, mtx, &rm);
+	else            m4x4_multiply(&om, &rm, mtx);
+	m4x4_copy(mtx, &om);
+}
+
+void m4x4_rotate_y(matrix_4x4* mtx, float angle, bool bRightSide)
+{
+	matrix_4x4 rm, om;
+
+	float cosAngle = cosf(angle);
+	float sinAngle = sinf(angle);
+
+	m4x4_zeros(&rm);
+	rm.r[0].x = cosAngle;
+	rm.r[0].z = sinAngle;
+	rm.r[1].y = 1.0f;
+	rm.r[2].x = -sinAngle;
+	rm.r[2].z = cosAngle;
+	rm.r[3].w = 1.0f;
+
+	if (bRightSide) m4x4_multiply(&om, mtx, &rm);
+	else            m4x4_multiply(&om, &rm, mtx);
+	m4x4_copy(mtx, &om);
+}
+
+void m4x4_rotate_z(matrix_4x4* mtx, float angle, bool bRightSide)
+{
+	matrix_4x4 rm, om;
+
+	float cosAngle = cosf(angle);
+	float sinAngle = sinf(angle);
+
+	m4x4_zeros(&rm);
+	rm.r[0].x = cosAngle;
+	rm.r[0].y = sinAngle;
+	rm.r[1].x = -sinAngle;
+	rm.r[1].y = cosAngle;
+	rm.r[2].z = 1.0f;
+	rm.r[3].w = 1.0f;
+
+	if (bRightSide) m4x4_multiply(&om, mtx, &rm);
+	else            m4x4_multiply(&om, &rm, mtx);
+	m4x4_copy(mtx, &om);
+}
+
+void m4x4_ortho_tilt(matrix_4x4* mtx, float left, float right, float bottom, float top, float near, float far)
+{
+	matrix_4x4 mp;
+	m4x4_zeros(&mp);
+
+	// Build standard orthogonal projection matrix
+	mp.r[0].x = 2.0f / (right - left);
+	mp.r[0].w = (left + right) / (left - right);
+	mp.r[1].y = 2.0f / (top - bottom);
+	mp.r[1].w = (bottom + top) / (bottom - top);
+	mp.r[2].z = 2.0f / (near - far);
+	mp.r[2].w = (far + near) / (far - near);
+	mp.r[3].w = 1.0f;
+
+	// Fix depth range to [-1, 0]
+	matrix_4x4 mp2, mp3;
+	m4x4_identity(&mp2);
+	mp2.r[2].z = 0.5;
+	mp2.r[2].w = -0.5;
+	m4x4_multiply(&mp3, &mp2, &mp);
+
+	// Fix the 3DS screens' orientation by swapping the X and Y axis
+	m4x4_identity(&mp2);
+	mp2.r[0].x = 0.0;
+	mp2.r[0].y = 1.0;
+	mp2.r[1].x = -1.0; // flipped
+	mp2.r[1].y = 0.0;
+	m4x4_multiply(mtx, &mp2, &mp3);
+}
+
+void m4x4_persp_tilt(matrix_4x4* mtx, float fovx, float invaspect, float near, float far)
+{
+	// Notes:
+	// We are passed "fovy" and the "aspect ratio". However, the 3DS screens are sideways,
+	// and so are these parameters -- in fact, they are actually the fovx and the inverse
+	// of the aspect ratio. Therefore the formula for the perspective projection matrix
+	// had to be modified to be expressed in these terms instead.
+
+	// Notes:
+	// fovx = 2 atan(tan(fovy/2)*w/h)
+	// fovy = 2 atan(tan(fovx/2)*h/w)
+	// invaspect = h/w
+
+	// a0,0 = h / (w*tan(fovy/2)) =
+	//      = h / (w*tan(2 atan(tan(fovx/2)*h/w) / 2)) =
+	//      = h / (w*tan( atan(tan(fovx/2)*h/w) )) =
+	//      = h / (w * tan(fovx/2)*h/w) =
+	//      = 1 / tan(fovx/2)
+
+	// a1,1 = 1 / tan(fovy/2) = (...) = w / (h*tan(fovx/2))
+
+	float fovx_tan = tanf(fovx / 2);
+	matrix_4x4 mp;
+	m4x4_zeros(&mp);
+
+	// Build standard perspective projection matrix
+	mp.r[0].x = 1.0f / fovx_tan;
+	mp.r[1].y = 1.0f / (fovx_tan*invaspect);
+	mp.r[2].z = (near + far) / (near - far);
+	mp.r[2].w = (2 * near * far) / (near - far);
+	mp.r[3].z = -1.0f;
+
+	// Fix depth range to [-1, 0]
+	matrix_4x4 mp2;
+	m4x4_identity(&mp2);
+	mp2.r[2].z = 0.5;
+	mp2.r[2].w = -0.5;
+	m4x4_multiply(mtx, &mp2, &mp);
+
+	// Rotate the matrix one quarter of a turn CCW in order to fix the 3DS screens' orientation
+	m4x4_rotate_z(mtx, M_PI / 2, true);
+}
--- a/examples/graphics/gpu/simple_tri/source/3dmath.h
+++ b/examples/graphics/gpu/simple_tri/source/3dmath.h
@ -0,0 +1,56 @@
+/*
+ * Bare-bones simplistic 3D math library
+ * This library is common to all libctru GPU examples
+ */
+
+#pragma once
+#include <string.h>
+#include <stdbool.h>
+#include <math.h>
+
+typedef union { struct { float w, z, y, x; }; float c[4]; } vector_4f;
+typedef struct { vector_4f r[4]; } matrix_4x4;
+
+static inline float v4f_dp4(const vector_4f* a, const vector_4f* b)
+{
+	return a->x*b->x + a->y*b->y + a->z*b->z + a->w*b->w;
+}
+
+static inline float v4f_mod4(const vector_4f* a)
+{
+	return sqrtf(v4f_dp4(a,a));
+}
+
+static inline void v4f_norm4(vector_4f* vec)
+{
+	float m = v4f_mod4(vec);
+	if (m == 0.0) return;
+	vec->x /= m;
+	vec->y /= m;
+	vec->z /= m;
+	vec->w /= m;
+}
+
+static inline void m4x4_zeros(matrix_4x4* out)
+{
+	memset(out, 0, sizeof(*out));
+}
+
+static inline void m4x4_copy(matrix_4x4* out, const matrix_4x4* in)
+{
+	memcpy(out, in, sizeof(*out));
+}
+
+void m4x4_identity(matrix_4x4* out);
+void m4x4_multiply(matrix_4x4* out, const matrix_4x4* a, const matrix_4x4* b);
+
+void m4x4_translate(matrix_4x4* mtx, float x, float y, float z);
+void m4x4_scale(matrix_4x4* mtx, float x, float y, float z);
+
+void m4x4_rotate_x(matrix_4x4* mtx, float angle, bool bRightSide);
+void m4x4_rotate_y(matrix_4x4* mtx, float angle, bool bRightSide);
+void m4x4_rotate_z(matrix_4x4* mtx, float angle, bool bRightSide);
+
+// Special versions of the projection matrices that take the 3DS' screen orientation into account
+void m4x4_ortho_tilt(matrix_4x4* mtx, float left, float right, float bottom, float top, float near, float far);
+void m4x4_persp_tilt(matrix_4x4* mtx, float fovy, float aspect, float near, float far);
--- a/examples/graphics/gpu/simple_tri/source/gpu.c
+++ b/examples/graphics/gpu/simple_tri/source/gpu.c
@ -0,0 +1,93 @@
+#include "gpu.h"
+
+#define DISPLAY_TRANSFER_FLAGS \
+	(GX_TRANSFER_FLIP_VERT(0) | GX_TRANSFER_OUT_TILED(0) | GX_TRANSFER_RAW_COPY(0) | \
+	GX_TRANSFER_IN_FORMAT(GX_TRANSFER_FMT_RGBA8) | GX_TRANSFER_OUT_FORMAT(GX_TRANSFER_FMT_RGB8) | \
+	GX_TRANSFER_SCALING(GX_TRANSFER_SCALE_NO))
+
+static u32 *colorBuf, *depthBuf;
+static u32 *cmdBuf;
+
+void gpuInit(void)
+{
+	colorBuf = vramAlloc(400*240*4);
+	depthBuf = vramAlloc(400*240*4);
+	cmdBuf = linearAlloc(0x40000*4);
+
+	GPU_Init(NULL);
+	GPU_Reset(NULL, cmdBuf, 0x40000);
+}
+
+void gpuExit(void)
+{
+	linearFree(cmdBuf);
+	vramFree(depthBuf);
+	vramFree(colorBuf);
+}
+
+void gpuClearBuffers(u32 clearColor)
+{
+	GX_SetMemoryFill(NULL,
+		colorBuf, clearColor, &colorBuf[240*400], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH,
+		depthBuf, 0,          &depthBuf[240*400], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH);
+	gspWaitForPSC0(); // Wait for the fill to complete
+}
+
+void gpuFrameBegin(void)
+{
+	// Configure the viewport and the depth linear conversion function
+	GPU_SetViewport(
+		(u32*)osConvertVirtToPhys((u32)depthBuf),
+		(u32*)osConvertVirtToPhys((u32)colorBuf),
+		0, 0, 240, 400); // The top screen is physically 240x400 pixels
+	GPU_DepthMap(-1.0f, 0.0f); // calculate the depth value from the Z coordinate in the following way: -1.0*z + 0.0
+
+	// Configure some boilerplate
+	GPU_SetFaceCulling(GPU_CULL_BACK_CCW);
+	GPU_SetStencilTest(false, GPU_ALWAYS, 0x00, 0xFF, 0x00);
+	GPU_SetStencilOp(GPU_KEEP, GPU_KEEP, GPU_KEEP);
+	GPU_SetBlendingColor(0,0,0,0);
+	GPU_SetDepthTestAndWriteMask(true, GPU_GREATER, GPU_WRITE_ALL);
+
+	// This is unknown
+	GPUCMD_AddMaskedWrite(GPUREG_0062, 0x1, 0);
+	GPUCMD_AddWrite(GPUREG_0118, 0);
+
+	// Configure alpha blending and test
+	GPU_SetAlphaBlending(GPU_BLEND_ADD, GPU_BLEND_ADD, GPU_SRC_ALPHA, GPU_ONE_MINUS_SRC_ALPHA, GPU_SRC_ALPHA, GPU_ONE_MINUS_SRC_ALPHA);
+	GPU_SetAlphaTest(false, GPU_ALWAYS, 0x00);
+
+	int i;
+	for (i = 0; i < 6; i ++)
+		GPU_SetDummyTexEnv(i);
+}
+
+void gpuFrameEnd(void)
+{
+	// Finish rendering
+	GPU_FinishDrawing();
+	GPUCMD_Finalize();
+	GPUCMD_FlushAndRun(NULL);
+	gspWaitForP3D(); // Wait for the rendering to complete
+
+	// Transfer the GPU output to the framebuffer
+	GX_SetDisplayTransfer(NULL, colorBuf, GX_BUFFER_DIM(240, 400),
+		(u32*)gfxGetFramebuffer(GFX_TOP, GFX_LEFT, NULL, NULL), GX_BUFFER_DIM(240, 400),
+		DISPLAY_TRANSFER_FLAGS);
+	gspWaitForPPF(); // Wait for the transfer to complete
+
+	// Reset the command buffer
+	GPUCMD_SetBufferOffset(0);
+};
+
+void GPU_SetDummyTexEnv(int id)
+{
+	GPU_SetTexEnv(id,
+		GPU_TEVSOURCES(GPU_PREVIOUS, 0, 0),
+		GPU_TEVSOURCES(GPU_PREVIOUS, 0, 0),
+		GPU_TEVOPERANDS(0, 0, 0),
+		GPU_TEVOPERANDS(0, 0, 0),
+		GPU_REPLACE,
+		GPU_REPLACE,
+		0xFFFFFFFF);
+}
--- a/examples/graphics/gpu/simple_tri/source/gpu.h
+++ b/examples/graphics/gpu/simple_tri/source/gpu.h
@ -0,0 +1,26 @@
+/*
+ * Bare-bones simplistic GPU wrapper
+ * This library is common to all libctru GPU examples
+ */
+
+#pragma once
+#include <string.h>
+#include <3ds.h>
+#include "3dmath.h"
+
+void gpuInit(void);
+void gpuExit(void);
+
+void gpuClearBuffers(u32 clearColor);
+
+void gpuFrameBegin(void);
+void gpuFrameEnd(void);
+
+// Configures the specified fixed-function fragment shading substage to be a no-operation
+void GPU_SetDummyTexEnv(int id);
+
+// Uploads an uniform matrix
+static inline void GPU_SetFloatUniformMatrix(GPU_SHADER_TYPE type, int location, matrix_4x4* matrix)
+{
+	GPU_SetFloatUniform(type, location, (u32*)matrix, 4);
+}
--- a/examples/graphics/gpu/simple_tri/source/main.c
+++ b/examples/graphics/gpu/simple_tri/source/main.c
@ -0,0 +1,131 @@
+/*
+ * ~~ Simple libctru GPU triangle example ~~
+ * This example demonstrates the basics of using the PICA200 in a 3DS homebrew
+ * application in order to render a basic scene consisting of a white solid triangle.
+ */
+
+#include "gpu.h"
+#include "vshader_shbin.h"
+
+#define CLEAR_COLOR 0x68B0D8FF
+
+typedef struct { float x, y, z; } vertex;
+
+static const vertex vertex_list[] =
+{
+	{ 200.0f, 200.0f, 0.5f },
+	{ 100.0f, 40.0f, 0.5f },
+	{ 300.0f, 40.0f, 0.5f },
+};
+
+#define vertex_list_count (sizeof(vertex_list)/sizeof(vertex_list[0]))
+
+static DVLB_s* vshader_dvlb;
+static shaderProgram_s program;
+static int uLoc_projection;
+static matrix_4x4 projection;
+
+static void* vbo_data;
+
+static void sceneInit(void)
+{
+	// Load the vertex shader and create a shader program
+	vshader_dvlb = DVLB_ParseFile((u32*)vshader_shbin, vshader_shbin_size);
+	shaderProgramInit(&program);
+	shaderProgramSetVsh(&program, &vshader_dvlb->DVLE[0]);
+
+	// Get the location of the projection matrix uniform
+	uLoc_projection = shaderInstanceGetUniformLocation(program.vertexShader, "projection");
+
+	// Compute the projection matrix
+	m4x4_ortho_tilt(&projection, 0.0, 400.0, 0.0, 240.0, 0.0, 1.0);
+
+	// Create the VBO (vertex buffer object)
+	vbo_data = linearAlloc(sizeof(vertex_list));
+	memcpy(vbo_data, vertex_list, sizeof(vertex_list));
+}
+
+static void sceneRender(void)
+{
+	// Bind the shader program
+	shaderProgramUse(&program);
+
+	// Configure the first fragment shading substage to just pass through the vertex color
+	// See https://www.opengl.org/sdk/docs/man2/xhtml/glTexEnv.xml for more insight
+	GPU_SetTexEnv(0,
+		GPU_TEVSOURCES(GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR), // RGB channels
+		GPU_TEVSOURCES(GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR), // Alpha
+		GPU_TEVOPERANDS(0, 0, 0), // RGB
+		GPU_TEVOPERANDS(0, 0, 0), // Alpha
+		GPU_REPLACE, GPU_REPLACE, // RGB, Alpha
+		0xFFFFFFFF);
+
+	// Configure the "attribute buffers" (that is, the vertex input buffers)
+	GPU_SetAttributeBuffers(
+		1, // Number of inputs per vertex
+		(u32*)osConvertVirtToPhys((u32)vbo_data), // Location of the VBO
+		GPU_ATTRIBFMT(0, 3, GPU_FLOAT), // Format of the inputs (in this case the only input is a 3-element float vector)
+		0xFFE, // Unused attribute mask, in our case bit 0 is cleared since it is used
+		0x0, // Attribute permutations (here it is the identity)
+		1, // Number of buffers
+		(u32[]) { 0x0 }, // Buffer offsets (placeholders)
+		(u64[]) { 0x0 }, // Attribute permutations for each buffer (identity again)
+		(u8[])  { 1 }); // Number of attributes for each buffer
+
+	// Upload the projection matrix
+	GPU_SetFloatUniformMatrix(GPU_VERTEX_SHADER, uLoc_projection, &projection);
+
+	// Draw the VBO
+	GPU_DrawArray(GPU_TRIANGLES, vertex_list_count);
+}
+
+static void sceneExit(void)
+{
+	// Free the VBO
+	linearFree(vbo_data);
+
+	// Free the shader program
+	shaderProgramFree(&program);
+	DVLB_Free(vshader_dvlb);
+}
+
+int main()
+{
+	// Initialize graphics
+	gfxInitDefault();
+	gpuInit();
+
+	// Initialize the scene
+	sceneInit();
+	gpuClearBuffers(CLEAR_COLOR);
+
+	// Main loop
+	while (aptMainLoop())
+	{
+		gspWaitForVBlank();  // Synchronize with the start of VBlank
+		gfxSwapBuffersGpu(); // Swap the framebuffers so that the frame that we rendered last frame is now visible
+		hidScanInput();      // Read the user input
+
+		// Respond to user input
+		u32 kDown = hidKeysDown();
+		if (kDown & KEY_START)
+			break; // break in order to return to hbmenu
+
+		// Render the scene
+		gpuFrameBegin();
+		sceneRender();
+		gpuFrameEnd();
+		gpuClearBuffers(CLEAR_COLOR);
+
+		// Flush the framebuffers out of the data cache (not necessary with pure GPU rendering)
+		//gfxFlushBuffers();
+	}
+
+	// Deinitialize the scene
+	sceneExit();
+
+	// Deinitialize graphics
+	gpuExit();
+	gfxExit();
+	return 0;
+}
--- a/examples/graphics/gpu/simple_tri/source/vshader.pica
+++ b/examples/graphics/gpu/simple_tri/source/vshader.pica
@ -0,0 +1,34 @@
+; Example PICA200 vertex shader
+
+; Uniforms
+.fvec projection[4]
+
+; Constants
+.constf myconst(0.0, 1.0, -1.0, -0.5)
+.alias  zeros myconst.xxxx ; Vector full of zeros
+.alias  ones  myconst.yyyy ; Vector full of ones
+
+; Outputs
+.out outpos position
+.out outclr color
+
+; Inputs (defined as aliases for convenience)
+.alias inpos v0
+
+.proc main
+	; Force the w component of inpos to be 1.0
+	mov r0.xyz, inpos
+	mov r0.w,   ones
+
+	; outpos = projectionMatrix * inpos
+	dp4 outpos.x, projection[0], r0
+	dp4 outpos.y, projection[1], r0
+	dp4 outpos.z, projection[2], r0
+	dp4 outpos.w, projection[3], r0
+
+	; outclr = solid white color
+	mov outclr, ones
+
+	; We're finished
+	end
+.end
--- a/examples/graphics/gpu/textured_cube/Makefile
+++ b/examples/graphics/gpu/textured_cube/Makefile
@ -0,0 +1,177 @@
+#---------------------------------------------------------------------------------
+.SUFFIXES:
+#---------------------------------------------------------------------------------
+
+ifeq ($(strip $(DEVKITARM)),)
+$(error "Please set DEVKITARM in your environment. export DEVKITARM=<path to>devkitARM")
+endif
+
+TOPDIR ?= $(CURDIR)
+include $(DEVKITARM)/3ds_rules
+
+#---------------------------------------------------------------------------------
+# TARGET is the name of the output
+# BUILD is the directory where object files & intermediate files will be placed
+# SOURCES is a list of directories containing source code
+# DATA is a list of directories containing data files
+# INCLUDES is a list of directories containing header files
+#
+# NO_SMDH: if set to anything, no SMDH file is generated.
+# APP_TITLE is the name of the app stored in the SMDH file (Optional)
+# APP_DESCRIPTION is the description of the app stored in the SMDH file (Optional)
+# APP_AUTHOR is the author of the app stored in the SMDH file (Optional)
+# ICON is the filename of the icon (.png), relative to the project folder.
+#   If not set, it attempts to use one of the following (in this order):
+#     - <Project name>.png
+#     - icon.png
+#     - <libctru folder>/default_icon.png
+#---------------------------------------------------------------------------------
+TARGET		:=	$(notdir $(CURDIR))
+BUILD		:=	build
+SOURCES		:=	source
+DATA		:=	data
+INCLUDES	:=	include
+
+#---------------------------------------------------------------------------------
+# options for code generation
+#---------------------------------------------------------------------------------
+ARCH	:=	-march=armv6k -mtune=mpcore -mfloat-abi=hard
+
+CFLAGS	:=	-g -Wall -O2 -mword-relocations \
+			-fomit-frame-pointer -ffast-math \
+			$(ARCH)
+
+CFLAGS	+=	$(INCLUDE) -DARM11 -D_3DS
+
+CXXFLAGS	:= $(CFLAGS) -fno-rtti -fno-exceptions -std=gnu++11
+
+ASFLAGS	:=	-g $(ARCH)
+LDFLAGS	=	-specs=3dsx.specs -g $(ARCH) -Wl,-Map,$(notdir $*.map)
+
+LIBS	:= -lctru -lm
+
+#---------------------------------------------------------------------------------
+# list of directories containing libraries, this must be the top level containing
+# include and lib
+#---------------------------------------------------------------------------------
+LIBDIRS	:= $(CTRULIB)
+
+
+#---------------------------------------------------------------------------------
+# no real need to edit anything past this point unless you need to add additional
+# rules for different file extensions
+#---------------------------------------------------------------------------------
+ifneq ($(BUILD),$(notdir $(CURDIR)))
+#---------------------------------------------------------------------------------
+
+export OUTPUT	:=	$(CURDIR)/$(TARGET)
+export TOPDIR	:=	$(CURDIR)
+
+export VPATH	:=	$(foreach dir,$(SOURCES),$(CURDIR)/$(dir)) \
+			$(foreach dir,$(DATA),$(CURDIR)/$(dir))
+
+export DEPSDIR	:=	$(CURDIR)/$(BUILD)
+
+CFILES		:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.c)))
+CPPFILES	:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.cpp)))
+SFILES		:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.s)))
+PICAFILES	:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.pica)))
+BINFILES	:=	$(foreach dir,$(DATA),$(notdir $(wildcard $(dir)/*.*)))
+
+#---------------------------------------------------------------------------------
+# use CXX for linking C++ projects, CC for standard C
+#---------------------------------------------------------------------------------
+ifeq ($(strip $(CPPFILES)),)
+#---------------------------------------------------------------------------------
+	export LD	:=	$(CC)
+#---------------------------------------------------------------------------------
+else
+#---------------------------------------------------------------------------------
+	export LD	:=	$(CXX)
+#---------------------------------------------------------------------------------
+endif
+#---------------------------------------------------------------------------------
+
+export OFILES	:=	$(addsuffix .o,$(BINFILES)) $(PICAFILES:.pica=.shbin.o) \
+			$(CPPFILES:.cpp=.o) $(CFILES:.c=.o) $(SFILES:.s=.o)
+
+export INCLUDE	:=	$(foreach dir,$(INCLUDES),-I$(CURDIR)/$(dir)) \
+			$(foreach dir,$(LIBDIRS),-I$(dir)/include) \
+			-I$(CURDIR)/$(BUILD)
+
+export LIBPATHS	:=	$(foreach dir,$(LIBDIRS),-L$(dir)/lib)
+
+ifeq ($(strip $(ICON)),)
+	icons := $(wildcard *.png)
+	ifneq (,$(findstring $(TARGET).png,$(icons)))
+		export APP_ICON := $(TOPDIR)/$(TARGET).png
+	else
+		ifneq (,$(findstring icon.png,$(icons)))
+			export APP_ICON := $(TOPDIR)/icon.png
+		endif
+	endif
+else
+	export APP_ICON := $(TOPDIR)/$(ICON)
+endif
+
+ifeq ($(strip $(NO_SMDH)),)
+	export _3DSXFLAGS += --smdh=$(CURDIR)/$(TARGET).smdh
+endif
+
+.PHONY: $(BUILD) clean all
+
+#---------------------------------------------------------------------------------
+all: $(BUILD)
+
+$(BUILD):
+	@[ -d $@ ] || mkdir -p $@
+	@$(MAKE) --no-print-directory -C $(BUILD) -f $(CURDIR)/Makefile
+
+#---------------------------------------------------------------------------------
+clean:
+	@echo clean ...
+	@rm -fr $(BUILD) $(TARGET).3dsx $(OUTPUT).smdh $(TARGET).elf
+
+
+#---------------------------------------------------------------------------------
+else
+
+DEPENDS	:=	$(OFILES:.o=.d)
+
+#---------------------------------------------------------------------------------
+# main targets
+#---------------------------------------------------------------------------------
+ifeq ($(strip $(NO_SMDH)),)
+$(OUTPUT).3dsx	:	$(OUTPUT).elf $(OUTPUT).smdh
+else
+$(OUTPUT).3dsx	:	$(OUTPUT).elf
+endif
+
+$(OUTPUT).elf	:	$(OFILES)
+
+#---------------------------------------------------------------------------------
+# you need a rule like this for each extension you use as binary data
+#---------------------------------------------------------------------------------
+%.bin.o	:	%.bin
+#---------------------------------------------------------------------------------
+	@echo $(notdir $<)
+	@$(bin2o)
+
+#---------------------------------------------------------------------------------
+# rule for assembling GPU shaders
+#---------------------------------------------------------------------------------
+%.shbin.o: %.pica
+	@echo $(notdir $<)
+	$(eval CURBIN := $(patsubst %.pica,%.shbin,$(notdir $<)))
+	$(eval CURH := $(patsubst %.pica,%.psh.h,$(notdir $<)))
+	@picasso $(CURBIN) $< $(CURH)
+	@bin2s $(CURBIN) | $(AS) -o $@
+	@echo "extern const u8" `(echo $(CURBIN) | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`"_end[];" > `(echo $(CURBIN) | tr . _)`.h
+	@echo "extern const u8" `(echo $(CURBIN) | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`"[];" >> `(echo $(CURBIN) | tr . _)`.h
+	@echo "extern const u32" `(echo $(CURBIN) | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`_size";" >> `(echo $(CURBIN) | tr . _)`.h
+
+-include $(DEPENDS)
+
+#---------------------------------------------------------------------------------------
+endif
+#---------------------------------------------------------------------------------------
--- a/examples/graphics/gpu/textured_cube/README.md
+++ b/examples/graphics/gpu/textured_cube/README.md
@ -0,0 +1,6 @@
+# GPU example
+
+This is a simple GPU example using the `picasso` shader assembler which comes with devkitARM r45 and up.
+Users of earlier versions of devkitARM need to install the tool, which can be found in the address below:
+
+https://github.com/fincs/picasso/releases
--- a/examples/graphics/gpu/textured_cube/data/kitten.bin
+++ b/examples/graphics/gpu/textured_cube/data/kitten.bin
--- a/examples/graphics/gpu/textured_cube/source/3dmath.c
+++ b/examples/graphics/gpu/textured_cube/source/3dmath.c
@ -0,0 +1,172 @@
+#include "3dmath.h"
+
+void m4x4_identity(matrix_4x4* out)
+{
+	m4x4_zeros(out);
+	out->r[0].x = out->r[1].y = out->r[2].z = out->r[3].w = 1.0f;
+}
+
+void m4x4_multiply(matrix_4x4* out, const matrix_4x4* a, const matrix_4x4* b)
+{
+	int i, j;
+	for (i = 0; i < 4; i ++)
+		for (j = 0; j < 4; j ++)
+			out->r[j].c[i] = a->r[j].x*b->r[0].c[i] + a->r[j].y*b->r[1].c[i] + a->r[j].z*b->r[2].c[i] + a->r[j].w*b->r[3].c[i];
+}
+
+void m4x4_translate(matrix_4x4* mtx, float x, float y, float z)
+{
+	matrix_4x4 tm, om;
+
+	m4x4_identity(&tm);
+	tm.r[0].w = x;
+	tm.r[1].w = y;
+	tm.r[2].w = z;
+
+	m4x4_multiply(&om, mtx, &tm);
+	m4x4_copy(mtx, &om);
+}
+
+void m4x4_scale(matrix_4x4* mtx, float x, float y, float z)
+{
+	int i;
+	for (i = 0; i < 4; i ++)
+	{
+		mtx->r[i].x *= x;
+		mtx->r[i].y *= y;
+		mtx->r[i].z *= z;
+	}
+}
+
+void m4x4_rotate_x(matrix_4x4* mtx, float angle, bool bRightSide)
+{
+	matrix_4x4 rm, om;
+
+	float cosAngle = cosf(angle);
+	float sinAngle = sinf(angle);
+
+	m4x4_zeros(&rm);
+	rm.r[0].x = 1.0f;
+	rm.r[1].y = cosAngle;
+	rm.r[1].z = sinAngle;
+	rm.r[2].y = -sinAngle;
+	rm.r[2].z = cosAngle;
+	rm.r[3].w = 1.0f;
+
+	if (bRightSide) m4x4_multiply(&om, mtx, &rm);
+	else            m4x4_multiply(&om, &rm, mtx);
+	m4x4_copy(mtx, &om);
+}
+
+void m4x4_rotate_y(matrix_4x4* mtx, float angle, bool bRightSide)
+{
+	matrix_4x4 rm, om;
+
+	float cosAngle = cosf(angle);
+	float sinAngle = sinf(angle);
+
+	m4x4_zeros(&rm);
+	rm.r[0].x = cosAngle;
+	rm.r[0].z = sinAngle;
+	rm.r[1].y = 1.0f;
+	rm.r[2].x = -sinAngle;
+	rm.r[2].z = cosAngle;
+	rm.r[3].w = 1.0f;
+
+	if (bRightSide) m4x4_multiply(&om, mtx, &rm);
+	else            m4x4_multiply(&om, &rm, mtx);
+	m4x4_copy(mtx, &om);
+}
+
+void m4x4_rotate_z(matrix_4x4* mtx, float angle, bool bRightSide)
+{
+	matrix_4x4 rm, om;
+
+	float cosAngle = cosf(angle);
+	float sinAngle = sinf(angle);
+
+	m4x4_zeros(&rm);
+	rm.r[0].x = cosAngle;
+	rm.r[0].y = sinAngle;
+	rm.r[1].x = -sinAngle;
+	rm.r[1].y = cosAngle;
+	rm.r[2].z = 1.0f;
+	rm.r[3].w = 1.0f;
+
+	if (bRightSide) m4x4_multiply(&om, mtx, &rm);
+	else            m4x4_multiply(&om, &rm, mtx);
+	m4x4_copy(mtx, &om);
+}
+
+void m4x4_ortho_tilt(matrix_4x4* mtx, float left, float right, float bottom, float top, float near, float far)
+{
+	matrix_4x4 mp;
+	m4x4_zeros(&mp);
+
+	// Build standard orthogonal projection matrix
+	mp.r[0].x = 2.0f / (right - left);
+	mp.r[0].w = (left + right) / (left - right);
+	mp.r[1].y = 2.0f / (top - bottom);
+	mp.r[1].w = (bottom + top) / (bottom - top);
+	mp.r[2].z = 2.0f / (near - far);
+	mp.r[2].w = (far + near) / (far - near);
+	mp.r[3].w = 1.0f;
+
+	// Fix depth range to [-1, 0]
+	matrix_4x4 mp2, mp3;
+	m4x4_identity(&mp2);
+	mp2.r[2].z = 0.5;
+	mp2.r[2].w = -0.5;
+	m4x4_multiply(&mp3, &mp2, &mp);
+
+	// Fix the 3DS screens' orientation by swapping the X and Y axis
+	m4x4_identity(&mp2);
+	mp2.r[0].x = 0.0;
+	mp2.r[0].y = 1.0;
+	mp2.r[1].x = -1.0; // flipped
+	mp2.r[1].y = 0.0;
+	m4x4_multiply(mtx, &mp2, &mp3);
+}
+
+void m4x4_persp_tilt(matrix_4x4* mtx, float fovx, float invaspect, float near, float far)
+{
+	// Notes:
+	// We are passed "fovy" and the "aspect ratio". However, the 3DS screens are sideways,
+	// and so are these parameters -- in fact, they are actually the fovx and the inverse
+	// of the aspect ratio. Therefore the formula for the perspective projection matrix
+	// had to be modified to be expressed in these terms instead.
+
+	// Notes:
+	// fovx = 2 atan(tan(fovy/2)*w/h)
+	// fovy = 2 atan(tan(fovx/2)*h/w)
+	// invaspect = h/w
+
+	// a0,0 = h / (w*tan(fovy/2)) =
+	//      = h / (w*tan(2 atan(tan(fovx/2)*h/w) / 2)) =
+	//      = h / (w*tan( atan(tan(fovx/2)*h/w) )) =
+	//      = h / (w * tan(fovx/2)*h/w) =
+	//      = 1 / tan(fovx/2)
+
+	// a1,1 = 1 / tan(fovy/2) = (...) = w / (h*tan(fovx/2))
+
+	float fovx_tan = tanf(fovx / 2);
+	matrix_4x4 mp;
+	m4x4_zeros(&mp);
+
+	// Build standard perspective projection matrix
+	mp.r[0].x = 1.0f / fovx_tan;
+	mp.r[1].y = 1.0f / (fovx_tan*invaspect);
+	mp.r[2].z = (near + far) / (near - far);
+	mp.r[2].w = (2 * near * far) / (near - far);
+	mp.r[3].z = -1.0f;
+
+	// Fix depth range to [-1, 0]
+	matrix_4x4 mp2;
+	m4x4_identity(&mp2);
+	mp2.r[2].z = 0.5;
+	mp2.r[2].w = -0.5;
+	m4x4_multiply(mtx, &mp2, &mp);
+
+	// Rotate the matrix one quarter of a turn CCW in order to fix the 3DS screens' orientation
+	m4x4_rotate_z(mtx, M_PI / 2, true);
+}
--- a/examples/graphics/gpu/textured_cube/source/3dmath.h
+++ b/examples/graphics/gpu/textured_cube/source/3dmath.h
@ -0,0 +1,56 @@
+/*
+ * Bare-bones simplistic 3D math library
+ * This library is common to all libctru GPU examples
+ */
+
+#pragma once
+#include <string.h>
+#include <stdbool.h>
+#include <math.h>
+
+typedef union { struct { float w, z, y, x; }; float c[4]; } vector_4f;
+typedef struct { vector_4f r[4]; } matrix_4x4;
+
+static inline float v4f_dp4(const vector_4f* a, const vector_4f* b)
+{
+	return a->x*b->x + a->y*b->y + a->z*b->z + a->w*b->w;
+}
+
+static inline float v4f_mod4(const vector_4f* a)
+{
+	return sqrtf(v4f_dp4(a,a));
+}
+
+static inline void v4f_norm4(vector_4f* vec)
+{
+	float m = v4f_mod4(vec);
+	if (m == 0.0) return;
+	vec->x /= m;
+	vec->y /= m;
+	vec->z /= m;
+	vec->w /= m;
+}
+
+static inline void m4x4_zeros(matrix_4x4* out)
+{
+	memset(out, 0, sizeof(*out));
+}
+
+static inline void m4x4_copy(matrix_4x4* out, const matrix_4x4* in)
+{
+	memcpy(out, in, sizeof(*out));
+}
+
+void m4x4_identity(matrix_4x4* out);
+void m4x4_multiply(matrix_4x4* out, const matrix_4x4* a, const matrix_4x4* b);
+
+void m4x4_translate(matrix_4x4* mtx, float x, float y, float z);
+void m4x4_scale(matrix_4x4* mtx, float x, float y, float z);
+
+void m4x4_rotate_x(matrix_4x4* mtx, float angle, bool bRightSide);
+void m4x4_rotate_y(matrix_4x4* mtx, float angle, bool bRightSide);
+void m4x4_rotate_z(matrix_4x4* mtx, float angle, bool bRightSide);
+
+// Special versions of the projection matrices that take the 3DS' screen orientation into account
+void m4x4_ortho_tilt(matrix_4x4* mtx, float left, float right, float bottom, float top, float near, float far);
+void m4x4_persp_tilt(matrix_4x4* mtx, float fovy, float aspect, float near, float far);
--- a/examples/graphics/gpu/textured_cube/source/gpu.c
+++ b/examples/graphics/gpu/textured_cube/source/gpu.c
@ -0,0 +1,93 @@
+#include "gpu.h"
+
+#define DISPLAY_TRANSFER_FLAGS \
+	(GX_TRANSFER_FLIP_VERT(0) | GX_TRANSFER_OUT_TILED(0) | GX_TRANSFER_RAW_COPY(0) | \
+	GX_TRANSFER_IN_FORMAT(GX_TRANSFER_FMT_RGBA8) | GX_TRANSFER_OUT_FORMAT(GX_TRANSFER_FMT_RGB8) | \
+	GX_TRANSFER_SCALING(GX_TRANSFER_SCALE_NO))
+
+static u32 *colorBuf, *depthBuf;
+static u32 *cmdBuf;
+
+void gpuInit(void)
+{
+	colorBuf = vramAlloc(400*240*4);
+	depthBuf = vramAlloc(400*240*4);
+	cmdBuf = linearAlloc(0x40000*4);
+
+	GPU_Init(NULL);
+	GPU_Reset(NULL, cmdBuf, 0x40000);
+}
+
+void gpuExit(void)
+{
+	linearFree(cmdBuf);
+	vramFree(depthBuf);
+	vramFree(colorBuf);
+}
+
+void gpuClearBuffers(u32 clearColor)
+{
+	GX_SetMemoryFill(NULL,
+		colorBuf, clearColor, &colorBuf[240*400], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH,
+		depthBuf, 0,          &depthBuf[240*400], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH);
+	gspWaitForPSC0(); // Wait for the fill to complete
+}
+
+void gpuFrameBegin(void)
+{
+	// Configure the viewport and the depth linear conversion function
+	GPU_SetViewport(
+		(u32*)osConvertVirtToPhys((u32)depthBuf),
+		(u32*)osConvertVirtToPhys((u32)colorBuf),
+		0, 0, 240, 400); // The top screen is physically 240x400 pixels
+	GPU_DepthMap(-1.0f, 0.0f); // calculate the depth value from the Z coordinate in the following way: -1.0*z + 0.0
+
+	// Configure some boilerplate
+	GPU_SetFaceCulling(GPU_CULL_BACK_CCW);
+	GPU_SetStencilTest(false, GPU_ALWAYS, 0x00, 0xFF, 0x00);
+	GPU_SetStencilOp(GPU_KEEP, GPU_KEEP, GPU_KEEP);
+	GPU_SetBlendingColor(0,0,0,0);
+	GPU_SetDepthTestAndWriteMask(true, GPU_GREATER, GPU_WRITE_ALL);
+
+	// This is unknown
+	GPUCMD_AddMaskedWrite(GPUREG_0062, 0x1, 0);
+	GPUCMD_AddWrite(GPUREG_0118, 0);
+
+	// Configure alpha blending and test
+	GPU_SetAlphaBlending(GPU_BLEND_ADD, GPU_BLEND_ADD, GPU_SRC_ALPHA, GPU_ONE_MINUS_SRC_ALPHA, GPU_SRC_ALPHA, GPU_ONE_MINUS_SRC_ALPHA);
+	GPU_SetAlphaTest(false, GPU_ALWAYS, 0x00);
+
+	int i;
+	for (i = 0; i < 6; i ++)
+		GPU_SetDummyTexEnv(i);
+}
+
+void gpuFrameEnd(void)
+{
+	// Finish rendering
+	GPU_FinishDrawing();
+	GPUCMD_Finalize();
+	GPUCMD_FlushAndRun(NULL);
+	gspWaitForP3D(); // Wait for the rendering to complete
+
+	// Transfer the GPU output to the framebuffer
+	GX_SetDisplayTransfer(NULL, colorBuf, GX_BUFFER_DIM(240, 400),
+		(u32*)gfxGetFramebuffer(GFX_TOP, GFX_LEFT, NULL, NULL), GX_BUFFER_DIM(240, 400),
+		DISPLAY_TRANSFER_FLAGS);
+	gspWaitForPPF(); // Wait for the transfer to complete
+
+	// Reset the command buffer
+	GPUCMD_SetBufferOffset(0);
+};
+
+void GPU_SetDummyTexEnv(int id)
+{
+	GPU_SetTexEnv(id,
+		GPU_TEVSOURCES(GPU_PREVIOUS, 0, 0),
+		GPU_TEVSOURCES(GPU_PREVIOUS, 0, 0),
+		GPU_TEVOPERANDS(0, 0, 0),
+		GPU_TEVOPERANDS(0, 0, 0),
+		GPU_REPLACE,
+		GPU_REPLACE,
+		0xFFFFFFFF);
+}
--- a/examples/graphics/gpu/textured_cube/source/gpu.h
+++ b/examples/graphics/gpu/textured_cube/source/gpu.h
@ -0,0 +1,26 @@
+/*
+ * Bare-bones simplistic GPU wrapper
+ * This library is common to all libctru GPU examples
+ */
+
+#pragma once
+#include <string.h>
+#include <3ds.h>
+#include "3dmath.h"
+
+void gpuInit(void);
+void gpuExit(void);
+
+void gpuClearBuffers(u32 clearColor);
+
+void gpuFrameBegin(void);
+void gpuFrameEnd(void);
+
+// Configures the specified fixed-function fragment shading substage to be a no-operation
+void GPU_SetDummyTexEnv(int id);
+
+// Uploads an uniform matrix
+static inline void GPU_SetFloatUniformMatrix(GPU_SHADER_TYPE type, int location, matrix_4x4* matrix)
+{
+	GPU_SetFloatUniform(type, location, (u32*)matrix, 4);
+}
--- a/examples/graphics/gpu/textured_cube/source/main.c
+++ b/examples/graphics/gpu/textured_cube/source/main.c
@ -0,0 +1,244 @@
+/*
+ * ~~ Simple libctru GPU textured cube example ~~
+ * This example demonstrates the basics of using the PICA200 in a 3DS homebrew
+ * application in order to render a basic scene consisting of a rotating
+ * textured cube which is also shaded using a simple shading algorithm.
+ * The shading algorithm is explained in the vertex shader source code.
+ */
+
+#include "gpu.h"
+#include "vshader_shbin.h"
+#include "kitten_bin.h"
+
+#define CLEAR_COLOR 0x68B0D8FF
+
+typedef struct { float position[3]; float texcoord[2]; float normal[3]; } vertex;
+
+static const vertex vertex_list[] =
+{
+	// First face (PZ)
+	// First triangle
+	{ {-0.5f, -0.5f, +0.5f}, {0.0f, 0.0f}, {0.0f, 0.0f, +1.0f} },
+	{ {+0.5f, -0.5f, +0.5f}, {1.0f, 0.0f}, {0.0f, 0.0f, +1.0f} },
+	{ {+0.5f, +0.5f, +0.5f}, {1.0f, 1.0f}, {0.0f, 0.0f, +1.0f} },
+	// Second triangle
+	{ {+0.5f, +0.5f, +0.5f}, {1.0f, 1.0f}, {0.0f, 0.0f, +1.0f} },
+	{ {-0.5f, +0.5f, +0.5f}, {0.0f, 1.0f}, {0.0f, 0.0f, +1.0f} },
+	{ {-0.5f, -0.5f, +0.5f}, {0.0f, 0.0f}, {0.0f, 0.0f, +1.0f} },
+
+	// Second face (MZ)
+	// First triangle
+	{ {-0.5f, -0.5f, -0.5f}, {0.0f, 0.0f}, {0.0f, 0.0f, -1.0f} },
+	{ {-0.5f, +0.5f, -0.5f}, {1.0f, 0.0f}, {0.0f, 0.0f, -1.0f} },
+	{ {+0.5f, +0.5f, -0.5f}, {1.0f, 1.0f}, {0.0f, 0.0f, -1.0f} },
+	// Second triangle
+	{ {+0.5f, +0.5f, -0.5f}, {1.0f, 1.0f}, {0.0f, 0.0f, -1.0f} },
+	{ {+0.5f, -0.5f, -0.5f}, {0.0f, 1.0f}, {0.0f, 0.0f, -1.0f} },
+	{ {-0.5f, -0.5f, -0.5f}, {0.0f, 0.0f}, {0.0f, 0.0f, -1.0f} },
+
+	// Third face (PX)
+	// First triangle
+	{ {+0.5f, -0.5f, -0.5f}, {0.0f, 0.0f}, {+1.0f, 0.0f, 0.0f} },
+	{ {+0.5f, +0.5f, -0.5f}, {1.0f, 0.0f}, {+1.0f, 0.0f, 0.0f} },
+	{ {+0.5f, +0.5f, +0.5f}, {1.0f, 1.0f}, {+1.0f, 0.0f, 0.0f} },
+	// Second triangle
+	{ {+0.5f, +0.5f, +0.5f}, {1.0f, 1.0f}, {+1.0f, 0.0f, 0.0f} },
+	{ {+0.5f, -0.5f, +0.5f}, {0.0f, 1.0f}, {+1.0f, 0.0f, 0.0f} },
+	{ {+0.5f, -0.5f, -0.5f}, {0.0f, 0.0f}, {+1.0f, 0.0f, 0.0f} },
+
+	// Fourth face (MX)
+	// First triangle
+	{ {-0.5f, -0.5f, -0.5f}, {0.0f, 0.0f}, {-1.0f, 0.0f, 0.0f} },
+	{ {-0.5f, -0.5f, +0.5f}, {1.0f, 0.0f}, {-1.0f, 0.0f, 0.0f} },
+	{ {-0.5f, +0.5f, +0.5f}, {1.0f, 1.0f}, {-1.0f, 0.0f, 0.0f} },
+	// Second triangle
+	{ {-0.5f, +0.5f, +0.5f}, {1.0f, 1.0f}, {-1.0f, 0.0f, 0.0f} },
+	{ {-0.5f, +0.5f, -0.5f}, {0.0f, 1.0f}, {-1.0f, 0.0f, 0.0f} },
+	{ {-0.5f, -0.5f, -0.5f}, {0.0f, 0.0f}, {-1.0f, 0.0f, 0.0f} },
+
+	// Fifth face (PY)
+	// First triangle
+	{ {-0.5f, +0.5f, -0.5f}, {0.0f, 0.0f}, {0.0f, +1.0f, 0.0f} },
+	{ {-0.5f, +0.5f, +0.5f}, {1.0f, 0.0f}, {0.0f, +1.0f, 0.0f} },
+	{ {+0.5f, +0.5f, +0.5f}, {1.0f, 1.0f}, {0.0f, +1.0f, 0.0f} },
+	// Second triangle
+	{ {+0.5f, +0.5f, +0.5f}, {1.0f, 1.0f}, {0.0f, +1.0f, 0.0f} },
+	{ {+0.5f, +0.5f, -0.5f}, {0.0f, 1.0f}, {0.0f, +1.0f, 0.0f} },
+	{ {-0.5f, +0.5f, -0.5f}, {0.0f, 0.0f}, {0.0f, +1.0f, 0.0f} },
+
+	// Sixth face (MY)
+	// First triangle
+	{ {-0.5f, -0.5f, -0.5f}, {0.0f, 0.0f}, {0.0f, -1.0f, 0.0f} },
+	{ {+0.5f, -0.5f, -0.5f}, {1.0f, 0.0f}, {0.0f, -1.0f, 0.0f} },
+	{ {+0.5f, -0.5f, +0.5f}, {1.0f, 1.0f}, {0.0f, -1.0f, 0.0f} },
+	// Second triangle
+	{ {+0.5f, -0.5f, +0.5f}, {1.0f, 1.0f}, {0.0f, -1.0f, 0.0f} },
+	{ {-0.5f, -0.5f, +0.5f}, {0.0f, 1.0f}, {0.0f, -1.0f, 0.0f} },
+	{ {-0.5f, -0.5f, -0.5f}, {0.0f, 0.0f}, {0.0f, -1.0f, 0.0f} },
+};
+
+#define vertex_list_count (sizeof(vertex_list)/sizeof(vertex_list[0]))
+
+static DVLB_s* vshader_dvlb;
+static shaderProgram_s program;
+static int uLoc_projection, uLoc_modelView;
+static int uLoc_lightVec, uLoc_lightHalfVec, uLoc_lightClr, uLoc_material;
+static matrix_4x4 projection;
+static matrix_4x4 material =
+{
+	{
+	{ { 0.0f, 0.2f, 0.2f, 0.2f } }, // Ambient
+	{ { 0.0f, 0.4f, 0.4f, 0.4f } }, // Diffuse
+	{ { 0.0f, 0.8f, 0.8f, 0.8f } }, // Specular
+	{ { 1.0f, 0.0f, 0.0f, 0.0f } }, // Emission
+	}
+};
+
+static void* vbo_data;
+static void* tex_data;
+static float angleX = 0.0, angleY = 0.0;
+
+static void sceneInit(void)
+{
+	// Load the vertex shader and create a shader program
+	vshader_dvlb = DVLB_ParseFile((u32*)vshader_shbin, vshader_shbin_size);
+	shaderProgramInit(&program);
+	shaderProgramSetVsh(&program, &vshader_dvlb->DVLE[0]);
+
+	// Get the location of the uniforms
+	uLoc_projection   = shaderInstanceGetUniformLocation(program.vertexShader, "projection");
+	uLoc_modelView    = shaderInstanceGetUniformLocation(program.vertexShader, "modelView");
+	uLoc_lightVec     = shaderInstanceGetUniformLocation(program.vertexShader, "lightVec");
+	uLoc_lightHalfVec = shaderInstanceGetUniformLocation(program.vertexShader, "lightHalfVec");
+	uLoc_lightClr     = shaderInstanceGetUniformLocation(program.vertexShader, "lightClr");
+	uLoc_material     = shaderInstanceGetUniformLocation(program.vertexShader, "material");
+
+	// Compute the projection matrix
+	m4x4_persp_tilt(&projection, 80.0f*M_PI/180.0f, 400.0f/240.0f, 0.01f, 1000.0f);
+
+	// Create the VBO (vertex buffer object)
+	vbo_data = linearAlloc(sizeof(vertex_list));
+	memcpy(vbo_data, vertex_list, sizeof(vertex_list));
+
+	// Load the texture
+	tex_data = linearAlloc(kitten_bin_size);
+	memcpy(tex_data, kitten_bin, kitten_bin_size);
+}
+
+static void sceneRender(void)
+{
+	// Bind the shader program
+	shaderProgramUse(&program);
+
+	// Configure the first fragment shading substage to blend the texture color with
+	// the vertex color (calculated by the vertex shader using a lighting algorithm)
+	// See https://www.opengl.org/sdk/docs/man2/xhtml/glTexEnv.xml for more insight
+	GPU_SetTexEnv(0,
+		GPU_TEVSOURCES(GPU_TEXTURE0, GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR), // RGB channels
+		GPU_TEVSOURCES(GPU_TEXTURE0, GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR), // Alpha
+		GPU_TEVOPERANDS(0, 0, 0), // RGB
+		GPU_TEVOPERANDS(0, 0, 0), // Alpha
+		GPU_MODULATE, GPU_MODULATE, // RGB, Alpha
+		0xFFFFFFFF);
+
+	// Configure the first texture unit
+	GPU_SetTextureEnable(GPU_TEXUNIT0);
+	GPU_SetTexture(
+		GPU_TEXUNIT0,
+		(u32*)osConvertVirtToPhys((u32)tex_data),
+		64, // Width
+		64, // Height
+		GPU_TEXTURE_MAG_FILTER(GPU_LINEAR) | GPU_TEXTURE_WRAP_S(GPU_REPEAT) | GPU_TEXTURE_WRAP_T(GPU_REPEAT), // Flags
+		GPU_RGBA8 // Pixel format
+	);
+
+	// Configure the "attribute buffers" (that is, the vertex input buffers)
+	GPU_SetAttributeBuffers(
+		3, // Number of inputs per vertex
+		(u32*)osConvertVirtToPhys((u32)vbo_data), // Location of the VBO
+		GPU_ATTRIBFMT(0, 3, GPU_FLOAT) | // Format of the inputs
+		GPU_ATTRIBFMT(1, 2, GPU_FLOAT) |
+		GPU_ATTRIBFMT(2, 3, GPU_FLOAT),
+		0xFFC, // Unused attribute mask, in our case bits 0~2 are cleared since they are used
+		0x210, // Attribute permutations (here it is the identity, passing each attribute in order)
+		1, // Number of buffers
+		(u32[]) { 0x0 }, // Buffer offsets (placeholders)
+		(u64[]) { 0x210 }, // Attribute permutations for each buffer (identity again)
+		(u8[])  { 3 }); // Number of attributes for each buffer
+
+	// Calculate the modelView matrix
+	matrix_4x4 modelView;
+	m4x4_identity(&modelView);
+	m4x4_translate(&modelView, 0.0, 0.0, -2.0 + 0.5*sinf(angleX));
+	m4x4_rotate_x(&modelView, angleX, true);
+	m4x4_rotate_y(&modelView, angleY, true);
+
+	// Rotate the cube each frame
+	angleX += M_PI / 180;
+	angleY += M_PI / 360;
+
+	// Upload the uniforms
+	GPU_SetFloatUniformMatrix(GPU_VERTEX_SHADER, uLoc_projection, &projection);
+	GPU_SetFloatUniformMatrix(GPU_VERTEX_SHADER, uLoc_modelView,  &modelView);
+	GPU_SetFloatUniformMatrix(GPU_VERTEX_SHADER, uLoc_material,   &material);
+	GPU_SetFloatUniform(GPU_VERTEX_SHADER, uLoc_lightVec,     (u32*)(float[]){0.0f, -1.0f, 0.0f, 0.0f}, 1);
+	GPU_SetFloatUniform(GPU_VERTEX_SHADER, uLoc_lightHalfVec, (u32*)(float[]){0.0f, -1.0f, 0.0f, 0.0f}, 1);
+	GPU_SetFloatUniform(GPU_VERTEX_SHADER, uLoc_lightClr,     (u32*)(float[]){1.0f,  1.0f, 1.0f, 1.0f}, 1);
+
+	// Draw the VBO
+	GPU_DrawArray(GPU_TRIANGLES, vertex_list_count);
+}
+
+static void sceneExit(void)
+{
+	// Free the texture
+	linearFree(tex_data);
+
+	// Free the VBO
+	linearFree(vbo_data);
+
+	// Free the shader program
+	shaderProgramFree(&program);
+	DVLB_Free(vshader_dvlb);
+}
+
+int main()
+{
+	// Initialize graphics
+	gfxInitDefault();
+	gpuInit();
+
+	// Initialize the scene
+	sceneInit();
+	gpuClearBuffers(CLEAR_COLOR);
+
+	// Main loop
+	while (aptMainLoop())
+	{
+		gspWaitForVBlank();  // Synchronize with the start of VBlank
+		gfxSwapBuffersGpu(); // Swap the framebuffers so that the frame that we rendered last frame is now visible
+		hidScanInput();      // Read the user input
+
+		// Respond to user input
+		u32 kDown = hidKeysDown();
+		if (kDown & KEY_START)
+			break; // break in order to return to hbmenu
+
+		// Render the scene
+		gpuFrameBegin();
+		sceneRender();
+		gpuFrameEnd();
+		gpuClearBuffers(CLEAR_COLOR);
+
+		// Flush the framebuffers out of the data cache (not necessary with pure GPU rendering)
+		//gfxFlushBuffers();
+	}
+
+	// Deinitialize the scene
+	sceneExit();
+
+	// Deinitialize graphics
+	gpuExit();
+	gfxExit();
+	return 0;
+}
--- a/examples/graphics/gpu/textured_cube/source/vshader.pica
+++ b/examples/graphics/gpu/textured_cube/source/vshader.pica
@ -0,0 +1,90 @@
+; Example PICA200 vertex shader
+
+; Uniforms
+.fvec projection[4], modelView[4]
+.fvec lightVec, lightHalfVec, lightClr, material[4]
+.alias mat_amb material[0]
+.alias mat_dif material[1]
+.alias mat_spe material[2]
+.alias mat_emi material[3]
+
+; Constants
+.constf myconst(0.0, 1.0, -1.0, -0.5)
+.alias  zeros myconst.xxxx ; Vector full of zeros
+.alias  ones  myconst.yyyy ; Vector full of ones
+
+; Outputs
+.out outpos position
+.out outtc0 texcoord0
+.out outclr color
+
+; Inputs (defined as aliases for convenience)
+.alias inpos v0
+.alias intex v1
+.alias innrm v2
+
+.proc main
+	; Force the w component of inpos to be 1.0
+	mov r0.xyz, inpos
+	mov r0.w,   ones
+
+	; r1 = modelView * inpos
+	dp4 r1.x, modelView[0], r0
+	dp4 r1.y, modelView[1], r0
+	dp4 r1.z, modelView[2], r0
+	dp4 r1.w, modelView[3], r0
+
+	; outpos = projection * r1
+	dp4 outpos.x, projection[0], r1
+	dp4 outpos.y, projection[1], r1
+	dp4 outpos.z, projection[2], r1
+	dp4 outpos.w, projection[3], r1
+
+	; outtex = intex
+	mov outtc0, intex
+
+	; Transform the normal vector with the modelView matrix
+	; r1 = normalize(modelView * innrm)
+	mov r0.xyz, innrm
+	mov r0.w,   zeros
+	dp4 r1.x,   modelView[0], r0
+	dp4 r1.y,   modelView[1], r0
+	dp4 r1.z,   modelView[2], r0
+	mov r1.w,   zeros
+	dp3 r2,     r1, r1 ; r2 = x^2+y^2+z^2 for each component
+	rsq r2,     r2     ; r2 = 1/sqrt(r2)  ''
+	mul r1,     r2, r1 ; r1 = r1*r2
+
+	; Calculate the diffuse level (r0.x) and the shininess level (r0.y)
+	; r0.x = max(0, -(lightVec * r1))
+	; r0.y = max(0, (-lightHalfVec[i]) * r1) ^ 2
+	dp3 r0.x, lightVec,      r1
+	add r0.x, zeros,         -r0
+	dp3 r0.y, -lightHalfVec, r1
+	max r0,   zeros,         r0
+	mul r0.y, r0,            r0
+
+	; Accumulate the vertex color in r1, initializing it to the emission color
+	mov r1, mat_emi
+
+	; r1 += specularColor * lightClr * shininessLevel
+	mul r2, lightClr, r0.yyyy
+	mul r2, mat_spe,  r2
+	add r1, r2,       r1
+
+	; r1 += diffuseColor * lightClr * diffuseLevel
+	mul r2, lightClr, r0.xxxx
+	mul r2, mat_dif,  r2
+	add r1, r2,       r1
+
+	; r1 += ambientColor * lightClr
+	mov r2, lightClr
+	mul r2, mat_amb, r2
+	add r1, r2,      r1
+	
+	; outclr = clamp r1 to [0,1]
+	min outclr, ones, r1
+
+	; We're finished
+	end
+.end