Merge branch 'master' of github.com:smealum/ctrulib

2015-08-17 21:31:04 -07:00 · 2015-08-17 21:31:04 -07:00 · c49d5f49c2
commit c49d5f49c2
parent 91c98308e2 7d63ab5050
41 changed files with 2276 additions and 1258 deletions
--- a/examples/gpu/README.md
+++ b/examples/gpu/README.md
@ -1,11 +0,0 @@
 gpu
 =======
 example of how to use the GPU with libctru
 before trying to compile, make sure to download aemstro
 ( https://github.com/smealum/aemstro reflog: 51bfeef9e1a0149726dca43b50919bd45917015a )
 and update AEMSTRO environment variable with the proper path
 You'll also need to install Python 3 and have that in your path.
--- a/examples/gpu/data/test.vsh
+++ b/examples/gpu/data/test.vsh
@ -1,57 +0,0 @@
 ; setup constants
 	.const c20, 1.0, 0.0, 0.5, 1.0
 ; setup outmap
 	.out o0, result.position, 0xF
 	.out o1, result.color, 0xF
 	.out o2, result.texcoord0, 0x3
 	.out o3, result.texcoord1, 0x3
 	.out o4, result.texcoord2, 0x3
 ; setup uniform map (not required)
 	.uniform c0, c3, projection
 	.uniform c4, c7, modelview
 	.uniform c8, c8, lightDirection
 	.uniform c9, c9, lightAmbient
 	.vsh vmain, end_vmain
 ;code
 	vmain:
 		mov r1, v0 (0x4)
 		mov r1, c20 (0x3)
 		; temp = modvMtx * in.pos
 		dp4 r0, c4, r1 (0x0)
 		dp4 r0, c5, r1 (0x1)
 		dp4 r0, c6, r1 (0x2)
 		mov r0, c20 (0x3)
 		; result.pos = projMtx * temp
 		dp4 o0, c0, r0 (0x0)
 		dp4 o0, c1, r0 (0x1)
 		dp4 o0, c2, r0 (0x2)
 		dp4 o0, c3, r0 (0x3)
 		; result.texcoord = in.texcoord
 		mov o2, v1 (0x5)
 		mov o3, c20 (0x7)
 		mov o4, c20 (0x7)
 		; result.color = crappy lighting
 		dp3 r0, c8, v2 (0x4)
 		max r0, c20, r0 (0x9)
 		mul r0, c9, r0 (0x4)
 		add o1, c9, r0 (0x4)
 		mov o1, c20 (0x3)
 		nop
 		end
 	end_vmain:
 ;operand descriptors
 	.opdesc x___, xyzw, xyzw ; 0x0
 	.opdesc _y__, xyzw, xyzw ; 0x1
 	.opdesc __z_, xyzw, xyzw ; 0x2
 	.opdesc ___w, xyzw, xyzw ; 0x3
 	.opdesc xyz_, xyzw, xyzw ; 0x4
 	.opdesc xyzw, xyzw, xyzw ; 0x5
 	.opdesc x_zw, xyzw, xyzw ; 0x6
 	.opdesc xyzw, yyyw, xyzw ; 0x7
 	.opdesc xyz_, wwww, wwww ; 0x8
 	.opdesc xyz_, yyyy, xyzw ; 0x9
--- a/examples/gpu/data/texture.bin
+++ b/examples/gpu/data/texture.bin
--- a/examples/gpu/source/_gs.s
+++ b/examples/gpu/source/_gs.s
@ -1,16 +0,0 @@
 .section ".text"
 .arm
 .align 4
 .global _vboMemcpy50
 # r0 : dst
 # r1 : src
 # fixed size 0x50
 _vboMemcpy50:
 	push {r4-r11}
 	ldmia r1!, {r2-r12}
 	stmia r0!, {r2-r12}
 	ldmia r1!, {r2-r12}
 	stmia r0!, {r2-r12}
 	pop {r4-r11}
 	bx lr
--- a/examples/gpu/source/gs.c
+++ b/examples/gpu/source/gs.c
@ -1,432 +0,0 @@
 #include <stdlib.h>
 #include <string.h>
 #include <malloc.h>
 #include <3ds.h>
 #include "gs.h"
 #include "math.h"
 #define BUFFERMATRIXLIST_SIZE (GS_MATRIXSTACK_SIZE*4)
 static void gsInitMatrixStack();
 Handle linearAllocMutex;
 static u32 gsMatrixStackRegisters[GS_MATRIXTYPES];
 typedef struct
 {
 	u32 offset;
 	mtx44 data;
 }bufferMatrix_s;
 bufferMatrix_s bufferMatrixList[BUFFERMATRIXLIST_SIZE];
 int bufferMatrixListLength;
 //----------------------
 //   GS SYSTEM STUFF
 //----------------------
 void initBufferMatrixList()
 {
 	bufferMatrixListLength=0;
 }
 void gsInit(shaderProgram_s* shader)
 {
 	gsInitMatrixStack();
 	initBufferMatrixList();
 	svcCreateMutex(&linearAllocMutex, false);
 	if(shader)
 	{
 		gsMatrixStackRegisters[0]=shaderInstanceGetUniformLocation(shader->vertexShader, "projection");
 		gsMatrixStackRegisters[1]=shaderInstanceGetUniformLocation(shader->vertexShader, "modelview");
 		shaderProgramUse(shader);
 	}
 }
 void gsExit(void)
 {
 	svcCloseHandle(linearAllocMutex);
 }
 void gsStartFrame(void)
 {
 	GPUCMD_SetBufferOffset(0);
 	initBufferMatrixList();
 }
 void* gsLinearAlloc(size_t size)
 {
 	void* ret=NULL;
 	svcWaitSynchronization(linearAllocMutex, U64_MAX);
 	ret=linearAlloc(size);
 	svcReleaseMutex(linearAllocMutex);
 	return ret;
 }
 void gsLinearFree(void* mem)
 {
 	svcWaitSynchronization(linearAllocMutex, U64_MAX);
 	linearFree(mem);
 	svcReleaseMutex(linearAllocMutex);
 }
 //----------------------
 //  MATRIX STACK STUFF
 //----------------------
 static mtx44 gsMatrixStacks[GS_MATRIXTYPES][GS_MATRIXSTACK_SIZE];
 static u32 gsMatrixStackRegisters[GS_MATRIXTYPES]={0x00, 0x04};
 static u8 gsMatrixStackOffsets[GS_MATRIXTYPES];
 static bool gsMatrixStackUpdated[GS_MATRIXTYPES];
 static GS_MATRIX gsCurrentMatrixType;
 static void gsInitMatrixStack()
 {
 	int i;
 	for(i=0; i<GS_MATRIXTYPES; i++)
 	{
 		gsMatrixStackOffsets[i]=0;
 		gsMatrixStackUpdated[i]=true;
 		loadIdentity44((float*)gsMatrixStacks[i][0]);
 	}
 	gsCurrentMatrixType=GS_PROJECTION;
 }
 float* gsGetMatrix(GS_MATRIX m)
 {
 	if(m<0 || m>=GS_MATRIXTYPES)return NULL;
 	return (float*)gsMatrixStacks[m][gsMatrixStackOffsets[m]];
 }
 int gsLoadMatrix(GS_MATRIX m, float* data)
 {
 	if(m<0 || m>=GS_MATRIXTYPES || !data)return -1;
 	memcpy(gsGetMatrix(m), data, sizeof(mtx44));
 	gsMatrixStackUpdated[m]=true;
 	return 0;
 }
 int gsPushMatrix()
 {
 	const GS_MATRIX m=gsCurrentMatrixType;
 	if(m<0 || m>=GS_MATRIXTYPES)return -1;
 	if(gsMatrixStackOffsets[m]<0 || gsMatrixStackOffsets[m]>=GS_MATRIXSTACK_SIZE-1)return -1;
 	float* cur=gsGetMatrix(m);
 	gsMatrixStackOffsets[m]++;
 	memcpy(gsGetMatrix(m), cur, sizeof(mtx44));
 	return 0;
 }
 int gsPopMatrix()
 {
 	const GS_MATRIX m=gsCurrentMatrixType;
 	if(m<0 || m>=GS_MATRIXTYPES)return -1;
 	if(gsMatrixStackOffsets[m]<1 || gsMatrixStackOffsets[m]>=GS_MATRIXSTACK_SIZE)return -1;
 	gsMatrixStackOffsets[m]--;
 	gsMatrixStackUpdated[m]=true;
 	return 0;
 }
 int gsMatrixMode(GS_MATRIX m)
 {
 	if(m<0 || m>=GS_MATRIXTYPES)return -1;
 	gsCurrentMatrixType=m;
 	return 0;
 }
 //------------------------
 // MATRIX TRANSFORM STUFF
 //------------------------
 int gsMultMatrix(float* data)
 {
 	if(!data)return -1;
 	mtx44 tmp;
 	multMatrix44(gsGetMatrix(gsCurrentMatrixType), data, (float*)tmp);
 	memcpy(gsGetMatrix(gsCurrentMatrixType), (float*)tmp, sizeof(mtx44));
 	gsMatrixStackUpdated[gsCurrentMatrixType]=true;
 	return 0;
 }
 void gsLoadIdentity()
 {
 	loadIdentity44(gsGetMatrix(gsCurrentMatrixType));
 	gsMatrixStackUpdated[gsCurrentMatrixType]=true;
 }
 void gsProjectionMatrix(float fovy, float aspect, float near, float far)
 {
 	initProjectionMatrix(gsGetMatrix(gsCurrentMatrixType), fovy, aspect, near, far);
 	gsMatrixStackUpdated[gsCurrentMatrixType]=true;
 }
 void gsRotateX(float x)
 {
 	rotateMatrixX(gsGetMatrix(gsCurrentMatrixType), x, false);
 	gsMatrixStackUpdated[gsCurrentMatrixType]=true;
 }
 void gsRotateY(float y)
 {
 	rotateMatrixY(gsGetMatrix(gsCurrentMatrixType), y, false);
 	gsMatrixStackUpdated[gsCurrentMatrixType]=true;
 }
 void gsRotateZ(float z)
 {
 	rotateMatrixZ(gsGetMatrix(gsCurrentMatrixType), z, false);
 	gsMatrixStackUpdated[gsCurrentMatrixType]=true;
 }
 void gsScale(float x, float y, float z)
 {
 	scaleMatrix(gsGetMatrix(gsCurrentMatrixType), x, y, z);
 	gsMatrixStackUpdated[gsCurrentMatrixType]=true;
 }
 void gsTranslate(float x, float y, float z)
 {
 	translateMatrix(gsGetMatrix(gsCurrentMatrixType), x, y, z);
 	gsMatrixStackUpdated[gsCurrentMatrixType]=true;
 }
 //----------------------
 // MATRIX RENDER STUFF
 //----------------------
 static void gsSetUniformMatrix(u32 startreg, float* m)
 {
 	float param[16];
 	param[0x0]=m[3]; //w
 	param[0x1]=m[2]; //z
 	param[0x2]=m[1]; //y
 	param[0x3]=m[0]; //x
 	param[0x4]=m[7];
 	param[0x5]=m[6];
 	param[0x6]=m[5];
 	param[0x7]=m[4];
 	param[0x8]=m[11];
 	param[0x9]=m[10];
 	param[0xa]=m[9];
 	param[0xb]=m[8];
 	param[0xc]=m[15];
 	param[0xd]=m[14];
 	param[0xe]=m[13];
 	param[0xf]=m[12];
 	GPU_SetFloatUniform(GPU_VERTEX_SHADER, startreg, (u32*)param, 4);
 }
 static int gsUpdateTransformation()
 {
 	GS_MATRIX m;
 	for(m=0; m<GS_MATRIXTYPES; m++)
 	{
 		if(gsMatrixStackUpdated[m])
 		{
 			if(m==GS_PROJECTION && bufferMatrixListLength<BUFFERMATRIXLIST_SIZE)
 			{
 				GPUCMD_GetBuffer(NULL, NULL, &bufferMatrixList[bufferMatrixListLength].offset);
 				memcpy(bufferMatrixList[bufferMatrixListLength].data, gsGetMatrix(m), sizeof(mtx44));
 				bufferMatrixListLength++;
 			}
 			gsSetUniformMatrix(gsMatrixStackRegisters[m], gsGetMatrix(m));
 			gsMatrixStackUpdated[m]=false;
 		}
 	}
 	return 0;
 }
 void gsAdjustBufferMatrices(mtx44 transformation)
 {
 	int i;
 	u32* buffer;
 	u32 offset;
 	GPUCMD_GetBuffer(&buffer, NULL, &offset);
 	for(i=0; i<bufferMatrixListLength; i++)
 	{
 		u32 o=bufferMatrixList[i].offset;
 		if(o+2<offset) //TODO : better check, need to account for param size
 		{
 			mtx44 newMatrix;
 			GPUCMD_SetBufferOffset(o);
 			multMatrix44((float*)bufferMatrixList[i].data, (float*)transformation, (float*)newMatrix);
 			gsSetUniformMatrix(gsMatrixStackRegisters[GS_PROJECTION], (float*)newMatrix);
 		}
 	}
 	GPUCMD_SetBufferOffset(offset);
 }
 //----------------------
 //      VBO STUFF
 //----------------------
 int gsVboInit(gsVbo_s* vbo)
 {
 	if(!vbo)return -1;
 	vbo->data=NULL;
 	vbo->currentSize=0;
 	vbo->maxSize=0;
 	vbo->commands=NULL;
 	vbo->commandsSize=0;
 	return 0;
 }
 int gsVboCreate(gsVbo_s* vbo, u32 size)
 {
 	if(!vbo)return -1;
 	vbo->data=gsLinearAlloc(size);
 	vbo->numVertices=0;
 	vbo->currentSize=0;
 	vbo->maxSize=size;
 	return 0;
 }
 void* gsVboGetOffset(gsVbo_s* vbo)
 {
 	if(!vbo)return NULL;
 	return (void*)(&((u8*)vbo->data)[vbo->currentSize]);
 }
 int gsVboAddData(gsVbo_s* vbo, void* data, u32 size, u32 units)
 {
 	if(!vbo || !data || !size)return -1;
 	if(((s32)vbo->maxSize)-((s32)vbo->currentSize) < size)return -1;
 	memcpy(gsVboGetOffset(vbo), data, size);
 	vbo->currentSize+=size;
 	vbo->numVertices+=units;
 	return 0;
 }
 int gsVboFlushData(gsVbo_s* vbo)
 {
 	if(!vbo)return -1;
 	//unnecessary if we use flushAndRun
 	// GSPGPU_FlushDataCache(NULL, vbo->data, vbo->currentSize);
 	return 0;
 }
 int gsVboDestroy(gsVbo_s* vbo)
 {
 	if(!vbo)return -1;
 	if(vbo->commands)free(vbo->commands);
 	if(vbo->data)gsLinearFree(vbo->data);
 	gsVboInit(vbo);
 	return 0;
 }
 extern u32 debugValue[];
 void GPU_DrawArrayDirectly(GPU_Primitive_t primitive, u8* data, u32 n)
 {
 	//set attribute buffer address
 	GPUCMD_AddSingleParam(0x000F0200, (osConvertVirtToPhys((u32)data))>>3);
 	//set primitive type
 	GPUCMD_AddSingleParam(0x0002025E, primitive);
 	GPUCMD_AddSingleParam(0x0002025F, 0x00000001);
 	//index buffer not used for drawArrays but 0x000F0227 still required
 	GPUCMD_AddSingleParam(0x000F0227, 0x80000000);
 	//pass number of vertices
 	GPUCMD_AddSingleParam(0x000F0228, n);
 	GPUCMD_AddSingleParam(0x00010253, 0x00000001);
 	GPUCMD_AddSingleParam(0x00010245, 0x00000000);
 	GPUCMD_AddSingleParam(0x000F022E, 0x00000001);
 	GPUCMD_AddSingleParam(0x00010245, 0x00000001);
 	GPUCMD_AddSingleParam(0x000F0231, 0x00000001);
 	// GPUCMD_AddSingleParam(0x000F0111, 0x00000001); //breaks stuff
 }
 //not thread safe
 int gsVboPrecomputeCommands(gsVbo_s* vbo)
 {
 	if(!vbo || vbo->commands)return -1;
 	static u32 tmpBuffer[128];
 	u32* savedAdr; u32 savedSize, savedOffset;
 	GPUCMD_GetBuffer(&savedAdr, &savedSize, &savedOffset);
 	GPUCMD_SetBuffer(tmpBuffer, 128, 0);
 	GPU_DrawArrayDirectly(GPU_TRIANGLES, vbo->data, vbo->numVertices);
 	GPUCMD_GetBuffer(NULL, NULL, &vbo->commandsSize);
 	vbo->commands=memalign(0x4, vbo->commandsSize*4);
 	if(!vbo->commands)return -1;
 	memcpy(vbo->commands, tmpBuffer, vbo->commandsSize*4);
 	GPUCMD_SetBuffer(savedAdr, savedSize, savedOffset);
 	return 0;
 }
 extern u32* gpuCmdBuf;
 extern u32 gpuCmdBufSize;
 extern u32 gpuCmdBufOffset;
 void _vboMemcpy50(u32* dst, u32* src);
 void _GPUCMD_AddRawCommands(u32* cmd, u32 size)
 {
 	if(!cmd || !size)return;
 	if(size*4==0x50)_vboMemcpy50(&gpuCmdBuf[gpuCmdBufOffset], cmd);
 	else memcpy(&gpuCmdBuf[gpuCmdBufOffset], cmd, size*4);
 	gpuCmdBufOffset+=size;
 }
 int gsVboDraw(gsVbo_s* vbo)
 {
 	if(!vbo || !vbo->data || !vbo->currentSize || !vbo->maxSize)return -1;
 	gsUpdateTransformation();
 	gsVboPrecomputeCommands(vbo);
 	// u64 val=svcGetSystemTick();
 	if(vbo->commands)
 	{
 		_GPUCMD_AddRawCommands(vbo->commands, vbo->commandsSize);
 	}else{
 		GPU_DrawArrayDirectly(GPU_TRIANGLES, vbo->data, vbo->numVertices);
 	}
 	// debugValue[5]+=(u32)(svcGetSystemTick()-val);
 	// debugValue[6]++;
 	return 0;
 }
--- a/examples/gpu/source/gs.h
+++ b/examples/gpu/source/gs.h
@ -1,59 +0,0 @@
 #ifndef GS_H
 #define GS_H
 #include <3ds.h>
 #include "math.h"
 #define GS_MATRIXSTACK_SIZE (8)
 typedef enum
 {
 	GS_PROJECTION = 0,
 	GS_MODELVIEW = 1,
 	GS_MATRIXTYPES
 }GS_MATRIX;
 typedef struct
 {
 	u8* data;
 	u32 currentSize; // in bytes
 	u32 maxSize; // in bytes
 	u32 numVertices;
 	u32* commands;
 	u32 commandsSize;
 }gsVbo_s;
 void gsInit(shaderProgram_s* shader);
 void gsExit(void);
 void gsStartFrame(void);
 void gsAdjustBufferMatrices(mtx44 transformation);
 void* gsLinearAlloc(size_t size);
 void gsLinearFree(void* mem);
 float* gsGetMatrix(GS_MATRIX m);
 int gsLoadMatrix(GS_MATRIX m, float* data);
 int gsPushMatrix();
 int gsPopMatrix();
 int gsMatrixMode(GS_MATRIX m);
 void gsLoadIdentity();
 void gsProjectionMatrix(float fovy, float aspect, float near, float far);
 void gsRotateX(float x);
 void gsRotateY(float y);
 void gsRotateZ(float z);
 void gsScale(float x, float y, float z);
 void gsTranslate(float x, float y, float z);
 int gsMultMatrix(float* data);
 int gsVboInit(gsVbo_s* vbo);
 int gsVboCreate(gsVbo_s* vbo, u32 size);
 int gsVboFlushData(gsVbo_s* vbo);
 int gsVboDestroy(gsVbo_s* vbo);
 int gsVboDraw(gsVbo_s* vbo);
 void* gsVboGetOffset(gsVbo_s* vbo);
 int gsVboAddData(gsVbo_s* vbo, void* data, u32 size, u32 units);
 #endif
--- a/examples/gpu/source/main.c
+++ b/examples/gpu/source/main.c
@ -1,354 +0,0 @@
 ///////////////////////////////////////
 //            GPU example            //
 ///////////////////////////////////////
 //this example is meant to show how to use the GPU to render a 3D object
 //it also shows how to do stereoscopic 3D
 //it uses GS which is a WIP GPU abstraction layer that's currently part of 3DScraft
 //keep in mind GPU reverse engineering is an ongoing effort and our understanding of it is still fairly limited.
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <3ds.h>
 #include "math.h"
 #include "gs.h"
 #include "test_vsh_shbin.h"
 #include "texture_bin.h"
 //will be moved into ctrulib at some point
 #define CONFIG_3D_SLIDERSTATE (*(float*)0x1FF81080)
 #define RGBA8(r,g,b,a) ((((r)&0xFF)<<24) | (((g)&0xFF)<<16) | (((b)&0xFF)<<8) | (((a)&0xFF)<<0))
 //transfer from GPU output buffer to actual framebuffer flags
 #define DISPLAY_TRANSFER_FLAGS \
 	(GX_TRANSFER_FLIP_VERT(0) | GX_TRANSFER_OUT_TILED(0) | GX_TRANSFER_RAW_COPY(0) | \
 	 GX_TRANSFER_IN_FORMAT(GX_TRANSFER_FMT_RGBA8) | GX_TRANSFER_OUT_FORMAT(GX_TRANSFER_FMT_RGB8) | \
 	 GX_TRANSFER_SCALING(GX_TRANSFER_SCALE_X))
 //shader structure
 DVLB_s* dvlb;
 shaderProgram_s shader;
 //texture data pointer
 u32* texData;
 //vbo structure
 gsVbo_s vbo;
 //GPU framebuffer address
 u32* gpuOut=(u32*)0x1F119400;
 //GPU depth buffer address
 u32* gpuDOut=(u32*)0x1F370800;
 //angle for the vertex lighting (cf test.vsh)
 float lightAngle;
 //object position and rotation angle
 vect3Df_s position, angle;
 //vertex structure
 typedef struct
 {
 	vect3Df_s position;
 	float texcoord[2];
 	vect3Df_s normal;
 }vertex_s;
 //object data (cube)
 //obviously this doesn't have to be defined manually, but we will here for the purposes of the example
 //each line is a vertex : {position.x, position.y, position.z}, {texcoord.t, texcoord.s}, {normal.x, normal.y, normal.z}
 //we're drawing triangles so three lines = one triangle
 const vertex_s modelVboData[]=
 {
 	//first face (PZ)
 		//first triangle
 		{(vect3Df_s){-0.5f, -0.5f, +0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){0.0f, 0.0f, +1.0f}},
 		{(vect3Df_s){+0.5f, -0.5f, +0.5f}, (float[]){1.0f, 1.0f}, (vect3Df_s){0.0f, 0.0f, +1.0f}},
 		{(vect3Df_s){+0.5f, +0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){0.0f, 0.0f, +1.0f}},
 		//second triangle
 		{(vect3Df_s){+0.5f, +0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){0.0f, 0.0f, +1.0f}},
 		{(vect3Df_s){-0.5f, +0.5f, +0.5f}, (float[]){0.0f, 0.0f}, (vect3Df_s){0.0f, 0.0f, +1.0f}},
 		{(vect3Df_s){-0.5f, -0.5f, +0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){0.0f, 0.0f, +1.0f}},
 	//second face (MZ)
 		//first triangle
 		{(vect3Df_s){-0.5f, -0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){0.0f, 0.0f, -1.0f}},
 		{(vect3Df_s){-0.5f, +0.5f, -0.5f}, (float[]){1.0f, 1.0f}, (vect3Df_s){0.0f, 0.0f, -1.0f}},
 		{(vect3Df_s){+0.5f, +0.5f, -0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){0.0f, 0.0f, -1.0f}},
 		//second triangle
 		{(vect3Df_s){+0.5f, +0.5f, -0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){0.0f, 0.0f, -1.0f}},
 		{(vect3Df_s){+0.5f, -0.5f, -0.5f}, (float[]){0.0f, 0.0f}, (vect3Df_s){0.0f, 0.0f, -1.0f}},
 		{(vect3Df_s){-0.5f, -0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){0.0f, 0.0f, -1.0f}},
 	//third face (PX)
 		//first triangle
 		{(vect3Df_s){+0.5f, -0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){+1.0f, 0.0f, 0.0f}},
 		{(vect3Df_s){+0.5f, +0.5f, -0.5f}, (float[]){1.0f, 1.0f}, (vect3Df_s){+1.0f, 0.0f, 0.0f}},
 		{(vect3Df_s){+0.5f, +0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){+1.0f, 0.0f, 0.0f}},
 		//second triangle
 		{(vect3Df_s){+0.5f, +0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){+1.0f, 0.0f, 0.0f}},
 		{(vect3Df_s){+0.5f, -0.5f, +0.5f}, (float[]){0.0f, 0.0f}, (vect3Df_s){+1.0f, 0.0f, 0.0f}},
 		{(vect3Df_s){+0.5f, -0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){+1.0f, 0.0f, 0.0f}},
 	//fourth face (MX)
 		//first triangle
 		{(vect3Df_s){-0.5f, -0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){-1.0f, 0.0f, 0.0f}},
 		{(vect3Df_s){-0.5f, -0.5f, +0.5f}, (float[]){1.0f, 1.0f}, (vect3Df_s){-1.0f, 0.0f, 0.0f}},
 		{(vect3Df_s){-0.5f, +0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){-1.0f, 0.0f, 0.0f}},
 		//second triangle
 		{(vect3Df_s){-0.5f, +0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){-1.0f, 0.0f, 0.0f}},
 		{(vect3Df_s){-0.5f, +0.5f, -0.5f}, (float[]){0.0f, 0.0f}, (vect3Df_s){-1.0f, 0.0f, 0.0f}},
 		{(vect3Df_s){-0.5f, -0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){-1.0f, 0.0f, 0.0f}},
 	//fifth face (PY)
 		//first triangle
 		{(vect3Df_s){-0.5f, +0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){0.0f, +1.0f, 0.0f}},
 		{(vect3Df_s){-0.5f, +0.5f, +0.5f}, (float[]){1.0f, 1.0f}, (vect3Df_s){0.0f, +1.0f, 0.0f}},
 		{(vect3Df_s){+0.5f, +0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){0.0f, +1.0f, 0.0f}},
 		//second triangle
 		{(vect3Df_s){+0.5f, +0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){0.0f, +1.0f, 0.0f}},
 		{(vect3Df_s){+0.5f, +0.5f, -0.5f}, (float[]){0.0f, 0.0f}, (vect3Df_s){0.0f, +1.0f, 0.0f}},
 		{(vect3Df_s){-0.5f, +0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){0.0f, +1.0f, 0.0f}},
 	//sixth face (MY)
 		//first triangle
 		{(vect3Df_s){-0.5f, -0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){0.0f, -1.0f, 0.0f}},
 		{(vect3Df_s){+0.5f, -0.5f, -0.5f}, (float[]){1.0f, 1.0f}, (vect3Df_s){0.0f, -1.0f, 0.0f}},
 		{(vect3Df_s){+0.5f, -0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){0.0f, -1.0f, 0.0f}},
 		//second triangle
 		{(vect3Df_s){+0.5f, -0.5f, +0.5f}, (float[]){1.0f, 0.0f}, (vect3Df_s){0.0f, -1.0f, 0.0f}},
 		{(vect3Df_s){-0.5f, -0.5f, +0.5f}, (float[]){0.0f, 0.0f}, (vect3Df_s){0.0f, -1.0f, 0.0f}},
 		{(vect3Df_s){-0.5f, -0.5f, -0.5f}, (float[]){0.0f, 1.0f}, (vect3Df_s){0.0f, -1.0f, 0.0f}},
 };
 //stolen from staplebutt
 void GPU_SetDummyTexEnv(u8 num)
 {
 	GPU_SetTexEnv(num,
 		GPU_TEVSOURCES(GPU_PREVIOUS, 0, 0),
 		GPU_TEVSOURCES(GPU_PREVIOUS, 0, 0),
 		GPU_TEVOPERANDS(0,0,0),
 		GPU_TEVOPERANDS(0,0,0),
 		GPU_REPLACE,
 		GPU_REPLACE,
 		0xFFFFFFFF);
 }
 // topscreen
 void renderFrame()
 {
 	GPU_SetViewport((u32*)osConvertVirtToPhys((u32)gpuDOut),(u32*)osConvertVirtToPhys((u32)gpuOut),0,0,240*2,400);
 	GPU_DepthMap(-1.0f, 0.0f);
 	GPU_SetFaceCulling(GPU_CULL_BACK_CCW);
 	GPU_SetStencilTest(false, GPU_ALWAYS, 0x00, 0xFF, 0x00);
 	GPU_SetStencilOp(GPU_KEEP, GPU_KEEP, GPU_KEEP);
 	GPU_SetBlendingColor(0,0,0,0);
 	GPU_SetDepthTestAndWriteMask(true, GPU_GREATER, GPU_WRITE_ALL);
 	GPUCMD_AddMaskedWrite(GPUREG_0062, 0x1, 0);
 	GPUCMD_AddWrite(GPUREG_0118, 0);
 	GPU_SetAlphaBlending(GPU_BLEND_ADD, GPU_BLEND_ADD, GPU_SRC_ALPHA, GPU_ONE_MINUS_SRC_ALPHA, GPU_SRC_ALPHA, GPU_ONE_MINUS_SRC_ALPHA);
 	GPU_SetAlphaTest(false, GPU_ALWAYS, 0x00);
 	GPU_SetTextureEnable(GPU_TEXUNIT0);
 	GPU_SetTexEnv(0,
 		GPU_TEVSOURCES(GPU_TEXTURE0, GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR),
 		GPU_TEVSOURCES(GPU_TEXTURE0, GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR),
 		GPU_TEVOPERANDS(0,0,0),
 		GPU_TEVOPERANDS(0,0,0),
 		GPU_MODULATE, GPU_MODULATE,
 		0xFFFFFFFF);
 	GPU_SetDummyTexEnv(1);
 	GPU_SetDummyTexEnv(2);
 	GPU_SetDummyTexEnv(3);
 	GPU_SetDummyTexEnv(4);
 	GPU_SetDummyTexEnv(5);
 	//texturing stuff
 		GPU_SetTexture(
 			GPU_TEXUNIT0, //texture unit
 			(u32*)osConvertVirtToPhys((u32)texData), //data buffer
 			128, //texture width
 			128, //texture height
 			GPU_TEXTURE_MAG_FILTER(GPU_NEAREST) | GPU_TEXTURE_MIN_FILTER(GPU_NEAREST), //texture params
 			GPU_RGBA8 //texture pixel format
 		);
 		GPU_SetAttributeBuffers(
 			3, //3 attributes: vertices, texcoords, and normals
 			(u32*)osConvertVirtToPhys((u32)texData), //mesh buffer
 			GPU_ATTRIBFMT(0, 3, GPU_FLOAT) | // GPU Input attribute register 0 (v0): 3 floats (position)
 			GPU_ATTRIBFMT(1, 2, GPU_FLOAT) | // GPU Input attribute register 1 (v1): 2 floats (texcoord)
 			GPU_ATTRIBFMT(2, 3, GPU_FLOAT),  // GPU Input attribute register 2 (v2): 3 floats (normal)
 			0xFFC,
 			0x210,
 			1,
 			(u32[]){0x00000000},
 			(u64[]){0x210},
 			(u8[]){3}
 		);
 	//setup lighting (this is specific to our shader)
 		vect3Df_s lightDir=vnormf(vect3Df(cos(lightAngle), -1.0f, sin(lightAngle)));
 		GPU_SetFloatUniform(GPU_VERTEX_SHADER, shaderInstanceGetUniformLocation(shader.vertexShader, "lightDirection"), (u32*)(float[]){0.0f, -lightDir.z, -lightDir.y, -lightDir.x}, 1);
 		GPU_SetFloatUniform(GPU_VERTEX_SHADER, shaderInstanceGetUniformLocation(shader.vertexShader, "lightAmbient"), (u32*)(float[]){0.7f, 0.4f, 0.4f, 0.4f}, 1);
 	//initialize projection matrix to standard perspective stuff
 	gsMatrixMode(GS_PROJECTION);
 	gsProjectionMatrix(80.0f*M_PI/180.0f, 240.0f/400.0f, 0.01f, 100.0f);
 	gsRotateZ(M_PI/2); //because framebuffer is sideways...
 	//draw object
 		gsMatrixMode(GS_MODELVIEW);
 		gsPushMatrix();
 			gsTranslate(position.x, position.y, position.z);
 			gsRotateX(angle.x);
 			gsRotateY(angle.y);
 			gsVboDraw(&vbo);
 		gsPopMatrix();
 	GPU_FinishDrawing();
 }
 int main(int argc, char** argv)
 {
 	gfxInitDefault();
 	//initialize GPU
 	GPU_Init(NULL);
 	//let GFX know we're ok with doing stereoscopic 3D rendering
 	gfxSet3D(true);
 	//allocate our GPU command buffers
 	//they *have* to be on the linear heap
 	u32 gpuCmdSize=0x40000;
 	u32* gpuCmd=(u32*)linearAlloc(gpuCmdSize*4);
 	u32* gpuCmdRight=(u32*)linearAlloc(gpuCmdSize*4);
 	//actually reset the GPU
 	GPU_Reset(NULL, gpuCmd, gpuCmdSize);
 	//load our vertex shader binary
 	dvlb=DVLB_ParseFile((u32*)test_vsh_shbin, test_vsh_shbin_size);
 	shaderProgramInit(&shader);
 	shaderProgramSetVsh(&shader, &dvlb->DVLE[0]);
 	//initialize GS
 	gsInit(&shader);
 	// Flush the command buffer so that the shader upload gets executed
 	GPUCMD_Finalize();
 	GPUCMD_FlushAndRun(NULL);
 	gspWaitForP3D();
 	//create texture
 	texData=(u32*)linearMemAlign(texture_bin_size, 0x80); //textures need to be 0x80-byte aligned
 	memcpy(texData, texture_bin, texture_bin_size);
 	//create VBO
 	gsVboInit(&vbo);
 	gsVboCreate(&vbo, sizeof(modelVboData));
 	gsVboAddData(&vbo, (void*)modelVboData, sizeof(modelVboData), sizeof(modelVboData)/sizeof(vertex_s));
 	gsVboFlushData(&vbo);
 	//initialize object position and angle
 	position=vect3Df(0.0f, 0.0f, -2.0f);
 	angle=vect3Df(M_PI/4, M_PI/4, 0.0f);
 	//background color (blue)
 	u32 backgroundColor=RGBA8(0x68, 0xB0, 0xD8, 0xFF);
 	while(aptMainLoop())
 	{
 		//get current 3D slider state
 		float slider=CONFIG_3D_SLIDERSTATE;
 		//controls
 		hidScanInput();
 		//START to exit to hbmenu
 		if(keysDown()&KEY_START)break;
 		//A/B to change vertex lighting angle
 		if(keysHeld()&KEY_A)lightAngle+=0.1f;
 		if(keysHeld()&KEY_B)lightAngle-=0.1f;
 		//D-PAD to rotate object
 		if(keysHeld()&KEY_DOWN)angle.x+=0.05f;
 		if(keysHeld()&KEY_UP)angle.x-=0.05f;
 		if(keysHeld()&KEY_LEFT)angle.y+=0.05f;
 		if(keysHeld()&KEY_RIGHT)angle.y-=0.05f;
 		//R/L to bring object closer to or move it further from the camera
 		if(keysHeld()&KEY_R)position.z+=0.1f;
 		if(keysHeld()&KEY_L)position.z-=0.1f;
 		//generate our GPU command buffer for this frame
 		gsStartFrame();
 		renderFrame();
 		GPUCMD_Finalize();
 		if(slider>0.0f)
 		{
 			//new and exciting 3D !
 			//make a copy of left gpu buffer
 			u32 offset; GPUCMD_GetBuffer(NULL, NULL, &offset);
 			memcpy(gpuCmdRight, gpuCmd, offset*4);
 			//setup interaxial
 			float interaxial=slider*0.12f;
 			//adjust left gpu buffer fo 3D !
 			{mtx44 m; loadIdentity44((float*)m); translateMatrix((float*)m, -interaxial*0.5f, 0.0f, 0.0f); gsAdjustBufferMatrices(m);}
 			//draw left framebuffer
 			GPUCMD_FlushAndRun(NULL);
 			//while GPU starts drawing the left buffer, adjust right one for 3D !
 			GPUCMD_SetBuffer(gpuCmdRight, gpuCmdSize, offset);
 			{mtx44 m; loadIdentity44((float*)m); translateMatrix((float*)m, interaxial*0.5f, 0.0f, 0.0f); gsAdjustBufferMatrices(m);}
 			//we wait for the left buffer to finish drawing
 			gspWaitForP3D();
 			GX_SetDisplayTransfer(NULL, (u32*)gpuOut, GX_BUFFER_DIM(240*2, 400), (u32*)gfxGetFramebuffer(GFX_TOP, GFX_LEFT, NULL, NULL), GX_BUFFER_DIM(240*2, 400), DISPLAY_TRANSFER_FLAGS);
 			gspWaitForPPF();
 			//we draw the right buffer, wait for it to finish and then switch back to left one
 			//clear the screen
 			GX_SetMemoryFill(NULL, (u32*)gpuOut, backgroundColor, (u32*)&gpuOut[0x2EE00], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH , (u32*)gpuDOut, 0x00000000, (u32*)&gpuDOut[0x2EE00], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH);
 			gspWaitForPSC0();
 			//draw the right framebuffer
 			GPUCMD_FlushAndRun(NULL);
 			gspWaitForP3D();
 			//transfer from GPU output buffer to actual framebuffer
 			GX_SetDisplayTransfer(NULL, (u32*)gpuOut, GX_BUFFER_DIM(240*2, 400), (u32*)gfxGetFramebuffer(GFX_TOP, GFX_RIGHT, NULL, NULL), GX_BUFFER_DIM(240*2, 400), DISPLAY_TRANSFER_FLAGS);
 			gspWaitForPPF();
 			GPUCMD_SetBuffer(gpuCmd, gpuCmdSize, 0);
 		}else{
 			//boring old 2D !
 			//draw the frame
 			GPUCMD_FlushAndRun(NULL);
 			gspWaitForP3D();
 			//clear the screen
 			GX_SetDisplayTransfer(NULL, (u32*)gpuOut, GX_BUFFER_DIM(240*2, 400), (u32*)gfxGetFramebuffer(GFX_TOP, GFX_LEFT, NULL, NULL), GX_BUFFER_DIM(240*2, 400), DISPLAY_TRANSFER_FLAGS);
 			gspWaitForPPF();
 		}
 		//clear the screen
 		GX_SetMemoryFill(NULL, (u32*)gpuOut, backgroundColor, (u32*)&gpuOut[0x2EE00], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH, (u32*)gpuDOut, 0x00000000, (u32*)&gpuDOut[0x2EE00], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH);
 		gspWaitForPSC0();
 		gfxSwapBuffersGpu();
 		gspWaitForEvent(GSPEVENT_VBlank0, true);
 	}
 	gsExit();
 	shaderProgramFree(&shader);
 	DVLB_Free(dvlb);
 	gfxExit();
 	return 0;
 }
--- a/examples/gpu/source/math.c
+++ b/examples/gpu/source/math.c
@ -1,148 +0,0 @@
 #include <math.h>
 #include <string.h>
 #include "math.h"
 void loadIdentity44(float* m)
 {
 	if(!m)return;
 	memset(m, 0x00, 16*4);
 	m[0]=m[5]=m[10]=m[15]=1.0f;
 }
 void multMatrix44(float* m1, float* m2, float* m) //4x4
 {
 	int i, j;
 	for(i=0;i<4;i++)for(j=0;j<4;j++)m[i+j*4]=(m1[0+j*4]*m2[i+0*4])+(m1[1+j*4]*m2[i+1*4])+(m1[2+j*4]*m2[i+2*4])+(m1[3+j*4]*m2[i+3*4]);
 }
 void translateMatrix(float* tm, float x, float y, float z)
 {
 	float rm[16], m[16];
 	loadIdentity44(rm);
 	rm[3]=x;
 	rm[7]=y;
 	rm[11]=z;
 	multMatrix44(tm,rm,m);
 	memcpy(tm,m,16*sizeof(float));
 }
 // 00 01 02 03
 // 04 05 06 07
 // 08 09 10 11
 // 12 13 14 15
 void rotateMatrixX(float* tm, float x, bool r)
 {
 	float rm[16], m[16];
 	memset(rm, 0x00, 16*4);
 	rm[0]=1.0f;
 	rm[5]=cos(x);
 	rm[6]=sin(x);
 	rm[9]=-sin(x);
 	rm[10]=cos(x);
 	rm[15]=1.0f;
 	if(!r)multMatrix44(tm,rm,m);
 	else multMatrix44(rm,tm,m);
 	memcpy(tm,m,16*sizeof(float));
 }
 void rotateMatrixY(float* tm, float x, bool r)
 {
 	float rm[16], m[16];
 	memset(rm, 0x00, 16*4);
 	rm[0]=cos(x);
 	rm[2]=sin(x);
 	rm[5]=1.0f;
 	rm[8]=-sin(x);
 	rm[10]=cos(x);
 	rm[15]=1.0f;
 	if(!r)multMatrix44(tm,rm,m);
 	else multMatrix44(rm,tm,m);
 	memcpy(tm,m,16*sizeof(float));
 }
 void rotateMatrixZ(float* tm, float x, bool r)
 {
 	float rm[16], m[16];
 	memset(rm, 0x00, 16*4);
 	rm[0]=cos(x);
 	rm[1]=sin(x);
 	rm[4]=-sin(x);
 	rm[5]=cos(x);
 	rm[10]=1.0f;
 	rm[15]=1.0f;
 	if(!r)multMatrix44(tm,rm,m);
 	else multMatrix44(rm,tm,m);
 	memcpy(tm,m,16*sizeof(float));
 }
 void scaleMatrix(float* tm, float x, float y, float z)
 {
 	tm[0]*=x; tm[4]*=x; tm[8]*=x; tm[12]*=x;
 	tm[1]*=y; tm[5]*=y; tm[9]*=y; tm[13]*=y;
 	tm[2]*=z; tm[6]*=z; tm[10]*=z; tm[14]*=z;
 }
 void initProjectionMatrix(float* m, float fovy, float aspect, float near, float far)
 {
 	float top = near*tan(fovy/2);
 	float right = (top*aspect);
 	float mp[4*4];
 	mp[0x0] = near/right;
 	mp[0x1] = 0.0f;
 	mp[0x2] = 0.0f;
 	mp[0x3] = 0.0f;
 	mp[0x4] = 0.0f;
 	mp[0x5] = near/top;
 	mp[0x6] = 0.0f;
 	mp[0x7] = 0.0f;
 	mp[0x8] = 0.0f;
 	mp[0x9] = 0.0f;
 	mp[0xA] = -(far+near)/(far-near);
 	mp[0xB] = -2.0f*(far*near)/(far-near);
 	mp[0xC] = 0.0f;
 	mp[0xD] = 0.0f;
 	mp[0xE] = -1.0f;
 	mp[0xF] = 0.0f;
 	float mp2[4*4];
 	loadIdentity44(mp2);
 	mp2[0xA]=0.5;
 	mp2[0xB]=-0.5;
 	multMatrix44(mp2, mp, m);
 }
 vect3Df_s getMatrixColumn(float* m, u8 i)
 {
 	if(!m || i>=4)return vect3Df(0,0,0);
 	return vect3Df(m[0+i*4],m[1+i*4],m[2+i*4]);
 }
 vect3Df_s getMatrixRow(float* m, u8 i)
 {
 	if(!m || i>=4)return vect3Df(0,0,0);
 	return vect3Df(m[i+0*4],m[i+1*4],m[i+2*4]);
 }
 vect4Df_s getMatrixColumn4(float* m, u8 i)
 {
 	if(!m || i>=4)return vect4Df(0,0,0,0);
 	return vect4Df(m[0+i*4],m[1+i*4],m[2+i*4],m[3+i*4]);
 }
 vect4Df_s getMatrixRow4(float* m, u8 i)
 {
 	if(!m || i>=4)return vect4Df(0,0,0,0);
 	return vect4Df(m[i+0*4],m[i+1*4],m[i+2*4],m[i+3*4]);
 }
--- a/examples/gpu/source/math.h
+++ b/examples/gpu/source/math.h
@ -1,144 +0,0 @@
 #ifndef MATH_H
 #define MATH_H
 #include <3ds/types.h>
 #include <math.h>
 typedef float mtx44[4][4];
 typedef float mtx33[3][3];
 typedef struct
 {
 	s32 x, y, z;
 }vect3Di_s;
 static inline vect3Di_s vect3Di(s32 x, s32 y, s32 z)
 {
 	return (vect3Di_s){x,y,z};
 }
 static inline vect3Di_s vaddi(vect3Di_s u, vect3Di_s v)
 {
 	return (vect3Di_s){u.x+v.x,u.y+v.y,u.z+v.z};
 }
 static inline vect3Di_s vsubi(vect3Di_s u, vect3Di_s v)
 {
 	return (vect3Di_s){u.x-v.x,u.y-v.y,u.z-v.z};
 }
 static inline vect3Di_s vmuli(vect3Di_s v, s32 f)
 {
 	return (vect3Di_s){v.x*f,v.y*f,v.z*f};
 }
 typedef struct
 {
 	float x, y, z;
 }vect3Df_s;
 static inline vect3Df_s vect3Df(float x, float y, float z)
 {
 	return (vect3Df_s){x,y,z};
 }
 static inline vect3Df_s vaddf(vect3Df_s u, vect3Df_s v)
 {
 	return (vect3Df_s){u.x+v.x,u.y+v.y,u.z+v.z};
 }
 static inline vect3Df_s vsubf(vect3Df_s u, vect3Df_s v)
 {
 	return (vect3Df_s){u.x-v.x,u.y-v.y,u.z-v.z};
 }
 static inline vect3Df_s vmulf(vect3Df_s v, float f)
 {
 	return (vect3Df_s){v.x*f,v.y*f,v.z*f};
 }
 static inline vect3Df_s vscalef(vect3Df_s v1, vect3Df_s v2)
 {
 	return (vect3Df_s){v1.x*v2.x,v1.y*v2.y,v1.z*v2.z};
 }
 static inline float vmagf(vect3Df_s v)
 {
 	return sqrtf(v.x*v.x+v.y*v.y+v.z*v.z);
 }
 static inline float vdistf(vect3Df_s v1, vect3Df_s v2)
 {
 	return sqrtf((v1.x-v2.x)*(v1.x-v2.x)+(v1.y-v2.y)*(v1.y-v2.y)+(v1.z-v2.z)*(v1.z-v2.z));
 }
 static inline vect3Df_s vnormf(vect3Df_s v)
 {
 	const float l=sqrtf(v.x*v.x+v.y*v.y+v.z*v.z);
 	return (vect3Df_s){v.x/l,v.y/l,v.z/l};
 }
 typedef struct
 {
 	float x, y, z, w;
 }vect4Df_s;
 static inline vect4Df_s vect4Df(float x, float y, float z, float w)
 {
 	return (vect4Df_s){x,y,z,w};
 }
 static inline vect4Df_s vaddf4(vect4Df_s u, vect4Df_s v)
 {
 	return (vect4Df_s){u.x+v.x,u.y+v.y,u.z+v.z,u.w+v.w};
 }
 static inline vect4Df_s vsubf4(vect4Df_s u, vect4Df_s v)
 {
 	return (vect4Df_s){u.x-v.x,u.y-v.y,u.z-v.z,u.w-v.w};
 }
 static inline vect4Df_s vmulf4(vect4Df_s v, float f)
 {
 	return (vect4Df_s){v.x*f,v.y*f,v.z*f,v.w*f};
 }
 static inline float vdotf4(vect4Df_s v1, vect4Df_s v2)
 {
 	return v1.x*v2.x+v1.y*v2.y+v1.z*v2.z+v1.w*v2.w;
 }
 static inline vect4Df_s vnormf4(vect4Df_s v)
 {
 	const float l=sqrtf(v.x*v.x+v.y*v.y+v.z*v.z+v.w*v.w);
 	return (vect4Df_s){v.x/l,v.y/l,v.z/l,v.w/l};
 }
 //interstuff
 static inline vect3Di_s vf2i(vect3Df_s v)
 {
 	return (vect3Di_s){floorf(v.x),floorf(v.y),floorf(v.z)};
 }
 static inline vect3Df_s vi2f(vect3Di_s v)
 {
 	return (vect3Df_s){(float)v.x,(float)v.y,(float)v.z};
 }
 void loadIdentity44(float* m);
 void multMatrix44(float* m1, float* m2, float* m);
 void translateMatrix(float* tm, float x, float y, float z);
 void rotateMatrixX(float* tm, float x, bool r);
 void rotateMatrixY(float* tm, float x, bool r);
 void rotateMatrixZ(float* tm, float x, bool r);
 void scaleMatrix(float* tm, float x, float y, float z);
 void initProjectionMatrix(float* m, float fovy, float aspect, float near, float far);
 vect3Df_s getMatrixColumn(float* m, u8 i);
 vect3Df_s getMatrixRow(float* m, u8 i);
 vect4Df_s getMatrixColumn4(float* m, u8 i);
 vect4Df_s getMatrixRow4(float* m, u8 i);
 #endif
--- a/examples/graphics/gpu/geoshader/Makefile
+++ b/examples/graphics/gpu/geoshader/Makefile
@ -75,6 +75,7 @@ export DEPSDIR	:=	$(CURDIR)/$(BUILD)
 CFILES		:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.c)))
 CPPFILES	:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.cpp)))
 SFILES		:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.s)))
 PICAFILES	:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.pica)))
 BINFILES	:=	$(foreach dir,$(DATA),$(notdir $(wildcard $(dir)/*.*)))
 #---------------------------------------------------------------------------------
@ -91,7 +92,7 @@ else
 endif
 #---------------------------------------------------------------------------------
-export OFILES	:=	$(addsuffix .o,$(BINFILES)) \
+export OFILES	:=	$(addsuffix .o,$(BINFILES)) $(PICAFILES:.pica=.shbin.o) \
 			$(CPPFILES:.cpp=.o) $(CFILES:.c=.o) $(SFILES:.s=.o)
 export INCLUDE	:=	$(foreach dir,$(INCLUDES),-I$(CURDIR)/$(dir)) \
@ -156,17 +157,18 @@ $(OUTPUT).elf	:	$(OFILES)
 	@echo $(notdir $<)
 	@$(bin2o)
 # WARNING: This is not the right way to do this! TODO: Do it right!
 #---------------------------------------------------------------------------------
-%_vsh.h %.vsh.o	:	%.vsh
+# rule for assembling GPU shaders
 #---------------------------------------------------------------------------------
 %.shbin.o: %.pica
 	@echo $(notdir $<)
-	@python3 $(AEMSTRO)/aemstro_as.py $< ../$(notdir $<).shbin
+	$(eval CURBIN := $(patsubst %.pica,%.shbin,$(notdir $<)))
-	@bin2s ../$(notdir $<).shbin | $(PREFIX)as -o $@
+	$(eval CURH := $(patsubst %.pica,%.psh.h,$(notdir $<)))
-	@echo "extern const u8" `(echo $(notdir $<).shbin | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`"_end[];" > `(echo $(notdir $<).shbin | tr . _)`.h
+	@picasso $(CURBIN) $< $(CURH)
-	@echo "extern const u8" `(echo $(notdir $<).shbin | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`"[];" >> `(echo $(notdir $<).shbin | tr . _)`.h
+	@bin2s $(CURBIN) | $(AS) -o $@
-	@echo "extern const u32" `(echo $(notdir $<).shbin | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`_size";" >> `(echo $(notdir $<).shbin | tr . _)`.h
+	@echo "extern const u8" `(echo $(CURBIN) | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`"_end[];" > `(echo $(CURBIN) | tr . _)`.h
-	@rm ../$(notdir $<).shbin
+	@echo "extern const u8" `(echo $(CURBIN) | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`"[];" >> `(echo $(CURBIN) | tr . _)`.h
 	@echo "extern const u32" `(echo $(CURBIN) | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`_size";" >> `(echo $(CURBIN) | tr . _)`.h
 -include $(DEPENDS)
--- a/examples/graphics/gpu/geoshader/README.md
+++ b/examples/graphics/gpu/geoshader/README.md
@ -0,0 +1,6 @@
 # GPU example
 This is a simple GPU example using the `picasso` shader assembler which comes with devkitARM r45 and up.
 Users of earlier versions of devkitARM need to install the tool, which can be found in the address below:
 https://github.com/fincs/picasso/releases
--- a/examples/graphics/gpu/geoshader/source/3dmath.c
+++ b/examples/graphics/gpu/geoshader/source/3dmath.c
@ -0,0 +1,172 @@
 #include "3dmath.h"
 void m4x4_identity(matrix_4x4* out)
 {
 	m4x4_zeros(out);
 	out->r[0].x = out->r[1].y = out->r[2].z = out->r[3].w = 1.0f;
 }
 void m4x4_multiply(matrix_4x4* out, const matrix_4x4* a, const matrix_4x4* b)
 {
 	int i, j;
 	for (i = 0; i < 4; i ++)
 		for (j = 0; j < 4; j ++)
 			out->r[j].c[i] = a->r[j].x*b->r[0].c[i] + a->r[j].y*b->r[1].c[i] + a->r[j].z*b->r[2].c[i] + a->r[j].w*b->r[3].c[i];
 }
 void m4x4_translate(matrix_4x4* mtx, float x, float y, float z)
 {
 	matrix_4x4 tm, om;
 	m4x4_identity(&tm);
 	tm.r[0].w = x;
 	tm.r[1].w = y;
 	tm.r[2].w = z;
 	m4x4_multiply(&om, mtx, &tm);
 	m4x4_copy(mtx, &om);
 }
 void m4x4_scale(matrix_4x4* mtx, float x, float y, float z)
 {
 	int i;
 	for (i = 0; i < 4; i ++)
 	{
 		mtx->r[i].x *= x;
 		mtx->r[i].y *= y;
 		mtx->r[i].z *= z;
 	}
 }
 void m4x4_rotate_x(matrix_4x4* mtx, float angle, bool bRightSide)
 {
 	matrix_4x4 rm, om;
 	float cosAngle = cosf(angle);
 	float sinAngle = sinf(angle);
 	m4x4_zeros(&rm);
 	rm.r[0].x = 1.0f;
 	rm.r[1].y = cosAngle;
 	rm.r[1].z = sinAngle;
 	rm.r[2].y = -sinAngle;
 	rm.r[2].z = cosAngle;
 	rm.r[3].w = 1.0f;
 	if (bRightSide) m4x4_multiply(&om, mtx, &rm);
 	else            m4x4_multiply(&om, &rm, mtx);
 	m4x4_copy(mtx, &om);
 }
 void m4x4_rotate_y(matrix_4x4* mtx, float angle, bool bRightSide)
 {
 	matrix_4x4 rm, om;
 	float cosAngle = cosf(angle);
 	float sinAngle = sinf(angle);
 	m4x4_zeros(&rm);
 	rm.r[0].x = cosAngle;
 	rm.r[0].z = sinAngle;
 	rm.r[1].y = 1.0f;
 	rm.r[2].x = -sinAngle;
 	rm.r[2].z = cosAngle;
 	rm.r[3].w = 1.0f;
 	if (bRightSide) m4x4_multiply(&om, mtx, &rm);
 	else            m4x4_multiply(&om, &rm, mtx);
 	m4x4_copy(mtx, &om);
 }
 void m4x4_rotate_z(matrix_4x4* mtx, float angle, bool bRightSide)
 {
 	matrix_4x4 rm, om;
 	float cosAngle = cosf(angle);
 	float sinAngle = sinf(angle);
 	m4x4_zeros(&rm);
 	rm.r[0].x = cosAngle;
 	rm.r[0].y = sinAngle;
 	rm.r[1].x = -sinAngle;
 	rm.r[1].y = cosAngle;
 	rm.r[2].z = 1.0f;
 	rm.r[3].w = 1.0f;
 	if (bRightSide) m4x4_multiply(&om, mtx, &rm);
 	else            m4x4_multiply(&om, &rm, mtx);
 	m4x4_copy(mtx, &om);
 }
 void m4x4_ortho_tilt(matrix_4x4* mtx, float left, float right, float bottom, float top, float near, float far)
 {
 	matrix_4x4 mp;
 	m4x4_zeros(&mp);
 	// Build standard orthogonal projection matrix
 	mp.r[0].x = 2.0f / (right - left);
 	mp.r[0].w = (left + right) / (left - right);
 	mp.r[1].y = 2.0f / (top - bottom);
 	mp.r[1].w = (bottom + top) / (bottom - top);
 	mp.r[2].z = 2.0f / (near - far);
 	mp.r[2].w = (far + near) / (far - near);
 	mp.r[3].w = 1.0f;
 	// Fix depth range to [-1, 0]
 	matrix_4x4 mp2, mp3;
 	m4x4_identity(&mp2);
 	mp2.r[2].z = 0.5;
 	mp2.r[2].w = -0.5;
 	m4x4_multiply(&mp3, &mp2, &mp);
 	// Fix the 3DS screens' orientation by swapping the X and Y axis
 	m4x4_identity(&mp2);
 	mp2.r[0].x = 0.0;
 	mp2.r[0].y = 1.0;
 	mp2.r[1].x = -1.0; // flipped
 	mp2.r[1].y = 0.0;
 	m4x4_multiply(mtx, &mp2, &mp3);
 }
 void m4x4_persp_tilt(matrix_4x4* mtx, float fovx, float invaspect, float near, float far)
 {
 	// Notes:
 	// We are passed "fovy" and the "aspect ratio". However, the 3DS screens are sideways,
 	// and so are these parameters -- in fact, they are actually the fovx and the inverse
 	// of the aspect ratio. Therefore the formula for the perspective projection matrix
 	// had to be modified to be expressed in these terms instead.
 	// Notes:
 	// fovx = 2 atan(tan(fovy/2)*w/h)
 	// fovy = 2 atan(tan(fovx/2)*h/w)
 	// invaspect = h/w
 	// a0,0 = h / (w*tan(fovy/2)) =
 	//      = h / (w*tan(2 atan(tan(fovx/2)*h/w) / 2)) =
 	//      = h / (w*tan( atan(tan(fovx/2)*h/w) )) =
 	//      = h / (w * tan(fovx/2)*h/w) =
 	//      = 1 / tan(fovx/2)
 	// a1,1 = 1 / tan(fovy/2) = (...) = w / (h*tan(fovx/2))
 	float fovx_tan = tanf(fovx / 2);
 	matrix_4x4 mp;
 	m4x4_zeros(&mp);
 	// Build standard perspective projection matrix
 	mp.r[0].x = 1.0f / fovx_tan;
 	mp.r[1].y = 1.0f / (fovx_tan*invaspect);
 	mp.r[2].z = (near + far) / (near - far);
 	mp.r[2].w = (2 * near * far) / (near - far);
 	mp.r[3].z = -1.0f;
 	// Fix depth range to [-1, 0]
 	matrix_4x4 mp2;
 	m4x4_identity(&mp2);
 	mp2.r[2].z = 0.5;
 	mp2.r[2].w = -0.5;
 	m4x4_multiply(mtx, &mp2, &mp);
 	// Rotate the matrix one quarter of a turn CCW in order to fix the 3DS screens' orientation
 	m4x4_rotate_z(mtx, M_PI / 2, true);
 }
--- a/examples/graphics/gpu/geoshader/source/3dmath.h
+++ b/examples/graphics/gpu/geoshader/source/3dmath.h
@ -0,0 +1,56 @@
 /*
 * Bare-bones simplistic 3D math library
 * This library is common to all libctru GPU examples
 */
 #pragma once
 #include <string.h>
 #include <stdbool.h>
 #include <math.h>
 typedef union { struct { float w, z, y, x; }; float c[4]; } vector_4f;
 typedef struct { vector_4f r[4]; } matrix_4x4;
 static inline float v4f_dp4(const vector_4f* a, const vector_4f* b)
 {
 	return a->x*b->x + a->y*b->y + a->z*b->z + a->w*b->w;
 }
 static inline float v4f_mod4(const vector_4f* a)
 {
 	return sqrtf(v4f_dp4(a,a));
 }
 static inline void v4f_norm4(vector_4f* vec)
 {
 	float m = v4f_mod4(vec);
 	if (m == 0.0) return;
 	vec->x /= m;
 	vec->y /= m;
 	vec->z /= m;
 	vec->w /= m;
 }
 static inline void m4x4_zeros(matrix_4x4* out)
 {
 	memset(out, 0, sizeof(*out));
 }
 static inline void m4x4_copy(matrix_4x4* out, const matrix_4x4* in)
 {
 	memcpy(out, in, sizeof(*out));
 }
 void m4x4_identity(matrix_4x4* out);
 void m4x4_multiply(matrix_4x4* out, const matrix_4x4* a, const matrix_4x4* b);
 void m4x4_translate(matrix_4x4* mtx, float x, float y, float z);
 void m4x4_scale(matrix_4x4* mtx, float x, float y, float z);
 void m4x4_rotate_x(matrix_4x4* mtx, float angle, bool bRightSide);
 void m4x4_rotate_y(matrix_4x4* mtx, float angle, bool bRightSide);
 void m4x4_rotate_z(matrix_4x4* mtx, float angle, bool bRightSide);
 // Special versions of the projection matrices that take the 3DS' screen orientation into account
 void m4x4_ortho_tilt(matrix_4x4* mtx, float left, float right, float bottom, float top, float near, float far);
 void m4x4_persp_tilt(matrix_4x4* mtx, float fovy, float aspect, float near, float far);
--- a/examples/graphics/gpu/geoshader/source/gpu.c
+++ b/examples/graphics/gpu/geoshader/source/gpu.c
@ -0,0 +1,93 @@
 #include "gpu.h"
 #define DISPLAY_TRANSFER_FLAGS \
 	(GX_TRANSFER_FLIP_VERT(0) | GX_TRANSFER_OUT_TILED(0) | GX_TRANSFER_RAW_COPY(0) | \
 	GX_TRANSFER_IN_FORMAT(GX_TRANSFER_FMT_RGBA8) | GX_TRANSFER_OUT_FORMAT(GX_TRANSFER_FMT_RGB8) | \
 	GX_TRANSFER_SCALING(GX_TRANSFER_SCALE_NO))
 static u32 *colorBuf, *depthBuf;
 static u32 *cmdBuf;
 void gpuInit(void)
 {
 	colorBuf = vramAlloc(400*240*4);
 	depthBuf = vramAlloc(400*240*4);
 	cmdBuf = linearAlloc(0x40000*4);
 	GPU_Init(NULL);
 	GPU_Reset(NULL, cmdBuf, 0x40000);
 }
 void gpuExit(void)
 {
 	linearFree(cmdBuf);
 	vramFree(depthBuf);
 	vramFree(colorBuf);
 }
 void gpuClearBuffers(u32 clearColor)
 {
 	GX_SetMemoryFill(NULL,
 		colorBuf, clearColor, &colorBuf[240*400], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH,
 		depthBuf, 0,          &depthBuf[240*400], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH);
 	gspWaitForPSC0(); // Wait for the fill to complete
 }
 void gpuFrameBegin(void)
 {
 	// Configure the viewport and the depth linear conversion function
 	GPU_SetViewport(
 		(u32*)osConvertVirtToPhys((u32)depthBuf),
 		(u32*)osConvertVirtToPhys((u32)colorBuf),
 		0, 0, 240, 400); // The top screen is physically 240x400 pixels
 	GPU_DepthMap(-1.0f, 0.0f); // calculate the depth value from the Z coordinate in the following way: -1.0*z + 0.0
 	// Configure some boilerplate
 	GPU_SetFaceCulling(GPU_CULL_BACK_CCW);
 	GPU_SetStencilTest(false, GPU_ALWAYS, 0x00, 0xFF, 0x00);
 	GPU_SetStencilOp(GPU_KEEP, GPU_KEEP, GPU_KEEP);
 	GPU_SetBlendingColor(0,0,0,0);
 	GPU_SetDepthTestAndWriteMask(true, GPU_GREATER, GPU_WRITE_ALL);
 	// This is unknown
 	GPUCMD_AddMaskedWrite(GPUREG_0062, 0x1, 0);
 	GPUCMD_AddWrite(GPUREG_0118, 0);
 	// Configure alpha blending and test
 	GPU_SetAlphaBlending(GPU_BLEND_ADD, GPU_BLEND_ADD, GPU_SRC_ALPHA, GPU_ONE_MINUS_SRC_ALPHA, GPU_SRC_ALPHA, GPU_ONE_MINUS_SRC_ALPHA);
 	GPU_SetAlphaTest(false, GPU_ALWAYS, 0x00);
 	int i;
 	for (i = 0; i < 6; i ++)
 		GPU_SetDummyTexEnv(i);
 }
 void gpuFrameEnd(void)
 {
 	// Finish rendering
 	GPU_FinishDrawing();
 	GPUCMD_Finalize();
 	GPUCMD_FlushAndRun(NULL);
 	gspWaitForP3D(); // Wait for the rendering to complete
 	// Transfer the GPU output to the framebuffer
 	GX_SetDisplayTransfer(NULL, colorBuf, GX_BUFFER_DIM(240, 400),
 		(u32*)gfxGetFramebuffer(GFX_TOP, GFX_LEFT, NULL, NULL), GX_BUFFER_DIM(240, 400),
 		DISPLAY_TRANSFER_FLAGS);
 	gspWaitForPPF(); // Wait for the transfer to complete
 	// Reset the command buffer
 	GPUCMD_SetBufferOffset(0);
 };
 void GPU_SetDummyTexEnv(int id)
 {
 	GPU_SetTexEnv(id,
 		GPU_TEVSOURCES(GPU_PREVIOUS, 0, 0),
 		GPU_TEVSOURCES(GPU_PREVIOUS, 0, 0),
 		GPU_TEVOPERANDS(0, 0, 0),
 		GPU_TEVOPERANDS(0, 0, 0),
 		GPU_REPLACE,
 		GPU_REPLACE,
 		0xFFFFFFFF);
 }
--- a/examples/graphics/gpu/geoshader/source/gpu.h
+++ b/examples/graphics/gpu/geoshader/source/gpu.h
@ -0,0 +1,26 @@
 /*
 * Bare-bones simplistic GPU wrapper
 * This library is common to all libctru GPU examples
 */
 #pragma once
 #include <string.h>
 #include <3ds.h>
 #include "3dmath.h"
 void gpuInit(void);
 void gpuExit(void);
 void gpuClearBuffers(u32 clearColor);
 void gpuFrameBegin(void);
 void gpuFrameEnd(void);
 // Configures the specified fixed-function fragment shading substage to be a no-operation
 void GPU_SetDummyTexEnv(int id);
 // Uploads an uniform matrix
 static inline void GPU_SetFloatUniformMatrix(GPU_SHADER_TYPE type, int location, matrix_4x4* matrix)
 {
 	GPU_SetFloatUniform(type, location, (u32*)matrix, 4);
 }
--- a/examples/graphics/gpu/geoshader/source/gshader.pica
+++ b/examples/graphics/gpu/geoshader/source/gshader.pica
@ -0,0 +1,91 @@
 ; Example PICA200 geometry shader
 ; Uniforms
 .fvec projection[4]
 ; Constants
 .constf myconst(0.0, 1.0, -1.0, 0.5)
 .alias  zeros myconst.xxxx ; Vector full of zeros
 .alias  ones  myconst.yyyy ; Vector full of ones
 .alias  half  myconst.wwww
 ; Outputs - this time the type *is* used
 .out outpos position
 .out outclr color
 ; Inputs: we will receive the following inputs:
 ; v0-v1: position/color of the first vertex
 ; v2-v3: position/color of the second vertex
 ; v4-v5: position/color of the third vertex
 .proc main
 	; Calculate the midpoints of the vertices
 	mov r4, v0
 	add r4, v2,   r4
 	mul r4, half, r4
 	mov r5, v2
 	add r5, v4,   r5
 	mul r5, half, r5
 	mov r6, v4
 	add r6, v0,   r6
 	mul r6, half, r6
 	; Emit the first triangle
 	mov r0, v0
 	mov r1, r4
 	mov r2, r6
 	call emit_triangle
 	; Emit the second triangle
 	mov r0, r4
 	mov r1, v2
 	mov r2, r5
 	call emit_triangle
 	; Emit the third triangle
 	mov r0, r6
 	mov r1, r5
 	mov r2, v4
 	call emit_triangle
 	; We're finished
 	end
 .end
 .proc emit_triangle
 	; Emit the first vertex
 	setemit 0
 	mov r8, r0
 	mov r9, v1
 	call process_vertex
 	emit
 	; Emit the second vertex
 	setemit 1
 	mov r8, r1
 	mov r9, v3
 	call process_vertex
 	emit
 	; Emit the third vertex and finish the primitive
 	setemit 2, prim
 	mov r8, r2
 	mov r9, v5
 	call process_vertex
 	emit
 .end
 ; Subroutine
 ; Inputs:
 ;   r8: vertex position
 ;   r9: vertex color
 .proc process_vertex
 	; outpos = projectionMatrix * r8
 	dp4 outpos.x, projection[0], r8
 	dp4 outpos.y, projection[1], r8
 	dp4 outpos.z, projection[2], r8
 	dp4 outpos.w, projection[3], r8
 	; outclr = r9
 	mov outclr, r9
 .end
--- a/examples/graphics/gpu/geoshader/source/main.c
+++ b/examples/graphics/gpu/geoshader/source/main.c
@ -0,0 +1,139 @@
 /*
 * ~~ Simple libctru GPU geometry shader example ~~
 * This example demonstrates the basics of using the PICA200 in a 3DS homebrew
 * application in order to render a basic scene using a geoshader.
 * The example geoshader receives the vertices of a triangle and emits three
 * smaller triangles, thus forming a 'triforce' shape.
 */
 #include "gpu.h"
 #include "vshader_shbin.h"
 #include "gshader_shbin.h"
 #define CLEAR_COLOR 0x68B0D8FF
 typedef struct { float position[3]; float color[4]; } vertex;
 static const vertex vertex_list[] =
 {
 	{ {200.0f, 200.0f, 0.5f}, {1.0f, 0.0f, 0.0f, 1.0f} },
 	{ {100.0f,  40.0f, 0.5f}, {0.0f, 1.0f, 0.0f, 1.0f} },
 	{ {300.0f,  40.0f, 0.5f}, {0.0f, 0.0f, 1.0f, 1.0f} },
 };
 #define vertex_list_count (sizeof(vertex_list)/sizeof(vertex_list[0]))
 static DVLB_s *vshader_dvlb, *gshader_dvlb;
 static shaderProgram_s program;
 static int uLoc_projection;
 static matrix_4x4 projection;
 static void* vbo_data;
 static void sceneInit(void)
 {
 	// Load the shaders and create a shader program
 	// The geoshader stride is set to 6 so that it processes a triangle at a time
 	vshader_dvlb = DVLB_ParseFile((u32*)vshader_shbin, vshader_shbin_size);
 	gshader_dvlb = DVLB_ParseFile((u32*)gshader_shbin, gshader_shbin_size);
 	shaderProgramInit(&program);
 	shaderProgramSetVsh(&program, &vshader_dvlb->DVLE[0]);
 	shaderProgramSetGsh(&program, &gshader_dvlb->DVLE[0], 6);
 	// Get the location of the projection matrix uniform
 	uLoc_projection = shaderInstanceGetUniformLocation(program.geometryShader, "projection");
 	// Compute the projection matrix
 	m4x4_ortho_tilt(&projection, 0.0, 400.0, 0.0, 240.0, 0.0, 1.0);
 	// Create the VBO (vertex buffer object)
 	vbo_data = linearAlloc(sizeof(vertex_list));
 	memcpy(vbo_data, vertex_list, sizeof(vertex_list));
 }
 static void sceneRender(void)
 {
 	// Bind the shader program
 	shaderProgramUse(&program);
 	// Configure the first fragment shading substage to just pass through the vertex color
 	// See https://www.opengl.org/sdk/docs/man2/xhtml/glTexEnv.xml for more insight
 	GPU_SetTexEnv(0,
 		GPU_TEVSOURCES(GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR), // RGB channels
 		GPU_TEVSOURCES(GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR), // Alpha
 		GPU_TEVOPERANDS(0, 0, 0), // RGB
 		GPU_TEVOPERANDS(0, 0, 0), // Alpha
 		GPU_REPLACE, GPU_REPLACE, // RGB, Alpha
 		0xFFFFFFFF);
 	// Configure the "attribute buffers" (that is, the vertex input buffers)
 	GPU_SetAttributeBuffers(
 		2, // Number of inputs per vertex
 		(u32*)osConvertVirtToPhys((u32)vbo_data), // Location of the VBO
 		GPU_ATTRIBFMT(0, 3, GPU_FLOAT) |
 		GPU_ATTRIBFMT(1, 4, GPU_FLOAT), // Format of the inputs (in this case the only input is a 3-element float vector)
 		0xFFC, // Unused attribute mask, in our case bit 0 is cleared since it is used
 		0x10, // Attribute permutations (here it is the identity)
 		1, // Number of buffers
 		(u32[]) { 0x0 }, // Buffer offsets (placeholders)
 		(u64[]) { 0x10 }, // Attribute permutations for each buffer (identity again)
 		(u8[])  { 2 }); // Number of attributes for each buffer
 	// Upload the projection matrix
 	GPU_SetFloatUniformMatrix(GPU_GEOMETRY_SHADER, uLoc_projection, &projection);
 	// Draw the VBO - GPU_UNKPRIM allows the geoshader to control primitive emission
 	GPU_DrawArray(GPU_UNKPRIM, vertex_list_count);
 }
 static void sceneExit(void)
 {
 	// Free the VBO
 	linearFree(vbo_data);
 	// Free the shader program
 	shaderProgramFree(&program);
 	DVLB_Free(vshader_dvlb);
 	DVLB_Free(gshader_dvlb);
 }
 int main()
 {
 	// Initialize graphics
 	gfxInitDefault();
 	gpuInit();
 	// Initialize the scene
 	sceneInit();
 	gpuClearBuffers(CLEAR_COLOR);
 	// Main loop
 	while (aptMainLoop())
 	{
 		gspWaitForVBlank();  // Synchronize with the start of VBlank
 		gfxSwapBuffersGpu(); // Swap the framebuffers so that the frame that we rendered last frame is now visible
 		hidScanInput();      // Read the user input
 		// Respond to user input
 		u32 kDown = hidKeysDown();
 		if (kDown & KEY_START)
 			break; // break in order to return to hbmenu
 		// Render the scene
 		gpuFrameBegin();
 		sceneRender();
 		gpuFrameEnd();
 		gpuClearBuffers(CLEAR_COLOR);
 		// Flush the framebuffers out of the data cache (not necessary with pure GPU rendering)
 		//gfxFlushBuffers();
 	}
 	// Deinitialize the scene
 	sceneExit();
 	// Deinitialize graphics
 	gpuExit();
 	gfxExit();
 	return 0;
 }
--- a/examples/graphics/gpu/geoshader/source/vshader.pica
+++ b/examples/graphics/gpu/geoshader/source/vshader.pica
@ -0,0 +1,24 @@
 ; Example PICA200 vertex shader
 ; Constants
 .constf myconst(0.0, 1.0, -1.0, -0.5)
 .alias  zeros myconst.xxxx ; Vector full of zeros
 .alias  ones  myconst.yyyy ; Vector full of ones
 ; Outputs - since we are also using a geoshader the output type isn't really used
 .out outpos position
 .out outclr color
 ; Inputs (defined as aliases for convenience)
 .alias inpos v0
 .alias inclr v1
 .proc main
 	; Pass through both inputs to the geoshader
 	mov outpos.xyz, inpos
 	mov outpos.w,   ones
 	mov outclr,     inclr
 	; We're finished
 	end
 .end
--- a/examples/graphics/gpu/simple_tri/Makefile
+++ b/examples/graphics/gpu/simple_tri/Makefile
@ -0,0 +1,177 @@
 #---------------------------------------------------------------------------------
 .SUFFIXES:
 #---------------------------------------------------------------------------------
 ifeq ($(strip $(DEVKITARM)),)
 $(error "Please set DEVKITARM in your environment. export DEVKITARM=<path to>devkitARM")
 endif
 TOPDIR ?= $(CURDIR)
 include $(DEVKITARM)/3ds_rules
 #---------------------------------------------------------------------------------
 # TARGET is the name of the output
 # BUILD is the directory where object files & intermediate files will be placed
 # SOURCES is a list of directories containing source code
 # DATA is a list of directories containing data files
 # INCLUDES is a list of directories containing header files
 #
 # NO_SMDH: if set to anything, no SMDH file is generated.
 # APP_TITLE is the name of the app stored in the SMDH file (Optional)
 # APP_DESCRIPTION is the description of the app stored in the SMDH file (Optional)
 # APP_AUTHOR is the author of the app stored in the SMDH file (Optional)
 # ICON is the filename of the icon (.png), relative to the project folder.
 #   If not set, it attempts to use one of the following (in this order):
 #     - <Project name>.png
 #     - icon.png
 #     - <libctru folder>/default_icon.png
 #---------------------------------------------------------------------------------
 TARGET		:=	$(notdir $(CURDIR))
 BUILD		:=	build
 SOURCES		:=	source
 DATA		:=	data
 INCLUDES	:=	include
 #---------------------------------------------------------------------------------
 # options for code generation
 #---------------------------------------------------------------------------------
 ARCH	:=	-march=armv6k -mtune=mpcore -mfloat-abi=hard
 CFLAGS	:=	-g -Wall -O2 -mword-relocations \
 			-fomit-frame-pointer -ffast-math \
 			$(ARCH)
 CFLAGS	+=	$(INCLUDE) -DARM11 -D_3DS
 CXXFLAGS	:= $(CFLAGS) -fno-rtti -fno-exceptions -std=gnu++11
 ASFLAGS	:=	-g $(ARCH)
 LDFLAGS	=	-specs=3dsx.specs -g $(ARCH) -Wl,-Map,$(notdir $*.map)
 LIBS	:= -lctru -lm
 #---------------------------------------------------------------------------------
 # list of directories containing libraries, this must be the top level containing
 # include and lib
 #---------------------------------------------------------------------------------
 LIBDIRS	:= $(CTRULIB)
 #---------------------------------------------------------------------------------
 # no real need to edit anything past this point unless you need to add additional
 # rules for different file extensions
 #---------------------------------------------------------------------------------
 ifneq ($(BUILD),$(notdir $(CURDIR)))
 #---------------------------------------------------------------------------------
 export OUTPUT	:=	$(CURDIR)/$(TARGET)
 export TOPDIR	:=	$(CURDIR)
 export VPATH	:=	$(foreach dir,$(SOURCES),$(CURDIR)/$(dir)) \
 			$(foreach dir,$(DATA),$(CURDIR)/$(dir))
 export DEPSDIR	:=	$(CURDIR)/$(BUILD)
 CFILES		:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.c)))
 CPPFILES	:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.cpp)))
 SFILES		:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.s)))
 PICAFILES	:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.pica)))
 BINFILES	:=	$(foreach dir,$(DATA),$(notdir $(wildcard $(dir)/*.*)))
 #---------------------------------------------------------------------------------
 # use CXX for linking C++ projects, CC for standard C
 #---------------------------------------------------------------------------------
 ifeq ($(strip $(CPPFILES)),)
 #---------------------------------------------------------------------------------
 	export LD	:=	$(CC)
 #---------------------------------------------------------------------------------
 else
 #---------------------------------------------------------------------------------
 	export LD	:=	$(CXX)
 #---------------------------------------------------------------------------------
 endif
 #---------------------------------------------------------------------------------
 export OFILES	:=	$(addsuffix .o,$(BINFILES)) $(PICAFILES:.pica=.shbin.o) \
 			$(CPPFILES:.cpp=.o) $(CFILES:.c=.o) $(SFILES:.s=.o)
 export INCLUDE	:=	$(foreach dir,$(INCLUDES),-I$(CURDIR)/$(dir)) \
 			$(foreach dir,$(LIBDIRS),-I$(dir)/include) \
 			-I$(CURDIR)/$(BUILD)
 export LIBPATHS	:=	$(foreach dir,$(LIBDIRS),-L$(dir)/lib)
 ifeq ($(strip $(ICON)),)
 	icons := $(wildcard *.png)
 	ifneq (,$(findstring $(TARGET).png,$(icons)))
 		export APP_ICON := $(TOPDIR)/$(TARGET).png
 	else
 		ifneq (,$(findstring icon.png,$(icons)))
 			export APP_ICON := $(TOPDIR)/icon.png
 		endif
 	endif
 else
 	export APP_ICON := $(TOPDIR)/$(ICON)
 endif
 ifeq ($(strip $(NO_SMDH)),)
 	export _3DSXFLAGS += --smdh=$(CURDIR)/$(TARGET).smdh
 endif
 .PHONY: $(BUILD) clean all
 #---------------------------------------------------------------------------------
 all: $(BUILD)
 $(BUILD):
 	@[ -d $@ ] || mkdir -p $@
 	@$(MAKE) --no-print-directory -C $(BUILD) -f $(CURDIR)/Makefile
 #---------------------------------------------------------------------------------
 clean:
 	@echo clean ...
 	@rm -fr $(BUILD) $(TARGET).3dsx $(OUTPUT).smdh $(TARGET).elf
 #---------------------------------------------------------------------------------
 else
 DEPENDS	:=	$(OFILES:.o=.d)
 #---------------------------------------------------------------------------------
 # main targets
 #---------------------------------------------------------------------------------
 ifeq ($(strip $(NO_SMDH)),)
 $(OUTPUT).3dsx	:	$(OUTPUT).elf $(OUTPUT).smdh
 else
 $(OUTPUT).3dsx	:	$(OUTPUT).elf
 endif
 $(OUTPUT).elf	:	$(OFILES)
 #---------------------------------------------------------------------------------
 # you need a rule like this for each extension you use as binary data
 #---------------------------------------------------------------------------------
 %.bin.o	:	%.bin
 #---------------------------------------------------------------------------------
 	@echo $(notdir $<)
 	@$(bin2o)
 #---------------------------------------------------------------------------------
 # rule for assembling GPU shaders
 #---------------------------------------------------------------------------------
 %.shbin.o: %.pica
 	@echo $(notdir $<)
 	$(eval CURBIN := $(patsubst %.pica,%.shbin,$(notdir $<)))
 	$(eval CURH := $(patsubst %.pica,%.psh.h,$(notdir $<)))
 	@picasso $(CURBIN) $< $(CURH)
 	@bin2s $(CURBIN) | $(AS) -o $@
 	@echo "extern const u8" `(echo $(CURBIN) | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`"_end[];" > `(echo $(CURBIN) | tr . _)`.h
 	@echo "extern const u8" `(echo $(CURBIN) | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`"[];" >> `(echo $(CURBIN) | tr . _)`.h
 	@echo "extern const u32" `(echo $(CURBIN) | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`_size";" >> `(echo $(CURBIN) | tr . _)`.h
 -include $(DEPENDS)
 #---------------------------------------------------------------------------------------
 endif
 #---------------------------------------------------------------------------------------
--- a/examples/graphics/gpu/simple_tri/README.md
+++ b/examples/graphics/gpu/simple_tri/README.md
@ -0,0 +1,6 @@
 # GPU example
 This is a simple GPU example using the `picasso` shader assembler which comes with devkitARM r45 and up.
 Users of earlier versions of devkitARM need to install the tool, which can be found in the address below:
 https://github.com/fincs/picasso/releases
--- a/examples/graphics/gpu/simple_tri/source/3dmath.c
+++ b/examples/graphics/gpu/simple_tri/source/3dmath.c
@ -0,0 +1,172 @@
 #include "3dmath.h"
 void m4x4_identity(matrix_4x4* out)
 {
 	m4x4_zeros(out);
 	out->r[0].x = out->r[1].y = out->r[2].z = out->r[3].w = 1.0f;
 }
 void m4x4_multiply(matrix_4x4* out, const matrix_4x4* a, const matrix_4x4* b)
 {
 	int i, j;
 	for (i = 0; i < 4; i ++)
 		for (j = 0; j < 4; j ++)
 			out->r[j].c[i] = a->r[j].x*b->r[0].c[i] + a->r[j].y*b->r[1].c[i] + a->r[j].z*b->r[2].c[i] + a->r[j].w*b->r[3].c[i];
 }
 void m4x4_translate(matrix_4x4* mtx, float x, float y, float z)
 {
 	matrix_4x4 tm, om;
 	m4x4_identity(&tm);
 	tm.r[0].w = x;
 	tm.r[1].w = y;
 	tm.r[2].w = z;
 	m4x4_multiply(&om, mtx, &tm);
 	m4x4_copy(mtx, &om);
 }
 void m4x4_scale(matrix_4x4* mtx, float x, float y, float z)
 {
 	int i;
 	for (i = 0; i < 4; i ++)
 	{
 		mtx->r[i].x *= x;
 		mtx->r[i].y *= y;
 		mtx->r[i].z *= z;
 	}
 }
 void m4x4_rotate_x(matrix_4x4* mtx, float angle, bool bRightSide)
 {
 	matrix_4x4 rm, om;
 	float cosAngle = cosf(angle);
 	float sinAngle = sinf(angle);
 	m4x4_zeros(&rm);
 	rm.r[0].x = 1.0f;
 	rm.r[1].y = cosAngle;
 	rm.r[1].z = sinAngle;
 	rm.r[2].y = -sinAngle;
 	rm.r[2].z = cosAngle;
 	rm.r[3].w = 1.0f;
 	if (bRightSide) m4x4_multiply(&om, mtx, &rm);
 	else            m4x4_multiply(&om, &rm, mtx);
 	m4x4_copy(mtx, &om);
 }
 void m4x4_rotate_y(matrix_4x4* mtx, float angle, bool bRightSide)
 {
 	matrix_4x4 rm, om;
 	float cosAngle = cosf(angle);
 	float sinAngle = sinf(angle);
 	m4x4_zeros(&rm);
 	rm.r[0].x = cosAngle;
 	rm.r[0].z = sinAngle;
 	rm.r[1].y = 1.0f;
 	rm.r[2].x = -sinAngle;
 	rm.r[2].z = cosAngle;
 	rm.r[3].w = 1.0f;
 	if (bRightSide) m4x4_multiply(&om, mtx, &rm);
 	else            m4x4_multiply(&om, &rm, mtx);
 	m4x4_copy(mtx, &om);
 }
 void m4x4_rotate_z(matrix_4x4* mtx, float angle, bool bRightSide)
 {
 	matrix_4x4 rm, om;
 	float cosAngle = cosf(angle);
 	float sinAngle = sinf(angle);
 	m4x4_zeros(&rm);
 	rm.r[0].x = cosAngle;
 	rm.r[0].y = sinAngle;
 	rm.r[1].x = -sinAngle;
 	rm.r[1].y = cosAngle;
 	rm.r[2].z = 1.0f;
 	rm.r[3].w = 1.0f;
 	if (bRightSide) m4x4_multiply(&om, mtx, &rm);
 	else            m4x4_multiply(&om, &rm, mtx);
 	m4x4_copy(mtx, &om);
 }
 void m4x4_ortho_tilt(matrix_4x4* mtx, float left, float right, float bottom, float top, float near, float far)
 {
 	matrix_4x4 mp;
 	m4x4_zeros(&mp);
 	// Build standard orthogonal projection matrix
 	mp.r[0].x = 2.0f / (right - left);
 	mp.r[0].w = (left + right) / (left - right);
 	mp.r[1].y = 2.0f / (top - bottom);
 	mp.r[1].w = (bottom + top) / (bottom - top);
 	mp.r[2].z = 2.0f / (near - far);
 	mp.r[2].w = (far + near) / (far - near);
 	mp.r[3].w = 1.0f;
 	// Fix depth range to [-1, 0]
 	matrix_4x4 mp2, mp3;
 	m4x4_identity(&mp2);
 	mp2.r[2].z = 0.5;
 	mp2.r[2].w = -0.5;
 	m4x4_multiply(&mp3, &mp2, &mp);
 	// Fix the 3DS screens' orientation by swapping the X and Y axis
 	m4x4_identity(&mp2);
 	mp2.r[0].x = 0.0;
 	mp2.r[0].y = 1.0;
 	mp2.r[1].x = -1.0; // flipped
 	mp2.r[1].y = 0.0;
 	m4x4_multiply(mtx, &mp2, &mp3);
 }
 void m4x4_persp_tilt(matrix_4x4* mtx, float fovx, float invaspect, float near, float far)
 {
 	// Notes:
 	// We are passed "fovy" and the "aspect ratio". However, the 3DS screens are sideways,
 	// and so are these parameters -- in fact, they are actually the fovx and the inverse
 	// of the aspect ratio. Therefore the formula for the perspective projection matrix
 	// had to be modified to be expressed in these terms instead.
 	// Notes:
 	// fovx = 2 atan(tan(fovy/2)*w/h)
 	// fovy = 2 atan(tan(fovx/2)*h/w)
 	// invaspect = h/w
 	// a0,0 = h / (w*tan(fovy/2)) =
 	//      = h / (w*tan(2 atan(tan(fovx/2)*h/w) / 2)) =
 	//      = h / (w*tan( atan(tan(fovx/2)*h/w) )) =
 	//      = h / (w * tan(fovx/2)*h/w) =
 	//      = 1 / tan(fovx/2)
 	// a1,1 = 1 / tan(fovy/2) = (...) = w / (h*tan(fovx/2))
 	float fovx_tan = tanf(fovx / 2);
 	matrix_4x4 mp;
 	m4x4_zeros(&mp);
 	// Build standard perspective projection matrix
 	mp.r[0].x = 1.0f / fovx_tan;
 	mp.r[1].y = 1.0f / (fovx_tan*invaspect);
 	mp.r[2].z = (near + far) / (near - far);
 	mp.r[2].w = (2 * near * far) / (near - far);
 	mp.r[3].z = -1.0f;
 	// Fix depth range to [-1, 0]
 	matrix_4x4 mp2;
 	m4x4_identity(&mp2);
 	mp2.r[2].z = 0.5;
 	mp2.r[2].w = -0.5;
 	m4x4_multiply(mtx, &mp2, &mp);
 	// Rotate the matrix one quarter of a turn CCW in order to fix the 3DS screens' orientation
 	m4x4_rotate_z(mtx, M_PI / 2, true);
 }
--- a/examples/graphics/gpu/simple_tri/source/3dmath.h
+++ b/examples/graphics/gpu/simple_tri/source/3dmath.h
@ -0,0 +1,56 @@
 /*
 * Bare-bones simplistic 3D math library
 * This library is common to all libctru GPU examples
 */
 #pragma once
 #include <string.h>
 #include <stdbool.h>
 #include <math.h>
 typedef union { struct { float w, z, y, x; }; float c[4]; } vector_4f;
 typedef struct { vector_4f r[4]; } matrix_4x4;
 static inline float v4f_dp4(const vector_4f* a, const vector_4f* b)
 {
 	return a->x*b->x + a->y*b->y + a->z*b->z + a->w*b->w;
 }
 static inline float v4f_mod4(const vector_4f* a)
 {
 	return sqrtf(v4f_dp4(a,a));
 }
 static inline void v4f_norm4(vector_4f* vec)
 {
 	float m = v4f_mod4(vec);
 	if (m == 0.0) return;
 	vec->x /= m;
 	vec->y /= m;
 	vec->z /= m;
 	vec->w /= m;
 }
 static inline void m4x4_zeros(matrix_4x4* out)
 {
 	memset(out, 0, sizeof(*out));
 }
 static inline void m4x4_copy(matrix_4x4* out, const matrix_4x4* in)
 {
 	memcpy(out, in, sizeof(*out));
 }
 void m4x4_identity(matrix_4x4* out);
 void m4x4_multiply(matrix_4x4* out, const matrix_4x4* a, const matrix_4x4* b);
 void m4x4_translate(matrix_4x4* mtx, float x, float y, float z);
 void m4x4_scale(matrix_4x4* mtx, float x, float y, float z);
 void m4x4_rotate_x(matrix_4x4* mtx, float angle, bool bRightSide);
 void m4x4_rotate_y(matrix_4x4* mtx, float angle, bool bRightSide);
 void m4x4_rotate_z(matrix_4x4* mtx, float angle, bool bRightSide);
 // Special versions of the projection matrices that take the 3DS' screen orientation into account
 void m4x4_ortho_tilt(matrix_4x4* mtx, float left, float right, float bottom, float top, float near, float far);
 void m4x4_persp_tilt(matrix_4x4* mtx, float fovy, float aspect, float near, float far);
--- a/examples/graphics/gpu/simple_tri/source/gpu.c
+++ b/examples/graphics/gpu/simple_tri/source/gpu.c
@ -0,0 +1,93 @@
 #include "gpu.h"
 #define DISPLAY_TRANSFER_FLAGS \
 	(GX_TRANSFER_FLIP_VERT(0) | GX_TRANSFER_OUT_TILED(0) | GX_TRANSFER_RAW_COPY(0) | \
 	GX_TRANSFER_IN_FORMAT(GX_TRANSFER_FMT_RGBA8) | GX_TRANSFER_OUT_FORMAT(GX_TRANSFER_FMT_RGB8) | \
 	GX_TRANSFER_SCALING(GX_TRANSFER_SCALE_NO))
 static u32 *colorBuf, *depthBuf;
 static u32 *cmdBuf;
 void gpuInit(void)
 {
 	colorBuf = vramAlloc(400*240*4);
 	depthBuf = vramAlloc(400*240*4);
 	cmdBuf = linearAlloc(0x40000*4);
 	GPU_Init(NULL);
 	GPU_Reset(NULL, cmdBuf, 0x40000);
 }
 void gpuExit(void)
 {
 	linearFree(cmdBuf);
 	vramFree(depthBuf);
 	vramFree(colorBuf);
 }
 void gpuClearBuffers(u32 clearColor)
 {
 	GX_SetMemoryFill(NULL,
 		colorBuf, clearColor, &colorBuf[240*400], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH,
 		depthBuf, 0,          &depthBuf[240*400], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH);
 	gspWaitForPSC0(); // Wait for the fill to complete
 }
 void gpuFrameBegin(void)
 {
 	// Configure the viewport and the depth linear conversion function
 	GPU_SetViewport(
 		(u32*)osConvertVirtToPhys((u32)depthBuf),
 		(u32*)osConvertVirtToPhys((u32)colorBuf),
 		0, 0, 240, 400); // The top screen is physically 240x400 pixels
 	GPU_DepthMap(-1.0f, 0.0f); // calculate the depth value from the Z coordinate in the following way: -1.0*z + 0.0
 	// Configure some boilerplate
 	GPU_SetFaceCulling(GPU_CULL_BACK_CCW);
 	GPU_SetStencilTest(false, GPU_ALWAYS, 0x00, 0xFF, 0x00);
 	GPU_SetStencilOp(GPU_KEEP, GPU_KEEP, GPU_KEEP);
 	GPU_SetBlendingColor(0,0,0,0);
 	GPU_SetDepthTestAndWriteMask(true, GPU_GREATER, GPU_WRITE_ALL);
 	// This is unknown
 	GPUCMD_AddMaskedWrite(GPUREG_0062, 0x1, 0);
 	GPUCMD_AddWrite(GPUREG_0118, 0);
 	// Configure alpha blending and test
 	GPU_SetAlphaBlending(GPU_BLEND_ADD, GPU_BLEND_ADD, GPU_SRC_ALPHA, GPU_ONE_MINUS_SRC_ALPHA, GPU_SRC_ALPHA, GPU_ONE_MINUS_SRC_ALPHA);
 	GPU_SetAlphaTest(false, GPU_ALWAYS, 0x00);
 	int i;
 	for (i = 0; i < 6; i ++)
 		GPU_SetDummyTexEnv(i);
 }
 void gpuFrameEnd(void)
 {
 	// Finish rendering
 	GPU_FinishDrawing();
 	GPUCMD_Finalize();
 	GPUCMD_FlushAndRun(NULL);
 	gspWaitForP3D(); // Wait for the rendering to complete
 	// Transfer the GPU output to the framebuffer
 	GX_SetDisplayTransfer(NULL, colorBuf, GX_BUFFER_DIM(240, 400),
 		(u32*)gfxGetFramebuffer(GFX_TOP, GFX_LEFT, NULL, NULL), GX_BUFFER_DIM(240, 400),
 		DISPLAY_TRANSFER_FLAGS);
 	gspWaitForPPF(); // Wait for the transfer to complete
 	// Reset the command buffer
 	GPUCMD_SetBufferOffset(0);
 };
 void GPU_SetDummyTexEnv(int id)
 {
 	GPU_SetTexEnv(id,
 		GPU_TEVSOURCES(GPU_PREVIOUS, 0, 0),
 		GPU_TEVSOURCES(GPU_PREVIOUS, 0, 0),
 		GPU_TEVOPERANDS(0, 0, 0),
 		GPU_TEVOPERANDS(0, 0, 0),
 		GPU_REPLACE,
 		GPU_REPLACE,
 		0xFFFFFFFF);
 }
--- a/examples/graphics/gpu/simple_tri/source/gpu.h
+++ b/examples/graphics/gpu/simple_tri/source/gpu.h
@ -0,0 +1,26 @@
 /*
 * Bare-bones simplistic GPU wrapper
 * This library is common to all libctru GPU examples
 */
 #pragma once
 #include <string.h>
 #include <3ds.h>
 #include "3dmath.h"
 void gpuInit(void);
 void gpuExit(void);
 void gpuClearBuffers(u32 clearColor);
 void gpuFrameBegin(void);
 void gpuFrameEnd(void);
 // Configures the specified fixed-function fragment shading substage to be a no-operation
 void GPU_SetDummyTexEnv(int id);
 // Uploads an uniform matrix
 static inline void GPU_SetFloatUniformMatrix(GPU_SHADER_TYPE type, int location, matrix_4x4* matrix)
 {
 	GPU_SetFloatUniform(type, location, (u32*)matrix, 4);
 }
--- a/examples/graphics/gpu/simple_tri/source/main.c
+++ b/examples/graphics/gpu/simple_tri/source/main.c
@ -0,0 +1,131 @@
 /*
 * ~~ Simple libctru GPU triangle example ~~
 * This example demonstrates the basics of using the PICA200 in a 3DS homebrew
 * application in order to render a basic scene consisting of a white solid triangle.
 */
 #include "gpu.h"
 #include "vshader_shbin.h"
 #define CLEAR_COLOR 0x68B0D8FF
 typedef struct { float x, y, z; } vertex;
 static const vertex vertex_list[] =
 {
 	{ 200.0f, 200.0f, 0.5f },
 	{ 100.0f, 40.0f, 0.5f },
 	{ 300.0f, 40.0f, 0.5f },
 };
 #define vertex_list_count (sizeof(vertex_list)/sizeof(vertex_list[0]))
 static DVLB_s* vshader_dvlb;
 static shaderProgram_s program;
 static int uLoc_projection;
 static matrix_4x4 projection;
 static void* vbo_data;
 static void sceneInit(void)
 {
 	// Load the vertex shader and create a shader program
 	vshader_dvlb = DVLB_ParseFile((u32*)vshader_shbin, vshader_shbin_size);
 	shaderProgramInit(&program);
 	shaderProgramSetVsh(&program, &vshader_dvlb->DVLE[0]);
 	// Get the location of the projection matrix uniform
 	uLoc_projection = shaderInstanceGetUniformLocation(program.vertexShader, "projection");
 	// Compute the projection matrix
 	m4x4_ortho_tilt(&projection, 0.0, 400.0, 0.0, 240.0, 0.0, 1.0);
 	// Create the VBO (vertex buffer object)
 	vbo_data = linearAlloc(sizeof(vertex_list));
 	memcpy(vbo_data, vertex_list, sizeof(vertex_list));
 }
 static void sceneRender(void)
 {
 	// Bind the shader program
 	shaderProgramUse(&program);
 	// Configure the first fragment shading substage to just pass through the vertex color
 	// See https://www.opengl.org/sdk/docs/man2/xhtml/glTexEnv.xml for more insight
 	GPU_SetTexEnv(0,
 		GPU_TEVSOURCES(GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR), // RGB channels
 		GPU_TEVSOURCES(GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR), // Alpha
 		GPU_TEVOPERANDS(0, 0, 0), // RGB
 		GPU_TEVOPERANDS(0, 0, 0), // Alpha
 		GPU_REPLACE, GPU_REPLACE, // RGB, Alpha
 		0xFFFFFFFF);
 	// Configure the "attribute buffers" (that is, the vertex input buffers)
 	GPU_SetAttributeBuffers(
 		1, // Number of inputs per vertex
 		(u32*)osConvertVirtToPhys((u32)vbo_data), // Location of the VBO
 		GPU_ATTRIBFMT(0, 3, GPU_FLOAT), // Format of the inputs (in this case the only input is a 3-element float vector)
 		0xFFE, // Unused attribute mask, in our case bit 0 is cleared since it is used
 		0x0, // Attribute permutations (here it is the identity)
 		1, // Number of buffers
 		(u32[]) { 0x0 }, // Buffer offsets (placeholders)
 		(u64[]) { 0x0 }, // Attribute permutations for each buffer (identity again)
 		(u8[])  { 1 }); // Number of attributes for each buffer
 	// Upload the projection matrix
 	GPU_SetFloatUniformMatrix(GPU_VERTEX_SHADER, uLoc_projection, &projection);
 	// Draw the VBO
 	GPU_DrawArray(GPU_TRIANGLES, vertex_list_count);
 }
 static void sceneExit(void)
 {
 	// Free the VBO
 	linearFree(vbo_data);
 	// Free the shader program
 	shaderProgramFree(&program);
 	DVLB_Free(vshader_dvlb);
 }
 int main()
 {
 	// Initialize graphics
 	gfxInitDefault();
 	gpuInit();
 	// Initialize the scene
 	sceneInit();
 	gpuClearBuffers(CLEAR_COLOR);
 	// Main loop
 	while (aptMainLoop())
 	{
 		gspWaitForVBlank();  // Synchronize with the start of VBlank
 		gfxSwapBuffersGpu(); // Swap the framebuffers so that the frame that we rendered last frame is now visible
 		hidScanInput();      // Read the user input
 		// Respond to user input
 		u32 kDown = hidKeysDown();
 		if (kDown & KEY_START)
 			break; // break in order to return to hbmenu
 		// Render the scene
 		gpuFrameBegin();
 		sceneRender();
 		gpuFrameEnd();
 		gpuClearBuffers(CLEAR_COLOR);
 		// Flush the framebuffers out of the data cache (not necessary with pure GPU rendering)
 		//gfxFlushBuffers();
 	}
 	// Deinitialize the scene
 	sceneExit();
 	// Deinitialize graphics
 	gpuExit();
 	gfxExit();
 	return 0;
 }
--- a/examples/graphics/gpu/simple_tri/source/vshader.pica
+++ b/examples/graphics/gpu/simple_tri/source/vshader.pica
@ -0,0 +1,34 @@
 ; Example PICA200 vertex shader
 ; Uniforms
 .fvec projection[4]
 ; Constants
 .constf myconst(0.0, 1.0, -1.0, -0.5)
 .alias  zeros myconst.xxxx ; Vector full of zeros
 .alias  ones  myconst.yyyy ; Vector full of ones
 ; Outputs
 .out outpos position
 .out outclr color
 ; Inputs (defined as aliases for convenience)
 .alias inpos v0
 .proc main
 	; Force the w component of inpos to be 1.0
 	mov r0.xyz, inpos
 	mov r0.w,   ones
 	; outpos = projectionMatrix * inpos
 	dp4 outpos.x, projection[0], r0
 	dp4 outpos.y, projection[1], r0
 	dp4 outpos.z, projection[2], r0
 	dp4 outpos.w, projection[3], r0
 	; outclr = solid white color
 	mov outclr, ones
 	; We're finished
 	end
 .end
--- a/examples/graphics/gpu/textured_cube/Makefile
+++ b/examples/graphics/gpu/textured_cube/Makefile
@ -0,0 +1,177 @@
 #---------------------------------------------------------------------------------
 .SUFFIXES:
 #---------------------------------------------------------------------------------
 ifeq ($(strip $(DEVKITARM)),)
 $(error "Please set DEVKITARM in your environment. export DEVKITARM=<path to>devkitARM")
 endif
 TOPDIR ?= $(CURDIR)
 include $(DEVKITARM)/3ds_rules
 #---------------------------------------------------------------------------------
 # TARGET is the name of the output
 # BUILD is the directory where object files & intermediate files will be placed
 # SOURCES is a list of directories containing source code
 # DATA is a list of directories containing data files
 # INCLUDES is a list of directories containing header files
 #
 # NO_SMDH: if set to anything, no SMDH file is generated.
 # APP_TITLE is the name of the app stored in the SMDH file (Optional)
 # APP_DESCRIPTION is the description of the app stored in the SMDH file (Optional)
 # APP_AUTHOR is the author of the app stored in the SMDH file (Optional)
 # ICON is the filename of the icon (.png), relative to the project folder.
 #   If not set, it attempts to use one of the following (in this order):
 #     - <Project name>.png
 #     - icon.png
 #     - <libctru folder>/default_icon.png
 #---------------------------------------------------------------------------------
 TARGET		:=	$(notdir $(CURDIR))
 BUILD		:=	build
 SOURCES		:=	source
 DATA		:=	data
 INCLUDES	:=	include
 #---------------------------------------------------------------------------------
 # options for code generation
 #---------------------------------------------------------------------------------
 ARCH	:=	-march=armv6k -mtune=mpcore -mfloat-abi=hard
 CFLAGS	:=	-g -Wall -O2 -mword-relocations \
 			-fomit-frame-pointer -ffast-math \
 			$(ARCH)
 CFLAGS	+=	$(INCLUDE) -DARM11 -D_3DS
 CXXFLAGS	:= $(CFLAGS) -fno-rtti -fno-exceptions -std=gnu++11
 ASFLAGS	:=	-g $(ARCH)
 LDFLAGS	=	-specs=3dsx.specs -g $(ARCH) -Wl,-Map,$(notdir $*.map)
 LIBS	:= -lctru -lm
 #---------------------------------------------------------------------------------
 # list of directories containing libraries, this must be the top level containing
 # include and lib
 #---------------------------------------------------------------------------------
 LIBDIRS	:= $(CTRULIB)
 #---------------------------------------------------------------------------------
 # no real need to edit anything past this point unless you need to add additional
 # rules for different file extensions
 #---------------------------------------------------------------------------------
 ifneq ($(BUILD),$(notdir $(CURDIR)))
 #---------------------------------------------------------------------------------
 export OUTPUT	:=	$(CURDIR)/$(TARGET)
 export TOPDIR	:=	$(CURDIR)
 export VPATH	:=	$(foreach dir,$(SOURCES),$(CURDIR)/$(dir)) \
 			$(foreach dir,$(DATA),$(CURDIR)/$(dir))
 export DEPSDIR	:=	$(CURDIR)/$(BUILD)
 CFILES		:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.c)))
 CPPFILES	:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.cpp)))
 SFILES		:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.s)))
 PICAFILES	:=	$(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.pica)))
 BINFILES	:=	$(foreach dir,$(DATA),$(notdir $(wildcard $(dir)/*.*)))
 #---------------------------------------------------------------------------------
 # use CXX for linking C++ projects, CC for standard C
 #---------------------------------------------------------------------------------
 ifeq ($(strip $(CPPFILES)),)
 #---------------------------------------------------------------------------------
 	export LD	:=	$(CC)
 #---------------------------------------------------------------------------------
 else
 #---------------------------------------------------------------------------------
 	export LD	:=	$(CXX)
 #---------------------------------------------------------------------------------
 endif
 #---------------------------------------------------------------------------------
 export OFILES	:=	$(addsuffix .o,$(BINFILES)) $(PICAFILES:.pica=.shbin.o) \
 			$(CPPFILES:.cpp=.o) $(CFILES:.c=.o) $(SFILES:.s=.o)
 export INCLUDE	:=	$(foreach dir,$(INCLUDES),-I$(CURDIR)/$(dir)) \
 			$(foreach dir,$(LIBDIRS),-I$(dir)/include) \
 			-I$(CURDIR)/$(BUILD)
 export LIBPATHS	:=	$(foreach dir,$(LIBDIRS),-L$(dir)/lib)
 ifeq ($(strip $(ICON)),)
 	icons := $(wildcard *.png)
 	ifneq (,$(findstring $(TARGET).png,$(icons)))
 		export APP_ICON := $(TOPDIR)/$(TARGET).png
 	else
 		ifneq (,$(findstring icon.png,$(icons)))
 			export APP_ICON := $(TOPDIR)/icon.png
 		endif
 	endif
 else
 	export APP_ICON := $(TOPDIR)/$(ICON)
 endif
 ifeq ($(strip $(NO_SMDH)),)
 	export _3DSXFLAGS += --smdh=$(CURDIR)/$(TARGET).smdh
 endif
 .PHONY: $(BUILD) clean all
 #---------------------------------------------------------------------------------
 all: $(BUILD)
 $(BUILD):
 	@[ -d $@ ] || mkdir -p $@
 	@$(MAKE) --no-print-directory -C $(BUILD) -f $(CURDIR)/Makefile
 #---------------------------------------------------------------------------------
 clean:
 	@echo clean ...
 	@rm -fr $(BUILD) $(TARGET).3dsx $(OUTPUT).smdh $(TARGET).elf
 #---------------------------------------------------------------------------------
 else
 DEPENDS	:=	$(OFILES:.o=.d)
 #---------------------------------------------------------------------------------
 # main targets
 #---------------------------------------------------------------------------------
 ifeq ($(strip $(NO_SMDH)),)
 $(OUTPUT).3dsx	:	$(OUTPUT).elf $(OUTPUT).smdh
 else
 $(OUTPUT).3dsx	:	$(OUTPUT).elf
 endif
 $(OUTPUT).elf	:	$(OFILES)
 #---------------------------------------------------------------------------------
 # you need a rule like this for each extension you use as binary data
 #---------------------------------------------------------------------------------
 %.bin.o	:	%.bin
 #---------------------------------------------------------------------------------
 	@echo $(notdir $<)
 	@$(bin2o)
 #---------------------------------------------------------------------------------
 # rule for assembling GPU shaders
 #---------------------------------------------------------------------------------
 %.shbin.o: %.pica
 	@echo $(notdir $<)
 	$(eval CURBIN := $(patsubst %.pica,%.shbin,$(notdir $<)))
 	$(eval CURH := $(patsubst %.pica,%.psh.h,$(notdir $<)))
 	@picasso $(CURBIN) $< $(CURH)
 	@bin2s $(CURBIN) | $(AS) -o $@
 	@echo "extern const u8" `(echo $(CURBIN) | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`"_end[];" > `(echo $(CURBIN) | tr . _)`.h
 	@echo "extern const u8" `(echo $(CURBIN) | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`"[];" >> `(echo $(CURBIN) | tr . _)`.h
 	@echo "extern const u32" `(echo $(CURBIN) | sed -e 's/^\([0-9]\)/_\1/' | tr . _)`_size";" >> `(echo $(CURBIN) | tr . _)`.h
 -include $(DEPENDS)
 #---------------------------------------------------------------------------------------
 endif
 #---------------------------------------------------------------------------------------
--- a/examples/graphics/gpu/textured_cube/README.md
+++ b/examples/graphics/gpu/textured_cube/README.md
@ -0,0 +1,6 @@
 # GPU example
 This is a simple GPU example using the `picasso` shader assembler which comes with devkitARM r45 and up.
 Users of earlier versions of devkitARM need to install the tool, which can be found in the address below:
 https://github.com/fincs/picasso/releases
--- a/examples/graphics/gpu/textured_cube/data/kitten.bin
+++ b/examples/graphics/gpu/textured_cube/data/kitten.bin
--- a/examples/graphics/gpu/textured_cube/source/3dmath.c
+++ b/examples/graphics/gpu/textured_cube/source/3dmath.c
@ -0,0 +1,172 @@
 #include "3dmath.h"
 void m4x4_identity(matrix_4x4* out)
 {
 	m4x4_zeros(out);
 	out->r[0].x = out->r[1].y = out->r[2].z = out->r[3].w = 1.0f;
 }
 void m4x4_multiply(matrix_4x4* out, const matrix_4x4* a, const matrix_4x4* b)
 {
 	int i, j;
 	for (i = 0; i < 4; i ++)
 		for (j = 0; j < 4; j ++)
 			out->r[j].c[i] = a->r[j].x*b->r[0].c[i] + a->r[j].y*b->r[1].c[i] + a->r[j].z*b->r[2].c[i] + a->r[j].w*b->r[3].c[i];
 }
 void m4x4_translate(matrix_4x4* mtx, float x, float y, float z)
 {
 	matrix_4x4 tm, om;
 	m4x4_identity(&tm);
 	tm.r[0].w = x;
 	tm.r[1].w = y;
 	tm.r[2].w = z;
 	m4x4_multiply(&om, mtx, &tm);
 	m4x4_copy(mtx, &om);
 }
 void m4x4_scale(matrix_4x4* mtx, float x, float y, float z)
 {
 	int i;
 	for (i = 0; i < 4; i ++)
 	{
 		mtx->r[i].x *= x;
 		mtx->r[i].y *= y;
 		mtx->r[i].z *= z;
 	}
 }
 void m4x4_rotate_x(matrix_4x4* mtx, float angle, bool bRightSide)
 {
 	matrix_4x4 rm, om;
 	float cosAngle = cosf(angle);
 	float sinAngle = sinf(angle);
 	m4x4_zeros(&rm);
 	rm.r[0].x = 1.0f;
 	rm.r[1].y = cosAngle;
 	rm.r[1].z = sinAngle;
 	rm.r[2].y = -sinAngle;
 	rm.r[2].z = cosAngle;
 	rm.r[3].w = 1.0f;
 	if (bRightSide) m4x4_multiply(&om, mtx, &rm);
 	else            m4x4_multiply(&om, &rm, mtx);
 	m4x4_copy(mtx, &om);
 }
 void m4x4_rotate_y(matrix_4x4* mtx, float angle, bool bRightSide)
 {
 	matrix_4x4 rm, om;
 	float cosAngle = cosf(angle);
 	float sinAngle = sinf(angle);
 	m4x4_zeros(&rm);
 	rm.r[0].x = cosAngle;
 	rm.r[0].z = sinAngle;
 	rm.r[1].y = 1.0f;
 	rm.r[2].x = -sinAngle;
 	rm.r[2].z = cosAngle;
 	rm.r[3].w = 1.0f;
 	if (bRightSide) m4x4_multiply(&om, mtx, &rm);
 	else            m4x4_multiply(&om, &rm, mtx);
 	m4x4_copy(mtx, &om);
 }
 void m4x4_rotate_z(matrix_4x4* mtx, float angle, bool bRightSide)
 {
 	matrix_4x4 rm, om;
 	float cosAngle = cosf(angle);
 	float sinAngle = sinf(angle);
 	m4x4_zeros(&rm);
 	rm.r[0].x = cosAngle;
 	rm.r[0].y = sinAngle;
 	rm.r[1].x = -sinAngle;
 	rm.r[1].y = cosAngle;
 	rm.r[2].z = 1.0f;
 	rm.r[3].w = 1.0f;
 	if (bRightSide) m4x4_multiply(&om, mtx, &rm);
 	else            m4x4_multiply(&om, &rm, mtx);
 	m4x4_copy(mtx, &om);
 }
 void m4x4_ortho_tilt(matrix_4x4* mtx, float left, float right, float bottom, float top, float near, float far)
 {
 	matrix_4x4 mp;
 	m4x4_zeros(&mp);
 	// Build standard orthogonal projection matrix
 	mp.r[0].x = 2.0f / (right - left);
 	mp.r[0].w = (left + right) / (left - right);
 	mp.r[1].y = 2.0f / (top - bottom);
 	mp.r[1].w = (bottom + top) / (bottom - top);
 	mp.r[2].z = 2.0f / (near - far);
 	mp.r[2].w = (far + near) / (far - near);
 	mp.r[3].w = 1.0f;
 	// Fix depth range to [-1, 0]
 	matrix_4x4 mp2, mp3;
 	m4x4_identity(&mp2);
 	mp2.r[2].z = 0.5;
 	mp2.r[2].w = -0.5;
 	m4x4_multiply(&mp3, &mp2, &mp);
 	// Fix the 3DS screens' orientation by swapping the X and Y axis
 	m4x4_identity(&mp2);
 	mp2.r[0].x = 0.0;
 	mp2.r[0].y = 1.0;
 	mp2.r[1].x = -1.0; // flipped
 	mp2.r[1].y = 0.0;
 	m4x4_multiply(mtx, &mp2, &mp3);
 }
 void m4x4_persp_tilt(matrix_4x4* mtx, float fovx, float invaspect, float near, float far)
 {
 	// Notes:
 	// We are passed "fovy" and the "aspect ratio". However, the 3DS screens are sideways,
 	// and so are these parameters -- in fact, they are actually the fovx and the inverse
 	// of the aspect ratio. Therefore the formula for the perspective projection matrix
 	// had to be modified to be expressed in these terms instead.
 	// Notes:
 	// fovx = 2 atan(tan(fovy/2)*w/h)
 	// fovy = 2 atan(tan(fovx/2)*h/w)
 	// invaspect = h/w
 	// a0,0 = h / (w*tan(fovy/2)) =
 	//      = h / (w*tan(2 atan(tan(fovx/2)*h/w) / 2)) =
 	//      = h / (w*tan( atan(tan(fovx/2)*h/w) )) =
 	//      = h / (w * tan(fovx/2)*h/w) =
 	//      = 1 / tan(fovx/2)
 	// a1,1 = 1 / tan(fovy/2) = (...) = w / (h*tan(fovx/2))
 	float fovx_tan = tanf(fovx / 2);
 	matrix_4x4 mp;
 	m4x4_zeros(&mp);
 	// Build standard perspective projection matrix
 	mp.r[0].x = 1.0f / fovx_tan;
 	mp.r[1].y = 1.0f / (fovx_tan*invaspect);
 	mp.r[2].z = (near + far) / (near - far);
 	mp.r[2].w = (2 * near * far) / (near - far);
 	mp.r[3].z = -1.0f;
 	// Fix depth range to [-1, 0]
 	matrix_4x4 mp2;
 	m4x4_identity(&mp2);
 	mp2.r[2].z = 0.5;
 	mp2.r[2].w = -0.5;
 	m4x4_multiply(mtx, &mp2, &mp);
 	// Rotate the matrix one quarter of a turn CCW in order to fix the 3DS screens' orientation
 	m4x4_rotate_z(mtx, M_PI / 2, true);
 }
--- a/examples/graphics/gpu/textured_cube/source/3dmath.h
+++ b/examples/graphics/gpu/textured_cube/source/3dmath.h
@ -0,0 +1,56 @@
 /*
 * Bare-bones simplistic 3D math library
 * This library is common to all libctru GPU examples
 */
 #pragma once
 #include <string.h>
 #include <stdbool.h>
 #include <math.h>
 typedef union { struct { float w, z, y, x; }; float c[4]; } vector_4f;
 typedef struct { vector_4f r[4]; } matrix_4x4;
 static inline float v4f_dp4(const vector_4f* a, const vector_4f* b)
 {
 	return a->x*b->x + a->y*b->y + a->z*b->z + a->w*b->w;
 }
 static inline float v4f_mod4(const vector_4f* a)
 {
 	return sqrtf(v4f_dp4(a,a));
 }
 static inline void v4f_norm4(vector_4f* vec)
 {
 	float m = v4f_mod4(vec);
 	if (m == 0.0) return;
 	vec->x /= m;
 	vec->y /= m;
 	vec->z /= m;
 	vec->w /= m;
 }
 static inline void m4x4_zeros(matrix_4x4* out)
 {
 	memset(out, 0, sizeof(*out));
 }
 static inline void m4x4_copy(matrix_4x4* out, const matrix_4x4* in)
 {
 	memcpy(out, in, sizeof(*out));
 }
 void m4x4_identity(matrix_4x4* out);
 void m4x4_multiply(matrix_4x4* out, const matrix_4x4* a, const matrix_4x4* b);
 void m4x4_translate(matrix_4x4* mtx, float x, float y, float z);
 void m4x4_scale(matrix_4x4* mtx, float x, float y, float z);
 void m4x4_rotate_x(matrix_4x4* mtx, float angle, bool bRightSide);
 void m4x4_rotate_y(matrix_4x4* mtx, float angle, bool bRightSide);
 void m4x4_rotate_z(matrix_4x4* mtx, float angle, bool bRightSide);
 // Special versions of the projection matrices that take the 3DS' screen orientation into account
 void m4x4_ortho_tilt(matrix_4x4* mtx, float left, float right, float bottom, float top, float near, float far);
 void m4x4_persp_tilt(matrix_4x4* mtx, float fovy, float aspect, float near, float far);
--- a/examples/graphics/gpu/textured_cube/source/gpu.c
+++ b/examples/graphics/gpu/textured_cube/source/gpu.c
@ -0,0 +1,93 @@
 #include "gpu.h"
 #define DISPLAY_TRANSFER_FLAGS \
 	(GX_TRANSFER_FLIP_VERT(0) | GX_TRANSFER_OUT_TILED(0) | GX_TRANSFER_RAW_COPY(0) | \
 	GX_TRANSFER_IN_FORMAT(GX_TRANSFER_FMT_RGBA8) | GX_TRANSFER_OUT_FORMAT(GX_TRANSFER_FMT_RGB8) | \
 	GX_TRANSFER_SCALING(GX_TRANSFER_SCALE_NO))
 static u32 *colorBuf, *depthBuf;
 static u32 *cmdBuf;
 void gpuInit(void)
 {
 	colorBuf = vramAlloc(400*240*4);
 	depthBuf = vramAlloc(400*240*4);
 	cmdBuf = linearAlloc(0x40000*4);
 	GPU_Init(NULL);
 	GPU_Reset(NULL, cmdBuf, 0x40000);
 }
 void gpuExit(void)
 {
 	linearFree(cmdBuf);
 	vramFree(depthBuf);
 	vramFree(colorBuf);
 }
 void gpuClearBuffers(u32 clearColor)
 {
 	GX_SetMemoryFill(NULL,
 		colorBuf, clearColor, &colorBuf[240*400], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH,
 		depthBuf, 0,          &depthBuf[240*400], GX_FILL_TRIGGER | GX_FILL_32BIT_DEPTH);
 	gspWaitForPSC0(); // Wait for the fill to complete
 }
 void gpuFrameBegin(void)
 {
 	// Configure the viewport and the depth linear conversion function
 	GPU_SetViewport(
 		(u32*)osConvertVirtToPhys((u32)depthBuf),
 		(u32*)osConvertVirtToPhys((u32)colorBuf),
 		0, 0, 240, 400); // The top screen is physically 240x400 pixels
 	GPU_DepthMap(-1.0f, 0.0f); // calculate the depth value from the Z coordinate in the following way: -1.0*z + 0.0
 	// Configure some boilerplate
 	GPU_SetFaceCulling(GPU_CULL_BACK_CCW);
 	GPU_SetStencilTest(false, GPU_ALWAYS, 0x00, 0xFF, 0x00);
 	GPU_SetStencilOp(GPU_KEEP, GPU_KEEP, GPU_KEEP);
 	GPU_SetBlendingColor(0,0,0,0);
 	GPU_SetDepthTestAndWriteMask(true, GPU_GREATER, GPU_WRITE_ALL);
 	// This is unknown
 	GPUCMD_AddMaskedWrite(GPUREG_0062, 0x1, 0);
 	GPUCMD_AddWrite(GPUREG_0118, 0);
 	// Configure alpha blending and test
 	GPU_SetAlphaBlending(GPU_BLEND_ADD, GPU_BLEND_ADD, GPU_SRC_ALPHA, GPU_ONE_MINUS_SRC_ALPHA, GPU_SRC_ALPHA, GPU_ONE_MINUS_SRC_ALPHA);
 	GPU_SetAlphaTest(false, GPU_ALWAYS, 0x00);
 	int i;
 	for (i = 0; i < 6; i ++)
 		GPU_SetDummyTexEnv(i);
 }
 void gpuFrameEnd(void)
 {
 	// Finish rendering
 	GPU_FinishDrawing();
 	GPUCMD_Finalize();
 	GPUCMD_FlushAndRun(NULL);
 	gspWaitForP3D(); // Wait for the rendering to complete
 	// Transfer the GPU output to the framebuffer
 	GX_SetDisplayTransfer(NULL, colorBuf, GX_BUFFER_DIM(240, 400),
 		(u32*)gfxGetFramebuffer(GFX_TOP, GFX_LEFT, NULL, NULL), GX_BUFFER_DIM(240, 400),
 		DISPLAY_TRANSFER_FLAGS);
 	gspWaitForPPF(); // Wait for the transfer to complete
 	// Reset the command buffer
 	GPUCMD_SetBufferOffset(0);
 };
 void GPU_SetDummyTexEnv(int id)
 {
 	GPU_SetTexEnv(id,
 		GPU_TEVSOURCES(GPU_PREVIOUS, 0, 0),
 		GPU_TEVSOURCES(GPU_PREVIOUS, 0, 0),
 		GPU_TEVOPERANDS(0, 0, 0),
 		GPU_TEVOPERANDS(0, 0, 0),
 		GPU_REPLACE,
 		GPU_REPLACE,
 		0xFFFFFFFF);
 }
--- a/examples/graphics/gpu/textured_cube/source/gpu.h
+++ b/examples/graphics/gpu/textured_cube/source/gpu.h
@ -0,0 +1,26 @@
 /*
 * Bare-bones simplistic GPU wrapper
 * This library is common to all libctru GPU examples
 */
 #pragma once
 #include <string.h>
 #include <3ds.h>
 #include "3dmath.h"
 void gpuInit(void);
 void gpuExit(void);
 void gpuClearBuffers(u32 clearColor);
 void gpuFrameBegin(void);
 void gpuFrameEnd(void);
 // Configures the specified fixed-function fragment shading substage to be a no-operation
 void GPU_SetDummyTexEnv(int id);
 // Uploads an uniform matrix
 static inline void GPU_SetFloatUniformMatrix(GPU_SHADER_TYPE type, int location, matrix_4x4* matrix)
 {
 	GPU_SetFloatUniform(type, location, (u32*)matrix, 4);
 }
--- a/examples/graphics/gpu/textured_cube/source/main.c
+++ b/examples/graphics/gpu/textured_cube/source/main.c
@ -0,0 +1,244 @@
 /*
 * ~~ Simple libctru GPU textured cube example ~~
 * This example demonstrates the basics of using the PICA200 in a 3DS homebrew
 * application in order to render a basic scene consisting of a rotating
 * textured cube which is also shaded using a simple shading algorithm.
 * The shading algorithm is explained in the vertex shader source code.
 */
 #include "gpu.h"
 #include "vshader_shbin.h"
 #include "kitten_bin.h"
 #define CLEAR_COLOR 0x68B0D8FF
 typedef struct { float position[3]; float texcoord[2]; float normal[3]; } vertex;
 static const vertex vertex_list[] =
 {
 	// First face (PZ)
 	// First triangle
 	{ {-0.5f, -0.5f, +0.5f}, {0.0f, 0.0f}, {0.0f, 0.0f, +1.0f} },
 	{ {+0.5f, -0.5f, +0.5f}, {1.0f, 0.0f}, {0.0f, 0.0f, +1.0f} },
 	{ {+0.5f, +0.5f, +0.5f}, {1.0f, 1.0f}, {0.0f, 0.0f, +1.0f} },
 	// Second triangle
 	{ {+0.5f, +0.5f, +0.5f}, {1.0f, 1.0f}, {0.0f, 0.0f, +1.0f} },
 	{ {-0.5f, +0.5f, +0.5f}, {0.0f, 1.0f}, {0.0f, 0.0f, +1.0f} },
 	{ {-0.5f, -0.5f, +0.5f}, {0.0f, 0.0f}, {0.0f, 0.0f, +1.0f} },
 	// Second face (MZ)
 	// First triangle
 	{ {-0.5f, -0.5f, -0.5f}, {0.0f, 0.0f}, {0.0f, 0.0f, -1.0f} },
 	{ {-0.5f, +0.5f, -0.5f}, {1.0f, 0.0f}, {0.0f, 0.0f, -1.0f} },
 	{ {+0.5f, +0.5f, -0.5f}, {1.0f, 1.0f}, {0.0f, 0.0f, -1.0f} },
 	// Second triangle
 	{ {+0.5f, +0.5f, -0.5f}, {1.0f, 1.0f}, {0.0f, 0.0f, -1.0f} },
 	{ {+0.5f, -0.5f, -0.5f}, {0.0f, 1.0f}, {0.0f, 0.0f, -1.0f} },
 	{ {-0.5f, -0.5f, -0.5f}, {0.0f, 0.0f}, {0.0f, 0.0f, -1.0f} },
 	// Third face (PX)
 	// First triangle
 	{ {+0.5f, -0.5f, -0.5f}, {0.0f, 0.0f}, {+1.0f, 0.0f, 0.0f} },
 	{ {+0.5f, +0.5f, -0.5f}, {1.0f, 0.0f}, {+1.0f, 0.0f, 0.0f} },
 	{ {+0.5f, +0.5f, +0.5f}, {1.0f, 1.0f}, {+1.0f, 0.0f, 0.0f} },
 	// Second triangle
 	{ {+0.5f, +0.5f, +0.5f}, {1.0f, 1.0f}, {+1.0f, 0.0f, 0.0f} },
 	{ {+0.5f, -0.5f, +0.5f}, {0.0f, 1.0f}, {+1.0f, 0.0f, 0.0f} },
 	{ {+0.5f, -0.5f, -0.5f}, {0.0f, 0.0f}, {+1.0f, 0.0f, 0.0f} },
 	// Fourth face (MX)
 	// First triangle
 	{ {-0.5f, -0.5f, -0.5f}, {0.0f, 0.0f}, {-1.0f, 0.0f, 0.0f} },
 	{ {-0.5f, -0.5f, +0.5f}, {1.0f, 0.0f}, {-1.0f, 0.0f, 0.0f} },
 	{ {-0.5f, +0.5f, +0.5f}, {1.0f, 1.0f}, {-1.0f, 0.0f, 0.0f} },
 	// Second triangle
 	{ {-0.5f, +0.5f, +0.5f}, {1.0f, 1.0f}, {-1.0f, 0.0f, 0.0f} },
 	{ {-0.5f, +0.5f, -0.5f}, {0.0f, 1.0f}, {-1.0f, 0.0f, 0.0f} },
 	{ {-0.5f, -0.5f, -0.5f}, {0.0f, 0.0f}, {-1.0f, 0.0f, 0.0f} },
 	// Fifth face (PY)
 	// First triangle
 	{ {-0.5f, +0.5f, -0.5f}, {0.0f, 0.0f}, {0.0f, +1.0f, 0.0f} },
 	{ {-0.5f, +0.5f, +0.5f}, {1.0f, 0.0f}, {0.0f, +1.0f, 0.0f} },
 	{ {+0.5f, +0.5f, +0.5f}, {1.0f, 1.0f}, {0.0f, +1.0f, 0.0f} },
 	// Second triangle
 	{ {+0.5f, +0.5f, +0.5f}, {1.0f, 1.0f}, {0.0f, +1.0f, 0.0f} },
 	{ {+0.5f, +0.5f, -0.5f}, {0.0f, 1.0f}, {0.0f, +1.0f, 0.0f} },
 	{ {-0.5f, +0.5f, -0.5f}, {0.0f, 0.0f}, {0.0f, +1.0f, 0.0f} },
 	// Sixth face (MY)
 	// First triangle
 	{ {-0.5f, -0.5f, -0.5f}, {0.0f, 0.0f}, {0.0f, -1.0f, 0.0f} },
 	{ {+0.5f, -0.5f, -0.5f}, {1.0f, 0.0f}, {0.0f, -1.0f, 0.0f} },
 	{ {+0.5f, -0.5f, +0.5f}, {1.0f, 1.0f}, {0.0f, -1.0f, 0.0f} },
 	// Second triangle
 	{ {+0.5f, -0.5f, +0.5f}, {1.0f, 1.0f}, {0.0f, -1.0f, 0.0f} },
 	{ {-0.5f, -0.5f, +0.5f}, {0.0f, 1.0f}, {0.0f, -1.0f, 0.0f} },
 	{ {-0.5f, -0.5f, -0.5f}, {0.0f, 0.0f}, {0.0f, -1.0f, 0.0f} },
 };
 #define vertex_list_count (sizeof(vertex_list)/sizeof(vertex_list[0]))
 static DVLB_s* vshader_dvlb;
 static shaderProgram_s program;
 static int uLoc_projection, uLoc_modelView;
 static int uLoc_lightVec, uLoc_lightHalfVec, uLoc_lightClr, uLoc_material;
 static matrix_4x4 projection;
 static matrix_4x4 material =
 {
 	{
 	{ { 0.0f, 0.2f, 0.2f, 0.2f } }, // Ambient
 	{ { 0.0f, 0.4f, 0.4f, 0.4f } }, // Diffuse
 	{ { 0.0f, 0.8f, 0.8f, 0.8f } }, // Specular
 	{ { 1.0f, 0.0f, 0.0f, 0.0f } }, // Emission
 	}
 };
 static void* vbo_data;
 static void* tex_data;
 static float angleX = 0.0, angleY = 0.0;
 static void sceneInit(void)
 {
 	// Load the vertex shader and create a shader program
 	vshader_dvlb = DVLB_ParseFile((u32*)vshader_shbin, vshader_shbin_size);
 	shaderProgramInit(&program);
 	shaderProgramSetVsh(&program, &vshader_dvlb->DVLE[0]);
 	// Get the location of the uniforms
 	uLoc_projection   = shaderInstanceGetUniformLocation(program.vertexShader, "projection");
 	uLoc_modelView    = shaderInstanceGetUniformLocation(program.vertexShader, "modelView");
 	uLoc_lightVec     = shaderInstanceGetUniformLocation(program.vertexShader, "lightVec");
 	uLoc_lightHalfVec = shaderInstanceGetUniformLocation(program.vertexShader, "lightHalfVec");
 	uLoc_lightClr     = shaderInstanceGetUniformLocation(program.vertexShader, "lightClr");
 	uLoc_material     = shaderInstanceGetUniformLocation(program.vertexShader, "material");
 	// Compute the projection matrix
 	m4x4_persp_tilt(&projection, 80.0f*M_PI/180.0f, 400.0f/240.0f, 0.01f, 1000.0f);
 	// Create the VBO (vertex buffer object)
 	vbo_data = linearAlloc(sizeof(vertex_list));
 	memcpy(vbo_data, vertex_list, sizeof(vertex_list));
 	// Load the texture
 	tex_data = linearAlloc(kitten_bin_size);
 	memcpy(tex_data, kitten_bin, kitten_bin_size);
 }
 static void sceneRender(void)
 {
 	// Bind the shader program
 	shaderProgramUse(&program);
 	// Configure the first fragment shading substage to blend the texture color with
 	// the vertex color (calculated by the vertex shader using a lighting algorithm)
 	// See https://www.opengl.org/sdk/docs/man2/xhtml/glTexEnv.xml for more insight
 	GPU_SetTexEnv(0,
 		GPU_TEVSOURCES(GPU_TEXTURE0, GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR), // RGB channels
 		GPU_TEVSOURCES(GPU_TEXTURE0, GPU_PRIMARY_COLOR, GPU_PRIMARY_COLOR), // Alpha
 		GPU_TEVOPERANDS(0, 0, 0), // RGB
 		GPU_TEVOPERANDS(0, 0, 0), // Alpha
 		GPU_MODULATE, GPU_MODULATE, // RGB, Alpha
 		0xFFFFFFFF);
 	// Configure the first texture unit
 	GPU_SetTextureEnable(GPU_TEXUNIT0);
 	GPU_SetTexture(
 		GPU_TEXUNIT0,
 		(u32*)osConvertVirtToPhys((u32)tex_data),
 		64, // Width
 		64, // Height
 		GPU_TEXTURE_MAG_FILTER(GPU_LINEAR) | GPU_TEXTURE_WRAP_S(GPU_REPEAT) | GPU_TEXTURE_WRAP_T(GPU_REPEAT), // Flags
 		GPU_RGBA8 // Pixel format
 	);
 	// Configure the "attribute buffers" (that is, the vertex input buffers)
 	GPU_SetAttributeBuffers(
 		3, // Number of inputs per vertex
 		(u32*)osConvertVirtToPhys((u32)vbo_data), // Location of the VBO
 		GPU_ATTRIBFMT(0, 3, GPU_FLOAT) | // Format of the inputs
 		GPU_ATTRIBFMT(1, 2, GPU_FLOAT) |
 		GPU_ATTRIBFMT(2, 3, GPU_FLOAT),
 		0xFFC, // Unused attribute mask, in our case bits 0~2 are cleared since they are used
 		0x210, // Attribute permutations (here it is the identity, passing each attribute in order)
 		1, // Number of buffers
 		(u32[]) { 0x0 }, // Buffer offsets (placeholders)
 		(u64[]) { 0x210 }, // Attribute permutations for each buffer (identity again)
 		(u8[])  { 3 }); // Number of attributes for each buffer
 	// Calculate the modelView matrix
 	matrix_4x4 modelView;
 	m4x4_identity(&modelView);
 	m4x4_translate(&modelView, 0.0, 0.0, -2.0 + 0.5*sinf(angleX));
 	m4x4_rotate_x(&modelView, angleX, true);
 	m4x4_rotate_y(&modelView, angleY, true);
 	// Rotate the cube each frame
 	angleX += M_PI / 180;
 	angleY += M_PI / 360;
 	// Upload the uniforms
 	GPU_SetFloatUniformMatrix(GPU_VERTEX_SHADER, uLoc_projection, &projection);
 	GPU_SetFloatUniformMatrix(GPU_VERTEX_SHADER, uLoc_modelView,  &modelView);
 	GPU_SetFloatUniformMatrix(GPU_VERTEX_SHADER, uLoc_material,   &material);
 	GPU_SetFloatUniform(GPU_VERTEX_SHADER, uLoc_lightVec,     (u32*)(float[]){0.0f, -1.0f, 0.0f, 0.0f}, 1);
 	GPU_SetFloatUniform(GPU_VERTEX_SHADER, uLoc_lightHalfVec, (u32*)(float[]){0.0f, -1.0f, 0.0f, 0.0f}, 1);
 	GPU_SetFloatUniform(GPU_VERTEX_SHADER, uLoc_lightClr,     (u32*)(float[]){1.0f,  1.0f, 1.0f, 1.0f}, 1);
 	// Draw the VBO
 	GPU_DrawArray(GPU_TRIANGLES, vertex_list_count);
 }
 static void sceneExit(void)
 {
 	// Free the texture
 	linearFree(tex_data);
 	// Free the VBO
 	linearFree(vbo_data);
 	// Free the shader program
 	shaderProgramFree(&program);
 	DVLB_Free(vshader_dvlb);
 }
 int main()
 {
 	// Initialize graphics
 	gfxInitDefault();
 	gpuInit();
 	// Initialize the scene
 	sceneInit();
 	gpuClearBuffers(CLEAR_COLOR);
 	// Main loop
 	while (aptMainLoop())
 	{
 		gspWaitForVBlank();  // Synchronize with the start of VBlank
 		gfxSwapBuffersGpu(); // Swap the framebuffers so that the frame that we rendered last frame is now visible
 		hidScanInput();      // Read the user input
 		// Respond to user input
 		u32 kDown = hidKeysDown();
 		if (kDown & KEY_START)
 			break; // break in order to return to hbmenu
 		// Render the scene
 		gpuFrameBegin();
 		sceneRender();
 		gpuFrameEnd();
 		gpuClearBuffers(CLEAR_COLOR);
 		// Flush the framebuffers out of the data cache (not necessary with pure GPU rendering)
 		//gfxFlushBuffers();
 	}
 	// Deinitialize the scene
 	sceneExit();
 	// Deinitialize graphics
 	gpuExit();
 	gfxExit();
 	return 0;
 }
--- a/examples/graphics/gpu/textured_cube/source/vshader.pica
+++ b/examples/graphics/gpu/textured_cube/source/vshader.pica
@ -0,0 +1,90 @@
 ; Example PICA200 vertex shader
 ; Uniforms
 .fvec projection[4], modelView[4]
 .fvec lightVec, lightHalfVec, lightClr, material[4]
 .alias mat_amb material[0]
 .alias mat_dif material[1]
 .alias mat_spe material[2]
 .alias mat_emi material[3]
 ; Constants
 .constf myconst(0.0, 1.0, -1.0, -0.5)
 .alias  zeros myconst.xxxx ; Vector full of zeros
 .alias  ones  myconst.yyyy ; Vector full of ones
 ; Outputs
 .out outpos position
 .out outtc0 texcoord0
 .out outclr color
 ; Inputs (defined as aliases for convenience)
 .alias inpos v0
 .alias intex v1
 .alias innrm v2
 .proc main
 	; Force the w component of inpos to be 1.0
 	mov r0.xyz, inpos
 	mov r0.w,   ones
 	; r1 = modelView * inpos
 	dp4 r1.x, modelView[0], r0
 	dp4 r1.y, modelView[1], r0
 	dp4 r1.z, modelView[2], r0
 	dp4 r1.w, modelView[3], r0
 	; outpos = projection * r1
 	dp4 outpos.x, projection[0], r1
 	dp4 outpos.y, projection[1], r1
 	dp4 outpos.z, projection[2], r1
 	dp4 outpos.w, projection[3], r1
 	; outtex = intex
 	mov outtc0, intex
 	; Transform the normal vector with the modelView matrix
 	; r1 = normalize(modelView * innrm)
 	mov r0.xyz, innrm
 	mov r0.w,   zeros
 	dp4 r1.x,   modelView[0], r0
 	dp4 r1.y,   modelView[1], r0
 	dp4 r1.z,   modelView[2], r0
 	mov r1.w,   zeros
 	dp3 r2,     r1, r1 ; r2 = x^2+y^2+z^2 for each component
 	rsq r2,     r2     ; r2 = 1/sqrt(r2)  ''
 	mul r1,     r2, r1 ; r1 = r1*r2
 	; Calculate the diffuse level (r0.x) and the shininess level (r0.y)
 	; r0.x = max(0, -(lightVec * r1))
 	; r0.y = max(0, (-lightHalfVec[i]) * r1) ^ 2
 	dp3 r0.x, lightVec,      r1
 	add r0.x, zeros,         -r0
 	dp3 r0.y, -lightHalfVec, r1
 	max r0,   zeros,         r0
 	mul r0.y, r0,            r0
 	; Accumulate the vertex color in r1, initializing it to the emission color
 	mov r1, mat_emi
 	; r1 += specularColor * lightClr * shininessLevel
 	mul r2, lightClr, r0.yyyy
 	mul r2, mat_spe,  r2
 	add r1, r2,       r1
 	; r1 += diffuseColor * lightClr * diffuseLevel
 	mul r2, lightClr, r0.xxxx
 	mul r2, mat_dif,  r2
 	add r1, r2,       r1
 	; r1 += ambientColor * lightClr
 	mov r2, lightClr
 	mul r2, mat_amb, r2
 	add r1, r2,      r1
 	; outclr = clamp r1 to [0,1]
 	min outclr, ones, r1
 	; We're finished
 	end
 .end
--- a/libctru/include/3ds.h
+++ b/libctru/include/3ds.h
@ -53,12 +53,52 @@ extern "C" {
 * @example app_launch/source/main.c
 * @example audio/mic/source/main.c
 * @example get_system_language/source/main.c
 * @example gpu/source/main.c
 * @example graphics/bitmap/24bit-color/source/main.c
 * @example graphics/printing/hello-world/source/main.c
 * @example graphics/printing/both-screen-text/source/main.c
 * @example graphics/printing/colored-text/source/main.c
 * @example graphics/printing/multiple-windows-text/source/main.c
 * @example graphics/gpu/geoshader/source/main.c
   graphics/gpu/geoshader/source/gpu.h
   @include graphics/gpu/geoshader/source/gpu.h
   graphics/gpu/geoshader/source/gpu.c
   @include graphics/gpu/geoshader/source/gpu.c
   graphics/gpu/geoshader/source/3dmath.h
   @include graphics/gpu/geoshader/source/3dmath.h
   graphics/gpu/geoshader/source/3dmath.c
   @include graphics/gpu/geoshader/source/3dmath.c
   graphics/gpu/geoshader/source/vshader.pica
   @include graphics/gpu/geoshader/source/vshader.pica
   graphics/gpu/geoshader/source/gshader.pica
   @include graphics/gpu/geoshader/source/gshader.pica
 * @example graphics/gpu/simple_tri/source/main.c
   graphics/gpu/simple_tri/source/gpu.h
   @include graphics/gpu/simple_tri/source/gpu.h
   graphics/gpu/simple_tri/source/gpu.c
   @include graphics/gpu/simple_tri/source/gpu.c
   graphics/gpu/simple_tri/source/3dmath.h
   @include graphics/gpu/simple_tri/source/3dmath.h
   graphics/gpu/simple_tri/source/3dmath.c
   @include graphics/gpu/simple_tri/source/3dmath.c
   graphics/gpu/simple_tri/source/vshader.pica
   @include graphics/gpu/simple_tri/source/vshader.pica
 * @example graphics/gpu/textured_cube/source/main.c
   graphics/gpu/textured_cube/source/gpu.h
   @include graphics/gpu/textured_cube/source/gpu.h
   graphics/gpu/textured_cube/source/gpu.c
   @include graphics/gpu/textured_cube/source/gpu.c
   graphics/gpu/textured_cube/source/3dmath.h
   @include graphics/gpu/textured_cube/source/3dmath.h
   graphics/gpu/textured_cube/source/3dmath.c
   @include graphics/gpu/textured_cube/source/3dmath.c
   graphics/gpu/textured_cube/source/vshader.pica
   @include graphics/gpu/textured_cube/source/vshader.pica
 * @example http/source/main.c
 * @example input/read-controls/source/main.c
 * @example input/touch-screen/source/main.c
--- a/libctru/include/3ds/gpu/gpu.h
+++ b/libctru/include/3ds/gpu/gpu.h
@ -30,8 +30,8 @@ void GPUCMD_Finalize();
 //tex param
 #define GPU_TEXTURE_MAG_FILTER(v) (((v)&0x1)<<1) //takes a GPU_TEXTURE_FILTER_PARAM
 #define GPU_TEXTURE_MIN_FILTER(v) (((v)&0x1)<<2) //takes a GPU_TEXTURE_FILTER_PARAM
-#define GPU_TEXTURE_WRAP_S(v) (((v)&0x3)<<8) //takes a GPU_TEXTURE_WRAP_PARAM
+#define GPU_TEXTURE_WRAP_S(v) (((v)&0x3)<<12) //takes a GPU_TEXTURE_WRAP_PARAM
-#define GPU_TEXTURE_WRAP_T(v) (((v)&0x3)<<12) //takes a GPU_TEXTURE_WRAP_PARAM
+#define GPU_TEXTURE_WRAP_T(v) (((v)&0x3)<<8) //takes a GPU_TEXTURE_WRAP_PARAM
 typedef enum
 {
--- a/libctru/include/3ds/gpu/gx.h
+++ b/libctru/include/3ds/gpu/gx.h
@ -1,7 +1,15 @@
 /**
 * @file gx.h
 */
 #pragma once
 #define GX_BUFFER_DIM(w, h) (((h)<<16)|((w)&0xFFFF))
 /**
 * @brief Pixel formats
 * @sa GSP_FramebufferFormats
 */
 typedef enum
 {
 	GX_TRANSFER_FMT_RGBA8  = 0,
@ -11,20 +19,29 @@ typedef enum
 	GX_TRANSFER_FMT_RGBA4  = 4
 } GX_TRANSFER_FORMAT;
 /**
 * @brief Anti-aliasing modes
 *
 * Please remember that the framebuffer is sideways.
 * Hence if you activate 2x1 anti-aliasing the destination dimensions are w = 240*2 and h = 400
 */
 typedef enum
 {
-	GX_TRANSFER_SCALE_NO = 0,
+	GX_TRANSFER_SCALE_NO = 0, ///< No anti-aliasing
-	GX_TRANSFER_SCALE_X  = 1,
+	GX_TRANSFER_SCALE_X  = 1, ///< 2x1 anti-aliasing
-	GX_TRANSFER_SCALE_Y  = 2
+	GX_TRANSFER_SCALE_XY = 2, ///< 2x2 anti-aliasing
 } GX_TRANSFER_SCALE;
 /**
 * @brief GX transfer control flags
 */
 typedef enum
 {
-	GX_FILL_TRIGGER     = 0x001,
+	GX_FILL_TRIGGER     = 0x001, ///< Trigger the PPF event
-	GX_FILL_FINISHED    = 0x002,
+	GX_FILL_FINISHED    = 0x002, ///< Indicates if the memory fill is complete. You should not use it when requesting a transfer.
-	GX_FILL_16BIT_DEPTH = 0x000,
+	GX_FILL_16BIT_DEPTH = 0x000, ///< The buffer has a 16 bit per pixel depth
-	GX_FILL_24BIT_DEPTH = 0x100,
+	GX_FILL_24BIT_DEPTH = 0x100, ///< The buffer has a 24 bit per pixel depth
-	GX_FILL_32BIT_DEPTH = 0x200,
+	GX_FILL_32BIT_DEPTH = 0x200, ///< The buffer has a 32 bit per pixel depth
 } GX_FILL_CONTROL;
 #define GX_TRANSFER_FLIP_VERT(x)  ((x)<<0)
--- a/libctru/include/3ds/services/csnd.h
+++ b/libctru/include/3ds/services/csnd.h
@ -147,6 +147,10 @@ void CSND_SetCapRegs(u32 capUnit, u32 flags, u32 addr, u32 size);
 Result CSND_SetDspFlags(bool waitDone);
 Result CSND_UpdateInfo(bool waitDone);
 /**
 * @param vol The volume, ranges from 0.0 to 1.0 included
 * @param pan The pan, ranges from -1.0 to 1.0 included
 */
 Result csndPlaySound(int chn, u32 flags, u32 sampleRate, float vol, float pan, void* data0, void* data1, u32 size);
 void csndGetDspFlags(u32* outSemFlags, u32* outIrqFlags); // Requires previous CSND_UpdateInfo()
--- a/libctru/include/3ds/services/y2r.h
+++ b/libctru/include/3ds/services/y2r.h
@ -105,8 +105,8 @@ typedef struct
 	Y2R_OutputFormat output_format     : 8; ///< Value passed to @ref Y2RU_SetOutputFormat
 	Y2R_Rotation rotation              : 8; ///< Value passed to @ref Y2RU_SetRotation
 	Y2R_BlockAlignment block_alignment : 8; ///< Value passed to @ref Y2RU_SetBlockAlignment
-	u16 input_line_width;                   ///< Value passed to @ref Y2RU_SetInputLineWidth
+	s16 input_line_width;                   ///< Value passed to @ref Y2RU_SetInputLineWidth
-	u16 input_lines;                        ///< Value passed to @ref Y2RU_SetInputLines
+	s16 input_lines;                        ///< Value passed to @ref Y2RU_SetInputLines
 	Y2R_StandardCoefficient standard_coefficient : 8; ///< Value passed to @ref Y2RU_SetStandardCoefficient
 	u8 unused;
 	u16 alpha;                              ///< Value passed to @ref Y2RU_SetAlpha
@ -233,60 +233,70 @@ Result Y2RU_GetTransferEndEvent(Handle* end_event);
 * @param src_buf A pointer to the beginning of your Y data buffer.
 * @param image_size The total size of the data buffer.
 * @param transfer_unit Specifies the size of 1 DMA transfer. Usually set to 1 line. This has to be a divisor of image_size.
- * @param transfer_unit Specifies the gap (offset) to be added after each transfer. Can be used to convert images with stride or only a part of it.
+ * @param transfer_gap Specifies the gap (offset) to be added after each transfer. Can be used to convert images with stride or only a part of it.
 *
 * @warning transfer_unit+transfer_gap must be less than 32768 (0x8000)
 *
 * This specifies the Y data buffer for the planar input formats (INPUT_YUV42*_INDIV_*).
 * The actual transfer will only happen after calling @ref Y2RU_StartConversion.
 */
-Result Y2RU_SetSendingY(const void* src_buf, u32 image_size, u16 transfer_unit, u16 transfer_gap);
+Result Y2RU_SetSendingY(const void* src_buf, u32 image_size, s16 transfer_unit, s16 transfer_gap);
 /**
 * @brief Configures the U plane buffer.
 * @param src_buf A pointer to the beginning of your Y data buffer.
 * @param image_size The total size of the data buffer.
 * @param transfer_unit Specifies the size of 1 DMA transfer. Usually set to 1 line. This has to be a divisor of image_size.
- * @param transfer_unit Specifies the gap (offset) to be added after each transfer. Can be used to convert images with stride or only a part of it.
+ * @param transfer_gap Specifies the gap (offset) to be added after each transfer. Can be used to convert images with stride or only a part of it.
 *
 * @warning transfer_unit+transfer_gap must be less than 32768 (0x8000)
 *
 * This specifies the U data buffer for the planar input formats (INPUT_YUV42*_INDIV_*).
 * The actual transfer will only happen after calling @ref Y2RU_StartConversion.
 */
-Result Y2RU_SetSendingU(const void* src_buf, u32 image_size, u16 transfer_unit, u16 transfer_gap);
+Result Y2RU_SetSendingU(const void* src_buf, u32 image_size, s16 transfer_unit, s16 transfer_gap);
 /**
 * @brief Configures the V plane buffer.
 * @param src_buf A pointer to the beginning of your Y data buffer.
 * @param image_size The total size of the data buffer.
 * @param transfer_unit Specifies the size of 1 DMA transfer. Usually set to 1 line. This has to be a divisor of image_size.
- * @param transfer_unit Specifies the gap (offset) to be added after each transfer. Can be used to convert images with stride or only a part of it.
+ * @param transfer_gap Specifies the gap (offset) to be added after each transfer. Can be used to convert images with stride or only a part of it.
 *
 * @warning transfer_unit+transfer_gap must be less than 32768 (0x8000)
 *
 * This specifies the V data buffer for the planar input formats (INPUT_YUV42*_INDIV_*).
 * The actual transfer will only happen after calling @ref Y2RU_StartConversion.
 */
-Result Y2RU_SetSendingV(const void* src_buf, u32 image_size, u16 transfer_unit, u16 transfer_gap);
+Result Y2RU_SetSendingV(const void* src_buf, u32 image_size, s16 transfer_unit, s16 transfer_gap);
 /**
 * @brief Configures the YUYV source buffer.
 * @param src_buf A pointer to the beginning of your Y data buffer.
 * @param image_size The total size of the data buffer.
 * @param transfer_unit Specifies the size of 1 DMA transfer. Usually set to 1 line. This has to be a divisor of image_size.
- * @param transfer_unit Specifies the gap (offset) to be added after each transfer. Can be used to convert images with stride or only a part of it.
+ * @param transfer_gap Specifies the gap (offset) to be added after each transfer. Can be used to convert images with stride or only a part of it.
 *
 * @warning transfer_unit+transfer_gap must be less than 32768 (0x8000)
 *
 * This specifies the YUYV data buffer for the packed input format @ref INPUT_YUV422_BATCH.
 * The actual transfer will only happen after calling @ref Y2RU_StartConversion.
 */
-Result Y2RU_SetSendingYUYV(const void* src_buf, u32 image_size, u16 transfer_unit, u16 transfer_gap);
+Result Y2RU_SetSendingYUYV(const void* src_buf, u32 image_size, s16 transfer_unit, s16 transfer_gap);
 /**
 * @brief Configures the destination buffer.
 * @param src_buf A pointer to the beginning of your destination buffer in FCRAM
 * @param image_size The total size of the data buffer.
 * @param transfer_unit Specifies the size of 1 DMA transfer. Usually set to 1 line. This has to be a divisor of image_size.
- * @param transfer_unit Specifies the gap (offset) to be added after each transfer. Can be used to convert images with stride or only a part of it.
+ * @param transfer_gap Specifies the gap (offset) to be added after each transfer. Can be used to convert images with stride or only a part of it.
 *
 * This specifies the destination buffer of the conversion.
 * The actual transfer will only happen after calling @ref Y2RU_StartConversion.
 * The buffer does NOT need to be allocated in the linear heap.
 *
 * @warning transfer_unit+transfer_gap must be less than 32768 (0x8000)
 *
 * @note
 *      It seems that depending on the size of the image and of the transfer unit,\n
 *      it is possible for the end of conversion interrupt to be triggered right after the conversion began.\n
@ -294,7 +304,7 @@ Result Y2RU_SetSendingYUYV(const void* src_buf, u32 image_size, u16 transfer_uni
 *
 * @note Setting a transfer_unit of 4 or 8 lines seems to bring the best results in terms of speed for a 400x240 image.
 */
-Result Y2RU_SetReceiving(void* dst_buf, u32 image_size, u16 transfer_unit, u16 transfer_gap);
+Result Y2RU_SetReceiving(void* dst_buf, u32 image_size, s16 transfer_unit, s16 transfer_gap);
 /**
 * @brief Checks if the DMA has finished sending the Y buffer.
--- a/libctru/source/services/y2r.c
+++ b/libctru/source/services/y2r.c
@ -120,7 +120,7 @@ Result Y2RU_GetTransferEndEvent(Handle* end_event)
 	return cmdbuf[1];
 }
-Result Y2RU_SetSendingY(const void* src_buf, u32 image_size, u16 transfer_unit, u16 transfer_gap)
+Result Y2RU_SetSendingY(const void* src_buf, u32 image_size, s16 transfer_unit, s16 transfer_gap)
 {
 	Result ret = 0;
 	u32* cmdbuf = getThreadCommandBuffer();
@ -136,7 +136,7 @@ Result Y2RU_SetSendingY(const void* src_buf, u32 image_size, u16 transfer_unit,
 	return cmdbuf[1];
 }
-Result Y2RU_SetSendingU(const void* src_buf, u32 image_size, u16 transfer_unit, u16 transfer_gap)
+Result Y2RU_SetSendingU(const void* src_buf, u32 image_size, s16 transfer_unit, s16 transfer_gap)
 {
 	Result ret = 0;
 	u32* cmdbuf = getThreadCommandBuffer();
@ -152,7 +152,7 @@ Result Y2RU_SetSendingU(const void* src_buf, u32 image_size, u16 transfer_unit,
 	return cmdbuf[1];
 }
-Result Y2RU_SetSendingV(const void* src_buf, u32 image_size, u16 transfer_unit, u16 transfer_gap)
+Result Y2RU_SetSendingV(const void* src_buf, u32 image_size, s16 transfer_unit, s16 transfer_gap)
 {
 	Result ret = 0;
 	u32* cmdbuf = getThreadCommandBuffer();
@ -168,7 +168,7 @@ Result Y2RU_SetSendingV(const void* src_buf, u32 image_size, u16 transfer_unit,
 	return cmdbuf[1];
 }
-Result Y2RU_SetSendingYUYV(const void* src_buf, u32 image_size, u16 transfer_unit, u16 transfer_gap)
+Result Y2RU_SetSendingYUYV(const void* src_buf, u32 image_size, s16 transfer_unit, s16 transfer_gap)
 {
 	Result ret = 0;
 	u32* cmdbuf = getThreadCommandBuffer();
@ -228,7 +228,7 @@ Result Y2RU_IsDoneSendingV(bool* is_done)
 	return cmdbuf[1];
 }
-Result Y2RU_SetReceiving(void* dst_buf, u32 image_size, u16 transfer_unit, u16 transfer_gap)
+Result Y2RU_SetReceiving(void* dst_buf, u32 image_size, s16 transfer_unit, s16 transfer_gap)
 {
 	Result ret = 0;
 	u32* cmdbuf = getThreadCommandBuffer();