From 7aad0b4968b3f569b9d55a7993dae34b79c1508b Mon Sep 17 00:00:00 2001 From: fincs Date: Wed, 20 Jul 2016 17:35:26 +0200 Subject: [PATCH] Enhanced and corrected shader code to fully support geometry shaders --- libctru/include/3ds/gpu/shaderProgram.h | 19 +-- libctru/include/3ds/gpu/shbin.h | 16 +- libctru/source/gpu/shaderProgram.c | 190 ++++++++++++++++-------- libctru/source/gpu/shbin.c | 116 ++++++--------- 4 files changed, 193 insertions(+), 148 deletions(-) diff --git a/libctru/include/3ds/gpu/shaderProgram.h b/libctru/include/3ds/gpu/shaderProgram.h index 90f1692..e0d5302 100644 --- a/libctru/include/3ds/gpu/shaderProgram.h +++ b/libctru/include/3ds/gpu/shaderProgram.h @@ -33,18 +33,8 @@ typedef struct shaderInstance_s* geometryShader; ///< Geometry shader. u32 geoShaderInputPermutation[2]; ///< Geometry shader input permutation. u8 geoShaderInputStride; ///< Geometry shader input stride. - u8 geoShaderMode; ///< Geometry shader operation mode. }shaderProgram_s; -/// Geometry shader operation modes. -typedef enum -{ - GSH_NORMAL = 0, ///< Normal operation. - GSH_PARTICLE = 1, ///< Particle system. - GSH_SUBDIVISION_LOOP = 2, ///< Loop subdivision surface. - GSH_SUBDIVISION_CATMULL_CLARK = 3, ///< Catmull-Clark subdivision surface. -} geoShaderMode; - /** * @brief Initializes a shader instance. * @param si Shader instance to initialize. @@ -104,7 +94,7 @@ Result shaderProgramSetVsh(shaderProgram_s* sp, DVLE_s* dvle); * @brief Sets the geometry shader of a shader program. * @param sp Shader program to use. * @param dvle Geometry shader to set. - * @param stride Stride of the geometry shader. + * @param stride Input stride of the shader (pass 0 to match the number of outputs of the vertex shader). */ Result shaderProgramSetGsh(shaderProgram_s* sp, DVLE_s* dvle, u8 stride); @@ -115,13 +105,6 @@ Result shaderProgramSetGsh(shaderProgram_s* sp, DVLE_s* dvle, u8 stride); */ Result shaderProgramSetGshInputPermutation(shaderProgram_s* sp, u64 permutation); -/** - * @brief Configures the operation mode of the geometry shader of a shader program. - * @param sp Shader program to use. - * @param mode Operation mode to use. - */ -Result shaderProgramSetGshMode(shaderProgram_s* sp, geoShaderMode mode); - /** * @brief Configures the shader units to use the specified shader program. * @param sp Shader program to use. diff --git a/libctru/include/3ds/gpu/shbin.h b/libctru/include/3ds/gpu/shbin.h index c53b272..e331beb 100644 --- a/libctru/include/3ds/gpu/shbin.h +++ b/libctru/include/3ds/gpu/shbin.h @@ -28,9 +28,18 @@ typedef enum{ RESULT_TEXCOORD0W = 0x4, ///< Texture coordinate 0 W. RESULT_TEXCOORD1 = 0x5, ///< Texture coordinate 1. RESULT_TEXCOORD2 = 0x6, ///< Texture coordinate 2. - RESULT_VIEW = 0x8 ///< View. + RESULT_VIEW = 0x8, ///< View. + RESULT_DUMMY = 0x9, ///< Dummy attribute (used as passthrough for geometry shader input). }DVLE_outputAttribute_t; +/// Geometry shader operation modes. +typedef enum +{ + GSH_POINT = 0, ///< Point processing mode. + GSH_VARIABLE_PRIM = 1, ///< Variable-size primitive processing mode. + GSH_FIXED_PRIM = 2, ///< Fixed-size primitive processing mode. +} DVLE_geoShaderMode; + /// DVLP data. typedef struct{ u32 codeSize; ///< Code size. @@ -64,6 +73,11 @@ typedef struct{ /// DVLE data. typedef struct{ DVLE_type type; ///< DVLE type. + bool mergeOutmaps; ///< true = merge vertex/geometry shader outmaps ('dummy' output attribute is present). + DVLE_geoShaderMode gshMode; ///< Geometry shader operation mode. + u8 gshFixedVtxStart; ///< Starting float uniform register number for storing the fixed-size primitive vertex array. + u8 gshVariableVtxNum; ///< Number of fully-defined vertices in the variable-size primitive vertex array. + u8 gshFixedVtxNum; ///< Number of vertices in the fixed-size primitive vertex array. DVLP_s* dvlp; ///< Contained DVLPs. u32 mainOffset; ///< Offset of the start of the main function. u32 endmainOffset; ///< Offset of the end of the main function. diff --git a/libctru/source/gpu/shaderProgram.c b/libctru/source/gpu/shaderProgram.c index 8b2c34e..edfcf98 100644 --- a/libctru/source/gpu/shaderProgram.c +++ b/libctru/source/gpu/shaderProgram.c @@ -5,7 +5,7 @@ #include <3ds/gpu/registers.h> #include <3ds/gpu/shaderProgram.h> -static void GPU_SetShaderOutmap(u32 outmapData[8]); +static void GPU_SetShaderOutmap(const u32 outmapData[8]); static void GPU_SendShaderCode(GPU_SHADER_TYPE type, u32* data, u16 offset, u16 length); static void GPU_SendOperandDescriptors(GPU_SHADER_TYPE type, u32* data, u16 offset, u16 length); @@ -168,7 +168,6 @@ Result shaderProgramSetGsh(shaderProgram_s* sp, DVLE_s* dvle, u8 stride) sp->geoShaderInputPermutation[0] = 0x76543210; sp->geoShaderInputPermutation[1] = 0xFEDCBA98; sp->geoShaderInputStride = stride; - sp->geoShaderMode = GSH_NORMAL; return shaderInstanceInit(sp->geometryShader, dvle); } @@ -182,82 +181,149 @@ Result shaderProgramSetGshInputPermutation(shaderProgram_s* sp, u64 permutation) return 0; } -Result shaderProgramSetGshMode(shaderProgram_s* sp, geoShaderMode mode) +static inline void shaderProgramUploadDvle(const DVLE_s* dvle) { - if(!sp || !sp->geometryShader)return -1; + const DVLP_s* dvlp = dvle->dvlp; + // Limit vertex shader code size to the first 512 instructions + int codeSize = dvle->type == GEOMETRY_SHDR ? dvlp->codeSize : (dvlp->codeSize < 512 ? dvlp->codeSize : 512); + GPU_SendShaderCode(dvle->type, dvlp->codeData, 0, codeSize); + GPU_SendOperandDescriptors(dvle->type, dvlp->opcdescData, 0, dvlp->opdescSize); +} - sp->geoShaderMode = mode & 3; - return 0; +static inline void shaderProgramMergeOutmaps(u32* outmapData, const u32* vshOutmap, const u32* gshOutmap) +{ + int i, j; + + // Find and copy attributes common to both vertex and geometry shader + u32 vsh_common = 0, gsh_common = 0; + for (i = 1; i < 8; i ++) + { + u32 mask = gshOutmap[i]; + if (mask == 0x1F1F1F1F) + break; + for (j = 1; j < 8; j ++) + { + if (vshOutmap[j] == mask) + { + outmapData[++outmapData[0]] = mask; + vsh_common |= BIT(j); + gsh_common |= BIT(i); + break; + } + } + } + + // Find and copy attributes that are exclusive to the geometry shader + for (i = 1; i < 8; i ++) + { + u32 mask = gshOutmap[i]; + if (mask == 0x1F1F1F1F) + break; + if (!(gsh_common & BIT(i))) + outmapData[++outmapData[0]] = mask; + } + + // Find and copy attributes that are exclusive to the vertex shader + for (i = 1; i < 8; i ++) + { + u32 mask = vshOutmap[i]; + if (mask == 0x1F1F1F1F) + break; + if (!(vsh_common & BIT(i))) + outmapData[++outmapData[0]] = mask; + } } Result shaderProgramConfigure(shaderProgram_s* sp, bool sendVshCode, bool sendGshCode) { - if(!sp)return -1; + if (!sp || !sp->vertexShader) return -1; - if(!sp->vertexShader)return -2; - - // configure geostage - // has to be done first or else VSH registers might only reconfigure 3 of the 4 shader units ! - if(!sp->geometryShader) - { - GPUCMD_AddMaskedWrite(GPUREG_GEOSTAGE_CONFIG, 0x1, 0x00000000); - GPUCMD_AddMaskedWrite(GPUREG_VSH_COM_MODE, 0x1, 0x00000000); - }else{ - GPUCMD_AddMaskedWrite(GPUREG_GEOSTAGE_CONFIG, 0x1, 0x00000002); - GPUCMD_AddMaskedWrite(GPUREG_VSH_COM_MODE, 0x1, 0x00000001); - } - - // setup vertex shader stuff no matter what + // Get pointers to relevant structures const DVLE_s* vshDvle = sp->vertexShader->dvle; - const DVLP_s* vshDvlp = vshDvle->dvlp; + const DVLE_s* gshDvle = sp->geometryShader ? sp->geometryShader->dvle : NULL; + const DVLE_s* mainDvle = gshDvle ? gshDvle : vshDvle; + + // Variables for working with the outmap + u32 outmapData[8]; + u32 outmapMode = mainDvle->outmapMode; + u32 outmapClock = mainDvle->outmapClock; + + // Initialize geometry engine - do this early in order to ensure all 4 units are correctly initialized + GPUCMD_AddMaskedWrite(GPUREG_GEOSTAGE_CONFIG, 0x3, gshDvle ? 2 : 0); + GPUCMD_AddMaskedWrite(GPUREG_GEOSTAGE_CONFIG2, 0x3, 0); + GPUCMD_AddMaskedWrite(GPUREG_VSH_COM_MODE, 0x1, gshDvle ? 1 : 0); + + // Set up vertex shader code blob (if necessary) if (sendVshCode) - { - GPU_SendShaderCode(vshDvle->type, vshDvlp->codeData, 0, vshDvlp->codeSize); - GPU_SendOperandDescriptors(vshDvle->type, vshDvlp->opcdescData, 0, vshDvlp->opdescSize); - } + shaderProgramUploadDvle(vshDvle); + + // Set up vertex shader entrypoint & outmap mask GPUCMD_AddWrite(GPUREG_VSH_ENTRYPOINT, 0x7FFF0000|(vshDvle->mainOffset&0xFFFF)); GPUCMD_AddWrite(GPUREG_VSH_OUTMAP_MASK, vshDvle->outmapMask); + GPUCMD_AddWrite(GPUREG_VSH_OUTMAP_TOTAL1, vshDvle->outmapData[0]-1); + GPUCMD_AddWrite(GPUREG_VSH_OUTMAP_TOTAL2, vshDvle->outmapData[0]-1); - GPUCMD_AddWrite(GPUREG_VSH_OUTMAP_TOTAL1, vshDvle->outmapData[0]-1); // ? - GPUCMD_AddWrite(GPUREG_VSH_OUTMAP_TOTAL2, vshDvle->outmapData[0]-1); // ? - - bool subdivision = sp->geoShaderMode >= GSH_SUBDIVISION_LOOP; - GPUCMD_AddMaskedWrite(GPUREG_GEOSTAGE_CONFIG, 0x8, subdivision ? 0x80000000 : 0); // Enable or disable subdivision - u32 gshMisc = 0; - if (subdivision) - gshMisc = 1; - else if (sp->geoShaderMode == GSH_PARTICLE) - gshMisc = 0x01004302; - GPUCMD_AddWrite(GPUREG_GSH_MISC0, gshMisc); - GPUCMD_AddWrite(GPUREG_GSH_MISC1, sp->geoShaderMode); - - if(!sp->geometryShader) + // Set up geometry shader (if present) + if (gshDvle) { - // finish setting up vertex shader alone - GPU_SetShaderOutmap((u32*)vshDvle->outmapData); - - GPUCMD_AddWrite(GPUREG_SH_OUTATTR_MODE, vshDvle->outmapMode); - GPUCMD_AddWrite(GPUREG_SH_OUTATTR_CLOCK, vshDvle->outmapClock); - }else{ - // setup both vertex and geometry shader - const DVLE_s* gshDvle = sp->geometryShader->dvle; - const DVLP_s* gshDvlp = gshDvle->dvlp; + // Set up geometry shader code blob (if necessary) if (sendGshCode) - { - GPU_SendShaderCode(gshDvle->type, gshDvlp->codeData, 0, gshDvlp->codeSize); - GPU_SendOperandDescriptors(gshDvle->type, gshDvlp->opcdescData, 0, gshDvlp->opdescSize); - } + shaderProgramUploadDvle(gshDvle); + + // Set up geometry shader entrypoint & outmap mask GPUCMD_AddWrite(GPUREG_GSH_ENTRYPOINT, 0x7FFF0000|(gshDvle->mainOffset&0xFFFF)); GPUCMD_AddWrite(GPUREG_GSH_OUTMAP_MASK, gshDvle->outmapMask); + } - GPU_SetShaderOutmap((u32*)gshDvle->outmapData); + // Merge vertex shader & geometry shader outmaps if requested + if (gshDvle && gshDvle->mergeOutmaps) + { + // Clear outmap + memset(outmapData, 0x1F, sizeof(outmapData)); + outmapData[0] = 0; - //GSH input attributes stuff - GPUCMD_AddWrite(GPUREG_GSH_INPUTBUFFER_CONFIG, 0x08000000|(sp->geoShaderInputStride-1)|(subdivision?0x100:0)); + // Merge outmaps + shaderProgramMergeOutmaps(outmapData, vshDvle->outmapData, gshDvle->outmapData); + outmapMode |= vshDvle->outmapMode; + outmapClock |= vshDvle->outmapClock; + } else + memcpy(outmapData, mainDvle->outmapData, sizeof(outmapData)); + + // Upload and configure outmap + GPU_SetShaderOutmap(outmapData); + GPUCMD_AddWrite(GPUREG_SH_OUTATTR_MODE, outmapMode); + GPUCMD_AddWrite(GPUREG_SH_OUTATTR_CLOCK, outmapClock); + + // Configure geostage + if (gshDvle) + { + // Input stride: use value if specified, otherwise use number of outputs in vertex shader + int stride = sp->geoShaderInputStride ? sp->geoShaderInputStride : vshDvle->outmapData[0]; + + // Enable or disable variable-size primitive processing + GPUCMD_AddMaskedWrite(GPUREG_GEOSTAGE_CONFIG, 0xA, gshDvle->gshMode == GSH_VARIABLE_PRIM ? 0x80000000 : 0); + + // Set up geoshader processing mode + u32 misc = gshDvle->gshMode; + if (misc == GSH_FIXED_PRIM) + misc |= 0x01000000 | ((u32)gshDvle->gshFixedVtxStart<<16) | ((stride-1)<<12) | ((u32)(gshDvle->gshFixedVtxNum-1)<<8); + GPUCMD_AddWrite(GPUREG_GSH_MISC0, misc); + + // Set up variable-size primitive mode parameters + GPUCMD_AddWrite(GPUREG_GSH_MISC1, gshDvle->gshMode == GSH_VARIABLE_PRIM ? (gshDvle->gshVariableVtxNum-1) : 0); + + // Set up geoshader input + GPUCMD_AddWrite(GPUREG_GSH_INPUTBUFFER_CONFIG, 0x08000000 | (gshDvle->gshMode ? 0x0100 : 0) | (stride-1)); + + // Set up geoshader permutation GPUCMD_AddIncrementalWrites(GPUREG_GSH_ATTRIBUTES_PERMUTATION_LOW, sp->geoShaderInputPermutation, 2); - - GPUCMD_AddWrite(GPUREG_SH_OUTATTR_MODE, gshDvle->outmapMode); - GPUCMD_AddWrite(GPUREG_SH_OUTATTR_CLOCK, gshDvle->outmapClock); + } else + { + // Defaults for when geostage is disabled + GPUCMD_AddMaskedWrite(GPUREG_GEOSTAGE_CONFIG, 0xA, 0); + GPUCMD_AddWrite(GPUREG_GSH_MISC0, 0); + GPUCMD_AddWrite(GPUREG_GSH_MISC1, 0); + GPUCMD_AddWrite(GPUREG_GSH_INPUTBUFFER_CONFIG, 0xA0000000); } return 0; @@ -271,12 +337,12 @@ Result shaderProgramUse(shaderProgram_s* sp) int i; // Set up uniforms - GPUCMD_AddWrite(GPUREG_VSH_BOOLUNIFORM, 0x7FFF0000|~sp->vertexShader->boolUniforms); + GPUCMD_AddWrite(GPUREG_VSH_BOOLUNIFORM, 0x7FFF0000|sp->vertexShader->boolUniforms); GPUCMD_AddIncrementalWrites(GPUREG_VSH_INTUNIFORM_I0, sp->vertexShader->intUniforms, 4); for(i=0; ivertexShader->numFloat24Uniforms; i++) GPUCMD_AddIncrementalWrites(GPUREG_VSH_FLOATUNIFORM_CONFIG, (u32*)&sp->vertexShader->float24Uniforms[i], 4); if (sp->geometryShader) { - GPUCMD_AddWrite(GPUREG_GSH_BOOLUNIFORM, 0x7FFF0000|~sp->geometryShader->boolUniforms); + GPUCMD_AddWrite(GPUREG_GSH_BOOLUNIFORM, 0x7FFF0000|sp->geometryShader->boolUniforms); GPUCMD_AddIncrementalWrites(GPUREG_GSH_INTUNIFORM_I0, sp->geometryShader->intUniforms, 4); for(i=0; igeometryShader->numFloat24Uniforms; i++) GPUCMD_AddIncrementalWrites(GPUREG_GSH_FLOATUNIFORM_CONFIG, (u32*)&sp->geometryShader->float24Uniforms[i], 4); } @@ -284,7 +350,7 @@ Result shaderProgramUse(shaderProgram_s* sp) return 0; } -void GPU_SetShaderOutmap(u32 outmapData[8]) +void GPU_SetShaderOutmap(const u32 outmapData[8]) { GPUCMD_AddMaskedWrite(GPUREG_PRIMITIVE_CONFIG, 0x1, outmapData[0]-1); GPUCMD_AddIncrementalWrites(GPUREG_SH_OUTMAP_TOTAL, outmapData, 8); diff --git a/libctru/source/gpu/shbin.c b/libctru/source/gpu/shbin.c index 5209c3c..7f60150 100644 --- a/libctru/source/gpu/shbin.c +++ b/libctru/source/gpu/shbin.c @@ -38,9 +38,18 @@ DVLB_s* DVLB_ParseFile(u32* shbinData, u32 shbinSize) dvle->dvlp=&ret->DVLP; dvle->type=(dvleData[1]>>16)&0xFF; + dvle->mergeOutmaps=(dvleData[1]>>24)&1; dvle->mainOffset=dvleData[2]; dvle->endmainOffset=dvleData[3]; + if(dvle->type==GEOMETRY_SHDR) + { + dvle->gshMode=dvleData[5]&0xFF; + dvle->gshFixedVtxStart=(dvleData[5]>>8)&0xFF; + dvle->gshVariableVtxNum=(dvleData[5]>>16)&0xFF; + dvle->gshFixedVtxNum=(dvleData[5]>>24)&0xFF; + } + dvle->constTableSize=dvleData[7]; dvle->constTableData=(DVLE_constEntry_s*)&dvleData[dvleData[6]/4]; @@ -89,80 +98,53 @@ s8 DVLE_GetUniformRegister(DVLE_s* dvle, const char* name) void DVLE_GenerateOutmap(DVLE_s* dvle) { - if(!dvle)return; + if (!dvle) return; + // Initialize outmap data memset(dvle->outmapData, 0x1F, sizeof(dvle->outmapData)); + dvle->outmapData[0] = 0; + dvle->outmapMask = 0; + dvle->outmapMode = 0; + dvle->outmapClock = 0; - int i; - u8 numAttr=0; - u8 maxAttr=0; - u8 attrMask=0; - u32 attrMode=0; - u32 attrClock=0; - - for(i=0;ioutTableSize;i++) + int i, j, k; + for (i = 0; i < dvle->outTableSize; i ++) { - u32* out=&dvle->outmapData[dvle->outTableData[i].regID+1]; - u32 mask=0x00000000; - u8 tmpmask=dvle->outTableData[i].mask; - mask=(mask<<8)|((tmpmask&8)?0xFF:0x00);tmpmask<<=1; - mask=(mask<<8)|((tmpmask&8)?0xFF:0x00);tmpmask<<=1; - mask=(mask<<8)|((tmpmask&8)?0xFF:0x00);tmpmask<<=1; - mask=(mask<<8)|((tmpmask&8)?0xFF:0x00);tmpmask<<=1; + int type = dvle->outTableData[i].type; + int mask = dvle->outTableData[i].mask; + int regID = dvle->outTableData[i].regID; + u32* out = &dvle->outmapData[regID+1]; - if(*out==0x1F1F1F1F)numAttr++; - - u32 val=0x1F1F1F1F; - switch(dvle->outTableData[i].type) + if (!(dvle->outmapMask & BIT(regID))) { - case RESULT_POSITION: val=0x03020100; break; - case RESULT_NORMALQUAT: val=0x07060504; break; - case RESULT_COLOR: val=0x0B0A0908; break; - case RESULT_TEXCOORD0: val=0x1F1F0D0C; break; - case RESULT_TEXCOORD0W: val=0x10101010; break; - case RESULT_TEXCOORD1: val=0x1F1F0F0E; break; - case RESULT_TEXCOORD2: val=0x1F1F1716; break; - case RESULT_VIEW: val=0x1F141312; break; - } - *out=((*out)&~mask)|(val&mask); - - switch(dvle->outTableData[i].type) - { - case RESULT_POSITION: - if ((*out & 0xFF0000)==0x020000) - attrClock |= BIT(0); - break; - case RESULT_COLOR: - attrClock |= BIT(1); - break; - case RESULT_TEXCOORD0: - attrMode = 1; - attrClock |= BIT(8); - break; - case RESULT_TEXCOORD1: - attrMode = 1; - attrClock |= BIT(9); - break; - case RESULT_TEXCOORD2: - attrMode = 1; - attrClock |= BIT(10); - break; - case RESULT_TEXCOORD0W: - attrMode = 1; - attrClock |= BIT(16); - break; - case RESULT_NORMALQUAT: - case RESULT_VIEW: - attrClock |= BIT(24); - break; + dvle->outmapMask |= BIT(regID); + dvle->outmapData[0] ++; } - attrMask|=1<outTableData[i].regID; - if(dvle->outTableData[i].regID+1>maxAttr)maxAttr=dvle->outTableData[i].regID+1; + int sem = 0x1F, num = 0; + switch (type) + { + case RESULT_POSITION: sem = 0x00; num = 4; break; + case RESULT_NORMALQUAT: sem = 0x04; num = 4; dvle->outmapClock |= BIT(24); break; + case RESULT_COLOR: sem = 0x08; num = 4; dvle->outmapClock |= BIT(1); break; + case RESULT_TEXCOORD0: sem = 0x0C; num = 2; dvle->outmapClock |= BIT(8); dvle->outmapMode = 1; break; + case RESULT_TEXCOORD0W: sem = 0x10; num = 1; dvle->outmapClock |= BIT(16); dvle->outmapMode = 1; break; + case RESULT_TEXCOORD1: sem = 0x0E; num = 2; dvle->outmapClock |= BIT(9); dvle->outmapMode = 1; break; + case RESULT_TEXCOORD2: sem = 0x16; num = 2; dvle->outmapClock |= BIT(10); dvle->outmapMode = 1; break; + case RESULT_VIEW: sem = 0x12; num = 3; dvle->outmapClock |= BIT(24); break; + default: continue; + } + + for (j = 0, k = 0; j < 4 && k < num; j ++) + { + if (mask & BIT(j)) + { + *out &= ~(0xFF << (j*8)); + *out |= (sem++) << (j*8); + k ++; + if (type==RESULT_POSITION && k==3) + dvle->outmapClock |= BIT(0); + } + } } - - dvle->outmapData[0]=numAttr; - dvle->outmapMask=attrMask; - dvle->outmapMode=attrMode; - dvle->outmapClock=attrClock; }