22 changed files with 672 additions and 2667 deletions
--- a/.gitignore
+++ b/.gitignore
@ -26,5 +26,6 @@ missing
 config.log
 config.status
 Makefile
+picasso
 .deps/
 *.bz2
--- a/.vscode/c_cpp_properties.json
+++ b/.vscode/c_cpp_properties.json
@ -1,21 +0,0 @@
-{
-    "configurations": [
-        {
-            "name": "3ds",
-            "includePath": [
-                "${workspaceFolder}/**",
-                //"C:/devkitpro/libnx/include/**",
-                "C:/devkitpro/libctru/include/**",
-                "/opt/devkitpro/libctru/include/**",
-                //"C:/devkitpro/portlibs/switch/include/**",
-                "/opt/devkitpro/portlibs/3ds/include/**",
-                "C:/devkitpro/portlibs/3ds/include/**"
-            ],
-            "defines": [],
-            "cStandard": "gnu17",
-            "cppStandard": "gnu++17",
-            "intelliSenseMode": "linux-gcc-x64"
-        }
-    ],
-    "version": 4
-}
--- a/3ds.cmake
+++ b/3ds.cmake
@ -1,20 +0,0 @@
-#########################################################################################
-set(DEVKITPRO $ENV{DEVKITPRO})
-
-set(CMAKE_SYSTEM_NAME "Nintendo 3ds")
-set(CMAKE_C_COMPILER "${DEVKITPRO}/devkitARM/bin/arm-none-eabi-gcc")
-set(CMAKE_CXX_COMPILER "${DEVKITPRO}/devkitARM/bin/arm-none-eabi-g++")
-set(CMAKE_AR "${DEVKITPRO}/devkitARM/bin/arm-none-eabi-gcc-ar" CACHE STRING "")
-set(CMAKE_RANLIB "${DEVKITPRO}/devkitARM/bin/arm-none-eabi-gcc-ranlib" CACHE STRING "")
-set(CMAKE_ASM_COMPILER "${DEVKITPRO}/devkitARM/bin/arm-none-eabi-gcc")
-
-set(ARCH "-march=armv6k -mtune=mpcore -mfloat-abi=hard -mfpu=vfp -mtp=soft -D__3DS__")
-set(CMAKE_C_FLAGS "${ARCH} -Wall -mword-relocations -O3 -fomit-frame-pointer -ffunction-sections -fdata-sections" CACHE STRING "C flags")
-set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -fno-rtti -std=gnu++20" CACHE STRING "C++ flags")
-set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS}")
-set(CMAKE_FIND_ROOT_PATH ${DEVKITPRO}/devkitARM ${DEVKITPRO}/libctru ${DEVKITARM}/portlibs/3ds)
-
-set(BUILD_SHARED_LIBS OFF CACHE INTERNAL "Shared libs not available")
-
-link_directories(${DEVKITPRO}/libcrtu/lib ${DEVKITPRO}/portlibs/3ds/lib)
-#########################################################################################
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,57 +0,0 @@
-cmake_minimum_required(VERSION 3.22)
-
-project(picasso VERSION 0.5.2 LANGUAGES CXX DESCRIPTION "Picasso Shadercompiler on the Nintendo 3ds")
-
-set(CMAKE_EXE_LINKER_FLAGS "-L${DEVKITPRO}/libctru/lib -L${DEVKITPRO}/picaGL/lib -L${DEVKITPRO}/portlibs/3ds/lib -specs=3dsx.specs -Wl,--gc-sections")
-
-include_directories(${DEVKITPRO}/libctru/include ${DEVKITPRO}/picaGL/include ${DEVKITPRO}/portlibs/3ds/include)
-add_definitions("-D__3DS__")
-
-include(CMakePackageConfigHelpers)
-
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE Release CACHE STRING
-            "Choose the type of build, options are: None Debug Release."
-            FORCE)
-endif(NOT CMAKE_BUILD_TYPE)
-
-if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
-    set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE STRING
-            "The install location"
-            FORCE)
-endif(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
-
-include_directories(include)
-
-set(HEADER_FILES
-        include/pica.hpp
-        include/picasso/picasso.h
-        include/picasso/types.h
-        include/picasso/FileClass.h
-        include/picasso/maestro_opcodes.h)
-
-set(SOURCE_FILES
-        source/picasso_assembler.cpp
-        source/picasso_library.cpp)
-
-add_library(${PROJECT_NAME}
-        ${HEADER_FILES}
-        ${SOURCE_FILES})
-
-add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
-
-target_compile_features(${PROJECT_NAME}
-        # Features required to compile the library itself.
-        PRIVATE cxx_std_20 cxx_auto_type)
-
-set(PROJECT_PREFIX ${PROJECT_NAME}-${picasso_VERSION})
-
-target_include_directories(${PROJECT_NAME}
-        PUBLIC
-            # Used when building the library:
-            $<BUILD_INTERFACE:${foo_SOURCE_DIR}/include>
-            # Used when installing the library:
-            $<INSTALL_INTERFACE:include/${PROJECT_NAME}>
-        PRIVATE
-            # Used only when building the library:
-            src)
--- a/Changelog.md
+++ b/Changelog.md
@ -1,72 +0,0 @@
-# picasso Changelog
-
-# v2.7.1
-
- Further improvements to overall system stability and other minor adjustments have been made to enhance the user experience.
-
-# v2.7
-
- Added `dst`, `litp` and `break` instructions (thanks to @Tilka).
- Added check to enforce index regs being used only with floating point vector uniforms.
- Renamed index registers to match D3D naming conventions (`a0.x`, `a0.y`, `aL`) (old names still accepted).
- Miscellaneous bugfixes and improvements (thanks to @lioncash).
-
-# v2.6.2
-
- Fixed several compilation errors in some compilers.
-
-# v2.6.1
-
- Reduced `mad` opdesc allocation errors by automatically swapping out of bounds opdesc entries with other ones in the addressable range (5 bits).
-
-# v2.6
-
- Added `.in` directive for explicit specifying (and allocating) input registers and exporting them in the DVLE uniform table.
- Added support for dollar signs (`$`) in identifier names, which are translated to period characters (`.`) in DVLE uniform names.
- Output registers `o7` through `o15` are now allowed in vertex shaders (as dummy outputs).
- DVLE uniform table is now sorted by register position.
-
-# v2.5
-
- The `.gsh` directive has been enhanced to provide full support for all geometry shader operation modes (point, variable-sized primitive and fixed-size primitive). This also effectively separates vertex shader uniform space from geometry shader uniform space.
- The `.out` directive has been enhanced to allow wiring semantics to any arbitrary output register. Additionally the `dummy` semantic was added while the `7` semantic was removed.
- Added auto-insertion of NOP instruction in corner cases involving flow of control instructions, together with the `--no-nop` directive which instead of adding NOPs warns the user about the corner cases.
- Added support for `rgba` and `stpq` in addition to `xyzw`.
- Added an error message for invalid input register use (e.g. `add r0, v1, v2`).
- The operand descriptor allocation algorithm has been enhanced to take into account unused operands.
- The `6` and `7` conditional operators have been removed since they actually do not exist.
- Really corrected MAD instruction encoding.
- Several miscellaneous issues were fixed.
-
-# v2.4
-
- Corrected MAD instruction encoding.
- Added command line flag for retrieving the picasso version.
-
-# v2.3
-
- Added `.constfa` for creating floating-point vector constant arrays.
- Fixed `.nodvle` bug.
-
-# v2.2
-
- Added proper support for the MOVA instruction.
- Added support for inverting the condition in JMPU.
- Fixed `lcnt` bug.
-
-# v2.1
-
- Fixed input file open error message.
- Fixed `.constf` misallocation bug.
-
-# v2.0
-
- (**Breaking change**) Command line format changed.
- Added support for assembling multiple shaders (DVLEs) into a single SHBIN.
- Added new directives: `.entry`, `.nodvle`, `.gsh`, `.setf`, `.seti`, `.setb`.
- Added auto-detection of inverted forms of opcodes. (Explicitly using `dphi`, `sgei`, `slti` and `madi` is now deprecated)
- Several miscellaneous bug fixes.
-
-# v1.0
-
- Initial release.
--- a/Manual.md
+++ b/Manual.md
@ -9,7 +9,7 @@ Comments are introduced by the semicolon character. E.g.
 .fvec myFloat ; They can also appear in the same line
 ```

-Identifiers follow the same rules as C identifiers. Additionally, the dollar sign (`$`) is allowed in identifiers; mostly as a substitute for the period character (`.`) since the latter is used in `picasso` syntax.
+Identifiers follow the same rules as C identifiers.

 Labels consist of an identifier plus a colon. E.g.

@ -28,20 +28,22 @@ Procedures are delimited using the `.proc` and `.end` directives. E.g.
 .end
 ```

+A valid PICA200 shader must contain a `main` procedure.
+
 Instructions consist of an opcode name and a comma-delimited list of arguments.

 Directives are special statements that start with a period and control certain aspects of `picasso`'s code emission; such as defining procedures, uniforms, constants and more.

 PICA200 registers are often used as arguments to instructions. There exist the following registers:

- `o0` through `o15`: Output registers (usable as a destination operand). The range `o7` through `o15` is only available in vertex shaders.
- `v0` through `v15`: Input registers (usable as a source operand).
+- `o0` through `o7`: Output registers (usable as a destination operand).
+- `v0` through `v7`: Input registers (usable as a source operand).
 - `r0` through `r15`: Scratch registers (usable as both destination and source operands).
 - `c0` through `c95`: Floating-point vector uniforms (usable as a special type of source operand called SRC1).
 - `i0` through `i3`: Integer vector uniforms (special purpose).
 - `b0` through `b15`: Boolean uniforms (special purpose).

-All registers contain 24-bit floating point vectors; except for integer vector uniforms (containing 8-bit integers) and boolean uniforms. Vectors have 4 components: x, y, z and w. The components may alternatively be referred to as r, g, b and a (respectively); or s, t, p and q (respectively). Uniforms are special registers that are writable by the CPU; thus they are used to pass configuration parameters to the shader such as transformation matrices. Sometimes they are preloaded with constant values that may be used in the logic of the shader.
+All registers contain 32-bit floating point vectors; except for integer vector uniforms (containing 8-bit integers) and boolean uniforms. Vectors have 4 components: x, y, z and w. Uniforms are special registers that are writable by the CPU; thus they are used to pass configuration parameters to the shader such as transformation matrices. Sometimes they are preloaded with constant values that may be used in the logic of the shader.

 In most situations, vectors may be [swizzled](http://en.wikipedia.org/wiki/Swizzling_%28computer_graphics%29), that is; their components may be rearranged. Register arguments support specifying a swizzling mask: `r0.wwxy`. The swizzling mask usually has 4 components (but not more), if it has less the last component is repeated to fill the mask. The default mask applied to registers is `xyzw`; that is, identity (no effect).

@ -51,44 +53,9 @@ Registers may also be assigned additional names in order to make the code more l

 For convenience, registers may be addressed using an offset from a known register. This is called indexing. For example, `c8[4]` is equivalent to `c12`; and `r4[-2]` is equivalent to `r2`. Indexing is useful for addressing arrays of registers (such as matrices).

-Some source operands of instructions (called SRC1) support relative addressing. This means that it is possible to use one of the three built-in indexing registers (`a0.x`, `a0.y` and `aL`) to address a register, e.g. `someArray[aL]`. Adding an offset is also supported, e.g. `someArray[aL+2]`. This is useful in FOR loops. Index registers can only be used with floating-point vector uniform registers, though. Note: Older versions of `picasso` called the indexing registers `a0`, `a1` and `a2` respectively (also `lcnt` for `a2`); these names are still accepted for backwards compatibility.
+Some source operands of instructions (called SRC1) support relative addressing. This means that it is possible to use one of the three built-in indexing registers (`a0`, `a1` and `a2` aka `lcnt`) to address a register, e.g. `someArray[lcnt]`. Adding an offset is also supported, e.g. `someArray[lcnt+2]`. This is useful in FOR loops.

-Normal floating-point vector registers may also be negated by prepending a minus sign before it, e.g. `-r2` or `-someArray[aL+2]`.
-
-In geometry shaders, `b15` is automatically set to true *after* each execution of the geometry shader. This can be useful to detect whether program state should be initialized - GPU management code usually resets all unused boolean uniforms to false when setting up the PICA200's shader processing units.
-
-## Command Line Usage
-
-```
-Usage: picasso [options] files...
-Options:
-  -o, --out=<file>        Specifies the name of the SHBIN file to generate
-  -h, --header=<file>     Specifies the name of the header file to generate
-  -n, --no-nop            Disables the automatic insertion of padding NOPs
-  -v, --version           Displays version information
-```
-
-DVLEs are generated in the same order as the files in the command line.
-
-## Linking Model
-
-`picasso` takes one or more source code files, and assembles them into a single `.shbin` file. A DVLE object is generated for each source code file, unless the `.nodvle` directive is used (see below). Procedures are shared amongst all source code files, and they may be defined and called wherever. Uniform space for vertex shaders is also shared, that is, if two vertex shader source code files declare the same uniform, they are assigned the same location. Geometry shaders however do not share uniforms, and each geometry shader source code file will have its own uniform allocation map. On the other hand, constants are never shared, and the same space is reused for the constants of each DVLE. Outputs and aliases are, by necessity, never shared either.
-
-The entry point of a DVLE may be set with the `.entry` directive. If this directive is not used, `main` is assumed as the entrypoint.
-
-A DVLE by default is a vertex shader, unless the `.gsh` directive is used (in the case of which a geometry shader is specified).
-
-Uniforms that start with the underscore (`_`) character are not exposed in the DVLE table of uniforms. This allows for creating private uniforms that can be internally used to configure the behaviour of shared procedures. Additionally, dollar signs (`$`) are automatically translated to period characters (`.`) in the DVLE uniform table.
-
-**Note**: Older versions of `picasso` handled geometry shaders in a different way. Specifically, uniform space was shared with vertex shaders and it was possible to use `.gsh` without parameters or `setemit` to flag a DVLE as a geometry shader. For backwards compatibility purposes this functionality has been retained, however its use is not recommended.
-
-## PICA200 Caveats & Errata
-
-The PICA200's shader units have numerous implementation caveats and errata that should be taken into account when designing and writing shader code. Some of these include:
-
- Certain flow of control statements may not work at the end of another block, including the closing of other nested blocks. picasso detects these situations and automatically inserts padding NOP instructions (unless the `--no-nop` command line flag is used).
- The `mova` instruction is finicky and for instance two consecutive `mova` instructions will freeze the PICA200.
- Only a single input register is able to be referenced reliabily at a time in the source registers of an operand. That is, while specifying the same input register in one or more source registers will behave correctly, specifying different input registers will produce incorrect results. picasso detects this situation and displays an error message.
+Normal floating-point vector registers may also be negated by prepending a minus sign before it, e.g. `-r2` or `-someArray[lcnt+2]`.

 ## Supported Directives

@ -161,133 +128,33 @@ Reserves a new floating-point vector uniform to be preloaded with the specified
 Reserves a new integer vector uniform to be preloaded with the specified constant; creates an alias for it that points to the allocated register. Example:

 ```
-.consti loopParams(16, 0, 1, 0)
-```
-
-### .constfa
-```
-.constfa arrayName[]
-.constfa arrayName[size]
-.constfa (x, y, z, w)
-```
-Reserves a new array of floating-point vector uniforms to be preloaded with the specified constants; creates an alias for it that points to the first element. Example:
-
-```
-; Create an array of two elements
-.constfa myArray[]
-.constfa (1.0, 2.0, 3.0, 4.0)
-.constfa (5.0, 6.0, 7.0, 8.0)
-.end
-```
-
-Optionally the size of the array may be specified. If a number of elements less than the size is specified, the missing elements are initialized to zero. Example:
-
-```
-.constfa myArray[4]
-.constfa (1.0, 2.0, 3.0, 4.0)
-.constfa (5.0, 6.0, 7.0, 8.0)
-; The remaining two elements are vectors full of zeroes.
-.end
-```
-
-### .in
-```
-.in inName
-.in inName register
-```
-Reserves an input register and creates an alias for it called `inName`. If no input register is specified it is automatically allocated. The input register is added to the DVLE's uniform table.
-
-Example:
-
-```
-.in position
-.in texcoord
-.in special v15
+.constf loopParams(16, 0, 1, 0)
 ```

 ### .out
 ```
 .out outName propName
-.out outName propName register
-.out - propName register
 ```
-Wires an output register to a certain output property and (optionally) creates an alias for it called `outName` (specify a dash in order not to create the alias). If no output register is specified it is automatically allocated. The following property names are supported:
+Allocates a new output register, wires it to a certain output property and creates an alias for it that points to the allocated register. The following property names are supported:

 - `position` (or `pos`): Represents the position of the outputted vertex.
- `normalquat` (or `nquat`): Used in fragment lighting, this represents the quaternion associated to the normal vector of the vertex.
- `color` (or `clr`): Represents the color of the outputted vertex. Its format is (R, G, B, A) where R,G,B,A are values ranging from 0.0 to 1.0.
- `texcoord0` (or `tcoord0`): Represents the first texture coordinate, which is always fed to the Texture Unit 0. Only the first two components are used.
- `texcoord0w` (or `tcoord0w`): Represents the third component of the first texture coordinate, used for 3D/cube textures.
- `texcoord1` (or `tcoord1`): Similarly to `texcoord0`, this is the second texture coordinate, which is usually but not always fed to Texture Unit 1.
- `texcoord2` (or `tcoord2`): Similarly `texcoord0`, this is the third texture coordinate, which is usually but not always fed to Texture Unit 2.
- `view`: Used in fragment lighting, this represents the view vector associated to the vertex. The fourth component is not used.
- `dummy`: Used in vertex shaders to pass generic semanticless parameters to the geometry shader, and in geometry shaders to use the appropriate property type from the output map of the vertex shader, thus 'merging' the output maps.
-
-An output mask that specifies to which components of the output register should the property be wired to is also accepted. If the output register is explicitly specified, it attaches to it (e.g. `o2.xy`); otherwise it attaches to the property name (e.g. `texcoord0.xy`).
+- `normalquat` (or `nquat`): Under investigation.
+- `color` (or `clr`): Represents the color of the outputted vertex. Its format is (R, G, B, xx) where R,G,B are values ranging from 0.0 to 1.0. The W component isn't used.
+- `texcoord0` (or `tcoord0`): Represents the texture coordinate that is fed to the Texture Unit 0. The Z and W components are not used.
+- `texcoord0w` (or `tcoord0w`): Under investigation.
+- `texcoord1` (or `tcoord1`): As `texcoord0`, but for the Texture Unit 1.
+- `texcoord2` (or `tcoord2`): As `texcoord0`, but for the Texture Unit 2.
+- `7`: Under investigation.
+- `view`: Under investigation.

 Example:

 ```
 .out outPos position
-.out outClr color.rgba
-.out outTex texcoord0.xy
-.out -      texcoord0w outTex.p
+.out outClr color
+.out outTex texcoord0
 ```

-### .entry
-```
-.entry procedureName
-```
-Specifies the name of the procedure to use as the entrypoint of the current DVLE. If this directive is not used, `main` is assumed.
-
-### .nodvle
-```
-.nodvle
-```
-This directive tells `picasso` not to generate a DVLE for the source code file that is being processed. This allows for writing files that contain shared procedures to be used by other files.
-
-### .gsh
-```
-.gsh point firstReg
-.gsh variable firstReg vtxNum
-.gsh fixed firstReg arrayStartReg vtxNum
-```
-This directive flags the current DVLE as a geometry shader and specifies the geometry shader operation mode, which can be one of the following:
-
- `point` mode: In this mode the geometry shader is called according to the input stride and input permutation configured by the user. On entry, the data is stored starting at the `v0` register. This type of geometry shader can be used with both array-drawing mode (aka `C3D_DrawArrays`) and element-drawing mode (aka `C3D_DrawElements`).
- `variable` mode (also called `subdivision` mode): In this mode the geometry shader processes variable-sized primitives, which are required to have `vtxNum` vertices for which full attribute information will be stored, and **one or more** additional vertices for which only position information will be stored. On entry the register `c0` stores in all its components the total number of vertices of the primitive, and subsequent registers store vertex information in order. This type of geometry shader can only used with element-drawing mode - inside the index array each primitive is prefixed with the number of vertices in it.
- `fixed` mode (also called `particle` mode): In this mode the geometry shader processes fixed-size primitives, which always have `vtxNum` vertices. On entry, the array of vertex information will be stored starting at the float uniform register `arrayStartReg`. This type of geometry shader can only used with element-drawing mode.
-
-The `firstReg` parameter specifies the first float uniform register that is available for use in float uniform register allocation (this is especially useful in variable and fixed mode).
-
-Examples:
-
-```
-.gsh point c0
-.gsh variable c48 3
-.gsh fixed c48 c0 4
-```
-
-**Note**: For backwards compatibility reasons, a legacy mode which does not accept any parameters is accepted; however it should not be used.
-
-### .setf
-```
-.setf register(x, y, z, w)
-```
-Similar to `.constf`, this directive adds a DVLE constant entry for the specified floating-point vector uniform register to be loaded with the specified value. This is useful in order to instantiate a generalized shared procedure with the specified parameters.
-
-### .seti
-```
-.seti register(x, y, z, w)
-```
-Similar to `.consti`, this directive adds a DVLE constant entry for the specified integer vector uniform register to be loaded with the specified value. This is useful in order to instantiate a generalized shared procedure with the specified parameters.
-
-### .setb
-```
-.setb register value
-```
-This directive adds a DVLE constant entry for the specified boolean uniform register to be loaded with the specified value (which may be `true`, `false`, `on`, `off`, `1` or `0`). This is useful in order to control the flow of a generalized shared procedure.
-
 ## Supported Instructions

 See [Shader Instruction Set](http://3dbrew.org/wiki/Shader_Instruction_Set) for more details.
@ -302,44 +169,40 @@ Syntax                            | Description
 `dp3 rDest, rSrc1, rSrc2`         |
 `dp4 rDest, rSrc1, rSrc2`         |
 `dph rDest, rSrc1, rSrc2`         |
-`dst rDest, rSrc1, rSrc2`         |
 `mul rDest, rSrc1, rSrc2`         |
 `sge rDest, rSrc1, rSrc2`         |
 `slt rDest, rSrc1, rSrc2`         |
 `max rDest, rSrc1, rSrc2`         |
 `min rDest, rSrc1, rSrc2`         |
+`dphi rDest, rSrc2, rSrc1`        |
+`sgei rDest, rSrc2, rSrc1`        |
+`slti rDest, rSrc2, rSrc1`        |
 `ex2 rDest, rSrc1`                |
 `lg2 rDest, rSrc1`                |
-`litp rDest, rSrc1`               |
+`ex2 rDest, rSrc1`                |
 `flr rDest, rSrc1`                |
 `rcp rDest, rSrc1`                |
 `rsq rDest, rSrc1`                |
 `mov rDest, rSrc1`                |
-`mova idxReg, rSrc1`              |
+`mova rSrc1`                      |
 `cmp rSrc1, opx, opy, rSrc2`      |
 `call procName`                   |
 `for iReg`                        |
-`break`                           | (not recommended)
 `breakc condExp`                  |
 `callc condExp, procName`         |
 `ifc condExp`                     |
 `jmpc condExp, labelName`         |
 `callu bReg, procName`            |
 `ifu bReg`                        |
-`jmpu [!]bReg, labelName`         |
-`mad rDest, rSrc1, rSrc2, rSrc3`  |
+`jmpu bReg, labelName`            |
+`madi rDest, rSrc1, rSrc2, rSrc1` |
+`mad rDest, rSrc1, rSrc1, rSrc2`  |

 ### Description of operands

 - `rDest`: Represents a destination operand (register).
- `rSrc1`/`rSrc2`/`rSrc3`: Represents a source operand (register). Depending on the position, some registers may be supported and some may not.
-	- Narrow source operands are limited to input and scratch registers.
-	- Wide source operands also support floating-point vector uniforms and relative addressing.
-	- In instructions that take one source operand, it is always wide.
-	- In instructions that take two source operands, the first is wide and the second is narrow.
-	- `dph`/`sge`/`slt` have a special form where the first operand is narrow and the second is wide. This usage is detected automatically by `picasso`.
-	- `mad`, which takes three source operands, has two forms: the first is narrow-wide-narrow, and the second is narrow-narrow-wide. This is also detected automatically.
- `idxReg`: Represents an indexing register to write to using the mova instruction. Can be `a0.x`, `a0.y` or `a0.xy` (the latter writes to both components). Note: Older versions of `picasso` accepted `a0`, `a1` and `a01` respectively; this syntax is still supported for backwards compatibility.
+- `rSrc1`: Represents a so-called SRC1 source operand (register), which allows accessing floating-point vector uniforms and relative addressing.
+- `rSrc2`: Represents a so-called SRC2 source operand (register), which is limited to input and scratch registers.
 - `iReg`: Represents an integer vector uniform source operand.
 - `bReg`: Represents a boolean uniform source operand.
 - `procName`: Represents the name of a procedure.
@ -351,11 +214,12 @@ Syntax                            | Description
 	- `le`: Less or equal than
 	- `gt`: Greater than
 	- `ge`: Greater or equal than
+	- `6` and `7`: currently unknown, supposedly the result they yield is always true.
 - `condExp`: Represents a conditional expression, which uses the conditional flags `cmp.x` and `cmp.y` set by the CMP instruction. These flags may be negated using the `!` symbol, e.g. `!cmp.x`. The conditional expression can take any of the following forms:
 	- `flag1`: It tests a single flag.
 	- `flag1 && flag2`: It performs AND between the two flags. Optionally, a single `&` may be specified.
 	- `flag1 || flag2`: It performs OR between the two flags. Optionally, a single `|` may be specified.
- `vtxId`: An integer ranging from 0 to 2 specifying the vertex ID used in geoshader vertex emission.
+- `vtxId`: An integer ranging from 0 to 3 specifying the vertex ID used in geoshader vertex emission.
 - `emitFlags`: A space delimited combination of the following words:
-	- `prim` (or `primitive`): Specifies that after emitting the vertex, a primitive should also be emitted.
+	- `primitive` (or `prim`): Specifies that after emitting the vertex, a primitive should also be emitted.
 	- `inv` (or `invert`): Specifies that the order of the vertices in the emitted primitive is inverted.
--- a/cmake/picasso-config.cmake.in
+++ b/cmake/picasso-config.cmake.in
@ -1,14 +0,0 @@
-@PACKAGE_INIT@
-
-# Include the exported CMake file
-get_filename_component(picasso_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
-
-# This macro enables usage of find_dependency().
-# https://cmake.org/cmake/help/v3.11/module/CMakeFindDependencyMacro.html
-include(CMakeFindDependencyMacro)
-
-if(NOT TARGET picasso::picasso)
-    include("${picasso_CMAKE_DIR}/picasso-targets.cmake")
-endif()
-
-check_required_components(picasso)
--- a/348
+++ b/348
@ -1,348 +0,0 @@
-#! /bin/sh
-# Wrapper for compilers which do not understand '-c -o'.
-
-scriptversion=2018-03-07.03; # UTC
-
-# Copyright (C) 1999-2021 Free Software Foundation, Inc.
-# Written by Tom Tromey <tromey@cygnus.com>.
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# This file is maintained in Automake, please report
-# bugs to <bug-automake@gnu.org> or send patches to
-# <automake-patches@gnu.org>.
-
-nl='
-'
-
-# We need space, tab and new line, in precisely that order.  Quoting is
-# there to prevent tools from complaining about whitespace usage.
-IFS=" ""	$nl"
-
-file_conv=
-
-# func_file_conv build_file lazy
-# Convert a $build file to $host form and store it in $file
-# Currently only supports Windows hosts. If the determined conversion
-# type is listed in (the comma separated) LAZY, no conversion will
-# take place.
-func_file_conv ()
-{
-  file=$1
-  case $file in
-    / | /[!/]*) # absolute file, and not a UNC file
-      if test -z "$file_conv"; then
-	# lazily determine how to convert abs files
-	case `uname -s` in
-	  MINGW*)
-	    file_conv=mingw
-	    ;;
-	  CYGWIN* | MSYS*)
-	    file_conv=cygwin
-	    ;;
-	  *)
-	    file_conv=wine
-	    ;;
-	esac
-      fi
-      case $file_conv/,$2, in
-	*,$file_conv,*)
-	  ;;
-	mingw/*)
-	  file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'`
-	  ;;
-	cygwin/* | msys/*)
-	  file=`cygpath -m "$file" || echo "$file"`
-	  ;;
-	wine/*)
-	  file=`winepath -w "$file" || echo "$file"`
-	  ;;
-      esac
-      ;;
-  esac
-}
-
-# func_cl_dashL linkdir
-# Make cl look for libraries in LINKDIR
-func_cl_dashL ()
-{
-  func_file_conv "$1"
-  if test -z "$lib_path"; then
-    lib_path=$file
-  else
-    lib_path="$lib_path;$file"
-  fi
-  linker_opts="$linker_opts -LIBPATH:$file"
-}
-
-# func_cl_dashl library
-# Do a library search-path lookup for cl
-func_cl_dashl ()
-{
-  lib=$1
-  found=no
-  save_IFS=$IFS
-  IFS=';'
-  for dir in $lib_path $LIB
-  do
-    IFS=$save_IFS
-    if $shared && test -f "$dir/$lib.dll.lib"; then
-      found=yes
-      lib=$dir/$lib.dll.lib
-      break
-    fi
-    if test -f "$dir/$lib.lib"; then
-      found=yes
-      lib=$dir/$lib.lib
-      break
-    fi
-    if test -f "$dir/lib$lib.a"; then
-      found=yes
-      lib=$dir/lib$lib.a
-      break
-    fi
-  done
-  IFS=$save_IFS
-
-  if test "$found" != yes; then
-    lib=$lib.lib
-  fi
-}
-
-# func_cl_wrapper cl arg...
-# Adjust compile command to suit cl
-func_cl_wrapper ()
-{
-  # Assume a capable shell
-  lib_path=
-  shared=:
-  linker_opts=
-  for arg
-  do
-    if test -n "$eat"; then
-      eat=
-    else
-      case $1 in
-	-o)
-	  # configure might choose to run compile as 'compile cc -o foo foo.c'.
-	  eat=1
-	  case $2 in
-	    *.o | *.[oO][bB][jJ])
-	      func_file_conv "$2"
-	      set x "$@" -Fo"$file"
-	      shift
-	      ;;
-	    *)
-	      func_file_conv "$2"
-	      set x "$@" -Fe"$file"
-	      shift
-	      ;;
-	  esac
-	  ;;
-	-I)
-	  eat=1
-	  func_file_conv "$2" mingw
-	  set x "$@" -I"$file"
-	  shift
-	  ;;
-	-I*)
-	  func_file_conv "${1#-I}" mingw
-	  set x "$@" -I"$file"
-	  shift
-	  ;;
-	-l)
-	  eat=1
-	  func_cl_dashl "$2"
-	  set x "$@" "$lib"
-	  shift
-	  ;;
-	-l*)
-	  func_cl_dashl "${1#-l}"
-	  set x "$@" "$lib"
-	  shift
-	  ;;
-	-L)
-	  eat=1
-	  func_cl_dashL "$2"
-	  ;;
-	-L*)
-	  func_cl_dashL "${1#-L}"
-	  ;;
-	-static)
-	  shared=false
-	  ;;
-	-Wl,*)
-	  arg=${1#-Wl,}
-	  save_ifs="$IFS"; IFS=','
-	  for flag in $arg; do
-	    IFS="$save_ifs"
-	    linker_opts="$linker_opts $flag"
-	  done
-	  IFS="$save_ifs"
-	  ;;
-	-Xlinker)
-	  eat=1
-	  linker_opts="$linker_opts $2"
-	  ;;
-	-*)
-	  set x "$@" "$1"
-	  shift
-	  ;;
-	*.cc | *.CC | *.cxx | *.CXX | *.[cC]++)
-	  func_file_conv "$1"
-	  set x "$@" -Tp"$file"
-	  shift
-	  ;;
-	*.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO])
-	  func_file_conv "$1" mingw
-	  set x "$@" "$file"
-	  shift
-	  ;;
-	*)
-	  set x "$@" "$1"
-	  shift
-	  ;;
-      esac
-    fi
-    shift
-  done
-  if test -n "$linker_opts"; then
-    linker_opts="-link$linker_opts"
-  fi
-  exec "$@" $linker_opts
-  exit 1
-}
-
-eat=
-
-case $1 in
-  '')
-     echo "$0: No command.  Try '$0 --help' for more information." 1>&2
-     exit 1;
-     ;;
-  -h | --h*)
-    cat <<\EOF
-Usage: compile [--help] [--version] PROGRAM [ARGS]
-
-Wrapper for compilers which do not understand '-c -o'.
-Remove '-o dest.o' from ARGS, run PROGRAM with the remaining
-arguments, and rename the output as expected.
-
-If you are trying to build a whole package this is not the
-right script to run: please start by reading the file 'INSTALL'.
-
-Report bugs to <bug-automake@gnu.org>.
-EOF
-    exit $?
-    ;;
-  -v | --v*)
-    echo "compile $scriptversion"
-    exit $?
-    ;;
-  cl | *[/\\]cl | cl.exe | *[/\\]cl.exe | \
-  icl | *[/\\]icl | icl.exe | *[/\\]icl.exe )
-    func_cl_wrapper "$@"      # Doesn't return...
-    ;;
-esac
-
-ofile=
-cfile=
-
-for arg
-do
-  if test -n "$eat"; then
-    eat=
-  else
-    case $1 in
-      -o)
-	# configure might choose to run compile as 'compile cc -o foo foo.c'.
-	# So we strip '-o arg' only if arg is an object.
-	eat=1
-	case $2 in
-	  *.o | *.obj)
-	    ofile=$2
-	    ;;
-	  *)
-	    set x "$@" -o "$2"
-	    shift
-	    ;;
-	esac
-	;;
-      *.c)
-	cfile=$1
-	set x "$@" "$1"
-	shift
-	;;
-      *)
-	set x "$@" "$1"
-	shift
-	;;
-    esac
-  fi
-  shift
-done
-
-if test -z "$ofile" || test -z "$cfile"; then
-  # If no '-o' option was seen then we might have been invoked from a
-  # pattern rule where we don't need one.  That is ok -- this is a
-  # normal compilation that the losing compiler can handle.  If no
-  # '.c' file was seen then we are probably linking.  That is also
-  # ok.
-  exec "$@"
-fi
-
-# Name of file we expect compiler to create.
-cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'`
-
-# Create the lock directory.
-# Note: use '[/\\:.-]' here to ensure that we don't use the same name
-# that we are using for the .o file.  Also, base the name on the expected
-# object file name, since that is what matters with a parallel build.
-lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d
-while true; do
-  if mkdir "$lockdir" >/dev/null 2>&1; then
-    break
-  fi
-  sleep 1
-done
-# FIXME: race condition here if user kills between mkdir and trap.
-trap "rmdir '$lockdir'; exit 1" 1 2 15
-
-# Run the compile.
-"$@"
-ret=$?
-
-if test -f "$cofile"; then
-  test "$cofile" = "$ofile" || mv "$cofile" "$ofile"
-elif test -f "${cofile}bj"; then
-  test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile"
-fi
-
-rmdir "$lockdir"
-exit $ret
-
-# Local Variables:
-# mode: shell-script
-# sh-indentation: 2
-# eval: (add-hook 'before-save-hook 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-time-zone: "UTC0"
-# time-stamp-end: "; # UTC"
-# End:
--- a/configure.ac
+++ b/configure.ac
@ -2,10 +2,10 @@
 # Process this file with autoconf to produce a configure script.

 AC_PREREQ(2.61)
-AC_INIT([picasso],[2.7.1],[https://github.com/devkitPro/picasso/issues])
+AC_INIT([picasso],[1.0.0],[fincs.alt1@gmail.com])
 AC_CONFIG_SRCDIR([source/picasso_frontend.cpp])

-AM_INIT_AUTOMAKE([subdir-objects])
+AM_INIT_AUTOMAKE([1.10])

 AC_CANONICAL_BUILD
 AC_CANONICAL_HOST
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@ -1,42 +0,0 @@
-cmake_minimum_required(VERSION 3.22)
-
-project(linpicasso_sample)
-
-set(CMAKE_EXE_LINKER_FLAGS "-L${DEVKITPRO}/libctru/lib -L${DEVKITPRO}/picaGL/lib -L${DEVKITPRO}/portlibs/3ds/lib -specs=3dsx.specs -Wl,--gc-sections")
-
-include_directories(${DEVKITPRO}/libctru/include ${DEVKITPRO}/picaGL/include ${DEVKITPRO}/portlibs/3ds/include)
-add_definitions("-D__3DS__")
-
-set(APP_TITLE "${PROJECT_NAME}")
-set(APP_DESCRIPTION "Example of Lib Picasso")
-set(APP_AUTHOR "Tobi-D7, tobid7vx")
-
-set(APP_ICON "/opt/devkitpro/libctru/default_icon.png")
-set(APP_ROMFS "${PROJECT_SOURCE_DIR}/romfs")
-
-enable_language(ASM)
-
-set(BASE_CTR ON CACHE BOOL "Enable 3ds")
-add_subdirectory(../ picasso)
-
-add_executable(${PROJECT_NAME}.elf src/main.cpp)
-
-target_include_directories(${PROJECT_NAME}.elf PRIVATE src ../include)
-
-target_link_libraries(${PROJECT_NAME}.elf citro2d citro3d ctru m picasso)
-
-add_custom_command(
-	OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.smdh
-	COMMAND smdhtool --create "${APP_TITLE}" "${APP_DESCRIPTION}" "${APP_AUTHOR}"  "${APP_ICON}" ${PROJECT_NAME}.smdh
-    DEPENDS ${PROJECT_NAME}.elf
-)
-
-add_custom_command(
-	OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.3dsx
-	COMMAND 3dsxtool ${PROJECT_NAME}.elf ${PROJECT_NAME}.3dsx --romfs=${APP_ROMFS} --smdh=${PROJECT_NAME}.smdh
-	DEPENDS ${PROJECT_NAME}.elf
-)
-
-add_custom_target( 3ds ALL
-	DEPENDS ${PROJECT_NAME}.smdh ${PROJECT_NAME}.3dsx
-)
--- a/example/romfs/vshader.pica
+++ b/example/romfs/vshader.pica
@ -1,36 +0,0 @@
-; Example PICA200 vertex shader
-
-; Uniforms
-.fvec projection[4]
-
-; Constants
-.constf myconst(0.0, 1.0, -1.0, 0.1)
-.constf myconst2(0.3, 0.0, 0.0, 0.0)
-.alias  zeros myconst.xxxx ; Vector full of zeros
-.alias  ones  myconst.yyyy ; Vector full of ones
-
-; Outputs
-.out outpos position
-.out outclr color
-
-; Inputs (defined as aliases for convenience)
-.alias inpos v0
-.alias inclr v1
-
-.proc main
-	; Force the w component of inpos to be 1.0
-	mov r0.xyz, inpos
-	mov r0.w,   ones
-
-	; outpos = projectionMatrix * inpos
-	dp4 outpos.x, projection[0], r0
-	dp4 outpos.y, projection[1], r0
-	dp4 outpos.z, projection[2], r0
-	dp4 outpos.w, projection[3], r0
-
-	; outclr = inclr
-	mov outclr, inclr
-
-	; We're finished
-	end
-.end
--- a/example/src/main.cpp
+++ b/example/src/main.cpp
@ -1,174 +0,0 @@
-#include <3ds.h>
-#include <citro3d.h>
-
-#include <pica.hpp>
-
-static const char *const vertShader = R"text(  
-; Example PICA200 vertex shader
-
-; Uniforms
-.fvec projection[4]
-
-; Constants
-.constf myconst(0.0, 1.0, -1.0, 0.1)
-.constf myconst2(0.3, 0.0, 0.0, 0.0)
-.alias  zeros myconst.xxxx ; Vector full of zeros
-.alias  ones  myconst.yyyy ; Vector full of ones
-
-; Outputs
-.out outpos position
-.out outclr color
-
-; Inputs (defined as aliases for convenience)
-.alias inpos v0
-.alias inclr v1
-
-.proc main
-	; Force the w component of inpos to be 1.0
-	mov r0.xyz, inpos
-	mov r0.w,   ones
-
-	; outpos = projectionMatrix * inpos
-	dp4 outpos.x, projection[0], r0
-	dp4 outpos.y, projection[1], r0
-	dp4 outpos.z, projection[2], r0
-	dp4 outpos.w, projection[3], r0
-
-	; outclr = inclr
-	mov outclr, inclr
-
-	; We're finished
-	end
-.end
-)text";
-
-#define CLEAR_COLOR 0x68B0D8FF
-
-#define DISPLAY_TRANSFER_FLAGS \
-	(GX_TRANSFER_FLIP_VERT(0) | GX_TRANSFER_OUT_TILED(0) | GX_TRANSFER_RAW_COPY(0) | \
-	GX_TRANSFER_IN_FORMAT(GX_TRANSFER_FMT_RGBA8) | GX_TRANSFER_OUT_FORMAT(GX_TRANSFER_FMT_RGB8) | \
-	GX_TRANSFER_SCALING(GX_TRANSFER_SCALE_NO))
-
-typedef struct { float x, y, z; } vertex;
-
-static const vertex vertex_list[] =
-{
-	{ 200.0f, 200.0f, 0.5f },
-	{ 100.0f, 40.0f, 0.5f },
-	{ 300.0f, 40.0f, 0.5f },
-};
-
-#define vertex_list_count (sizeof(vertex_list)/sizeof(vertex_list[0]))
-
-static DVLB_s* vshader_dvlb;
-static shaderProgram_s program;
-static int uLoc_projection;
-static C3D_Mtx projection;
-
-
-static char* vshader_shbin;
-static int   vshader_shbin_size;
-
-static void* vbo_data;
-
-static void sceneInit(void)
-{
-	// Load the vertex shader, create a shader program and bind it
-	vshader_dvlb = DVLB_ParseFile((u32*)vshader_shbin, vshader_shbin_size);
-	shaderProgramInit(&program);
-	shaderProgramSetVsh(&program, &vshader_dvlb->DVLE[0]);
-	C3D_BindProgram(&program);
-
-	// Get the location of the uniforms
-	uLoc_projection = shaderInstanceGetUniformLocation(program.vertexShader, "projection");
-
-	// Configure attributes for use with the vertex shader
-	C3D_AttrInfo* attrInfo = C3D_GetAttrInfo();
-	AttrInfo_Init(attrInfo);
-	AttrInfo_AddLoader(attrInfo, 0, GPU_FLOAT, 3); // v0=position
-	AttrInfo_AddFixed(attrInfo, 1); // v1=color
-
-	// Set the fixed attribute (color) to solid white
-	C3D_FixedAttribSet(1, 1.0, 1.0, 1.0, 1.0);
-
-	// Compute the projection matrix
-	Mtx_OrthoTilt(&projection, 0.0, 400.0, 0.0, 240.0, 0.0, 1.0, true);
-
-	// Create the VBO (vertex buffer object)
-	vbo_data = linearAlloc(sizeof(vertex_list));
-	memcpy(vbo_data, vertex_list, sizeof(vertex_list));
-
-	// Configure buffers
-	C3D_BufInfo* bufInfo = C3D_GetBufInfo();
-	BufInfo_Init(bufInfo);
-	BufInfo_Add(bufInfo, vbo_data, sizeof(vertex), 1, 0x0);
-
-	// Configure the first fragment shading substage to just pass through the vertex color
-	// See https://www.opengl.org/sdk/docs/man2/xhtml/glTexEnv.xml for more insight
-	C3D_TexEnv* env = C3D_GetTexEnv(0);
-	C3D_TexEnvInit(env);
-	C3D_TexEnvSrc(env, C3D_Both, GPU_PRIMARY_COLOR, (GPU_TEVSRC)0, (GPU_TEVSRC)0);
-	C3D_TexEnvFunc(env, C3D_Both, GPU_REPLACE);
-}
-
-static void sceneRender(void)
-{
-	// Update the uniforms
-	C3D_FVUnifMtx4x4(GPU_VERTEX_SHADER, uLoc_projection, &projection);
-
-	// Draw the VBO
-	C3D_DrawArrays(GPU_TRIANGLES, 0, vertex_list_count);
-}
-
-static void sceneExit(void)
-{
-	// Free the VBO
-	linearFree(vbo_data);
-
-	// Free the shader program
-	shaderProgramFree(&program);
-	DVLB_Free(vshader_dvlb);
-}
-
-int main()
-{
-	// Initialize graphics
-	gfxInitDefault();
-    romfsInit();
-	C3D_Init(C3D_DEFAULT_CMDBUF_SIZE);
-    //vshader_shbin = Pica::AssembleCode(vertShader, vshader_shbin_size);
-    vshader_shbin = Pica::AssembleFile("romfs:/vshader.pica", vshader_shbin_size);
-
-	// Initialize the render target
-	C3D_RenderTarget* target = C3D_RenderTargetCreate(240, 400, GPU_RB_RGBA8, GPU_RB_DEPTH24_STENCIL8);
-	C3D_RenderTargetSetOutput(target, GFX_TOP, GFX_LEFT, DISPLAY_TRANSFER_FLAGS);
-
-	// Initialize the scene
-	sceneInit();
-
-	// Main loop
-	while (aptMainLoop())
-	{
-		hidScanInput();
-
-		// Respond to user input
-		u32 kDown = hidKeysDown();
-		if (kDown & KEY_START)
-			break; // break in order to return to hbmenu
-
-		// Render the scene
-		C3D_FrameBegin(C3D_FRAME_SYNCDRAW);
-			C3D_RenderTargetClear(target, C3D_CLEAR_ALL, CLEAR_COLOR, 0);
-			C3D_FrameDrawOn(target);
-			sceneRender();
-		C3D_FrameEnd(0);
-	}
-
-	// Deinitialize the scene
-	sceneExit();
-
-	// Deinitialize graphics
-	C3D_Fini();
-	gfxExit();
-	return 0;
-}
--- a/include/pica.hpp
+++ b/include/pica.hpp
@ -1,10 +0,0 @@
-#pragma once
-#include <iostream>
-#include <string>
-
-namespace Pica
-{
-    void InstallErrorCallback(void(*ErrorHandler)(const char* top, const char* message));
-    char* AssembleCode(const char* vertex, int &res_size);
-    char* AssembleFile(const char* file, int &res_size);
-}
--- a/include/picasso/picasso.h
+++ b/include/picasso/picasso.h
@ -1,256 +0,0 @@
-#pragma once
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#include <stdarg.h>
-#include <getopt.h>
-#ifdef WIN32
-#include <fcntl.h>
-#endif
-#include "types.h"
-
-#include <vector>
-#include <list>
-#include <map>
-#include <string>
-#include <algorithm>
-
-#include "picasso/FileClass.h"
-
-#include "picasso/maestro_opcodes.h"
-
-#if !defined(WIN32) && !defined(stricmp)
-#define stricmp strcasecmp
-#endif
-
-enum
-{
-	COMP_X = 0,
-	COMP_Y,
-	COMP_Z,
-	COMP_W,
-};
-
-#define SWIZZLE_COMP(n,v) ((v) << (6-(n)*2))
-#define OPSRC_MAKE(neg, sw) ((neg) | ((sw) << 1))
-#define OPDESC_MAKE(out, src1, src2, src3) ((out) | ((src1) << 4) | ((src2) << (4+9)) | ((src3) << (4+9*2)))
-#define FMT_OPCODE(n) ((n)<<26)
-#define OUTPUT_MAKE(i, reg, mask) ((i) | ((reg)<<16) | ((u64)(mask)<<32))
-
-#define DEFAULT_SWIZZLE (SWIZZLE_COMP(0,COMP_X) | SWIZZLE_COMP(1,COMP_Y) | SWIZZLE_COMP(2,COMP_Z) | SWIZZLE_COMP(3,COMP_W))
-#define DEFAULT_OPSRC OPSRC_MAKE(0, DEFAULT_SWIZZLE)
-
-#define OPDESC_MASK_D123 OPDESC_MAKE(0xF, 0x1FF, 0x1FF, 0x1FF)
-#define OPDESC_MASK_D12  OPDESC_MAKE(0xF, 0x1FF, 0x1FF, 0)
-#define OPDESC_MASK_D1   OPDESC_MAKE(0xF, 0x1FF, 0,     0)
-#define OPDESC_MASK_1    OPDESC_MAKE(0,   0x1FF, 0,     0)
-#define OPDESC_MASK_12   OPDESC_MAKE(0,   0x1FF, 0x1FF, 0)
-
-enum
-{
-	COND_EQ = 0,
-	COND_NE,
-	COND_LT,
-	COND_LE,
-	COND_GT,
-	COND_GE,
-};
-
-//-----------------------------------------------------------------------------
-// Global data
-//-----------------------------------------------------------------------------
-
-// Output buffer
-#define MAX_VSH_SIZE 512
-typedef std::vector<u32> outputBufType;
-typedef outputBufType::iterator outputBufIter;
-extern outputBufType g_outputBuf;
-
-enum
-{
-	SE_PROC,
-	SE_FOR,
-	SE_IF,
-	SE_ARRAY,
-};
-
-struct StackEntry
-{
-	int type;
-	size_t pos;
-	union
-	{
-		const char* strExtra;
-		size_t uExtra;
-	};
-};
-
-// Stack used to keep track of stuff.
-#define MAX_STACK 32
-extern StackEntry g_stack[MAX_STACK];
-extern int g_stackPos;
-
-// Operand descriptor stuff.
-#define MAX_OPDESC 128
-extern int g_opdescTable[MAX_OPDESC];
-extern int g_opdeskMasks[MAX_OPDESC]; // used to keep track of used bits
-extern int g_opdescCount;
-
-enum
-{
-	UTYPE_BOOL = 0,
-	UTYPE_IVEC,
-	UTYPE_FVEC,
-};
-
-struct Uniform
-{
-	std::string name;
-	int pos, size;
-	int type;
-
-	inline bool operator <(const Uniform& rhs) const
-	{
-		return pos < rhs.pos;
-	}
-
-	void init(const char* name, int pos, int size, int type)
-	{
-		this->name = name;
-		this->pos = pos;
-		this->size = size;
-		this->type = type;
-	}
-};
-
-// List of uniforms
-#define MAX_UNIFORM 0x60
-extern Uniform g_uniformTable[MAX_UNIFORM];
-extern int g_uniformCount;
-
-struct DVLEData; // Forward declaration
-
-typedef std::pair<size_t, size_t> procedure; // position, size
-typedef std::pair<size_t, std::string> relocation; // position, name
-
-typedef std::map<std::string, procedure> procTableType;
-typedef std::map<std::string, size_t> labelTableType;
-typedef std::map<std::string, int> aliasTableType;
-typedef std::vector<relocation> relocTableType;
-typedef std::list<DVLEData> dvleTableType;
-
-typedef procTableType::iterator procTableIter;
-typedef labelTableType::iterator labelTableIter;
-typedef aliasTableType::iterator aliasTableIter;
-typedef relocTableType::iterator relocTableIter;
-typedef dvleTableType::iterator dvleTableIter;
-
-extern procTableType g_procTable;
-extern dvleTableType g_dvleTable;
-extern relocTableType g_procRelocTable;
-extern int g_totalDvleCount;
-
-// The following are cleared before each file is processed
-extern labelTableType g_labels;
-extern relocTableType g_labelRelocTable;
-extern aliasTableType g_aliases;
-
-extern bool g_autoNop;
-
-int AssembleString(char* str, const char* initialFilename);
-int RelocateProduct(void);
-
-//-----------------------------------------------------------------------------
-// Local data
-//-----------------------------------------------------------------------------
-
-enum
-{
-	OUTTYPE_POS      = 0,
-	OUTTYPE_NQUAT    = 1,
-	OUTTYPE_CLR      = 2,
-	OUTTYPE_TCOORD0  = 3,
-	OUTTYPE_TCOORD0W = 4,
-	OUTTYPE_TCOORD1  = 5,
-	OUTTYPE_TCOORD2  = 6,
-	OUTTYPE_VIEW     = 8,
-	OUTTYPE_DUMMY    = 9,
-};
-
-enum
-{
-	GSHTYPE_POINT    = 0,
-	GSHTYPE_VARIABLE = 1,
-	GSHTYPE_FIXED    = 2,
-};
-
-struct Constant
-{
-	int regId;
-	int type;
-	union
-	{
-		float fparam[4];
-		u8 iparam[4];
-		bool bparam;
-	};
-};
-
-struct DVLEData
-{
-	// General config
-	std::string filename;
-	std::string entrypoint;
-	size_t entryStart, entryEnd;
-	bool nodvle, isGeoShader, isCompatGeoShader, isMerge;
-	u16 inputMask, outputMask;
-	u8 geoShaderType;
-	u8 geoShaderFixedStart;
-	u8 geoShaderVariableNum;
-	u8 geoShaderFixedNum;
-
-	// Uniforms
-	Uniform uniformTable[MAX_UNIFORM];
-	int uniformCount;
-	size_t symbolSize;
-
-	// Constants
-	#define MAX_CONSTANT 0x60
-	Constant constantTable[MAX_CONSTANT];
-	int constantCount;
-
-	// Outputs
-	#define MAX_OUTPUT 16
-	u64 outputTable[MAX_OUTPUT];
-	u32 outputUsedReg;
-	int outputCount;
-
-	bool usesGshSpace() const { return isGeoShader && !isCompatGeoShader; }
-	int findFreeOutput() const
-	{
-		for (int i = 0; i < maxOutputReg(); i ++)
-			if (!(outputMask & BIT(i)))
-				return i;
-		return -1;
-	}
-
-	int findFreeInput() const
-	{
-		for (int i = 0; i < 16; i ++)
-			if (!(inputMask & BIT(i)))
-				return i;
-		return -1;
-	}
-
-	int maxOutputReg() const
-	{
-		return isGeoShader ? 0x07 : 0x10;
-	}
-
-	DVLEData(const char* filename) :
-		filename(filename), entrypoint("main"),
-		nodvle(false), isGeoShader(false), isCompatGeoShader(false), isMerge(false),
-		inputMask(0), outputMask(0), geoShaderType(0), geoShaderFixedStart(0), geoShaderVariableNum(0), geoShaderFixedNum(0),
-		uniformCount(0), symbolSize(0), constantCount(0), outputUsedReg(0), outputCount(0) { }
-};
--- a/include/picasso/FileClass.h
+++ b/include/picasso/FileClass.h
@ -1,44 +1,43 @@
 #pragma once
 #include <stdio.h>
-#include "picasso/types.h"
-#include <sstream>
-#include <string>
+#include "types.h"

 class FileClass
 {
-	std::stringstream f;
+	FILE* f;
 	bool LittleEndian, own;
 	int filePos;

 	size_t _RawRead(void* buffer, size_t size)
 	{
-        f.read((char*)buffer, size);
-		filePos += size;
-		return size;
+		size_t x = fread(buffer, 1, size, f);
+		filePos += x;
+		return x;
 	}

 	size_t _RawWrite(const void* buffer, size_t size)
 	{
-		f.write((const char*)buffer, size);
-		filePos += size;
-		return size;
+		size_t x = fwrite(buffer, 1, size, f);
+		filePos += x;
+		return x;
 	}

 public:
 	FileClass(const char* file, const char* mode) : LittleEndian(true), own(true), filePos(0)
 	{
-		//Do nothing
+		f = fopen(file, mode);
 	}
+	FileClass(FILE* inf) : f(inf), LittleEndian(true), own(false), filePos(0) { }
 	~FileClass()
 	{
-		//Do nothing
+		if (f && own) fclose(f);
 	}

 	void SetLittleEndian() { LittleEndian = true; }
 	void SetBigEndian() { LittleEndian = false; }

-	std::stringstream* get_ptr() { return &f; }
-	bool openerror() { return false; }
+	FILE* get_ptr() { return f; }
+	bool openerror() { return f == NULL; }

 	dword_t ReadDword()
 	{
@ -104,11 +103,13 @@ public:
 		t.f = value;
 		WriteWord(t.w);
 	}
-
+	
 	bool ReadRaw(void* buffer, size_t size) { return _RawRead(buffer, size) == size; }
 	bool WriteRaw(const void* buffer, size_t size) { return _RawWrite(buffer, size) == size; }

+	void Seek(int pos, int mode) { fseek(f, pos, mode); }
 	int Tell() { return filePos /*ftell(f)*/; }
+	void Flush() { fflush(f); }
 };

 static inline char* StringFromFile(const char* filename)
@ -128,4 +129,4 @@ static inline char* StringFromFile(const char* filename)
 	buf[size] = 0;
 	fclose(f);
 	return buf;
-}
+}
--- a/include/picasso/maestro_opcodes.h
+++ b/include/picasso/maestro_opcodes.h
@ -5,10 +5,10 @@ enum
 	MAESTRO_DP3,
 	MAESTRO_DP4,
 	MAESTRO_DPH,
-	MAESTRO_DST,
+	MAESTRO_unk4,
 	MAESTRO_EX2,
 	MAESTRO_LG2,
-	MAESTRO_LITP,
+	MAESTRO_unk7,
 	MAESTRO_MUL,
 	MAESTRO_SGE,
 	MAESTRO_SLT,
@ -27,7 +27,7 @@ enum
 	MAESTRO_unk16,
 	MAESTRO_unk17,
 	MAESTRO_DPHI,
-	MAESTRO_DSTI,
+	MAESTRO_unk19,
 	MAESTRO_SGEI,
 	MAESTRO_SLTI,
 	MAESTRO_unk1C,
@ -35,7 +35,7 @@ enum
 	MAESTRO_unk1E,
 	MAESTRO_unk1F,

-	MAESTRO_BREAK,
+	MAESTRO_unk20,
 	MAESTRO_NOP,
 	MAESTRO_END,
 	MAESTRO_BREAKC,
@ -54,4 +54,4 @@ enum
 	// Only the upper 3 bits are used for the following opcodes
 	MAESTRO_MADI = 0x30,
 	MAESTRO_MAD = 0x38,
-};
+};
--- a/source/picasso.h
+++ b/source/picasso.h
@ -0,0 +1,170 @@
+#pragma once
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdarg.h>
+#ifdef WIN32
+#include <fcntl.h>
+#endif
+#include "types.h"
+
+#include <vector>
+#include <list>
+#include <map>
+#include <string>
+#include <algorithm>
+
+#include "FileClass.h"
+
+#include "maestro_opcodes.h"
+
+#if !defined(WIN32) && !defined(stricmp)
+#define stricmp strcasecmp
+#endif
+
+enum
+{
+	COMP_X = 0,
+	COMP_Y,
+	COMP_Z,
+	COMP_W,
+};
+
+#define SWIZZLE_COMP(n,v) ((v) << (6-(n)*2))
+#define OPSRC_MAKE(neg, sw) ((neg) | ((sw) << 1))
+#define OPDESC_MAKE(out, src1, src2, src3) ((out) | ((src1) << 4) | ((src2) << (4+9)) | ((src3) << (4+9*2)))
+#define FMT_OPCODE(n) ((n)<<26)
+#define OUTPUT_MAKE(i, reg, mask) ((i) | ((reg)<<16) | ((u64)(mask)<<32))
+
+#define DEFAULT_SWIZZLE (SWIZZLE_COMP(0,COMP_X) | SWIZZLE_COMP(1,COMP_Y) | SWIZZLE_COMP(2,COMP_Z) | SWIZZLE_COMP(3,COMP_W))
+#define DEFAULT_OPSRC OPSRC_MAKE(0, DEFAULT_SWIZZLE)
+
+#define OPDESC_MASK_D123 OPDESC_MAKE(0xF, 0x1FF, 0x1FF, 0x1FF)
+#define OPDESC_MASK_D12  OPDESC_MAKE(0xF, 0x1FF, 0x1FF, 0)
+#define OPDESC_MASK_D1   OPDESC_MAKE(0xF, 0x1FF, 0,     0)
+#define OPDESC_MASK_1    OPDESC_MAKE(0,   0x1FF, 0,     0)
+#define OPDESC_MASK_12   OPDESC_MAKE(0,   0x1FF, 0x1FF, 0)
+
+enum
+{
+	COND_EQ = 0,
+	COND_NE,
+	COND_LT,
+	COND_LE,
+	COND_GT,
+	COND_GE,
+	COND_UNK1,
+	COND_UNK2,
+};
+
+typedef std::vector<u32> outputBufType;
+typedef outputBufType::iterator outputBufIter;
+
+extern bool g_isGeoShader;
+extern outputBufType g_outputBuf;
+
+enum
+{
+	SE_PROC,
+	SE_FOR,
+	SE_IF,
+};
+
+struct StackEntry
+{
+	int type;
+	size_t pos;
+	union
+	{
+		const char* strExtra;
+		size_t uExtra;
+	};
+};
+
+// Stack used to keep track of stuff.
+#define MAX_STACK 32
+extern StackEntry g_stack[MAX_STACK];
+extern int g_stackPos;
+
+#define MAX_OPDESC 128
+extern int g_opdescTable[MAX_OPDESC];
+extern int g_opdeskMasks[MAX_OPDESC]; // used to keep track of used bits
+extern int g_opdescCount;
+
+enum
+{
+	UTYPE_BOOL = 0,
+	UTYPE_IVEC,
+	UTYPE_FVEC,
+};
+
+struct Uniform
+{
+	std::string name;
+	int pos, size;
+	int type;
+};
+
+#define MAX_UNIFORM 0x60
+extern Uniform g_uniformTable[MAX_UNIFORM];
+extern int g_uniformCount;
+
+enum
+{
+	OUTTYPE_POS = 0,
+	OUTTYPE_NQUAT,
+	OUTTYPE_CLR,
+	OUTTYPE_TCOORD0,
+	OUTTYPE_TCOORD0W,
+	OUTTYPE_TCOORD1,
+	OUTTYPE_TCOORD2,
+	OUTTYPE_7,
+	OUTTYPE_VIEW,
+};
+
+#define MAX_OUTPUT 8
+extern u64 g_outputTable[MAX_OUTPUT];
+extern int g_outputCount;
+
+struct Constant
+{
+	int regId;
+	int type;
+	union
+	{
+		float fparam[4];
+		u8 iparam[4];
+	};
+};
+
+#define MAX_CONSTANT 0x60
+extern Constant g_constantTable[MAX_CONSTANT];
+extern int g_constantCount;
+extern size_t g_constantSize;
+
+struct Relocation
+{
+	size_t instPos;
+	const char* target;
+	bool isProc;
+};
+
+typedef std::pair<size_t, size_t> procedure; // position, size
+
+typedef std::map<std::string, procedure> procTableType;
+typedef std::map<std::string, size_t> labelTableType;
+typedef std::map<std::string, int> aliasTableType;
+typedef std::vector<Relocation> relocTableType;
+
+typedef procTableType::iterator procTableIter;
+typedef labelTableType::iterator labelTableIter;
+typedef aliasTableType::iterator aliasTableIter;
+typedef relocTableType::iterator relocTableIter;
+
+extern procTableType g_procTable;
+extern labelTableType g_labels;
+extern aliasTableType g_aliases;
+extern relocTableType g_relocs;
+
+int AssembleString(char* str, const char* initialFilename);
--- a/source/picasso_assembler.cpp
+++ b/source/picasso_assembler.cpp
--- a/source/picasso_frontend.cpp
+++ b/source/picasso_frontend.cpp
@ -0,0 +1,218 @@
+#include "picasso.h"
+
+// !! Taken from ctrulib !!
+u32 f32tof24(float vf)
+{
+	if (!vf) return 0;
+
+	union { float f; u32 v; } q;
+	q.f=vf;
+
+	u8 s = q.v>>31;
+	u32 exp = ((q.v>>23) & 0xFF) - 0x40;
+	u32 man = (q.v>>7) & 0xFFFF;
+
+	if (exp >= 0)
+		return man | (exp<<16) | (s<<23);
+	else
+		return s<<23;
+}
+
+#ifdef WIN32
+static inline void FixMinGWPath(char* buf)
+{
+	if (buf && *buf == '/')
+	{
+		buf[0] = buf[1];
+		buf[1] = ':';
+	}
+}
+#endif
+
+int usage(const char* prog)
+{
+	fprintf(stderr,
+		"Usage:\n\n"
+		"%s shbinFile vshFile [hFile]\n", prog);
+	return 0;
+}
+
+int main(int argc, char* argv[])
+{
+	if (argc < 3 || argc > 4)
+		return usage(argv[0]);
+
+	char* shbinFile = argv[1];
+	char* vshFile = argv[2];
+	char* hFile = argc > 3 ? argv[3] : NULL;
+
+#ifdef WIN32
+	FixMinGWPath(shbinFile);
+	FixMinGWPath(vshFile);
+	FixMinGWPath(hFile);
+#endif
+
+	char* sourceCode = StringFromFile(vshFile);
+	if (!sourceCode)
+	{
+		fprintf(stderr, "Cannot open input file!\n");
+		return 1;
+	}
+
+	int rc = AssembleString(sourceCode, vshFile);
+	free(sourceCode);
+	if (rc != 0)
+		return rc;
+
+	procTableIter mainIt = g_procTable.find("main");
+	if (mainIt == g_procTable.end())
+	{
+		fprintf(stderr, "Error: main proc not defined\n");
+		return 1;
+	}
+
+	FileClass f(shbinFile, "wb");
+
+	if (f.openerror())
+	{
+		fprintf(stderr, "Can't open output file!");
+		return 1;
+	}
+
+	f.WriteWord(0x424C5644); // DVLB
+	f.WriteWord(1); // 1 DVLE
+	f.WriteWord(3*4 + 0x28); // offset to DVLE
+
+	u32 dvlpStart = f.Tell();
+	u32 shaderSize = g_outputBuf.size();
+	u32 paramStart = 0x28 + 0x40;
+
+	f.WriteWord(0x504C5644); // DVLP
+	f.WriteWord(0); // version
+	f.WriteWord(paramStart); // offset to shader binary blob
+	f.WriteWord(shaderSize); // size of shader binary blob
+	paramStart += shaderSize*4;
+	f.WriteWord(paramStart); // offset to opdesc table
+	f.WriteWord(g_opdescCount); // number of opdescs
+	paramStart += g_opdescCount*8;
+	f.WriteWord(paramStart); // offset to symtable (TODO)
+	f.WriteWord(0); // ????
+	f.WriteWord(0); // ????
+	f.WriteWord(0); // ????
+
+	u32 dvleStart = f.Tell();
+	paramStart -= dvleStart - dvlpStart;
+	
+	f.WriteWord(0x454C5644); // DVLE
+	f.WriteHword(0); // padding?
+	f.WriteHword(g_isGeoShader ? 1 : 0); // Shader type
+	f.WriteWord(mainIt->second.first); // offset to main
+	f.WriteWord(mainIt->second.first+mainIt->second.second); // offset to end of main
+	f.WriteWord(0); // ???
+	f.WriteWord(0); // ???
+	f.WriteWord(paramStart); // offset to constant table
+	f.WriteWord(g_constantCount); // size of constant table
+	paramStart += g_constantSize;
+	f.WriteWord(paramStart); // offset to label table (TODO)
+	f.WriteWord(0); // size of label table (TODO)
+	f.WriteWord(paramStart); // offset to output table
+	f.WriteWord(g_outputCount); // size of output table
+	paramStart += g_outputCount*8;
+	f.WriteWord(paramStart); // offset to uniform table
+	f.WriteWord(g_uniformCount); // size of uniform table
+	paramStart += g_uniformCount*8;
+	f.WriteWord(paramStart); // offset to symbol table
+	u32 temp = f.Tell();
+	f.WriteWord(0); // size of symbol table
+
+	// Write program
+	//for (u32 p : g_outputBuf)
+	for (outputBufIter it = g_outputBuf.begin(); it != g_outputBuf.end(); ++it)
+		f.WriteWord(*it);
+
+	// Write opdescs
+	for (int i = 0; i < g_opdescCount; i ++)
+		f.WriteDword(g_opdescTable[i]);
+
+	// Write constants
+	for (int i = 0; i < g_constantCount; i ++)
+	{
+		Constant& ct = g_constantTable[i];
+		f.WriteHword(ct.type);
+		if (ct.type == UTYPE_FVEC)
+		{
+			f.WriteHword(ct.regId-0x20);
+			for (int j = 0; j < 4; j ++)
+				f.WriteWord(f32tof24(ct.fparam[j]));
+		} else if (ct.type == UTYPE_IVEC)
+		{
+			f.WriteHword(ct.regId-0x80);
+			for (int j = 0; j < 4; j ++)
+				f.WriteByte(ct.iparam[j]);
+		}
+	}
+
+	// Write outputs
+	for (int i = 0; i < g_outputCount; i ++)
+		f.WriteDword(g_outputTable[i]);
+
+	// Write uniforms
+	size_t sp = 0;
+	for (int i = 0; i < g_uniformCount; i ++)
+	{
+		Uniform& u = g_uniformTable[i];
+		size_t l = u.name.length()+1;
+		f.WriteWord(sp); sp += l;
+		f.WriteHword(u.pos-0x10);
+		f.WriteHword(u.pos+u.size-1-0x10);
+	}
+
+	// Write size of symbol table
+	u32 temp2 = f.Tell();
+	f.Seek(temp, SEEK_SET);
+	f.WriteWord(sp);
+	f.Seek(temp2, SEEK_SET);
+
+	// Write symbols
+	for (int i = 0; i < g_uniformCount; i ++)
+	{
+		std::string& u = g_uniformTable[i].name;
+		size_t l = u.length()+1;
+		f.WriteRaw(u.c_str(), l);
+	}
+
+	if (hFile)
+	{
+		FILE* f2 = fopen(hFile, "w");
+		if (!f2)
+		{
+			fprintf(stderr, "Can't open header file!\n");
+			return 1;
+		}
+
+		fprintf(f2, "// Generated by picasso\n");
+		fprintf(f2, "#pragma once\n");
+		const char* prefix = g_isGeoShader ? "GSH" : "VSH";
+		for (int i = 0; i < g_uniformCount; i ++)
+		{
+			Uniform& u = g_uniformTable[i];
+			const char* name = u.name.c_str();
+			if (u.type == UTYPE_FVEC)
+				fprintf(f2, "#define %s_FVEC_%s 0x%02X\n", prefix, name, u.pos-0x20);
+			else if (u.type == UTYPE_IVEC)
+				fprintf(f2, "#define %s_IVEC_%s 0x%02X\n", prefix, name, u.pos-0x80);
+			else if (u.type == UTYPE_BOOL)
+			{
+				if (u.size == 1)
+					fprintf(f2, "#define %s_FLAG_%s BIT(%d)\n", prefix, name, u.pos-0x88);
+				else
+					fprintf(f2, "#define %s_FLAG_%s(_n) BIT(%d+(_n))\n", prefix, name, u.pos-0x88);
+			}
+			fprintf(f2, "#define %s_ULEN_%s %d\n", prefix, name, u.size);
+		}
+
+		fclose(f2);
+	}
+
+	return 0;
+}
--- a/source/picasso_frontend.cxx
+++ b/source/picasso_frontend.cxx
@ -1,308 +0,0 @@
-#include "picasso.h"
-
-// f24 has:
-//  - 1 sign bit
-//  - 7 exponent bits
-//  - 16 mantissa bits
-uint32_t f32tof24(float f)
-{
-	uint32_t i;
-	memcpy(&i, &f, sizeof(f));
-
-	uint32_t mantissa = (i << 9) >>  9;
-	int32_t  exponent = (i << 1) >> 24;
-	uint32_t sign     = (i << 0) >> 31;
-
-	// Truncate mantissa
-	mantissa >>= 7;
-
-	// Re-bias exponent
-	exponent = exponent - 127 + 63;
-	if (exponent < 0)
-	{
-		// Underflow: flush to zero
-		return sign << 23;
-	}
-	else if (exponent > 0x7F)
-	{
-		// Overflow: saturate to infinity
-		return (sign << 23) | (0x7F << 16);
-	}
-
-	return (sign << 23) | (exponent << 16) | mantissa;
-}
-
-#ifdef WIN32
-static inline void FixMinGWPath(char* buf)
-{
-	if (buf && *buf == '/')
-	{
-		buf[0] = buf[1];
-		buf[1] = ':';
-	}
-}
-#endif
-
-int usage(const char* prog)
-{
-	fprintf(stderr,
-		"Usage: %s [options] files...\n"
-		"Options:\n"
-		"  -o, --out=<file>        Specifies the name of the SHBIN file to generate\n"
-		"  -h, --header=<file>     Specifies the name of the header file to generate\n"
-		"  -n, --no-nop            Disables the automatic insertion of padding NOPs\n"
-		"  -v, --version           Displays version information\n"
-		, prog);
-	return EXIT_FAILURE;
-}
-
-int main(int argc, char* argv[])
-{
-	char *shbinFile = NULL, *hFile = NULL;
-
-	static struct option long_options[] =
-	{
-		{ "out",    required_argument, NULL, 'o' },
-		{ "header", required_argument, NULL, 'h' },
-		{ "help",   no_argument,       NULL, '?' },
-		{ "no-nop", no_argument,       NULL, 'n' },
-		{ "version",no_argument,       NULL, 'v' },
-		{ NULL, 0, NULL, 0 }
-	};
-
-	int opt, optidx = 0;
-	while ((opt = getopt_long(argc, argv, "o:h:?nv", long_options, &optidx)) != -1)
-	{
-		switch (opt)
-		{
-			case 'o': shbinFile = optarg; break;
-			case 'h': hFile     = optarg; break;
-			case '?': usage(argv[0]); return EXIT_SUCCESS;
-			case 'n': g_autoNop = false; break;
-			case 'v': printf("%s - Built on %s %s\n", PACKAGE_STRING, __DATE__, __TIME__); return EXIT_SUCCESS;
-			default:  return usage(argv[0]);
-		}
-	}
-
-#ifdef WIN32
-	FixMinGWPath(shbinFile);
-	FixMinGWPath(hFile);
-#endif
-
-	if (optind == argc)
-	{
-		fprintf(stderr, "%s: no input files are specified\n", argv[0]);
-		return usage(argv[0]);
-	}
-
-	if (!shbinFile)
-	{
-		fprintf(stderr, "%s: no output file is specified\n", argv[0]);
-		return usage(argv[0]);
-	}
-
-	int rc = 0;
-	for (int i = optind; i < argc; i ++)
-	{
-		char* vshFile = argv[i];
-
-#ifdef WIN32
-		FixMinGWPath(vshFile);
-#endif
-
-		char* sourceCode = StringFromFile(vshFile);
-		if (!sourceCode)
-		{
-			fprintf(stderr, "error: cannot open input file: %s\n", vshFile);
-			return EXIT_FAILURE;
-		}
-
-		rc = AssembleString(sourceCode, vshFile);
-		free(sourceCode);
-		if (rc != 0)
-			return EXIT_FAILURE;
-	}
-
-	rc = RelocateProduct();
-	if (rc != 0)
-		return EXIT_FAILURE;
-
-	FileClass f(shbinFile, "wb");
-
-	if (f.openerror())
-	{
-		fprintf(stderr, "Can't open output file!");
-		return EXIT_FAILURE;
-	}
-
-	u32 progSize = g_outputBuf.size();
-	u32 dvlpSize = 10*4 + progSize*4 + g_opdescCount*8;
-
-	// Write DVLB header
-	f.WriteWord(0x424C5644); // DVLB
-	f.WriteWord(g_totalDvleCount); // Number of DVLEs
-
-	// Calculate and write DVLE offsets
-	u32 curOff = 2*4 + g_totalDvleCount*4 + dvlpSize;
-	for (dvleTableIter dvle = g_dvleTable.begin(); dvle != g_dvleTable.end(); ++dvle)
-	{
-		if (dvle->nodvle) continue;
-		f.WriteWord(curOff);
-		curOff += 16*4; // Header
-		curOff += dvle->constantCount*20;
-		curOff += dvle->outputCount*8;
-		curOff += dvle->uniformCount*8;
-		curOff += dvle->symbolSize;
-		curOff  = (curOff + 3) &~ 3; // Word alignment
-	}
-
-	// Write DVLP header
-	f.WriteWord(0x504C5644); // DVLP
-	f.WriteWord(0); // version
-	f.WriteWord(10*4); // offset to shader binary blob
-	f.WriteWord(progSize); // size of shader binary blob
-	f.WriteWord(10*4 + progSize*4); // offset to opdesc table
-	f.WriteWord(g_opdescCount); // number of opdescs
-	f.WriteWord(dvlpSize); // offset to symtable (TODO)
-	f.WriteWord(0); // ????
-	f.WriteWord(0); // ????
-	f.WriteWord(0); // ????
-
-	// Write program
-	for (outputBufIter it = g_outputBuf.begin(); it != g_outputBuf.end(); ++it)
-		f.WriteWord(*it);
-
-	// Write opdescs
-	for (int i = 0; i < g_opdescCount; i ++)
-		f.WriteDword(g_opdescTable[i]);
-
-	// Write DVLEs
-	for (dvleTableIter dvle = g_dvleTable.begin(); dvle != g_dvleTable.end(); ++dvle)
-	{
-		if (dvle->nodvle) continue;
-		curOff = 16*4;
-
-		f.WriteWord(0x454C5644); // DVLE
-		f.WriteHword(0x1002); // maybe version?
-		f.WriteByte(dvle->isGeoShader ? 1 : 0); // Shader type
-		f.WriteByte(dvle->isMerge ? 1 : 0);
-		f.WriteWord(dvle->entryStart); // offset to main
-		f.WriteWord(dvle->entryEnd); // offset to end of main
-		f.WriteHword(dvle->inputMask);
-		f.WriteHword(dvle->outputMask);
-		f.WriteByte(dvle->geoShaderType);
-		f.WriteByte(dvle->geoShaderFixedStart);
-		f.WriteByte(dvle->geoShaderVariableNum);
-		f.WriteByte(dvle->geoShaderFixedNum);
-		f.WriteWord(curOff); // offset to constant table
-		f.WriteWord(dvle->constantCount); // size of constant table
-		curOff += dvle->constantCount*5*4;
-		f.WriteWord(curOff); // offset to label table (TODO)
-		f.WriteWord(0); // size of label table (TODO)
-		f.WriteWord(curOff); // offset to output table
-		f.WriteWord(dvle->outputCount); // size of output table
-		curOff += dvle->outputCount*8;
-		f.WriteWord(curOff); // offset to uniform table
-		f.WriteWord(dvle->uniformCount); // size of uniform table
-		curOff += dvle->uniformCount*8;
-		f.WriteWord(curOff); // offset to symbol table
-		f.WriteWord(dvle->symbolSize); // size of symbol table
-
-		// Sort uniforms by position
-		std::sort(dvle->uniformTable, dvle->uniformTable + dvle->uniformCount);
-
-		// Write constants
-		for (int i = 0; i < dvle->constantCount; i ++)
-		{
-			Constant& ct = dvle->constantTable[i];
-			f.WriteHword(ct.type);
-			if (ct.type == UTYPE_FVEC)
-			{
-				f.WriteHword(ct.regId-0x20);
-				for (int j = 0; j < 4; j ++)
-					f.WriteWord(f32tof24(ct.fparam[j]));
-			} else if (ct.type == UTYPE_IVEC)
-			{
-				f.WriteHword(ct.regId-0x80);
-				for (int j = 0; j < 4; j ++)
-					f.WriteByte(ct.iparam[j]);
-			} else if (ct.type == UTYPE_BOOL)
-			{
-				f.WriteHword(ct.regId-0x88);
-				f.WriteWord(ct.bparam ? 1 : 0);
-			}
-			if (ct.type != UTYPE_FVEC)
-				for (int j = 0; j < 3; j ++)
-					f.WriteWord(0); // Padding
-		}
-
-		// Write outputs
-		for (int i = 0; i < dvle->outputCount; i ++)
-			f.WriteDword(dvle->outputTable[i]);
-
-		// Write uniforms
-		size_t sp = 0;
-		for (int i = 0; i < dvle->uniformCount; i ++)
-		{
-			Uniform& u = dvle->uniformTable[i];
-			size_t l = u.name.length()+1;
-			f.WriteWord(sp); sp += l;
-			int pos = u.pos;
-			if (pos >= 0x20)
-				pos -= 0x10;
-			f.WriteHword(pos);
-			f.WriteHword(pos+u.size-1);
-		}
-
-		// Write symbols
-		for (int i = 0; i < dvle->uniformCount; i ++)
-		{
-			std::string u(dvle->uniformTable[i].name);
-			std::replace(u.begin(), u.end(), '$', '.');
-			size_t l = u.length()+1;
-			f.WriteRaw(u.c_str(), l);
-		}
-
-		// Word alignment
-		int pos = f.Tell();
-		int pad = ((pos+3)&~3)-pos;
-		for (int i = 0; i < pad; i ++)
-			f.WriteByte(0);
-	}
-
-	if (hFile)
-	{
-		FILE* f2 = fopen(hFile, "w");
-		if (!f2)
-		{
-			fprintf(stderr, "Can't open header file!\n");
-			return 1;
-		}
-
-		fprintf(f2, "// Generated by picasso\n");
-		fprintf(f2, "#pragma once\n");
-		const char* prefix = g_dvleTable.front().isGeoShader ? "GSH" : "VSH"; // WARNING: HORRIBLE HACK - PLEASE FIX!!!!!!!
-		for (int i = 0; i < g_uniformCount; i ++)
-		{
-			Uniform& u = g_uniformTable[i];
-			const char* name = u.name.c_str();
-			if (*name == '_') continue; // Hidden uniform
-			if (u.type == UTYPE_FVEC)
-				fprintf(f2, "#define %s_FVEC_%s 0x%02X\n", prefix, name, u.pos-0x20);
-			else if (u.type == UTYPE_IVEC)
-				fprintf(f2, "#define %s_IVEC_%s 0x%02X\n", prefix, name, u.pos-0x80);
-			else if (u.type == UTYPE_BOOL)
-			{
-				if (u.size == 1)
-					fprintf(f2, "#define %s_FLAG_%s BIT(%d)\n", prefix, name, u.pos-0x88);
-				else
-					fprintf(f2, "#define %s_FLAG_%s(_n) BIT(%d+(_n))\n", prefix, name, u.pos-0x88);
-			}
-			fprintf(f2, "#define %s_ULEN_%s %d\n", prefix, name, u.size);
-		}
-
-		fclose(f2);
-	}
-
-	return EXIT_SUCCESS;
-}
--- a/source/picasso_library.cpp
+++ b/source/picasso_library.cpp
@ -1,199 +0,0 @@
-#include <pica.hpp>
-#include <picasso/picasso.h>
-// f24 has:
-//  - 1 sign bit
-//  - 7 exponent bits
-//  - 16 mantissa bits
-uint32_t f32tof24(float f) {
-  uint32_t i;
-  memcpy(&i, &f, sizeof(f));
-
-  uint32_t mantissa = (i << 9) >> 9;
-  int32_t exponent = (i << 1) >> 24;
-  uint32_t sign = (i << 0) >> 31;
-
-  // Truncate mantissa
-  mantissa >>= 7;
-
-  // Re-bias exponent
-  exponent = exponent - 127 + 63;
-  if (exponent < 0) {
-    // Underflow: flush to zero
-    return sign << 23;
-  } else if (exponent > 0x7F) {
-    // Overflow: saturate to infinity
-    return (sign << 23) | (0x7F << 16);
-  }
-
-  return (sign << 23) | (exponent << 16) | mantissa;
-}
-
-void BasicHandler(const char *top, const char *message) {
-  std::cout << top << std::endl << message << std::endl;
-}
-
-static void (*EHND)(const char *top, const char *message) = BasicHandler;
-
-namespace Pica {
-
-void InstallErrorCallback(void (*ErrorHandler)(const char *top,
-                                               const char *message)) {
-  EHND = ErrorHandler;
-}
-
-char *AssembleCode(const char *vertex, int &res_size) {
-  int rc = 0;
-  rc = AssembleString((char *)vertex, "llc_npi");
-  if (rc) {
-    EHND("Error when Assembling Code", vertex);
-  }
-
-  rc = RelocateProduct();
-  if (rc) {
-    EHND("Error when Relocating Product", "0");
-  }
-  FileClass f("Dont Care", "wb");
-
-  u32 progSize = g_outputBuf.size();
-  u32 dvlpSize = 10 * 4 + progSize * 4 + g_opdescCount * 8;
-
-  // Write DVLB header
-  f.WriteWord(0x424C5644);       // DVLB
-  f.WriteWord(g_totalDvleCount); // Number of DVLEs
-
-  // Calculate and write DVLE offsets
-  u32 curOff = 2 * 4 + g_totalDvleCount * 4 + dvlpSize;
-  for (dvleTableIter dvle = g_dvleTable.begin(); dvle != g_dvleTable.end();
-       ++dvle) {
-    if (dvle->nodvle)
-      continue;
-    f.WriteWord(curOff);
-    curOff += 16 * 4; // Header
-    curOff += dvle->constantCount * 20;
-    curOff += dvle->outputCount * 8;
-    curOff += dvle->uniformCount * 8;
-    curOff += dvle->symbolSize;
-    curOff = (curOff + 3) & ~3; // Word alignment
-  }
-
-  // Write DVLP header
-  f.WriteWord(0x504C5644);            // DVLP
-  f.WriteWord(0);                     // version
-  f.WriteWord(10 * 4);                // offset to shader binary blob
-  f.WriteWord(progSize);              // size of shader binary blob
-  f.WriteWord(10 * 4 + progSize * 4); // offset to opdesc table
-  f.WriteWord(g_opdescCount);         // number of opdescs
-  f.WriteWord(dvlpSize);              // offset to symtable (TODO)
-  f.WriteWord(0);                     // ????
-  f.WriteWord(0);                     // ????
-  f.WriteWord(0);                     // ????
-
-  // Write program
-  for (outputBufIter it = g_outputBuf.begin(); it != g_outputBuf.end(); ++it)
-    f.WriteWord(*it);
-
-  // Write opdescs
-  for (int i = 0; i < g_opdescCount; i++)
-    f.WriteDword(g_opdescTable[i]);
-
-  // Write DVLEs
-  for (dvleTableIter dvle = g_dvleTable.begin(); dvle != g_dvleTable.end();
-       ++dvle) {
-    if (dvle->nodvle)
-      continue;
-    curOff = 16 * 4;
-
-    f.WriteWord(0x454C5644);                // DVLE
-    f.WriteHword(0x1002);                   // maybe version?
-    f.WriteByte(dvle->isGeoShader ? 1 : 0); // Shader type
-    f.WriteByte(dvle->isMerge ? 1 : 0);
-    f.WriteWord(dvle->entryStart); // offset to main
-    f.WriteWord(dvle->entryEnd);   // offset to end of main
-    f.WriteHword(dvle->inputMask);
-    f.WriteHword(dvle->outputMask);
-    f.WriteByte(dvle->geoShaderType);
-    f.WriteByte(dvle->geoShaderFixedStart);
-    f.WriteByte(dvle->geoShaderVariableNum);
-    f.WriteByte(dvle->geoShaderFixedNum);
-    f.WriteWord(curOff);              // offset to constant table
-    f.WriteWord(dvle->constantCount); // size of constant table
-    curOff += dvle->constantCount * 5 * 4;
-    f.WriteWord(curOff);            // offset to label table (TODO)
-    f.WriteWord(0);                 // size of label table (TODO)
-    f.WriteWord(curOff);            // offset to output table
-    f.WriteWord(dvle->outputCount); // size of output table
-    curOff += dvle->outputCount * 8;
-    f.WriteWord(curOff);             // offset to uniform table
-    f.WriteWord(dvle->uniformCount); // size of uniform table
-    curOff += dvle->uniformCount * 8;
-    f.WriteWord(curOff);           // offset to symbol table
-    f.WriteWord(dvle->symbolSize); // size of symbol table
-
-    // Sort uniforms by position
-    std::sort(dvle->uniformTable, dvle->uniformTable + dvle->uniformCount);
-
-    // Write constants
-    for (int i = 0; i < dvle->constantCount; i++) {
-      Constant &ct = dvle->constantTable[i];
-      f.WriteHword(ct.type);
-      if (ct.type == UTYPE_FVEC) {
-        f.WriteHword(ct.regId - 0x20);
-        for (int j = 0; j < 4; j++)
-          f.WriteWord(f32tof24(ct.fparam[j]));
-      } else if (ct.type == UTYPE_IVEC) {
-        f.WriteHword(ct.regId - 0x80);
-        for (int j = 0; j < 4; j++)
-          f.WriteByte(ct.iparam[j]);
-      } else if (ct.type == UTYPE_BOOL) {
-        f.WriteHword(ct.regId - 0x88);
-        f.WriteWord(ct.bparam ? 1 : 0);
-      }
-      if (ct.type != UTYPE_FVEC)
-        for (int j = 0; j < 3; j++)
-          f.WriteWord(0); // Padding
-    }
-
-    // Write outputs
-    for (int i = 0; i < dvle->outputCount; i++)
-      f.WriteDword(dvle->outputTable[i]);
-
-    // Write uniforms
-    size_t sp = 0;
-    for (int i = 0; i < dvle->uniformCount; i++) {
-      Uniform &u = dvle->uniformTable[i];
-      size_t l = u.name.length() + 1;
-      f.WriteWord(sp);
-      sp += l;
-      int pos = u.pos;
-      if (pos >= 0x20)
-        pos -= 0x10;
-      f.WriteHword(pos);
-      f.WriteHword(pos + u.size - 1);
-    }
-
-    // Write symbols
-    for (int i = 0; i < dvle->uniformCount; i++) {
-      std::string u(dvle->uniformTable[i].name);
-      std::replace(u.begin(), u.end(), '$', '.');
-      size_t l = u.length() + 1;
-      f.WriteRaw(u.c_str(), l);
-    }
-
-    // Word alignment
-    int pos = f.Tell();
-    int pad = ((pos + 3) & ~3) - pos;
-    for (int i = 0; i < pad; i++)
-      f.WriteByte(0);
-  }
-  res_size = f.Tell();
-  return (char *)f.get_ptr()->str().c_str();
-}
-
-char *AssembleFile(const char *file, int &res_size) {
-  char *sourceCode = StringFromFile(file);
-  if (!sourceCode) {
-    EHND("error:", "cannot open input file!\n");
-  }
-  return AssembleCode(sourceCode, res_size);
-}
-} // namespace Pica
--- a/include/picasso/types.h
+++ b/include/picasso/types.h
@ -16,6 +16,7 @@ typedef uint8_t u8;

 #define BIT(n) (1U << (n))

+#if !defined(__GNUC__) || (__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 6)
 #ifndef __BYTE_ORDER__
 #include <sys/param.h>
 #define __BYTE_ORDER__ BYTE_ORDER
@ -23,15 +24,11 @@ typedef uint8_t u8;
 #define __ORDER_BIG_ENDIAN__ BIG_ENDIAN
 #endif

-#ifndef __llvm__
-#if !defined(__GNUC__) || (__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)
-
 static inline uint16_t __builtin_bswap16(uint16_t x)
 {
 	return ((x << 8) & 0xff00) | ((x >> 8) & 0x00ff);
 }

-#if defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ < 7)
 static inline uint32_t __builtin_bswap32(uint32_t x)
 {
 	return ((x << 24) & 0xff000000) |
@ -46,8 +43,6 @@ static inline uint64_t __builtin_bswap64(uint64_t x)
 	      ((uint64_t)__builtin_bswap32(x&0xFFFFFFFF) << 32);
 }
 #endif
-#endif
-#endif

 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 #define be_dword(a)  __builtin_bswap64(a)
@ -65,4 +60,4 @@ static inline uint64_t __builtin_bswap64(uint64_t x)
 #define le_hword(a) __builtin_bswap16(a)
 #else
 #error "What's the endianness of the platform you're targeting?"
-#endif
+#endif