diff --git a/examples/18_particles/main.c b/examples/18_particles/main.c
index d36008ba..ff1aa781 100644
--- a/examples/18_particles/main.c
+++ b/examples/18_particles/main.c
@@ -43,10 +43,10 @@ int main()
   rdpq_init();
   //rdpq_debug_start();
 
+  uint64_t rspTimeTPX = 0;
+  uint64_t rdpTimeBusy = 0;
   #if RSPQ_PROFILE
     rspq_profile_data_t profile_data = (rspq_profile_data_t){};
-    uint64_t rdpTimeBusy = 0;
-    uint64_t rspTimeTPX = 0;
     rspq_profile_start();
   #endif
 
@@ -65,13 +65,20 @@ int main()
   // Meaning you only have to allocate an buffer of arbitrary size here and fill it with data.
   uint32_t particleCountMax = 100'000;
   uint32_t particleCount = 2000;
+
   // NOTE: just like with vertices, particles are interleaved in pairs of 2.
-  // So one TPXParticle struct always contains 2 particles.
+  // So one TPXParticleS8 struct always contains 2 particles.
   // If you need an odd number, just set the second particle size to 0.
-  uint32_t allocSize = sizeof(TPXParticle) * particleCountMax / 2;
-  TPXParticle *particles = malloc_uncached(allocSize);
+  uint32_t allocSize = sizeof(TPXParticleS8) * particleCountMax / 2;
+  TPXParticleS8 *particlesS8 = malloc_uncached(allocSize);
   debugf("Particle-Buffer %ldkb\n", allocSize / 1024);
-  generate_particles_random(particles, particleCount);
+
+  // Additionally, a 16bit version of particles is available.
+  // This one takes up more space (24 bytes vs 16 bytes per pair) and is slightly slower.
+  // In return, it can cover a larger range which can be useful for 3D sprites placed in a scene.
+  // The 8bit variant should be preferred when possible (e.g. in local particle effects)
+  allocSize = sizeof(TPXParticleS16) * particleCountMax / 2;
+  TPXParticleS16 *particlesS16 = malloc_uncached(allocSize);
 
   // Now some regular 3D stuff, not related to particles.
   T3DModel *model = t3d_model_load("rom://scene.t3dm");
@@ -111,6 +118,7 @@ int main()
   float time = 0;
   bool needRebuild = true;
   int frameIdx = 0;
+  bool measureTime = false;
 
   for(;;)
   {
@@ -132,6 +140,10 @@ int main()
     if(joypad.btn.c_up)partSizeY += deltaTime * 0.6f;
     if(joypad.btn.c_down)partSizeY -= deltaTime * 0.6f;
 
+  #if RSPQ_PROFILE
+    measureTime = joypad.btn.z;
+  #endif
+
     partSizeX = fmaxf(0.01f, fminf(1.0f, partSizeX));
     partSizeY = fmaxf(0.01f, fminf(1.0f, partSizeY));
 
@@ -173,6 +185,7 @@ int main()
 
     // A few example particles systems.
     // This will modify the particle buffer on the CPU side.
+    bool isS16 = false;
     switch(example)
     {
       case 0: // Random
@@ -180,9 +193,9 @@ int main()
         particleRot = (T3DVec3){{time,time*0.77f,time*1.42f}};
         particleMatScale = (T3DVec3){{partMatScaleVal, partMatScaleVal, partMatScaleVal}};
 
-        if(needRebuild)generate_particles_random(particles, particleCount);
+        if(needRebuild)generate_particles_random(particlesS8, particleCount);
         rdpq_set_env_color((color_t){0xFF, 0xFF, 0xFF, 0xFF});
-      break;
+        break;
       case 1: // Flame
         particleRot = (T3DVec3){{0,0,0}};
         if(!joypad.btn.z)time += deltaTime * 1.0f;
@@ -190,17 +203,17 @@ int main()
         float posX = fm_cosf(time) * 80.0f;
         float posZ = fm_sinf(2*time) * 40.0f;
 
-        simulate_particles_fire(particles, particleCount, posX, posZ);
+        simulate_particles_fire(particlesS8, particleCount, posX, posZ);
         particleMatScale = (T3DVec3){{0.9f, partMatScaleVal, 0.9f}};
         particlePos.y = partMatScaleVal * 130.0f;
         rdpq_set_env_color((color_t){0xFF, 0xFF, 0xFF, 0xFF});
-      break;
+        break;
       case 2: // Grass
         time += deltaTime * 1.0f;
         particleRot = (T3DVec3){{0,0,0}};
         particlePos.y = 0;
         if(needRebuild) {
-          particleCount = simulate_particles_grass(particles, particleCount);
+          particleCount = simulate_particles_grass(particlesS16, particleCount);
         }
         particleMatScale = (T3DVec3){{partMatScaleVal, partSizeY * 2.9f, partMatScaleVal}};
         rdpq_set_env_color(blend_colors(
@@ -208,7 +221,8 @@ int main()
           (color_t){0xFF, 0xAA, 0x55, 0xFF},
           fm_sinf(time)*0.5f+0.5f
         ));
-      break;
+        isS16 = true;
+        break;
     }
     needRebuild = false;
 
@@ -267,10 +281,29 @@ int main()
     // This can only scale particles down, so the range is 0.0 - 1.0.
     tpx_state_set_scale(partSizeX, partSizeY);
 
+    if(measureTime) {
+      rspq_wait();
+      rspq_highpri_begin();
+      wait_ms(2);
+      rspTimeTPX = get_ticks();
+    }
+
     // Now draw particles. internally this will load, transform and draw them in one go on the RSP.
     // While the ucode can only handle a 344 at a time, this function will automatically batch them
     // so you can specify an arbitrary amount of particles (as long as it's an even count)
-    tpx_particle_draw(particles, particleCount);
+    if(isS16) {
+      tpx_particle_draw_s16(particlesS16, particleCount);
+    } else {
+      tpx_particle_draw_s8(particlesS8, particleCount);
+    }
+
+    if(measureTime)
+    {
+      rspq_highpri_end();
+      rspq_highpri_sync();
+      rspTimeTPX = get_ticks() - rspTimeTPX;
+      rspTimeTPX = TICKS_TO_US(rspTimeTPX);
+    }
 
     // Make sure end up at the same stack level as before.
     tpx_matrix_pop(1);
@@ -283,7 +316,8 @@ int main()
     t3d_debug_printf(20,  30, "[C] %.2f %.2f", partSizeX, partSizeY);
     t3d_debug_printf(220, 18, "FPS: %.2f", display_get_fps());
 
-    #if RSPQ_PROFILE
+    if(measureTime)
+    {
       double timePerPart = 0;
       if(particleCount > 0) {
         timePerPart = (double)rspTimeTPX / (double)particleCount * 1000;
@@ -291,10 +325,9 @@ int main()
       t3d_debug_printf(20, 240-34, "RSP/tpx: %6lldus %.1f", rspTimeTPX, timePerPart);
       //t3d_debug_printf(20, 240-34, "RSP/tpx: %6lldus", rspTimeTPX);
       t3d_debug_printf(20, 240-24, "RDP    : %6lldus", rdpTimeBusy);
-    #else
+    } else {
       t3d_debug_printf(20, 240-24, "[L/R]: %s", EXAMPLE_NAMES[example]);
-    #endif
-
+    }
     rdpq_detach_show();
 
     #if RSPQ_PROFILE
diff --git a/examples/18_particles/partSim.h b/examples/18_particles/partSim.h
index 1239baa7..7df53355 100644
--- a/examples/18_particles/partSim.h
+++ b/examples/18_particles/partSim.h
@@ -5,7 +5,7 @@ static int currentPart  = 0;
 /**
  * Basic static particles with random positions and colors.
  */
-static void generate_particles_random(TPXParticle *particles, uint32_t count) {
+static void generate_particles_random(TPXParticleS8 *particles, uint32_t count) {
   for (int i = 0; i < count; i++) {
     int p = i / 2;
     int8_t *ptPos = i % 2 == 0 ? particles[p].posA : particles[p].posB;
@@ -47,7 +47,7 @@ static int noise_2d(int x, int y) {
  * Static particles simulating grass.
  * This will create a random grid of 3 particles stacked on top of each other representing grass-blades.
  */
-static int simulate_particles_grass(TPXParticle *particles, uint32_t partCount) {
+static int simulate_particles_grass(TPXParticleS16 *particles, uint32_t partCount) {
 
   int dist = 3;
   int heightParts = 3;
@@ -56,14 +56,14 @@ static int simulate_particles_grass(TPXParticle *particles, uint32_t partCount)
   int p = 0;
   for(int y=heightParts-1; y>=0; --y)
   {
-    int8_t ptPosX = -(dist * sideLen) / 2;
+    int16_t ptPosX = -(dist * sideLen) / 2;
     for(int x=0; x<sideLen; ++x)
     {
-      int8_t ptPosZ = -(dist * sideLen) / 2;
+      int16_t ptPosZ = -(dist * sideLen) / 2;
       for(int z=0; z<sideLen; ++z)
       {
-        int8_t *ptPos = tpx_buffer_get_pos(particles, p);
-        uint8_t *ptColor = tpx_buffer_get_rgba(particles, p);
+        int16_t *ptPos = tpx_buffer_s16_get_pos(particles, p);
+        uint8_t *ptColor = tpx_buffer_s16_get_rgba(particles, p);
 
         int rnd = noise_2d(x, z);
         float height = fm_sinf((x + z) * 0.1f) * 0.5f + 0.5f;
@@ -75,7 +75,7 @@ static int simulate_particles_grass(TPXParticle *particles, uint32_t partCount)
         ptPos[0] = ptPosX + ((rnd % 3) - 1);
         ptPos[1] = y + height;
         ptPos[2] = ptPosZ + ((rnd % 3) - 1);
-        *tpx_buffer_get_size(particles, p) = size;
+        *tpx_buffer_s16_get_size(particles, p) = size;
 
         ptPosZ += dist;
 
@@ -119,7 +119,7 @@ static void gradient_fire(uint8_t *color, float t) {
  * This will simulate particles over time by moving them up and changing their color.
  * The current position is used to spawn new particles, so it can move over time leaving a trail behind.
  */
-static void simulate_particles_fire(TPXParticle *particles, uint32_t partCount, float posX, float posZ) {
+static void simulate_particles_fire(TPXParticleS8 *particles, uint32_t partCount, float posX, float posZ) {
   uint32_t p = currentPart / 2;
   if(currentPart % (1+(rand() % 3)) == 0) {
     int8_t *ptPos = currentPart % 2 == 0 ? particles[p].posA : particles[p].posB;
diff --git a/examples/19_particles_tex/main.c b/examples/19_particles_tex/main.c
index e3f60143..ca71f5e0 100644
--- a/examples/19_particles_tex/main.c
+++ b/examples/19_particles_tex/main.c
@@ -65,10 +65,10 @@ int main()
   rdpq_init();
   //rdpq_debug_start();
 
+  uint64_t rdpTimeBusy = 0;
+  uint64_t rspTimeTPX = 0;
   #if RSPQ_PROFILE
     rspq_profile_data_t profile_data = (rspq_profile_data_t){};
-    uint64_t rdpTimeBusy = 0;
-    uint64_t rspTimeTPX = 0;
     rspq_profile_start();
   #endif
 
@@ -87,10 +87,17 @@ int main()
   // There is no special struct for textured particles compared to colored ones.
   // The only difference is that the alpha channel of the color is used for the texture offset.
   // You can still define a global alpha value via the CC ofc.
-  uint32_t allocSize = sizeof(TPXParticle) * particleCountMax / 2;
-  TPXParticle *particles = malloc_uncached(allocSize);
+  uint32_t allocSize = sizeof(TPXParticleS8) * particleCountMax / 2;
+  TPXParticleS8 *particlesS8 = malloc_uncached(allocSize);
   debugf("Particle-Buffer %ldkb\n", allocSize / 1024);
 
+  // Additionally, a 16bit version of particles is available.
+  // This one takes up more space (24 bytes vs 16 bytes per pair) and is slightly slower.
+  // In return, it can cover a larger range which can be useful for 3D sprites placed in a scene.
+  // The 8bit variant should be preferred when possible (e.g. in local particle effects)
+  allocSize = sizeof(TPXParticleS16) * particleCountMax / 2;
+  TPXParticleS16 *particlesS16 = malloc_uncached(allocSize);
+
   sprite_t *texTest[] = {
       sprite_load("rom://tex8.i8.sprite"),
       sprite_load("rom://tex16.i8.sprite"),
@@ -137,6 +144,7 @@ int main()
   float time = 0;
   float timeTile = 0;
   bool needRebuild = true;
+  bool measureTime = false;
   int frameIdx = 0;
 
   for(;;)
@@ -160,6 +168,10 @@ int main()
     if(joypad.btn.c_up)partSizeY += deltaTime * 0.6f;
     if(joypad.btn.c_down)partSizeY -= deltaTime * 0.6f;
 
+#if RSPQ_PROFILE
+    measureTime = joypad.btn.z;
+#endif
+
     partSizeX = fmaxf(0.01f, fminf(1.0f, partSizeX));
     partSizeY = fmaxf(0.01f, fminf(1.0f, partSizeY));
 
@@ -200,6 +212,7 @@ int main()
       camTarget.v[2] = camPos.v[2] + camDir.v[2];
     }
 
+    bool is16Bit = false;
     bool isSpriteRot = false;
     switch(example)
     {
@@ -211,7 +224,7 @@ int main()
         float posX = fm_cosf(time) * 80.0f;
         float posZ = fm_sinf(2*time) * 40.0f;
 
-        simulate_particles_fire(particles, particleCount, posX, posZ);
+        simulate_particles_fire(particlesS8, particleCount, posX, posZ);
         particleMatScale = (T3DVec3){{0.9f, partMatScaleVal, 0.9f}};
         particlePos.y = partMatScaleVal * 130.0f;
         rdpq_set_env_color((color_t){0xFF, 0xFF, 0xFF, 0xFF});
@@ -223,10 +236,11 @@ int main()
         particleRot = (T3DVec3){{0,0,0}};
         particlePos.y = 0;
         if(needRebuild) {
-          particleCount = simulate_particles_coins(particles, particleCount);
+          particleCount = simulate_particles_coins(particlesS16, particleCount);
         }
         particleMatScale = (T3DVec3){{partMatScaleVal, partSizeY * 2.9f, partMatScaleVal}};
         rdpq_set_env_color((color_t){0xFF, 0xFF, 0xFF, 0xFF});
+        is16Bit = true;
       break;
       default: // Random
         time += deltaTime * 0.2f;
@@ -234,7 +248,7 @@ int main()
         particleRot = (T3DVec3){{time,time*0.77f,time*1.42f}};
         particleMatScale = (T3DVec3){{partMatScaleVal, partMatScaleVal, partMatScaleVal}};
 
-        if(needRebuild)generate_particles_random(particles, particleCount);
+        if(needRebuild)generate_particles_random(particlesS8, particleCount);
         rdpq_set_env_color((color_t){0xFF, 0xFF, 0xFF, 0xFF});
         isSpriteRot = true;
       break;
@@ -326,7 +340,26 @@ int main()
       case 5: tpx_state_set_tex_params((int16_t)tileIdx, 0); break;
     }
 
-    tpx_particle_draw_tex(particles, particleCount);
+    if(measureTime) {
+      rspq_wait();
+      rspq_highpri_begin();
+      wait_ms(2);
+      rspTimeTPX = get_ticks();
+    }
+
+    if(is16Bit) {
+      tpx_particle_draw_tex_s16(particlesS16, particleCount);
+    } else {
+      tpx_particle_draw_tex_s8(particlesS8, particleCount);
+    }
+
+    if(measureTime)
+    {
+      rspq_highpri_end();
+      rspq_highpri_sync();
+      rspTimeTPX = get_ticks() - rspTimeTPX;
+      rspTimeTPX = TICKS_TO_US(rspTimeTPX);
+    }
 
     tpx_matrix_pop(1);
 
@@ -335,7 +368,8 @@ int main()
     t3d_debug_printf(20,  30, "[C] %.2f %.2f", partSizeX, partSizeY);
     t3d_debug_printf(220, 18, "FPS: %.2f", display_get_fps());
 
-    #if RSPQ_PROFILE
+    if(measureTime)
+    {
       double timePerPart = 0;
       if(particleCount > 0) {
         timePerPart = (double)rspTimeTPX / (double)particleCount * 1000;
@@ -343,9 +377,9 @@ int main()
       t3d_debug_printf(20, 240-34, "RSP/tpx: %6lldus %.1f", rspTimeTPX, timePerPart);
       //t3d_debug_printf(20, 240-34, "RSP/tpx: %6lldus", rspTimeTPX);
       t3d_debug_printf(20, 240-24, "RDP    : %6lldus", rdpTimeBusy);
-    #else
+    } else {
       t3d_debug_printf(20, 240-24, "[L/R]: %s", EXAMPLE_NAMES[example]);
-    #endif
+    }
 
     rdpq_detach_show();
 
diff --git a/examples/19_particles_tex/partSim.h b/examples/19_particles_tex/partSim.h
index cf967883..7bc1bf72 100644
--- a/examples/19_particles_tex/partSim.h
+++ b/examples/19_particles_tex/partSim.h
@@ -2,16 +2,6 @@
 
 static int currentPart  = 0;
 
-static color_t blend_colors(color_t colorA, color_t colorB, float t) {
-  color_t color;
-  color.r = (uint8_t)(colorA.r * (1.0f - t) + colorB.r * t);
-  color.g = (uint8_t)(colorA.g * (1.0f - t) + colorB.g * t);
-  color.b = (uint8_t)(colorA.b * (1.0f - t) + colorB.b * t);
-  color.a = (uint8_t)(colorA.a * (1.0f - t) + colorB.a * t);
-  return color;
-}
-
-
 static color_t get_rainbow_color(float s, float brightness) {
   float r = fm_sinf(s) * 0.5f + 0.5f;
   float g = fm_sinf(s + 2.094f) * 0.5f + 0.5f;
@@ -82,7 +72,7 @@ static int noise_2d(int x, int y) {
   return (n * (n * n * 60493 + 19990303) + 89);
 }
 
-static void generate_particles_random(TPXParticle *particles, uint32_t count) {
+static void generate_particles_random(TPXParticleS8 *particles, uint32_t count) {
   for (int i = 0; i < count; i++) {
     int p = i / 2;
     int8_t *ptPos = i % 2 == 0 ? particles[p].posA : particles[p].posB;
@@ -114,7 +104,7 @@ static void generate_particles_random(TPXParticle *particles, uint32_t count) {
   }
 }
 
-static int simulate_particles_coins(TPXParticle *particles, uint32_t partCount) {
+static int simulate_particles_coins(TPXParticleS16 *particles, uint32_t partCount) {
 
   int dist = 3;
   int heightParts = 1;
@@ -122,14 +112,15 @@ static int simulate_particles_coins(TPXParticle *particles, uint32_t partCount)
 
   int p = 0;
 
-  int8_t ptPosX = -(dist * sideLen) / 2;
+  int16_t ptPosX = -(dist * sideLen) / 2;
   for(int x=0; x<sideLen; ++x)
   {
-    int8_t ptPosZ = -(dist * sideLen) / 2;
+    int16_t ptPosZ = -(dist * sideLen) / 2;
     for(int z=0; z<sideLen; ++z)
     {
-      int8_t *ptPos = tpx_buffer_get_pos(particles, p);
-      color_t *ptColor = (color_t*)tpx_buffer_get_rgba(particles, p);
+      int16_t *ptPos = tpx_buffer_s16_get_pos(particles, p);
+      color_t *ptColor = (color_t*)tpx_buffer_s16_get_rgba(particles, p);
+      *tpx_buffer_s16_get_tex_offset(particles, p) = (rand() % 8) * 32;
 
       int rnd = noise_2d(x, z);
       float height = fm_sinf((x + z) * 0.1f) * 0.5f + 0.5f;
@@ -137,12 +128,11 @@ static int simulate_particles_coins(TPXParticle *particles, uint32_t partCount)
       int8_t size = (rand() % 8)*4 + 50;
 
       *ptColor = (rnd & 1) ? get_rand_color(20) : get_rainbow_color((x + z) * 0.1f, 1.0f);
-      ptColor->a = (rand() % 8) * 32;
 
       ptPos[0] = ptPosX + ((rnd % 3) - 1);
       ptPos[1] = height;
       ptPos[2] = ptPosZ + ((rnd % 3) - 1);
-      *tpx_buffer_get_size(particles, p) = size;
+      *tpx_buffer_s16_get_size(particles, p) = size;
 
       ptPosZ += dist;
 
@@ -160,17 +150,17 @@ static int simulate_particles_coins(TPXParticle *particles, uint32_t partCount)
  * This will simulate particles over time by moving them up and changing their color.
  * The current position is used to spawn new particles, so it can move over time leaving a trail behind.
  */
-static void simulate_particles_fire(TPXParticle *particles, uint32_t partCount, float posX, float posZ) {
-  uint32_t p = currentPart / 2;
+static void simulate_particles_fire(TPXParticleS8 *particles, uint32_t partCount, float posX, float posZ) {
+  int p = currentPart / 2;
   if(currentPart % (1+(rand() % 3)) == 0) {
-    int8_t *ptPos = currentPart % 2 == 0 ? particles[p].posA : particles[p].posB;
-    int8_t *size = currentPart % 2 == 0 ? &particles[p].sizeA : &particles[p].sizeB;
-    uint8_t *color = currentPart % 2 == 0 ? particles[p].colorA : particles[p].colorB;
+    int8_t *ptPos  = tpx_buffer_s8_get_pos(particles, p);
+    int8_t *size   = tpx_buffer_s8_get_size(particles, p);
+    uint8_t *color = tpx_buffer_s8_get_rgba(particles, p);
 
     ptPos[0] = posX + (rand() % 16) - 8;
     ptPos[1] = -126;
     gradient_fire(color, 0);
-    color[3] = (PhysicalAddr(ptPos) % 8) * 32;
+    color[3] = ((PhysicalAddr(ptPos) % 8) * 32);
 
     ptPos[2] = posZ + (rand() % 16) - 8;
     *size = 60 + (rand() % 10);
diff --git a/examples/24_hdr_bloom/src/actors/magicSpell.cpp b/examples/24_hdr_bloom/src/actors/magicSpell.cpp
index 9a36ec04..9517f25d 100644
--- a/examples/24_hdr_bloom/src/actors/magicSpell.cpp
+++ b/examples/24_hdr_bloom/src/actors/magicSpell.cpp
@@ -40,9 +40,9 @@ namespace Actor
     args.scale *= BASE_SCALE;
 
     for(uint32_t i=0; i<particles.countMax; ++i) {
-      auto p = tpx_buffer_get_pos(particles.particles, i);
-      auto col = tpx_buffer_get_rgba(particles.particles, i);
-      *tpx_buffer_get_size(particles.particles, i) = 6 + (rand()%4);
+      auto p = tpx_buffer_s8_get_pos(particles.particles, i);
+      auto col = tpx_buffer_s8_get_rgba(particles.particles, i);
+      *tpx_buffer_s8_get_size(particles.particles, i) = 6 + (rand()%4);
 
       float randAngle = (rand() % 1024) / 1024.0f * T3D_PI * 2.0f;
       float randX = fm_sinf(randAngle);
@@ -90,8 +90,8 @@ namespace Actor
     );
 
     for(uint32_t i=0; i<particles.countMax; ++i) {
-      auto p = tpx_buffer_get_pos(particles.particles, i);
-      auto col = tpx_buffer_get_rgba(particles.particles, i);
+      auto p = tpx_buffer_s8_get_pos(particles.particles, i);
+      auto col = tpx_buffer_s8_get_rgba(particles.particles, i);
       int8_t wiggleX = displace[(p[1] + 127) & 0xFF];
       int8_t wiggleZ = displace[(p[1] + 200) & 0xFF];
       p[1] += col[3];
@@ -103,7 +103,6 @@ namespace Actor
 
   void MagicSpell::draw3D(float deltaTime)
   {
-    auto &fr = state.activeScene->getCam().getFrustum();
     if(!checkFrustumSphere(pos, args.scale * 90.0f))return;
 
     t3d_matrix_set(matFP.get(), true);
diff --git a/examples/24_hdr_bloom/src/actors/pointGlobe.cpp b/examples/24_hdr_bloom/src/actors/pointGlobe.cpp
index c775cbed..5317d5bf 100644
--- a/examples/24_hdr_bloom/src/actors/pointGlobe.cpp
+++ b/examples/24_hdr_bloom/src/actors/pointGlobe.cpp
@@ -96,8 +96,8 @@ namespace Actor
     float latIncr = (T3D_PI * 32.0f) / sampleCount;
 
     for(uint32_t i=0; i<sampleCount; ++i) {
-      auto p = tpx_buffer_get_pos(particles.particles, particles.count);
-      auto col = tpx_buffer_get_rgba(particles.particles, particles.count);
+      auto p = tpx_buffer_s8_get_pos(particles.particles, particles.count);
+      auto col = tpx_buffer_s8_get_rgba(particles.particles, particles.count);
 
       float y = 1.0f - (i / (float)(sampleCount - 1)) * 2.0f;//  # y goes from 1 to -1
       float radius = sqrtf(1.0f - y * y);
@@ -123,7 +123,7 @@ namespace Actor
       p[1] = pt.y;
       p[2] = pt.z;
 
-      *tpx_buffer_get_size(particles.particles, particles.count) = 35 + (rand()%5);
+      *tpx_buffer_s8_get_size(particles.particles, particles.count) = 35 + (rand()%5);
 
       col[0] = colImg.r;
       col[1] = colImg.g;
diff --git a/examples/24_hdr_bloom/src/main.cpp b/examples/24_hdr_bloom/src/main.cpp
index 7977f73a..dd1580ff 100644
--- a/examples/24_hdr_bloom/src/main.cpp
+++ b/examples/24_hdr_bloom/src/main.cpp
@@ -43,6 +43,8 @@ namespace {
   constexpr int BUFF_COUNT = 3;
 
   rspq_profile_data_t profileData{};
+
+  [[maybe_unused]]
   uint64_t lastUcodeTime = 0;
 }
 
diff --git a/examples/24_hdr_bloom/src/render/ptSystem.cpp b/examples/24_hdr_bloom/src/render/ptSystem.cpp
index 35f6a984..1c4aa5d6 100644
--- a/examples/24_hdr_bloom/src/render/ptSystem.cpp
+++ b/examples/24_hdr_bloom/src/render/ptSystem.cpp
@@ -28,7 +28,7 @@ void PTSystem::resize(uint32_t maxSize)
   assert(sizeof(countMax) % 2 == 0);
   if(countMax > 0) {
     mat = (T3DMat4FP*)malloc_uncached(sizeof(T3DMat4FP));
-    particles = static_cast<TPXParticle*>(malloc_uncached(countMax * sizeof(TPXParticle) / 2));
+    particles = static_cast<TPXParticleS8*>(malloc_uncached(countMax * sizeof(TPXParticleS8) / 2));
   }
 }
 
@@ -36,7 +36,7 @@ void PTSystem::draw() const {
   if(count == 0)return;
   tpx_matrix_push(mat);
   uint32_t safeCount = count & ~1;
-  tpx_particle_draw(particles, safeCount);
+  tpx_particle_draw_s8(particles, safeCount);
   tpx_matrix_pop(1);
 }
 
@@ -44,7 +44,7 @@ void PTSystem::drawTextured() const {
   if(count == 0)return;
   tpx_matrix_push(mat);
   uint32_t safeCount = count & ~1;
-  tpx_particle_draw_tex(particles, safeCount);
+  tpx_particle_draw_tex_s8(particles, safeCount);
   tpx_matrix_pop(1);
 }
 
@@ -59,7 +59,7 @@ int PTSystem::drawTexturedSlice(int begin, int end) const
   auto size = end - begin;
   if(size <= 0)return 0;
   tpx_matrix_push(mat);
-  tpx_particle_draw_tex(particles + (begin/2), size);
+  tpx_particle_draw_tex_s8(particles + (begin/2), size);
   tpx_matrix_pop(1);
 
   return size;
diff --git a/examples/24_hdr_bloom/src/render/ptSystem.h b/examples/24_hdr_bloom/src/render/ptSystem.h
index 546d7134..adac36d0 100644
--- a/examples/24_hdr_bloom/src/render/ptSystem.h
+++ b/examples/24_hdr_bloom/src/render/ptSystem.h
@@ -10,7 +10,7 @@ struct PTSystem
 {
   T3DVec3 pos{};
   T3DMat4FP *mat{};
-  TPXParticle *particles{};
+  TPXParticleS8 *particles{};
   uint32_t countMax{};
   uint32_t count{};
 
@@ -21,9 +21,9 @@ struct PTSystem
   [[nodiscard]] bool isFull() const { return count == countMax; }
 
   void removeParticle(uint32_t index) {
-    tpx_buffer_copy(particles, index, --count);
+    tpx_buffer_s8_copy(particles, index, --count);
     if(count & 1) {
-      *tpx_buffer_get_size(particles, count + 1u) = 0;
+      *tpx_buffer_s8_get_size(particles, count + 1u) = 0;
     }
   }
 
diff --git a/src/t3d/rsp/rsp_tinypx.S b/src/t3d/rsp/rsp_tinypx.S
index d2084609..051016d5 100644
--- a/src/t3d/rsp/rsp_tinypx.S
+++ b/src/t3d/rsp/rsp_tinypx.S
@@ -152,199 +152,310 @@ TPXCmd_SyncT3D:
   j RSPQ_Loop
   nop
 TPXCmd_DrawColor:
-  or $s0, $zero, $a1
-  ori $s4, $zero, %lo(PARTICLE_BUFF)
-  andi $t0, $a0, 65535
-  addu $s7, $s4, $t0
-  or $t2, $zero, $zero
-  jal DMAExec ## Args: $t0, $t1, $s0, $s4, $t2
-  addiu $t0, $t0, -1
-  ori $at, $zero, %lo(MATRIX_MVP)
-  ldv $v26, 0, 24, $at
-  ldv $v26, 8, 24, $at
-  lw $s2, %lo(RDPQ_SCISSOR_RECT + 4)
-  ldv $v24, 0, 40, $at
-  ldv $v28, 0, 8, $at
-  srl $t4, $s2, 12
-  ldv $v21, 0, 48, $at
-  ldv $v22, 0, 56, $at
-  ldv $v24, 8, 40, $at
-  vmudl $v20, $v00, $v31.e3
-  ldv $v23, 0, 32, $at
-  ldv $v27, 0, 0, $at
-  ldv $v22, 8, 56, $at
-  ldv $v21, 8, 48, $at
-  ldv $v25, 0, 16, $at
-  or $s6, $zero, $s4
-  ldv $v25, 8, 16, $at
-  mtc2 $s2, $v15.e1
-  ldv $v28, 8, 8, $at
-  ldv $v23, 8, 32, $at
-  lw $s1, %lo(RDPQ_SCISSOR_RECT + 0)
-  ldv $v27, 8, 0, $at
-  ori $at, $zero, %lo(SCREEN_SCALE_OFFSET)
-  lui $t7, 0x3A00
-  ori $s4, $zero, %lo(RDP_BUFF)
-  mtc2 $t4, $v15.e0
-  mtc2 $t4, $v15.e4
-  ldv $v19, 0, 0, $at
-  ldv $v18, 0, 8, $at
-  srl $t4, $s1, 12
-  ldv $v18, 8, 8, $at
-  mtc2 $s2, $v15.e5
-  ldv $v19, 8, 0, $at
-  ori $at, $zero, %lo(NORM_SCALE_W)
-  ldv $v17, 0, 0, $at
-  mtc2 $s1, $v16.e1
-  ldv $v17, 8, 0, $at
-  addiu $at, $zero, 4095
-  vmadm $v19, $v19, $v31.e3
-  ori $s3, $zero, %lo(RDP_BUFF)
-  vmadn $v20, $v00, $v00
-  mtc2 $at, $v15.e3
-  mtc2 $t4, $v16.e0
-  ori $at, $zero, %lo(PARTICLE_SCALE)
-  ori $s2, $zero, %lo(RDP_BUFF)
-  addiu $s5, $s3, 528
-  mtc2 $t4, $v16.e4
-  vand $v15, $v15, $v15.e3
-  mtc2 $s1, $v16.e5
-  llv $v13, 0, 0, $at
-  llv $v13, 8, 0, $at
-  addiu $a0, $zero, 46
-  addiu $s1, $s5, 24
-  vand $v16, $v16, $v15.e3
-  LABEL_0001:
-  sw $t7, 0($s2)
-  sb $a0, 8($s2)
-  sh $zero, 14($s2)
-  addiu $s2, $s2, 24
-  bne $s2, $s1, LABEL_0001
-  nop
-  LABEL_0002:
-  jal DMAWaitIdle
-  nop
-  lpv $v10, 0, 0, $s6
-  vmulf $v14, $v13, $v10.h3
-  vmudm $v10, $v10, $v31.e7
-  vmov $v10.e3, $v30.e7
-  vmov $v10.e7, $v30.e7
-  vxor $v09, $v00, $v30.e7
-  vmudn $v12, $v28, $v10.h0
-  vmadh $v11, $v27, $v10.h0
-  vmadn $v12, $v26, $v10.h1
-  vmadh $v11, $v25, $v10.h1
-  vmadn $v12, $v24, $v10.h2
-  vmadh $v11, $v23, $v10.h2
-  vmadn $v12, $v22, $v10.h3
-  vmadh $v11, $v21, $v10.h3
-  vch $v29, $v11, $v11.h3
-  vcl $v29, $v12, $v12.h3
-  cfc2 $t4, $vcc
-  LABEL_0003:
-  vmudl $v12, $v12, $v17.v
-  ori $at, $zero, %lo(BASE_SIZE)
-  vmadm $v11, $v11, $v17.v
-  ldv $v03, 0, 8, $s6
-  vmadn $v12, $v00, $v00
-  vrcph $v07.e3, $v11.e3
-  andi $t5, $t4, 16448
-  vrcpl $v08.e3, $v12.e3
-  addiu $s6, $s6, 16
-  vrcph $v07.e3, $v11.e7
-  vrcpl $v08.e7, $v12.e7
-  vrcph $v07.e7, $v00.e7
-  lsv $v11, 6, 0, $at
-  vmov $v12.e3, $v00.e0
-  lsv $v11, 14, 0, $at
-  vmov $v12.e7, $v00.e0
-  addiu $at, $zero, 3
-  vmudl $v29, $v12, $v08.h3
-  vmadm $v29, $v11, $v08.h3
-  vmadn $v12, $v12, $v07.h3
-  addiu $t2, $zero, 54
-  vmadh $v11, $v11, $v07.h3
-  vmulf $v14, $v14, $v11.h3
-  vmudl $v29, $v12, $v20.v
-  vmadm $v29, $v11, $v20.v
-  slv $v03, 0, 4, $s3
-  vmadn $v08, $v12, $v19.v
-  andi $t1, $t4, 1028
-  vmadh $v07, $v11, $v19.v
-  vmadh $v06, $v18, $v09.v
-  vmadh $v05, $v09, $v14.v
-  vsubc $v10, $v06, $v14.v
-  vlt $v05, $v05, $v15
-  vge $v10, $v10, $v16
-  ssv $v06, 4, 12, $s3
-  mfc2 $sp, $v05.e5
-  mfc2 $k1, $v10.e0
-  mfc2 $s2, $v10.e1
-  andi $sp, $sp, 4095
-  sll $k1, $k1, 12
-  mfc2 $fp, $v05.e1
-  andi $s2, $s2, 4095
-  or $s2, $s2, $k1
-  mfc2 $k1, $v10.e4
-  mfc2 $s1, $v10.e5
-  andi $fp, $fp, 4095
-  sll $k1, $k1, 12
-  andi $s1, $s1, 4095
-  sw $s2, 4 + 16($s3)
-  or $s1, $s1, $k1
-  vlt $v04, $v10, $v05
-  mfc2 $k1, $v05.e0
-  cfc2 $t4, $vcc
-  sll $k1, $k1, 12
-  or $fp, $fp, $k1
-  andi $a2, $t4, 3
-  lpv $v10, 0, 0, $s6
-  sw $fp, 0 + 16($s3) ## Barrier: 0x1
-  or $t1, $t1, $a2
-  mfc2 $k1, $v05.e4
-  sb $t2, 0 + 16($s3) ## Barrier: 0x1
-  andi $a2, $t4, 48
-  vmulf $v14, $v13, $v10.h3
-  sll $k1, $k1, 12
-  vmudm $v10, $v10, $v31.e7
-  or $sp, $sp, $k1
-  vmov $v10.e3, $v30.e7
-  bne $t1, $at, LABEL_0005
-  or $t5, $t5, $a2
-  addiu $s3, $s3, 24
-  LABEL_0005:
-  addiu $at, $zero, 48
-  vmov $v10.e7, $v30.e7
-  bne $t5, $at, LABEL_0006
-  nop
-  slv $v03, 4, 4, $s3
-  sw $sp, 0 + 16($s3) ## Barrier: 0x1
-  ssv $v06, 12, 12, $s3
-  sb $t2, 0 + 16($s3) ## Barrier: 0x1
-  sw $s1, 4 + 16($s3)
-  addiu $s3, $s3, 24
-  LABEL_0006:
-  vmudn $v12, $v28, $v10.h0
-  vmadh $v11, $v27, $v10.h0
-  vmadn $v12, $v26, $v10.h1
-  vmadh $v11, $v25, $v10.h1
-  sltu $at, $s3, $s5
-  vmadn $v12, $v24, $v10.h2
-  vmadh $v11, $v23, $v10.h2
-  vmadn $v12, $v22, $v10.h3
-  bne $at, $zero, LABEL_0007
-  vmadh $v11, $v21, $v10.h3
-  jal RDPQ_Send ## Args: $s4, $s3
-  nop
-  or $s3, $zero, $s4
-  LABEL_0007:
-  vch $v29, $v11, $v11.h3
-  vcl $v29, $v12, $v12.h3
-  bne $s6, $s7, LABEL_0003
-  cfc2 $t4, $vcc
-  LABEL_0004:
-  j RDPQ_Send
-  ori $ra, $zero, %lo(RSPQ_Loop)
+  or $s0, $zero, $a1                                 ## L:850  |      ^ | dma_in_async(dmaDmem, rdramAddr, dmaSize);
+  andi $t0, $a0, 65535                               ## L:834  |      2 | u32<$t0> dmaSize = dataSize & 0xFFFF;
+  ori $s4, $zero, %lo(PARTICLE_BUFF)                 ## L:833  |      3 | u16<$s4> dmaDmem = PARTICLE_BUFF;
+  addu $s7, $s4, $t0                                 ## L:849  |      4 | u32 ptrInEnd = dmaDmem + dmaSize;
+  addiu $t0, $t0, -1                                 ## L:850  |      5 | dma_in_async(dmaDmem, rdramAddr, dmaSize);
+  jal DMAExec                                        ## L:850  |      6 | dma_in_async(dmaDmem, rdramAddr, dmaSize); ## Args: $t0, $t1, $s0, $s4, $t2
+  or $t2, $zero, $zero                               ## L:850  |     *8 | dma_in_async(dmaDmem, rdramAddr, dmaSize);
+  vmudl $v20, $v00, $v31.e3                          ## L:865  |      9 | screenSize >>= 4;
+  lw $s2, %lo(RDPQ_SCISSOR_RECT + 4)                 ## L:1213 |      ^ | u32 extMax = load(RDPQ_SCISSOR_RECT, 4);
+  lw $s1, %lo(RDPQ_SCISSOR_RECT + 0)                 ## L:1214 |     10 | u32 extMin = load(RDPQ_SCISSOR_RECT, 0);
+  ori $at, $zero, %lo(MATRIX_MVP)                    ## L:858  |     11 | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw;
+  ldv $v21, 0, 48, $at                               ## L:861  |     12 | vec32 mat3 = load(MATRIX_MVP, 0x30).xyzwxyzw;
+  mtc2 $s2, $v15.e5                                  ## L:1218 |     13 | screenMax.y = extMax; screenMax.Y = extMax;
+  ldv $v23, 0, 32, $at                               ## L:860  |     14 | vec32 mat2 = load(MATRIX_MVP, 0x20).xyzwxyzw;
+  ldv $v28, 0, 8, $at                                ## L:858  |     15 | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw;
+  ldv $v22, 0, 56, $at                               ## L:861  |     16 | vec32 mat3 = load(MATRIX_MVP, 0x30).xyzwxyzw;
+  ldv $v26, 0, 24, $at                               ## L:859  |     17 | vec32 mat1 = load(MATRIX_MVP, 0x10).xyzwxyzw;
+  ori $s3, $zero, %lo(RDP_BUFF)                      ## L:855  |     18 | u16<$s3> dmaDmemEnd = RDP_BUFF;
+  ldv $v25, 0, 16, $at                               ## L:859  |     19 | vec32 mat1 = load(MATRIX_MVP, 0x10).xyzwxyzw;
+  ldv $v25, 8, 16, $at                               ## L:859  |     20 | vec32 mat1 = load(MATRIX_MVP, 0x10).xyzwxyzw;
+  ldv $v24, 0, 40, $at                               ## L:860  |     21 | vec32 mat2 = load(MATRIX_MVP, 0x20).xyzwxyzw;
+  ldv $v21, 8, 48, $at                               ## L:861  |     22 | vec32 mat3 = load(MATRIX_MVP, 0x30).xyzwxyzw;
+  ldv $v26, 8, 24, $at                               ## L:859  |     23 | vec32 mat1 = load(MATRIX_MVP, 0x10).xyzwxyzw;
+  ldv $v28, 8, 8, $at                                ## L:858  |     24 | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw;
+  ldv $v27, 0, 0, $at                                ## L:858  |     25 | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw;
+  ldv $v24, 8, 40, $at                               ## L:860  |     26 | vec32 mat2 = load(MATRIX_MVP, 0x20).xyzwxyzw;
+  ldv $v23, 8, 32, $at                               ## L:860  |     27 | vec32 mat2 = load(MATRIX_MVP, 0x20).xyzwxyzw;
+  ldv $v27, 8, 0, $at                                ## L:858  |     28 | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw;
+  ldv $v22, 8, 56, $at                               ## L:861  |     29 | vec32 mat3 = load(MATRIX_MVP, 0x30).xyzwxyzw;
+  ori $at, $zero, %lo(SCREEN_SCALE_OFFSET)           ## L:863  |     30 | vec32 screenSize:sint = load(SCREEN_SCALE_OFFSET).xyzwxyzw;
+  ldv $v18, 0, 8, $at                                ## L:867  |     31 | vec16 screenOffset = load(SCREEN_SCALE_OFFSET, 0x08).xyzwxyzw;
+  mtc2 $s1, $v16.e5                                  ## L:1222 |     32 | screenMin.y = extMin; screenMin.Y = extMin;
+  ldv $v18, 8, 8, $at                                ## L:867  |     33 | vec16 screenOffset = load(SCREEN_SCALE_OFFSET, 0x08).xyzwxyzw;
+  srl $t4, $s2, 12                                   ## L:1216 |     34 | temp1 = extMax >> 12;
+  ldv $v19, 0, 0, $at                                ## L:863  |     35 | vec32 screenSize:sint = load(SCREEN_SCALE_OFFSET).xyzwxyzw;
+  mtc2 $s1, $v16.e1                                  ## L:1222 |     36 | screenMin.y = extMin; screenMin.Y = extMin;
+  ldv $v19, 8, 0, $at                                ## L:863  |     37 | vec32 screenSize:sint = load(SCREEN_SCALE_OFFSET).xyzwxyzw;
+  ori $at, $zero, %lo(NORM_SCALE_W)                  ## L:868  |     38 | vec16 normScaleW = load(NORM_SCALE_W).xyzwxyzw;
+  addiu $s5, $s3, 528                                ## L:856  |     39 | u16 dmaDmemFlush = dmaDmemEnd + 528;
+  mtc2 $s2, $v15.e1                                  ## L:1218 |     40 | screenMax.y = extMax; screenMax.Y = extMax;
+  ori $s2, $zero, %lo(RDP_BUFF)                      ## L:881  |     41 | u16 buffRdp = RDP_BUFF;
+  ldv $v17, 0, 0, $at                                ## L:868  |     42 | vec16 normScaleW = load(NORM_SCALE_W).xyzwxyzw;
+  mtc2 $t4, $v15.e4                                  ## L:1217 |     43 | screenMax.x = temp1;  screenMax.X = temp1;
+  ldv $v17, 8, 0, $at                                ## L:868  |     44 | vec16 normScaleW = load(NORM_SCALE_W).xyzwxyzw;
+  lui $t7, 0x3A00                                    ## L:883  |     45 | cmdRdpColor = 0x3A00'0000;
+  addiu $at, $zero, 4095                             ## L:1212 |     46 | screenMax.w = 0b1111'1111'1111;
+  mtc2 $at, $v15.e3                                  ## L:1212 |     47 | screenMax.w = 0b1111'1111'1111;
+  mtc2 $t4, $v15.e0                                  ## L:1217 |     48 | screenMax.x = temp1;  screenMax.X = temp1;
+  ori $at, $zero, %lo(PARTICLE_SCALE)                ## L:875  |     49 | globalPartSize.xy = load(PARTICLE_SCALE).xy;
+  addiu $a0, $zero, 46                               ## L:884  |     50 | cmdRdpDepth = 0x2E;
+  srl $t4, $s1, 12                                   ## L:1220 |     51 | temp1 = extMin >> 12;
+  vmadm $v19, $v19, $v31.e3                          ## L:865  |      ^ | screenSize >>= 4;
+  vmadn $v20, $v00, $v00                             ## L:865  |     52 | screenSize >>= 4;
+  mtc2 $t4, $v16.e0                                  ## L:1221 |      ^ | screenMin.x = temp1;  screenMin.X = temp1;
+  mtc2 $t4, $v16.e4                                  ## L:1221 |     53 | screenMin.x = temp1;  screenMin.X = temp1;
+  vand $v15, $v15, $v15.e3                           ## L:1224 |      ^ | screenMax &= screenMax.w;
+  llv $v13, 0, 0, $at                                ## L:875  |     54 | globalPartSize.xy = load(PARTICLE_SCALE).xy;
+  llv $v13, 8, 0, $at                                ## L:876  |     55 | globalPartSize.XY = load(PARTICLE_SCALE).xy;
+  addiu $s1, $s5, 24                                 ## L:882  |     56 | u16 buffRdpEnd = dmaDmemFlush + 24;
+  or $s6, $zero, $s4                                 ## L:852  |     57 | u32 ptrIn = dmaDmem;
+  ori $s4, $zero, %lo(RDP_BUFF)                      ## L:854  |     58 | dmaDmem = RDP_BUFF;
+  vand $v16, $v16, $v15.e3                           ## L:1225 |      ^ | screenMin &= screenMax.w;
+  LABEL_TPXCmd_DrawColor_0001:
+  sh $zero, 14($s2)                                  ## L:888  |     59 | store(ZERO:u16, buffRdp, 0x0E);
+  sw $t7, 0($s2)                                     ## L:886  |     60 | store(cmdRdpColor, buffRdp, 0);
+  sb $a0, 8($s2)                                     ## L:887  |     61 | store(cmdRdpDepth, buffRdp, 8);
+  addiu $s2, $s2, 24                                 ## L:889  |     62 | buffRdp += 24;
+  bne $s2, $s1, LABEL_TPXCmd_DrawColor_0001          ## L:889  |     63 | buffRdp += 24;
+  nop                                                ## L:889  |    *65 | buffRdp += 24;
+  LABEL_TPXCmd_DrawColor_0002:
+  jal DMAWaitIdle                                    ## L:893  |     66 | dma_await();
+  nop                                                ## L:893  |    *68 | dma_await();
+  bgez $a1, LABEL_TPXCmd_DrawColor_0003              ## L:900  |     69 | if(rdramAddr < 0) {
+  vxor $v10, $v00, $v30.e7                           ## L:897  |    *71 | const vec16 vecOne = 1;
+  lqv $v09, 0, 0, $s6                                ## L:466  |     72 | vec16 posStart = load(ptrIn, 0x00);
+  vmulf $v14, $v13, $v09.h3                          ## L:467  |  ***76 | localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
+  vmudn $v12, $v28, $v09.h0                          ## L:93   |     77 | out = mat0  * vec.xxxxXXXX;
+  vmadh $v11, $v27, $v09.h0                          ## L:93   |     78 | out = mat0  * vec.xxxxXXXX;
+  vmadn $v12, $v26, $v09.h1                          ## L:94   |     79 | out = mat1 +* vec.yyyyYYYY;
+  vmadh $v11, $v25, $v09.h1                          ## L:94   |     80 | out = mat1 +* vec.yyyyYYYY;
+  vmadn $v12, $v24, $v09.h2                          ## L:95   |     81 | out = mat2 +* vec.zzzzZZZZ;
+  vmadh $v11, $v23, $v09.h2                          ## L:95   |     82 | out = mat2 +* vec.zzzzZZZZ;
+  vmadn $v12, $v22, $v30.e7                          ## L:96   |     83 | out = mat3 +* 1;
+  vmadh $v11, $v21, $v30.e7                          ## L:96   |     84 | out = mat3 +* 1;
+  vch $v29, $v11, $v11.h3                            ## L:475  |  ***88 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  vcl $v29, $v12, $v12.h3                            ## L:475  |     89 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  cfc2 $t4, $vcc                                     ## L:475  |     90 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  LABEL_TPXCmd_DrawColor_0005:
+  vmudl $v12, $v12, $v17.v                           ## L:486  |      ^ | posClip *= normScaleW:ufract;
+  vmadm $v11, $v11, $v17.v                           ## L:486  |     91 | posClip *= normScaleW:ufract;
+  ldv $v03, 0, 16, $s6                               ## L:531  |      ^ | vec16 color = load(ptrIn, 16).xyzw;
+  vmadn $v12, $v00, $v00                             ## L:486  |     92 | posClip *= normScaleW:ufract;
+  vrcph $v07.e3, $v11.e3                             ## L:488  |   **95 | invW.w = invert_half(posClip).w;
+  andi $t1, $t4, 1028                                ## L:482  |      ^ | clipA = temp1 & 0b0000'0100'0000'0100;
+  vrcpl $v08.e3, $v12.e3                             ## L:488  |     96 | invW.w = invert_half(posClip).w;
+  addiu $t2, $zero, 54                               ## L:535  |      ^ | cmdRdpRect = 0x36;
+  ori $at, $zero, %lo(BASE_SIZE)                     ## L:492  |     97 | posClip:sint.w = load(BASE_SIZE).x;
+  vrcph $v07.e3, $v11.e7                             ## L:489  |      ^ | invW.W = invert_half(posClip).W;
+  vrcpl $v08.e7, $v12.e7                             ## L:489  |     98 | invW.W = invert_half(posClip).W;
+  vmov $v12.e3, $v00.e0                              ## L:494  |     99 | posClip:sfract.w = 0;
+  vmov $v12.e7, $v00.e0                              ## L:495  |    100 | posClip:sfract.W = 0;
+  lsv $v11, 6, 0, $at                                ## L:492  |      ^ | posClip:sint.w = load(BASE_SIZE).x;
+  lsv $v11, 14, 0, $at                               ## L:493  |    101 | posClip:sint.W = load(BASE_SIZE).x;
+  vrcph $v07.e7, $v00.e7                             ## L:489  |      ^ | invW.W = invert_half(posClip).W;
+  andi $t5, $t4, 16448                               ## L:483  |  **104 | clipB = temp1 & 0b0100'0000'0100'0000;
+  vmudl $v29, $v12, $v08.h3                          ## L:498  |      ^ | posClip *= invW.wwwwWWWW;
+  vmadm $v29, $v11, $v08.h3                          ## L:498  |    105 | posClip *= invW.wwwwWWWW;
+  vmadn $v12, $v12, $v07.h3                          ## L:498  |    106 | posClip *= invW.wwwwWWWW;
+  vmadh $v11, $v11, $v07.h3                          ## L:498  |    107 | posClip *= invW.wwwwWWWW;
+  slv $v03, 0, 4, $s3                                ## L:543  |      ^ | store(color.xy, dmaDmemEnd, 4);
+  vmulf $v14, $v14, $v11.h3                          ## L:502  | ***111 | localPartSize:sfract *= posClip:sint.wwwwWWWW;
+  vmudl $v29, $v12, $v20.v                           ## L:505  |    112 | vec32 posScreen = posClip * screenSize;
+  vmadm $v29, $v11, $v20.v                           ## L:505  |    113 | vec32 posScreen = posClip * screenSize;
+  vmadn $v08, $v12, $v19.v                           ## L:505  |    114 | vec32 posScreen = posClip * screenSize;
+  vmadh $v07, $v11, $v19.v                           ## L:505  |    115 | vec32 posScreen = posClip * screenSize;
+  vmadh $v06, $v18, $v10.v                           ## L:506  |    116 | vec16 posCenter = screenOffset:sint +* vecOne;
+  vmadh $v05, $v10, $v14.v                           ## L:509  |    117 | vec16 posEnd = vecOne +* localPartSize:sint;
+  vsubc $v09, $v06, $v14.v                           ## L:510  |  **120 | posStart = posCenter - localPartSize:sint;
+  vlt $v05, $v05, $v15                               ## L:513  |    121 | posEnd = min(posEnd, screenMax);
+  addiu $at, $zero, 3                                ## L:567  |      ^ | if(clipA == 0b0000'0011) {
+  ssv $v06, 4, 12, $s3                               ## L:547  |  **124 | store(posCenter.z, dmaDmemEnd, 0x04, 8);
+  vge $v09, $v09, $v16                               ## L:514  |      ^ | posStart = max(posStart, screenMin);
+  mfc2 $fp, $v05.e1                                  ## L:1191 |    125 | outA = pos.y;
+  mfc2 $sp, $v05.e5                                  ## L:1197 |    126 | outB = pos.Y;
+  andi $fp, $fp, 4095                                ## L:1192 |   *128 | outA &= 0b1111'1111'1111;
+  mfc2 $s2, $v09.e1                                  ## L:1191 |    129 | outA = pos.y;
+  mfc2 $k1, $v09.e0                                  ## L:1193 |    130 | u32 tmp = pos.x;
+  andi $sp, $sp, 4095                                ## L:1198 |    131 | outB &= 0b1111'1111'1111;
+  andi $s2, $s2, 4095                                ## L:1192 |    132 | outA &= 0b1111'1111'1111;
+  mfc2 $s1, $v09.e5                                  ## L:1197 |    133 | outB = pos.Y;
+  vlt $v04, $v09, $v05                               ## L:520  |      ^ | vec16 extend = posStart < posEnd;
+  sll $k1, $k1, 12                                   ## L:1194 |    134 | tmp <<= 12;
+  or $s2, $s2, $k1                                   ## L:1195 |    135 | outA |= tmp;
+  andi $s1, $s1, 4095                                ## L:1198 |    136 | outB &= 0b1111'1111'1111;
+  mfc2 $k1, $v09.e4                                  ## L:1199 |    137 | tmp = pos.X;
+  sw $s2, 4 + 16($s3)                                ## L:551  |    138 | store(posA, dmaDmemEnd, 0x04, 16);
+  ldv $v09, 0, 24, $s6                               ## L:558  |    139 | posStart.xyzw = load(ptrIn, 24).xyzw;
+  sll $k1, $k1, 12                                   ## L:1200 |    140 | tmp <<= 12;
+  or $s1, $s1, $k1                                   ## L:1201 |    141 | outB |= tmp;
+  mfc2 $k1, $v05.e0                                  ## L:1193 |    142 | u32 tmp = pos.x;
+  cfc2 $t4, $vcc                                     ## L:521  |    143 | temp1 = get_vcc();
+  ldv $v09, 8, 32, $s6                               ## L:559  |    144 | posStart.XYZW = load(ptrIn, 24).XYZW;
+  sll $k1, $k1, 12                                   ## L:1194 |    145 | tmp <<= 12;
+  or $fp, $fp, $k1                                   ## L:1195 |    146 | outA |= tmp;
+  sw $fp, 0 + 16($s3)                                ## L:552  |    147 | @Barrier("pos-cmd") store(posEndA, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1
+  andi $a2, $t4, 3                                   ## L:523  |    148 | temp0 = temp1 & 0b0000'0011;
+  mfc2 $k1, $v05.e4                                  ## L:1199 |    149 | tmp = pos.X;
+  or $t1, $t1, $a2                                   ## L:524  |    150 | clipA |= temp0;
+  andi $a2, $t4, 48                                  ## L:526  |    151 | temp0 = temp1 & 0b0011'0000;
+  vmulf $v14, $v13, $v09.h3                          ## L:560  |      ^ | localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
+  sll $k1, $k1, 12                                   ## L:1200 |    152 | tmp <<= 12;
+  sb $t2, 0 + 16($s3)                                ## L:553  |    153 | @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1
+  or $t5, $t5, $a2                                   ## L:527  |    154 | clipB |= temp0;
+  bne $t1, $at, LABEL_TPXCmd_DrawColor_0007          ## L:567  |    155 | if(clipA == 0b0000'0011) {
+  or $sp, $sp, $k1                                   ## L:1201 |   *157 | outB |= tmp;
+  addiu $s3, $s3, 24                                 ## L:568  |    158 | dmaDmemEnd += 24;
+  LABEL_TPXCmd_DrawColor_0007:
+  addiu $at, $zero, 48                               ## L:578  |    159 | if(clipB == 0b0011'0000) {
+  bne $t5, $at, LABEL_TPXCmd_DrawColor_0008          ## L:578  |    160 | if(clipB == 0b0011'0000) {
+  addiu $s6, $s6, 24                                 ## L:572  |   *162 | ptrIn += 24;
+  sw $s1, 4 + 16($s3)                                ## L:582  |    163 | store(posB, dmaDmemEnd, 0x04, 16);
+  ssv $v06, 12, 12, $s3                              ## L:580  |    164 | store(posCenter.Z, dmaDmemEnd, 0x04, 8);
+  sw $sp, 0 + 16($s3)                                ## L:583  |    165 | @Barrier("pos-cmd") store(posEndB, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1
+  sb $t2, 0 + 16($s3)                                ## L:584  |    166 | @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1
+  slv $v03, 4, 4, $s3                                ## L:579  |    167 | store(color.zw, dmaDmemEnd, 4);
+  addiu $s3, $s3, 24                                 ## L:586  |    168 | dmaDmemEnd += 24;
+  LABEL_TPXCmd_DrawColor_0008:
+  vmudn $v12, $v28, $v09.h0                          ## L:93   |      ^ | out = mat0  * vec.xxxxXXXX;
+  sltu $at, $s3, $s5                                 ## L:593  |    169 | if(dmaDmemEnd >= dmaDmemFlush) {
+  vmadh $v11, $v27, $v09.h0                          ## L:93   |      ^ | out = mat0  * vec.xxxxXXXX;
+  vmadn $v12, $v26, $v09.h1                          ## L:94   |    170 | out = mat1 +* vec.yyyyYYYY;
+  vmadh $v11, $v25, $v09.h1                          ## L:94   |    171 | out = mat1 +* vec.yyyyYYYY;
+  vmadn $v12, $v24, $v09.h2                          ## L:95   |    172 | out = mat2 +* vec.zzzzZZZZ;
+  vmadh $v11, $v23, $v09.h2                          ## L:95   |    173 | out = mat2 +* vec.zzzzZZZZ;
+  vmadn $v12, $v22, $v30.e7                          ## L:96   |    174 | out = mat3 +* 1;
+  bne $at, $zero, LABEL_TPXCmd_DrawColor_0009        ## L:593  |      ^ | if(dmaDmemEnd >= dmaDmemFlush) {
+  vmadh $v11, $v21, $v30.e7                          ## L:96   |   *176 | out = mat3 +* 1;
+  jal RDPQ_Send                                      ## L:594  |    177 | RDPQ_Send(dmaDmem, dmaDmemEnd); ## Args: $s4, $s3
+  nop                                                ## L:594  |   *179 | RDPQ_Send(dmaDmem, dmaDmemEnd);
+  or $s3, $zero, $s4                                 ## L:595  |    180 | dmaDmemEnd = dmaDmem;
+  LABEL_TPXCmd_DrawColor_0009:
+  vch $v29, $v11, $v11.h3                            ## L:598  |      ^ | temp1 = clip(posClip, posClip.wwwwWWWW);
+  vcl $v29, $v12, $v12.h3                            ## L:598  |    181 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  bne $s6, $s7, LABEL_TPXCmd_DrawColor_0005          ## L:598  |      ^ | temp1 = clip(posClip, posClip.wwwwWWWW);
+  cfc2 $t4, $vcc                                     ## L:598  |   *183 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  LABEL_TPXCmd_DrawColor_0006:
+  beq $zero, $zero, LABEL_TPXCmd_DrawColor_0004      ## L:598  |    184 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  nop                                                ## L:598  |   *186 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  LABEL_TPXCmd_DrawColor_0003:
+  lpv $v09, 0, 0, $s6                                ## L:116  |    187 | vec16 posStart = load_vec_s8(ptrIn, 0x00);
+  vmulf $v14, $v13, $v09.h3                          ## L:117  | ***191 | localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
+  vmudm $v09, $v09, $v31.e7                          ## L:118  |    192 | posStart >>= 8;
+  vmudn $v12, $v28, $v09.h0                          ## L:93   | ***196 | out = mat0  * vec.xxxxXXXX;
+  vmadh $v11, $v27, $v09.h0                          ## L:93   |    197 | out = mat0  * vec.xxxxXXXX;
+  vmadn $v12, $v26, $v09.h1                          ## L:94   |    198 | out = mat1 +* vec.yyyyYYYY;
+  vmadh $v11, $v25, $v09.h1                          ## L:94   |    199 | out = mat1 +* vec.yyyyYYYY;
+  vmadn $v12, $v24, $v09.h2                          ## L:95   |    200 | out = mat2 +* vec.zzzzZZZZ;
+  vmadh $v11, $v23, $v09.h2                          ## L:95   |    201 | out = mat2 +* vec.zzzzZZZZ;
+  vmadn $v12, $v22, $v30.e7                          ## L:96   |    202 | out = mat3 +* 1;
+  vmadh $v11, $v21, $v30.e7                          ## L:96   |    203 | out = mat3 +* 1;
+  vch $v29, $v11, $v11.h3                            ## L:122  | ***207 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  vcl $v29, $v12, $v12.h3                            ## L:122  |    208 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  cfc2 $t4, $vcc                                     ## L:122  |    209 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  LABEL_TPXCmd_DrawColor_000A:
+  vmudl $v12, $v12, $v17.v                           ## L:133  |      ^ | posClip *= normScaleW:ufract;
+  vmadm $v11, $v11, $v17.v                           ## L:133  |    210 | posClip *= normScaleW:ufract;
+  vmadn $v12, $v00, $v00                             ## L:133  |    211 | posClip *= normScaleW:ufract;
+  ldv $v03, 0, 8, $s6                                ## L:180  |      ^ | vec16 color = load(ptrIn, 8).xyzw;
+  andi $t1, $t4, 1028                                ## L:129  |    212 | clipA = temp1 & 0b0000'0100'0000'0100;
+  addiu $t2, $zero, 54                               ## L:182  |   *214 | cmdRdpRect = 0x36;
+  vrcph $v07.e3, $v11.e3                             ## L:135  |      ^ | invW.w = invert_half(posClip).w;
+  vrcpl $v08.e3, $v12.e3                             ## L:135  |    215 | invW.w = invert_half(posClip).w;
+  vrcph $v07.e3, $v11.e7                             ## L:136  |    216 | invW.W = invert_half(posClip).W;
+  ori $at, $zero, %lo(BASE_SIZE)                     ## L:139  |      ^ | posClip:sint.w = load(BASE_SIZE).x;
+  vrcpl $v08.e7, $v12.e7                             ## L:136  |    217 | invW.W = invert_half(posClip).W;
+  lsv $v11, 6, 0, $at                                ## L:139  |      ^ | posClip:sint.w = load(BASE_SIZE).x;
+  vmov $v12.e3, $v00.e0                              ## L:141  |    218 | posClip:sfract.w = 0;
+  lsv $v11, 14, 0, $at                               ## L:140  |      ^ | posClip:sint.W = load(BASE_SIZE).x;
+  addiu $at, $zero, 3                                ## L:214  |    219 | if(clipA == 0b0000'0011) {
+  vmov $v12.e7, $v00.e0                              ## L:142  |      ^ | posClip:sfract.W = 0;
+  vrcph $v07.e7, $v00.e7                             ## L:136  |    220 | invW.W = invert_half(posClip).W;
+  vmudl $v29, $v12, $v08.h3                          ## L:145  |  **223 | posClip *= invW.wwwwWWWW;
+  vmadm $v29, $v11, $v08.h3                          ## L:145  |    224 | posClip *= invW.wwwwWWWW;
+  vmadn $v12, $v12, $v07.h3                          ## L:145  |    225 | posClip *= invW.wwwwWWWW;
+  vmadh $v11, $v11, $v07.h3                          ## L:145  |    226 | posClip *= invW.wwwwWWWW;
+  vmulf $v14, $v14, $v11.h3                          ## L:149  | ***230 | localPartSize:sfract *= posClip:sint.wwwwWWWW;
+  andi $t5, $t4, 16448                               ## L:130  |      ^ | clipB = temp1 & 0b0100'0000'0100'0000;
+  vmudl $v29, $v12, $v20.v                           ## L:152  |    231 | vec32 posScreen = posClip * screenSize;
+  slv $v03, 0, 4, $s3                                ## L:190  |      ^ | store(color.xy, dmaDmemEnd, 4);
+  vmadm $v29, $v11, $v20.v                           ## L:152  |    232 | vec32 posScreen = posClip * screenSize;
+  vmadn $v08, $v12, $v19.v                           ## L:152  |    233 | vec32 posScreen = posClip * screenSize;
+  vmadh $v07, $v11, $v19.v                           ## L:152  |    234 | vec32 posScreen = posClip * screenSize;
+  vmadh $v06, $v18, $v10.v                           ## L:153  |    235 | vec16 posCenter = screenOffset:sint +* vecOne;
+  vmadh $v05, $v10, $v14.v                           ## L:156  |    236 | vec16 posEnd = vecOne +* localPartSize:sint;
+  vsubc $v09, $v06, $v14.v                           ## L:157  |  **239 | posStart = posCenter - localPartSize:sint;
+  vlt $v05, $v05, $v15                               ## L:160  |    240 | posEnd = min(posEnd, screenMax);
+  vge $v09, $v09, $v16                               ## L:161  |  **243 | posStart = max(posStart, screenMin);
+  ssv $v06, 4, 12, $s3                               ## L:194  |      ^ | store(posCenter.z, dmaDmemEnd, 0x04, 8);
+  mfc2 $fp, $v05.e1                                  ## L:1191 |    244 | outA = pos.y;
+  mfc2 $sp, $v05.e5                                  ## L:1197 |    245 | outB = pos.Y;
+  andi $fp, $fp, 4095                                ## L:1192 |   *247 | outA &= 0b1111'1111'1111;
+  mfc2 $k1, $v09.e0                                  ## L:1193 |    248 | u32 tmp = pos.x;
+  mfc2 $s2, $v09.e1                                  ## L:1191 |    249 | outA = pos.y;
+  andi $sp, $sp, 4095                                ## L:1198 |    250 | outB &= 0b1111'1111'1111;
+  vlt $v04, $v09, $v05                               ## L:167  |      ^ | vec16 extend = posStart < posEnd;
+  sll $k1, $k1, 12                                   ## L:1194 |    251 | tmp <<= 12;
+  mfc2 $s1, $v09.e5                                  ## L:1197 |    252 | outB = pos.Y;
+  andi $s2, $s2, 4095                                ## L:1192 |    253 | outA &= 0b1111'1111'1111;
+  or $s2, $s2, $k1                                   ## L:1195 |    254 | outA |= tmp;
+  mfc2 $k1, $v09.e4                                  ## L:1199 |    255 | tmp = pos.X;
+  cfc2 $t4, $vcc                                     ## L:168  |    256 | temp1 = get_vcc();
+  lpv $v09, 0, 16, $s6                               ## L:209  |    257 | posStart = load_vec_s8(ptrIn, 16);
+  sll $k1, $k1, 12                                   ## L:1200 |    258 | tmp <<= 12;
+  andi $s1, $s1, 4095                                ## L:1198 |    259 | outB &= 0b1111'1111'1111;
+  andi $a2, $t4, 3                                   ## L:170  |    260 | temp0 = temp1 & 0b0000'0011;
+  or $s1, $s1, $k1                                   ## L:1201 |    261 | outB |= tmp;
+  mfc2 $k1, $v05.e0                                  ## L:1193 |    262 | u32 tmp = pos.x;
+  vmulf $v14, $v13, $v09.h3                          ## L:210  |      ^ | localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
+  vmudm $v09, $v09, $v31.e7                          ## L:211  |    263 | posStart >>= 8;
+  or $t1, $t1, $a2                                   ## L:171  |      ^ | clipA |= temp0;
+  andi $a2, $t4, 48                                  ## L:173  |    264 | temp0 = temp1 & 0b0011'0000;
+  sll $k1, $k1, 12                                   ## L:1194 |    265 | tmp <<= 12;
+  or $fp, $fp, $k1                                   ## L:1195 |    266 | outA |= tmp;
+  mfc2 $k1, $v05.e4                                  ## L:1199 |    267 | tmp = pos.X;
+  sw $s2, 4 + 16($s3)                                ## L:198  |    268 | store(posA, dmaDmemEnd, 0x04, 16);
+  or $t5, $t5, $a2                                   ## L:174  |    269 | clipB |= temp0;
+  sw $fp, 0 + 16($s3)                                ## L:199  |    270 | @Barrier("pos-cmd") store(posEndA, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1
+  sb $t2, 0 + 16($s3)                                ## L:200  |    271 | @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1
+  sll $k1, $k1, 12                                   ## L:1200 |    272 | tmp <<= 12;
+  bne $t1, $at, LABEL_TPXCmd_DrawColor_000C          ## L:214  |    273 | if(clipA == 0b0000'0011) {
+  or $sp, $sp, $k1                                   ## L:1201 |   *275 | outB |= tmp;
+  addiu $s3, $s3, 24                                 ## L:215  |    276 | dmaDmemEnd += 24;
+  LABEL_TPXCmd_DrawColor_000C:
+  addiu $at, $zero, 48                               ## L:225  |    277 | if(clipB == 0b0011'0000) {
+  bne $t5, $at, LABEL_TPXCmd_DrawColor_000D          ## L:225  |    278 | if(clipB == 0b0011'0000) {
+  addiu $s6, $s6, 16                                 ## L:221  |   *280 | ptrIn += 16;
+  ssv $v06, 12, 12, $s3                              ## L:227  |    281 | store(posCenter.Z, dmaDmemEnd, 0x04, 8);
+  slv $v03, 4, 4, $s3                                ## L:226  |    282 | store(color.zw, dmaDmemEnd, 4);
+  sw $sp, 0 + 16($s3)                                ## L:230  |    283 | @Barrier("pos-cmd") store(posEndB, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1
+  sw $s1, 4 + 16($s3)                                ## L:229  |    284 | store(posB, dmaDmemEnd, 0x04, 16);
+  sb $t2, 0 + 16($s3)                                ## L:231  |    285 | @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1
+  addiu $s3, $s3, 24                                 ## L:233  |    286 | dmaDmemEnd += 24;
+  LABEL_TPXCmd_DrawColor_000D:
+  vmudn $v12, $v28, $v09.h0                          ## L:93   |      ^ | out = mat0  * vec.xxxxXXXX;
+  vmadh $v11, $v27, $v09.h0                          ## L:93   |    287 | out = mat0  * vec.xxxxXXXX;
+  vmadn $v12, $v26, $v09.h1                          ## L:94   |    288 | out = mat1 +* vec.yyyyYYYY;
+  vmadh $v11, $v25, $v09.h1                          ## L:94   |    289 | out = mat1 +* vec.yyyyYYYY;
+  sltu $at, $s3, $s5                                 ## L:240  |      ^ | if(dmaDmemEnd >= dmaDmemFlush) {
+  vmadn $v12, $v24, $v09.h2                          ## L:95   |    290 | out = mat2 +* vec.zzzzZZZZ;
+  vmadh $v11, $v23, $v09.h2                          ## L:95   |    291 | out = mat2 +* vec.zzzzZZZZ;
+  vmadn $v12, $v22, $v30.e7                          ## L:96   |    292 | out = mat3 +* 1;
+  bne $at, $zero, LABEL_TPXCmd_DrawColor_000E        ## L:240  |      ^ | if(dmaDmemEnd >= dmaDmemFlush) {
+  vmadh $v11, $v21, $v30.e7                          ## L:96   |   *294 | out = mat3 +* 1;
+  jal RDPQ_Send                                      ## L:241  |    295 | RDPQ_Send(dmaDmem, dmaDmemEnd); ## Args: $s4, $s3
+  nop                                                ## L:241  |   *297 | RDPQ_Send(dmaDmem, dmaDmemEnd);
+  or $s3, $zero, $s4                                 ## L:242  |    298 | dmaDmemEnd = dmaDmem;
+  LABEL_TPXCmd_DrawColor_000E:
+  vch $v29, $v11, $v11.h3                            ## L:245  |      ^ | temp1 = clip(posClip, posClip.wwwwWWWW);
+  vcl $v29, $v12, $v12.h3                            ## L:245  |    299 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  bne $s6, $s7, LABEL_TPXCmd_DrawColor_000A          ## L:245  |      ^ | temp1 = clip(posClip, posClip.wwwwWWWW);
+  cfc2 $t4, $vcc                                     ## L:245  |   *301 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  LABEL_TPXCmd_DrawColor_0004:
+  j RDPQ_Send                                        ## L:908  |    302 | goto RDPQ_Send;
+  ori $ra, $zero, %lo(RSPQ_Loop)                     ## L:907  |    303 | RA = RSPQ_Loop;
 mulMat4Mat4:
   ldv $v01, 0, 0, $s3
   ldv $v10, 0, 8, $s4
@@ -458,250 +569,398 @@ TPXCmd_SetDMEM:
   j RSPQ_Loop
   sw $a1, ($a0)
 TPXCmd_DrawTextured:
-  andi $t0, $a0, 65535
-  or $s0, $zero, $a1
-  ori $s4, $zero, %lo(PARTICLE_BUFF)
-  addu $s7, $s4, $t0
-  addiu $t0, $t0, -1
-  jal DMAExec ## Args: $t0, $t1, $s0, $s4, $t2
-  or $t2, $zero, $zero
-  ori $at, $zero, %lo(MATRIX_MVP)
-  lw $s2, %lo(RDPQ_SCISSOR_RECT + 4)
-  ldv $v21, 0, 48, $at
-  ldv $v27, 0, 0, $at
-  ldv $v24, 0, 40, $at
-  ldv $v21, 8, 48, $at
-  ldv $v25, 0, 16, $at
-  ldv $v22, 0, 56, $at
-  ldv $v26, 0, 24, $at
-  lw $s1, %lo(RDPQ_SCISSOR_RECT + 0)
-  ldv $v28, 0, 8, $at
-  ldv $v25, 8, 16, $at
-  ldv $v28, 8, 8, $at
-  srl $t4, $s2, 12
-  ldv $v27, 8, 0, $at
-  or $s6, $zero, $s4
-  ldv $v23, 0, 32, $at
-  ldv $v22, 8, 56, $at
-  ldv $v24, 8, 40, $at
-  ldv $v23, 8, 32, $at
-  ldv $v26, 8, 24, $at
-  ori $at, $zero, %lo(SCREEN_SCALE_OFFSET)
-  ldv $v19, 0, 0, $at
-  lui $t7, 0x3A00
-  ori $s4, $zero, %lo(RDP_BUFF)
-  mtc2 $s2, $v15.e1
-  mtc2 $s1, $v16.e1
-  ldv $v19, 8, 0, $at
-  ldv $v18, 0, 8, $at
-  vmudl $v20, $v00, $v31.e3
-  ldv $v18, 8, 8, $at
-  ori $at, $zero, %lo(NORM_SCALE_W)
-  ldv $v17, 0, 0, $at
-  mtc2 $s1, $v16.e5
-  ldv $v17, 8, 0, $at
-  addiu $at, $zero, 4095
-  ori $s3, $zero, %lo(RDP_BUFF)
-  mtc2 $t4, $v15.e0
-  mtc2 $t4, $v15.e4
-  addiu $s5, $s3, 480
-  srl $t4, $s1, 12
-  vmadm $v19, $v19, $v31.e3
-  mtc2 $s2, $v15.e5
-  mtc2 $at, $v15.e3
-  ori $s2, $zero, %lo(RDP_BUFF)
-  ori $at, $zero, %lo(PARTICLE_SCALE)
-  mtc2 $t4, $v16.e4
-  vmadn $v20, $v00, $v00
-  mtc2 $t4, $v16.e0
-  vand $v15, $v15, $v15.e3
-  llv $v13, 0, 0, $at
-  llv $v13, 8, 0, $at
-  addiu $s1, $s5, 32
-  addiu $a0, $zero, 46
-  vand $v16, $v16, $v15.e3
-  LABEL_000C:
-  sw $t7, 0($s2)
-  sh $zero, 14($s2)
-  sb $a0, 8($s2)
-  addiu $s2, $s2, 32
-  bne $s2, $s1, LABEL_000C
-  nop
-  LABEL_000D:
-  lhu $s1, %lo(TILE_COUNT + 0)
-  vxor $v12, $v00, $v00.e0
-  vmov $v12.e1, $v30.e7
-  vxor $v11, $v00, $v00.e0
-  vmov $v12.e5, $v30.e7
-  addu $fp, $s1, $s1
-  mtc2 $fp, $v12.e3
-  mtc2 $s1, $v12.e2
-  addiu $fp, $fp, 65535
-  vmov $v12.e7, $v31.e7
-  mtc2 $fp, $v12.e0
-  mtc2 $fp, $v12.e4
-  lh $s2, %lo(TEX_OFFSET + 0)
-  vmudn $v12, $v12, $v12.e7
-  vsubc $v29, $v12, $v30.e7
-  vmov $v12.e3, $v29.e3
-  jal DMAWaitIdle
-  vor $v11, $v00, $v29.e2
-  lpv $v08, 0, 0, $s6
-  vmulf $v14, $v13, $v08.h3
-  vmudm $v08, $v08, $v31.e7
-  vmov $v08.e7, $v30.e7
-  vmov $v08.e3, $v30.e7
-  vxor $v07, $v00, $v30.e7
-  vmudn $v10, $v28, $v08.h0
-  vmadh $v09, $v27, $v08.h0
-  vmadn $v10, $v26, $v08.h1
-  vmadh $v09, $v25, $v08.h1
-  vmadn $v10, $v24, $v08.h2
-  vmadh $v09, $v23, $v08.h2
-  vmadn $v10, $v22, $v08.h3
-  vmadh $v09, $v21, $v08.h3
-  vch $v29, $v09, $v09.h3
-  vcl $v29, $v10, $v10.h3
-  cfc2 $t4, $vcc
-  LABEL_000E:
-  vmudl $v10, $v10, $v17.v
-  vmadm $v09, $v09, $v17.v
-  ori $at, $zero, %lo(BASE_SIZE)
-  vmadn $v10, $v00, $v00
-  vrcph $v05.e3, $v09.e3
-  andi $t1, $t4, 1028
-  vrcpl $v06.e3, $v10.e3
-  vrcph $v05.e3, $v09.e7
-  vrcpl $v06.e7, $v10.e7
-  lsv $v09, 6, 0, $at
-  lsv $v09, 14, 0, $at
-  vmov $v10.e3, $v00.e0
-  andi $t5, $t4, 16448
-  vrcph $v05.e7, $v00.e7
-  vmov $v10.e7, $v00.e0
-  vmudl $v29, $v10, $v06.h3
-  lb $a2, 11($s6)
-  vmadm $v29, $v09, $v06.h3
-  vmadn $v10, $v10, $v05.h3
-  vmadh $v09, $v09, $v05.h3
-  vmulf $v14, $v14, $v09.h3
-  vmudl $v29, $v10, $v20.v
-  vmadm $v29, $v09, $v20.v
-  vmadn $v04, $v10, $v19.v
-  vmadh $v03, $v09, $v19.v
-  vmadh $v02, $v18, $v07.v
-  lb $t4, 15($s6)
-  vmadh $v04, $v07, $v14.v
-  vsubc $v08, $v02, $v14.v
-  addu $t4, $t4, $s2
-  addu $a2, $a2, $s2
-  vmudn $v14, $v14, $v30.e6
-  vrcp $v06.e0, $v14.e0
-  vrcph $v05.e0, $v14.e0
-  vrcp $v06.e1, $v14.e1
-  vrcph $v05.e1, $v14.e1
-  vrcp $v06.e4, $v14.e4
-  vrcph $v05.e4, $v14.e4
-  vrcp $v06.e5, $v14.e5
-  vrcph $v05.e5, $v14.e5
-  sll $a2, $a2, 3
-  vor $v03, $v00, $v05
-  vmudh $v01, $v08, $v05.v
-  vmudm $v01, $v01, $v31.e6
-  vxor $v05, $v00, $v00.e0
-  slv $v03, 0, 28, $s3
-  sll $t4, $t4, 3
-  mtc2 $t4, $v05.e4
-  mtc2 $a2, $v05.e0
-  vlt $v01, $v01, $v00.e0
-  addiu $t2, $zero, 36
-  vsubc $v06, $v00, $v01.v
-  vand $v05, $v05, $v12.e3
-  vge $v29, $v11, $v05.h0
-  vmrg $v01, $v06, $v01
-  vlt $v04, $v04, $v15
-  vge $v08, $v08, $v16
-  addiu $at, $zero, 3
-  vaddc $v01, $v01, $v05.v
-  ssv $v02, 4, 12, $s3
-  mfc2 $k1, $v04.e5
-  mfc2 $s1, $v08.e1
-  andi $k1, $k1, 4095
-  vsubc $v06, $v12, $v01.v
-  mfc2 $k0, $v08.e0
-  mfc2 $fp, $v08.e5
-  andi $s1, $s1, 4095
-  sll $k0, $k0, 12
-  vaddc $v06, $v06, $v12.e2
-  or $s1, $s1, $k0
-  mfc2 $k0, $v08.e4
-  vge $v29, $v11, $v05.h0
-  mfc2 $sp, $v04.e1
-  ldv $v05, 0, 8, $s6
-  vmrg $v01, $v01, $v06
-  andi $fp, $fp, 4095
-  sll $k0, $k0, 12
-  sw $s1, 4 + 16($s3)
-  vlt $v06, $v08, $v04
-  cfc2 $t4, $vcc
-  addiu $s6, $s6, 16
-  or $fp, $fp, $k0
-  mfc2 $k0, $v04.e0
-  andi $sp, $sp, 4095
-  andi $a2, $t4, 3
-  slv $v01, 0, 24, $s3
-  lpv $v08, 0, 0, $s6
-  sll $k0, $k0, 12
-  or $sp, $sp, $k0
-  mfc2 $k0, $v04.e4
-  sw $sp, 0 + 16($s3) ## Barrier: 0x1
-  or $t1, $t1, $a2
-  slv $v05, 0, 4, $s3
-  sll $k0, $k0, 12
-  andi $a2, $t4, 48
-  vmulf $v14, $v13, $v08.h3
-  vmudm $v08, $v08, $v31.e7
-  or $t5, $t5, $a2
-  vmov $v08.e3, $v30.e7
-  sb $t2, 0 + 16($s3) ## Barrier: 0x1
-  bne $t1, $at, LABEL_0010
-  or $k1, $k1, $k0
-  addiu $s3, $s3, 32
-  LABEL_0010:
-  vmov $v08.e7, $v30.e7
-  addiu $at, $zero, 48
-  bne $t5, $at, LABEL_0011
-  nop
-  sw $fp, 4 + 16($s3)
-  slv $v05, 4, 4, $s3
-  sw $k1, 0 + 16($s3) ## Barrier: 0x1
-  sb $t2, 0 + 16($s3) ## Barrier: 0x1
-  slv $v03, 8, 28, $s3
-  slv $v01, 8, 24, $s3
-  ssv $v02, 12, 12, $s3
-  addiu $s3, $s3, 32
-  LABEL_0011:
-  vmudn $v10, $v28, $v08.h0
-  vmadh $v09, $v27, $v08.h0
-  vmadn $v10, $v26, $v08.h1
-  vmadh $v09, $v25, $v08.h1
-  sltu $at, $s3, $s5
-  vmadn $v10, $v24, $v08.h2
-  vmadh $v09, $v23, $v08.h2
-  vmadn $v10, $v22, $v08.h3
-  bne $at, $zero, LABEL_0012
-  vmadh $v09, $v21, $v08.h3
-  jal RDPQ_Send ## Args: $s4, $s3
-  nop
-  or $s3, $zero, $s4
-  LABEL_0012:
-  vch $v29, $v09, $v09.h3
-  vcl $v29, $v10, $v10.h3
-  bne $s6, $s7, LABEL_000E
-  cfc2 $t4, $vcc
-  LABEL_000F:
-  j RDPQ_Send
-  ori $ra, $zero, %lo(RSPQ_Loop)
+  andi $t0, $a0, 65535                               ## L:1081 |      ^ | u32<$t0> dmaSize = dataSize & 0xFFFF;
+  ori $s4, $zero, %lo(PARTICLE_BUFF)                 ## L:1080 |      2 | u16<$s4> dmaDmem = PARTICLE_BUFF;
+  or $t2, $zero, $zero                               ## L:1097 |      3 | dma_in_async(dmaDmem, rdramAddr, dmaSize);
+  or $s0, $zero, $a1                                 ## L:1097 |      4 | dma_in_async(dmaDmem, rdramAddr, dmaSize);
+  addu $s7, $s4, $t0                                 ## L:1096 |      5 | u32 ptrInEnd = dmaDmem + dmaSize;
+  jal DMAExec                                        ## L:1097 |      6 | dma_in_async(dmaDmem, rdramAddr, dmaSize); ## Args: $t0, $t1, $s0, $s4, $t2
+  addiu $t0, $t0, -1                                 ## L:1097 |     *8 | dma_in_async(dmaDmem, rdramAddr, dmaSize);
+  vmudl $v20, $v00, $v31.e3                          ## L:1112 |      9 | screenSize >>= 4;
+  ori $at, $zero, %lo(MATRIX_MVP)                    ## L:1105 |      ^ | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw;
+  ldv $v22, 0, 56, $at                               ## L:1108 |     10 | vec32 mat3 = load(MATRIX_MVP, 0x30).xyzwxyzw;
+  lw $s2, %lo(RDPQ_SCISSOR_RECT + 4)                 ## L:1212 |     11 | u32 extMax = load(RDPQ_SCISSOR_RECT, 4);
+  ldv $v23, 0, 32, $at                               ## L:1107 |     12 | vec32 mat2 = load(MATRIX_MVP, 0x20).xyzwxyzw;
+  ldv $v25, 0, 16, $at                               ## L:1106 |     13 | vec32 mat1 = load(MATRIX_MVP, 0x10).xyzwxyzw;
+  ldv $v24, 0, 40, $at                               ## L:1107 |     14 | vec32 mat2 = load(MATRIX_MVP, 0x20).xyzwxyzw;
+  ldv $v21, 0, 48, $at                               ## L:1108 |     15 | vec32 mat3 = load(MATRIX_MVP, 0x30).xyzwxyzw;
+  ldv $v25, 8, 16, $at                               ## L:1106 |     16 | vec32 mat1 = load(MATRIX_MVP, 0x10).xyzwxyzw;
+  ldv $v27, 0, 0, $at                                ## L:1105 |     17 | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw;
+  ldv $v26, 0, 24, $at                               ## L:1106 |     18 | vec32 mat1 = load(MATRIX_MVP, 0x10).xyzwxyzw;
+  ldv $v27, 8, 0, $at                                ## L:1105 |     19 | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw;
+  ldv $v22, 8, 56, $at                               ## L:1108 |     20 | vec32 mat3 = load(MATRIX_MVP, 0x30).xyzwxyzw;
+  ldv $v26, 8, 24, $at                               ## L:1106 |     21 | vec32 mat1 = load(MATRIX_MVP, 0x10).xyzwxyzw;
+  ldv $v28, 0, 8, $at                                ## L:1105 |     22 | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw;
+  ldv $v23, 8, 32, $at                               ## L:1107 |     23 | vec32 mat2 = load(MATRIX_MVP, 0x20).xyzwxyzw;
+  ldv $v28, 8, 8, $at                                ## L:1105 |     24 | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw;
+  ldv $v21, 8, 48, $at                               ## L:1108 |     25 | vec32 mat3 = load(MATRIX_MVP, 0x30).xyzwxyzw;
+  ldv $v24, 8, 40, $at                               ## L:1107 |     26 | vec32 mat2 = load(MATRIX_MVP, 0x20).xyzwxyzw;
+  ori $at, $zero, %lo(SCREEN_SCALE_OFFSET)           ## L:1110 |     27 | vec32 screenSize:sint = load(SCREEN_SCALE_OFFSET).xyzwxyzw;
+  ldv $v19, 0, 0, $at                                ## L:1110 |     28 | vec32 screenSize:sint = load(SCREEN_SCALE_OFFSET).xyzwxyzw;
+  mtc2 $s2, $v15.e5                                  ## L:1217 |     29 | screenMax.y = extMax; screenMax.Y = extMax;
+  ldv $v18, 0, 8, $at                                ## L:1114 |     30 | vec16 screenOffset = load(SCREEN_SCALE_OFFSET, 0x08).xyzwxyzw;
+  ldv $v19, 8, 0, $at                                ## L:1110 |     31 | vec32 screenSize:sint = load(SCREEN_SCALE_OFFSET).xyzwxyzw;
+  ldv $v18, 8, 8, $at                                ## L:1114 |     32 | vec16 screenOffset = load(SCREEN_SCALE_OFFSET, 0x08).xyzwxyzw;
+  srl $t4, $s2, 12                                   ## L:1215 |     33 | temp1 = extMax >> 12;
+  lw $s1, %lo(RDPQ_SCISSOR_RECT + 0)                 ## L:1213 |     34 | u32 extMin = load(RDPQ_SCISSOR_RECT, 0);
+  mtc2 $s2, $v15.e1                                  ## L:1217 |     35 | screenMax.y = extMax; screenMax.Y = extMax;
+  ori $at, $zero, %lo(NORM_SCALE_W)                  ## L:1115 |     36 | vec16 normScaleW = load(NORM_SCALE_W).xyzwxyzw;
+  ldv $v17, 0, 0, $at                                ## L:1115 |     37 | vec16 normScaleW = load(NORM_SCALE_W).xyzwxyzw;
+  mtc2 $s1, $v16.e5                                  ## L:1221 |     38 | screenMin.y = extMin; screenMin.Y = extMin;
+  lui $t7, 0x3A00                                    ## L:1130 |     39 | cmdRdpColor = 0x3A00'0000;
+  ori $s3, $zero, %lo(RDP_BUFF)                      ## L:1102 |     40 | u16<$s3> dmaDmemEnd = RDP_BUFF;
+  mtc2 $t4, $v15.e0                                  ## L:1216 |     41 | screenMax.x = temp1;  screenMax.X = temp1;
+  ldv $v17, 8, 0, $at                                ## L:1115 |     42 | vec16 normScaleW = load(NORM_SCALE_W).xyzwxyzw;
+  addiu $s5, $s3, 480                                ## L:1103 |     43 | u16 dmaDmemFlush = dmaDmemEnd + 480;
+  or $s6, $zero, $s4                                 ## L:1099 |     44 | u32 ptrIn = dmaDmem;
+  addiu $at, $zero, 4095                             ## L:1211 |     45 | screenMax.w = 0b1111'1111'1111;
+  mtc2 $t4, $v15.e4                                  ## L:1216 |     46 | screenMax.x = temp1;  screenMax.X = temp1;
+  mtc2 $s1, $v16.e1                                  ## L:1221 |     47 | screenMin.y = extMin; screenMin.Y = extMin;
+  addiu $a0, $zero, 46                               ## L:1131 |     48 | cmdRdpDepth = 0x2E;
+  srl $t4, $s1, 12                                   ## L:1219 |     49 | temp1 = extMin >> 12;
+  mtc2 $at, $v15.e3                                  ## L:1211 |     50 | screenMax.w = 0b1111'1111'1111;
+  mtc2 $t4, $v16.e4                                  ## L:1220 |     51 | screenMin.x = temp1;  screenMin.X = temp1;
+  ori $at, $zero, %lo(PARTICLE_SCALE)                ## L:1122 |     52 | globalPartSize.xy = load(PARTICLE_SCALE).xy;
+  vmadm $v19, $v19, $v31.e3                          ## L:1112 |      ^ | screenSize >>= 4;
+  addiu $s1, $s5, 32                                 ## L:1129 |     53 | u16 buffRdpEnd = dmaDmemFlush + 32;
+  vmadn $v20, $v00, $v00                             ## L:1112 |      ^ | screenSize >>= 4;
+  vand $v15, $v15, $v15.e3                           ## L:1223 |     54 | screenMax &= screenMax.w;
+  mtc2 $t4, $v16.e0                                  ## L:1220 |      ^ | screenMin.x = temp1;  screenMin.X = temp1;
+  llv $v13, 0, 0, $at                                ## L:1122 |     55 | globalPartSize.xy = load(PARTICLE_SCALE).xy;
+  llv $v13, 8, 0, $at                                ## L:1123 |     56 | globalPartSize.XY = load(PARTICLE_SCALE).xy;
+  ori $s2, $zero, %lo(RDP_BUFF)                      ## L:1128 |     57 | u16 buffRdp = RDP_BUFF;
+  ori $s4, $zero, %lo(RDP_BUFF)                      ## L:1101 |     58 | dmaDmem = RDP_BUFF;
+  vand $v16, $v16, $v15.e3                           ## L:1224 |      ^ | screenMin &= screenMax.w;
+  LABEL_TPXCmd_DrawTextured_0013:
+  sh $zero, 14($s2)                                  ## L:1135 |     59 | store(ZERO:u16, buffRdp, 0x0E);
+  sw $t7, 0($s2)                                     ## L:1133 |     60 | store(cmdRdpColor, buffRdp, 0);
+  sb $a0, 8($s2)                                     ## L:1134 |     61 | store(cmdRdpDepth, buffRdp, 8);
+  addiu $s2, $s2, 32                                 ## L:1136 |     62 | buffRdp += 32;
+  bne $s2, $s1, LABEL_TPXCmd_DrawTextured_0013       ## L:1136 |     63 | buffRdp += 32;
+  nop                                                ## L:1136 |    *65 | buffRdp += 32;
+  LABEL_TPXCmd_DrawTextured_0014:
+  lhu $s1, %lo(TILE_COUNT + 0)                       ## L:1147 |     66 | u16 tiles = load(TILE_COUNT);
+  vxor $v12, $v00, $v00.e0                           ## L:1143 |      ^ | vec16 texMirrorMask = 0;
+  addu $fp, $s1, $s1                                 ## L:1148 |   **69 | u16 tilesEnd = tiles + tiles;
+  vmov $v12.e5, $v30.e7                              ## L:1150 |      ^ | texMirrorMask.Y = 1;
+  mtc2 $fp, $v12.e3                                  ## L:1153 |     70 | texMirrorMask.w = tilesEnd;
+  mtc2 $s1, $v12.e2                                  ## L:1152 |     71 | texMirrorMask.z = tiles;
+  vmov $v12.e1, $v30.e7                              ## L:1151 |     72 | texMirrorMask.y = 1;
+  vmov $v12.e7, $v31.e7                              ## L:1158 |     73 | texMirrorMask.W = 0x100;
+  addiu $fp, $fp, 65535                              ## L:1155 |      ^ | tilesEnd -= 1;
+  mtc2 $fp, $v12.e4                                  ## L:1157 |     74 | texMirrorMask.X = tilesEnd;
+  mtc2 $fp, $v12.e0                                  ## L:1156 |     75 | texMirrorMask.x = tilesEnd;
+  vxor $v11, $v00, $v00.e0                           ## L:1144 |      ^ | vec16 texMirrorCompare = 0;
+  vmudn $v12, $v12, $v12.e7                          ## L:1159 |  ***79 | texMirrorMask = texMirrorMask * texMirrorMask.W;
+  vsubc $v29, $v12, $v30.e7                          ## L:1161 |  ***83 | VTEMP = texMirrorMask - 1;
+  lh $s2, %lo(TEX_OFFSET + 0)                        ## L:1141 |      ^ | s16 texOffset = load(TEX_OFFSET);
+  vmov $v12.e3, $v29.e3                              ## L:1162 |  ***87 | texMirrorMask.w = VTEMP.w;
+  jal DMAWaitIdle                                    ## L:1166 |      ^ | dma_await();
+  vor $v11, $v00, $v29.e2                            ## L:1163 |    *89 | texMirrorCompare = VTEMP.z;
+  bgez $a1, LABEL_TPXCmd_DrawTextured_0015           ## L:1171 |     90 | if(rdramAddr < 0) {
+  nop                                                ## L:1171 |    *92 | if(rdramAddr < 0) {
+  vxor $v07, $v00, $v30.e7                           ## L:614  |     93 | const vec16 vecOne = 1;
+  lqv $v08, 0, 0, $s6                                ## L:606  |      ^ | vec16 posStart = load(ptrIn, 0x00);
+  vmulf $v14, $v13, $v08.h3                          ## L:607  |  ***97 | localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
+  vmudn $v10, $v28, $v08.h0                          ## L:92   |     98 | out = mat0  * vec.xxxxXXXX;
+  vmadh $v09, $v27, $v08.h0                          ## L:92   |     99 | out = mat0  * vec.xxxxXXXX;
+  vmadn $v10, $v26, $v08.h1                          ## L:93   |    100 | out = mat1 +* vec.yyyyYYYY;
+  vmadh $v09, $v25, $v08.h1                          ## L:93   |    101 | out = mat1 +* vec.yyyyYYYY;
+  vmadn $v10, $v24, $v08.h2                          ## L:94   |    102 | out = mat2 +* vec.zzzzZZZZ;
+  vmadh $v09, $v23, $v08.h2                          ## L:94   |    103 | out = mat2 +* vec.zzzzZZZZ;
+  vmadn $v10, $v22, $v30.e7                          ## L:95   |    104 | out = mat3 +* 1;
+  vmadh $v09, $v21, $v30.e7                          ## L:95   |    105 | out = mat3 +* 1;
+  vch $v29, $v09, $v09.h3                            ## L:619  | ***109 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  vcl $v29, $v10, $v10.h3                            ## L:619  |    110 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  cfc2 $t4, $vcc                                     ## L:619  |    111 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  LABEL_TPXCmd_DrawTextured_0017:
+  vmudl $v10, $v10, $v17.v                           ## L:630  |      ^ | posClip *= normScaleW:ufract;
+  lb $a2, 7($s6)                                     ## L:674  |    112 | temp0:s8 = load(ptrIn, 7);
+  vmadm $v09, $v09, $v17.v                           ## L:630  |      ^ | posClip *= normScaleW:ufract;
+  vmadn $v10, $v00, $v00                             ## L:630  |    113 | posClip *= normScaleW:ufract;
+  ori $at, $zero, %lo(BASE_SIZE)                     ## L:636  |      ^ | posClip:sint.w = load(BASE_SIZE).x;
+  vrcph $v05.e3, $v09.e3                             ## L:632  |  **116 | invW.w = invert_half(posClip).w;
+  andi $t1, $t4, 1028                                ## L:626  |      ^ | clipA = temp1 & 0b0000'0100'0000'0100;
+  andi $t5, $t4, 16448                               ## L:627  |    117 | clipB = temp1 & 0b0100'0000'0100'0000;
+  vrcpl $v06.e3, $v10.e3                             ## L:632  |      ^ | invW.w = invert_half(posClip).w;
+  lb $t4, 15($s6)                                    ## L:675  |    118 | temp1:s8 = load(ptrIn, 15);
+  vrcph $v05.e3, $v09.e7                             ## L:633  |      ^ | invW.W = invert_half(posClip).W;
+  addiu $t2, $zero, 36                               ## L:737  |    119 | cmdRdpRect = 0x24;
+  vrcpl $v06.e7, $v10.e7                             ## L:633  |      ^ | invW.W = invert_half(posClip).W;
+  vmov $v10.e7, $v00.e0                              ## L:639  |    120 | posClip:sfract.W = 0;
+  lsv $v09, 6, 0, $at                                ## L:636  |      ^ | posClip:sint.w = load(BASE_SIZE).x;
+  lsv $v09, 14, 0, $at                               ## L:637  |    121 | posClip:sint.W = load(BASE_SIZE).x;
+  vmov $v10.e3, $v00.e0                              ## L:638  |      ^ | posClip:sfract.w = 0;
+  addu $t4, $t4, $s2                                 ## L:681  |    122 | temp1 += texOffset; temp1 <<= 3;
+  sll $t4, $t4, 3                                    ## L:681  |    123 | temp1 += texOffset; temp1 <<= 3;
+  vrcph $v05.e7, $v00.e7                             ## L:633  |      ^ | invW.W = invert_half(posClip).W;
+  addu $a2, $a2, $s2                                 ## L:680  |   *125 | temp0 += texOffset; temp0 <<= 3;
+  vmudl $v29, $v10, $v06.h3                          ## L:642  |      ^ | posClip *= invW.wwwwWWWW;
+  sll $a2, $a2, 3                                    ## L:680  |    126 | temp0 += texOffset; temp0 <<= 3;
+  vmadm $v29, $v09, $v06.h3                          ## L:642  |      ^ | posClip *= invW.wwwwWWWW;
+  addiu $at, $zero, 3                                ## L:771  |    127 | if(clipA == 0b0000'0011) {
+  vmadn $v10, $v10, $v05.h3                          ## L:642  |      ^ | posClip *= invW.wwwwWWWW;
+  vmadh $v09, $v09, $v05.h3                          ## L:642  |    128 | posClip *= invW.wwwwWWWW;
+  vmulf $v14, $v14, $v09.h3                          ## L:645  | ***132 | localPartSize:sfract *= posClip:sint.wwwwWWWW;
+  vmudl $v29, $v10, $v20.v                           ## L:648  |    133 | vec32 posScreen = posClip * screenSize;
+  vmadm $v29, $v09, $v20.v                           ## L:648  |    134 | vec32 posScreen = posClip * screenSize;
+  vmadn $v04, $v10, $v19.v                           ## L:648  |    135 | vec32 posScreen = posClip * screenSize;
+  vmadh $v03, $v09, $v19.v                           ## L:648  |    136 | vec32 posScreen = posClip * screenSize;
+  vmadh $v02, $v18, $v07.v                           ## L:649  |    137 | vec16 posCenter = screenOffset:sint +* vecOne;
+  vmadh $v04, $v07, $v14.v                           ## L:654  |    138 | vec16 posEnd = vecOne +* localPartSize:sint;
+  vsubc $v08, $v02, $v14.v                           ## L:655  |  **141 | posStart = posCenter - localPartSize:sint;
+  vmudn $v14, $v14, $v30.e6                          ## L:659  |    142 | localPartSize *= 2;
+  vrcp $v06.e0, $v14.e0                              ## L:660  | ***146 | invW.x = invert_half(localPartSize).x;
+  vrcph $v05.e0, $v14.e0                             ## L:660  |    147 | invW.x = invert_half(localPartSize).x;
+  vrcp $v06.e1, $v14.e1                              ## L:661  |    148 | invW.y = invert_half(localPartSize).y;
+  vrcph $v05.e1, $v14.e1                             ## L:661  |    149 | invW.y = invert_half(localPartSize).y;
+  vrcp $v06.e4, $v14.e4                              ## L:662  |    150 | invW.X = invert_half(localPartSize).X;
+  vrcph $v05.e4, $v14.e4                             ## L:662  |    151 | invW.X = invert_half(localPartSize).X;
+  vrcp $v06.e5, $v14.e5                              ## L:663  |    152 | invW.Y = invert_half(localPartSize).Y;
+  vrcph $v05.e5, $v14.e5                             ## L:663  |    153 | invW.Y = invert_half(localPartSize).Y;
+  vmudh $v01, $v08, $v05.v                           ## L:668  | ***157 | vec16 uvStart = posStart * invW:sint;
+  vor $v03, $v00, $v05                               ## L:664  |    158 | vec16 uvDelta = invW:sint;
+  vmudm $v01, $v01, $v31.e6                          ## L:669  |  **161 | uvStart >>= 7;
+  vxor $v05, $v00, $v00.e0                           ## L:694  |    162 | vec16 texOffsetTotal = 0;
+  vlt $v01, $v01, $v00.e0                            ## L:670  |  **165 | uvStart = uvStart < 0;
+  mtc2 $a2, $v05.e0                                  ## L:695  |      ^ | texOffsetTotal.x = temp0:s16;
+  mtc2 $t4, $v05.e4                                  ## L:696  |    166 | texOffsetTotal.X = temp1:s16;
+  slv $v03, 0, 28, $s3                               ## L:758  |  **169 | store(uvDelta.xy, dmaDmemEnd, 0x0C, 16);
+  vsubc $v06, $v00, $v01.v                           ## L:692  |      ^ | vec16 uvStartNeg = VZERO - uvStart;
+  vand $v05, $v05, $v12.e3                           ## L:697  |    170 | texOffsetTotal &= texMirrorMask.w;
+  vge $v29, $v11, $v05.h0                            ## L:700  | ***174 | uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStartNeg : uvStart;
+  vmrg $v01, $v06, $v01                              ## L:700  |    175 | uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStartNeg : uvStart;
+  vlt $v04, $v04, $v15                               ## L:703  |    176 | posEnd = min(posEnd, screenMax);
+  ssv $v02, 4, 12, $s3                               ## L:749  |      ^ | store(posCenter.z, dmaDmemEnd, 0x04, 8);
+  vge $v08, $v08, $v16                               ## L:704  |    177 | posStart = max(posStart, screenMin);
+  vaddc $v01, $v01, $v05.v                           ## L:706  |  **180 | uvStart += texOffsetTotal;
+  mfc2 $k1, $v04.e5                                  ## L:1196 |      ^ | outB = pos.Y;
+  mfc2 $s1, $v08.e1                                  ## L:1190 |    181 | outA = pos.y;
+  mfc2 $fp, $v08.e5                                  ## L:1196 |  **184 | outB = pos.Y;
+  vsubc $v06, $v12, $v01.v                           ## L:711  |      ^ | uvStartNeg = texMirrorMask - uvStart;
+  mfc2 $k0, $v08.e0                                  ## L:1192 |    185 | u32 tmp = pos.x;
+  andi $s1, $s1, 4095                                ## L:1191 |    186 | outA &= 0b1111'1111'1111;
+  andi $k1, $k1, 4095                                ## L:1197 |    187 | outB &= 0b1111'1111'1111;
+  sll $k0, $k0, 12                                   ## L:1193 |    188 | tmp <<= 12;
+  or $s1, $s1, $k0                                   ## L:1194 |    189 | outA |= tmp;
+  mfc2 $k0, $v08.e4                                  ## L:1198 |    190 | tmp = pos.X;
+  vaddc $v06, $v06, $v12.e2                          ## L:712  |      ^ | uvStartNeg += texMirrorMask.z;
+  mfc2 $sp, $v04.e1                                  ## L:1190 |    191 | outA = pos.y;
+  andi $fp, $fp, 4095                                ## L:1197 |    192 | outB &= 0b1111'1111'1111;
+  sll $k0, $k0, 12                                   ## L:1199 |    193 | tmp <<= 12;
+  or $fp, $fp, $k0                                   ## L:1200 |    194 | outB |= tmp;
+  mfc2 $k0, $v04.e0                                  ## L:1192 |    195 | u32 tmp = pos.x;
+  sw $s1, 4 + 16($s3)                                ## L:754  |    196 | store(posA, dmaDmemEnd, 0x04, 16);
+  vge $v29, $v11, $v05.h0                            ## L:714  |      ^ | uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStart : uvStartNeg;
+  andi $sp, $sp, 4095                                ## L:1191 |    197 | outA &= 0b1111'1111'1111;
+  vmrg $v01, $v01, $v06                              ## L:714  |      ^ | uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStart : uvStartNeg;
+  sll $k0, $k0, 12                                   ## L:1193 |    198 | tmp <<= 12;
+  or $sp, $sp, $k0                                   ## L:1194 |    199 | outA |= tmp;
+  sw $sp, 0 + 16($s3)                                ## L:755  |    200 | @Barrier("pos-cmd") store(posEndA, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1
+  mfc2 $k0, $v04.e4                                  ## L:1198 |    201 | tmp = pos.X;
+  sb $t2, 0 + 16($s3)                                ## L:756  |    202 | @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1
+  vlt $v06, $v08, $v04                               ## L:721  |      ^ | vec16 extend = posStart < posEnd;
+  ldv $v08, 0, 24, $s6                               ## L:762  |    203 | posStart.xyzw = load(ptrIn, 24).xyzw;
+  cfc2 $t4, $vcc                                     ## L:722  |    204 | temp1 = get_vcc();
+  ldv $v05, 0, 16, $s6                               ## L:732  |    205 | vec16 color = load(ptrIn, 16).xyzw;
+  sll $k0, $k0, 12                                   ## L:1199 |    206 | tmp <<= 12;
+  ldv $v08, 8, 32, $s6                               ## L:763  |    207 | posStart.XYZW = load(ptrIn, 24).XYZW;
+  slv $v01, 0, 24, $s3                               ## L:757  |    208 | store(uvStart.xy, dmaDmemEnd, 0x08, 16);
+  andi $a2, $t4, 3                                   ## L:724  |    209 | temp0 = temp1 & 0b0000'0011;
+  or $k1, $k1, $k0                                   ## L:1200 |    210 | outB |= tmp;
+  or $t1, $t1, $a2                                   ## L:725  |    211 | clipA |= temp0;
+  slv $v05, 0, 4, $s3                                ## L:745  |    212 | store(color.xy, dmaDmemEnd, 4);
+  vmulf $v14, $v13, $v08.h3                          ## L:764  |      ^ | localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
+  andi $a2, $t4, 48                                  ## L:727  |    213 | temp0 = temp1 & 0b0011'0000;
+  bne $t1, $at, LABEL_TPXCmd_DrawTextured_0019       ## L:771  |    214 | if(clipA == 0b0000'0011) {
+  or $t5, $t5, $a2                                   ## L:728  |   *216 | clipB |= temp0;
+  addiu $s3, $s3, 32                                 ## L:772  |    217 | dmaDmemEnd += 32;
+  LABEL_TPXCmd_DrawTextured_0019:
+  addiu $at, $zero, 48                               ## L:782  |    218 | if(clipB == 0b0011'0000) {
+  bne $t5, $at, LABEL_TPXCmd_DrawTextured_001A       ## L:782  |    219 | if(clipB == 0b0011'0000) {
+  addiu $s6, $s6, 24                                 ## L:776  |   *221 | ptrIn += 24;
+  sw $k1, 0 + 16($s3)                                ## L:787  |    222 | @Barrier("pos-cmd") store(posEndB, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1
+  sb $t2, 0 + 16($s3)                                ## L:788  |    223 | @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1
+  ssv $v02, 12, 12, $s3                              ## L:784  |    224 | store(posCenter.Z, dmaDmemEnd, 0x04, 8);
+  slv $v01, 8, 24, $s3                               ## L:790  |    225 | store(uvStart.XY, dmaDmemEnd, 0x08, 16);
+  slv $v05, 4, 4, $s3                                ## L:783  |    226 | store(color.zw, dmaDmemEnd, 4);
+  sw $fp, 4 + 16($s3)                                ## L:786  |    227 | store(posB, dmaDmemEnd, 0x04, 16);
+  slv $v03, 8, 28, $s3                               ## L:791  |    228 | store(uvDelta.XY, dmaDmemEnd, 0x0C, 16);
+  addiu $s3, $s3, 32                                 ## L:793  |    229 | dmaDmemEnd += 32;
+  LABEL_TPXCmd_DrawTextured_001A:
+  vmudn $v10, $v28, $v08.h0                          ## L:92   |      ^ | out = mat0  * vec.xxxxXXXX;
+  vmadh $v09, $v27, $v08.h0                          ## L:92   |    230 | out = mat0  * vec.xxxxXXXX;
+  vmadn $v10, $v26, $v08.h1                          ## L:93   |    231 | out = mat1 +* vec.yyyyYYYY;
+  vmadh $v09, $v25, $v08.h1                          ## L:93   |    232 | out = mat1 +* vec.yyyyYYYY;
+  vmadn $v10, $v24, $v08.h2                          ## L:94   |    233 | out = mat2 +* vec.zzzzZZZZ;
+  vmadh $v09, $v23, $v08.h2                          ## L:94   |    234 | out = mat2 +* vec.zzzzZZZZ;
+  sltu $at, $s3, $s5                                 ## L:800  |      ^ | if(dmaDmemEnd >= dmaDmemFlush) {
+  vmadn $v10, $v22, $v30.e7                          ## L:95   |    235 | out = mat3 +* 1;
+  bne $at, $zero, LABEL_TPXCmd_DrawTextured_001B     ## L:800  |      ^ | if(dmaDmemEnd >= dmaDmemFlush) {
+  vmadh $v09, $v21, $v30.e7                          ## L:95   |   *237 | out = mat3 +* 1;
+  jal RDPQ_Send                                      ## L:801  |    238 | RDPQ_Send(dmaDmem, dmaDmemEnd); ## Args: $s4, $s3
+  nop                                                ## L:801  |   *240 | RDPQ_Send(dmaDmem, dmaDmemEnd);
+  or $s3, $zero, $s4                                 ## L:802  |    241 | dmaDmemEnd = dmaDmem;
+  LABEL_TPXCmd_DrawTextured_001B:
+  vch $v29, $v09, $v09.h3                            ## L:805  |      ^ | temp1 = clip(posClip, posClip.wwwwWWWW);
+  vcl $v29, $v10, $v10.h3                            ## L:805  |    242 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  bne $s6, $s7, LABEL_TPXCmd_DrawTextured_0017       ## L:805  |      ^ | temp1 = clip(posClip, posClip.wwwwWWWW);
+  cfc2 $t4, $vcc                                     ## L:805  |   *244 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  LABEL_TPXCmd_DrawTextured_0018:
+  beq $zero, $zero, LABEL_TPXCmd_DrawTextured_0016   ## L:805  |    245 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  nop                                                ## L:805  |   *247 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  LABEL_TPXCmd_DrawTextured_0015:
+  vxor $v07, $v00, $v30.e7                           ## L:261  |    248 | const vec16 vecOne = 1;
+  lpv $v08, 0, 0, $s6                                ## L:256  |      ^ | vec16 posStart = load_vec_s8(ptrIn, 0x00);
+  vmulf $v14, $v13, $v08.h3                          ## L:257  | ***252 | localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
+  vmudm $v08, $v08, $v31.e7                          ## L:258  |    253 | posStart >>= 8;
+  vmudn $v10, $v28, $v08.h0                          ## L:92   | ***257 | out = mat0  * vec.xxxxXXXX;
+  vmadh $v09, $v27, $v08.h0                          ## L:92   |    258 | out = mat0  * vec.xxxxXXXX;
+  vmadn $v10, $v26, $v08.h1                          ## L:93   |    259 | out = mat1 +* vec.yyyyYYYY;
+  vmadh $v09, $v25, $v08.h1                          ## L:93   |    260 | out = mat1 +* vec.yyyyYYYY;
+  vmadn $v10, $v24, $v08.h2                          ## L:94   |    261 | out = mat2 +* vec.zzzzZZZZ;
+  vmadh $v09, $v23, $v08.h2                          ## L:94   |    262 | out = mat2 +* vec.zzzzZZZZ;
+  vmadn $v10, $v22, $v30.e7                          ## L:95   |    263 | out = mat3 +* 1;
+  vmadh $v09, $v21, $v30.e7                          ## L:95   |    264 | out = mat3 +* 1;
+  vch $v29, $v09, $v09.h3                            ## L:266  | ***268 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  vcl $v29, $v10, $v10.h3                            ## L:266  |    269 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  cfc2 $t4, $vcc                                     ## L:266  |    270 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  LABEL_TPXCmd_DrawTextured_001C:
+  vmudl $v10, $v10, $v17.v                           ## L:277  |      ^ | posClip *= normScaleW:ufract;
+  lb $a2, 11($s6)                                    ## L:324  |    271 | temp0:s8 = load(ptrIn, 0x0B);
+  vmadm $v09, $v09, $v17.v                           ## L:277  |      ^ | posClip *= normScaleW:ufract;
+  vmadn $v10, $v00, $v00                             ## L:277  |    272 | posClip *= normScaleW:ufract;
+  ori $at, $zero, %lo(BASE_SIZE)                     ## L:283  |      ^ | posClip:sint.w = load(BASE_SIZE).x;
+  vrcph $v05.e3, $v09.e3                             ## L:279  |  **275 | invW.w = invert_half(posClip).w;
+  andi $t5, $t4, 16448                               ## L:274  |      ^ | clipB = temp1 & 0b0100'0000'0100'0000;
+  vrcpl $v06.e3, $v10.e3                             ## L:279  |    276 | invW.w = invert_half(posClip).w;
+  andi $t1, $t4, 1028                                ## L:273  |      ^ | clipA = temp1 & 0b0000'0100'0000'0100;
+  lb $t4, 15($s6)                                    ## L:325  |    277 | temp1:s8 = load(ptrIn, 0x0F);
+  vrcph $v05.e3, $v09.e7                             ## L:280  |      ^ | invW.W = invert_half(posClip).W;
+  vrcpl $v06.e7, $v10.e7                             ## L:280  |    278 | invW.W = invert_half(posClip).W;
+  vmov $v10.e3, $v00.e0                              ## L:285  |    279 | posClip:sfract.w = 0;
+  lsv $v09, 6, 0, $at                                ## L:283  |      ^ | posClip:sint.w = load(BASE_SIZE).x;
+  lsv $v09, 14, 0, $at                               ## L:284  |    280 | posClip:sint.W = load(BASE_SIZE).x;
+  vmov $v10.e7, $v00.e0                              ## L:286  |      ^ | posClip:sfract.W = 0;
+  vrcph $v05.e7, $v00.e7                             ## L:280  |    281 | invW.W = invert_half(posClip).W;
+  vmudl $v29, $v10, $v06.h3                          ## L:289  |  **284 | posClip *= invW.wwwwWWWW;
+  addu $a2, $a2, $s2                                 ## L:327  |      ^ | temp0 += texOffset; temp0 <<= 3;
+  vmadm $v29, $v09, $v06.h3                          ## L:289  |    285 | posClip *= invW.wwwwWWWW;
+  vmadn $v10, $v10, $v05.h3                          ## L:289  |    286 | posClip *= invW.wwwwWWWW;
+  vmadh $v09, $v09, $v05.h3                          ## L:289  |    287 | posClip *= invW.wwwwWWWW;
+  vmulf $v14, $v14, $v09.h3                          ## L:292  | ***291 | localPartSize:sfract *= posClip:sint.wwwwWWWW;
+  vmudl $v29, $v10, $v20.v                           ## L:295  |    292 | vec32 posScreen = posClip * screenSize;
+  vmadm $v29, $v09, $v20.v                           ## L:295  |    293 | vec32 posScreen = posClip * screenSize;
+  vmadn $v04, $v10, $v19.v                           ## L:295  |    294 | vec32 posScreen = posClip * screenSize;
+  vmadh $v03, $v09, $v19.v                           ## L:295  |    295 | vec32 posScreen = posClip * screenSize;
+  vmadh $v02, $v18, $v07.v                           ## L:296  |    296 | vec16 posCenter = screenOffset:sint +* vecOne;
+  vmadh $v04, $v07, $v14.v                           ## L:301  |    297 | vec16 posEnd = vecOne +* localPartSize:sint;
+  vsubc $v08, $v02, $v14.v                           ## L:302  |  **300 | posStart = posCenter - localPartSize:sint;
+  vmudn $v14, $v14, $v30.e6                          ## L:306  |    301 | localPartSize *= 2;
+  vrcp $v06.e0, $v14.e0                              ## L:307  | ***305 | invW.x = invert_half(localPartSize).x;
+  vrcph $v05.e0, $v14.e0                             ## L:307  |    306 | invW.x = invert_half(localPartSize).x;
+  vrcp $v06.e1, $v14.e1                              ## L:308  |    307 | invW.y = invert_half(localPartSize).y;
+  vrcph $v05.e1, $v14.e1                             ## L:308  |    308 | invW.y = invert_half(localPartSize).y;
+  vrcp $v06.e4, $v14.e4                              ## L:309  |    309 | invW.X = invert_half(localPartSize).X;
+  addu $t4, $t4, $s2                                 ## L:328  |      ^ | temp1 += texOffset; temp1 <<= 3;
+  vrcph $v05.e4, $v14.e4                             ## L:309  |    310 | invW.X = invert_half(localPartSize).X;
+  ssv $v02, 4, 12, $s3                               ## L:396  |      ^ | store(posCenter.z, dmaDmemEnd, 0x04, 8);
+  vrcp $v06.e5, $v14.e5                              ## L:310  |    311 | invW.Y = invert_half(localPartSize).Y;
+  vrcph $v05.e5, $v14.e5                             ## L:310  |    312 | invW.Y = invert_half(localPartSize).Y;
+  vmudh $v01, $v08, $v05.v                           ## L:315  | ***316 | vec16 uvStart = posStart * invW:sint;
+  sll $t4, $t4, 3                                    ## L:328  |      ^ | temp1 += texOffset; temp1 <<= 3;
+  vor $v03, $v00, $v05                               ## L:311  |    317 | vec16 uvDelta = invW:sint;
+  vxor $v05, $v00, $v00.e0                           ## L:341  |    318 | vec16 texOffsetTotal = 0;
+  sll $a2, $a2, 3                                    ## L:327  |      ^ | temp0 += texOffset; temp0 <<= 3;
+  vmudm $v01, $v01, $v31.e6                          ## L:316  |   *320 | uvStart >>= 7;
+  mtc2 $a2, $v05.e0                                  ## L:342  |      ^ | texOffsetTotal.x = temp0:s16;
+  mtc2 $t4, $v05.e4                                  ## L:343  | ***324 | texOffsetTotal.X = temp1:s16;
+  vlt $v01, $v01, $v00.e0                            ## L:317  |      ^ | uvStart = uvStart < 0;
+  vsubc $v06, $v00, $v01.v                           ## L:339  | ***328 | vec16 uvStartNeg = VZERO - uvStart;
+  vand $v05, $v05, $v12.e3                           ## L:344  |    329 | texOffsetTotal &= texMirrorMask.w;
+  vge $v29, $v11, $v05.h0                            ## L:347  | ***333 | uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStartNeg : uvStart;
+  vmrg $v01, $v06, $v01                              ## L:347  |    334 | uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStartNeg : uvStart;
+  vlt $v04, $v04, $v15                               ## L:350  |    335 | posEnd = min(posEnd, screenMax);
+  slv $v03, 0, 28, $s3                               ## L:405  |      ^ | store(uvDelta.xy, dmaDmemEnd, 0x0C, 16);
+  addiu $t2, $zero, 36                               ## L:384  |    336 | cmdRdpRect = 0x24;
+  vge $v08, $v08, $v16                               ## L:351  |      ^ | posStart = max(posStart, screenMin);
+  vaddc $v01, $v01, $v05.v                           ## L:353  |   *338 | uvStart += texOffsetTotal;
+  addiu $at, $zero, 3                                ## L:418  |      ^ | if(clipA == 0b0000'0011) {
+  mfc2 $k1, $v04.e5                                  ## L:1196 |    339 | outB = pos.Y;
+  mfc2 $k0, $v08.e0                                  ## L:1192 |    340 | u32 tmp = pos.x;
+  andi $k1, $k1, 4095                                ## L:1197 |   *342 | outB &= 0b1111'1111'1111;
+  vsubc $v06, $v12, $v01.v                           ## L:358  |      ^ | uvStartNeg = texMirrorMask - uvStart;
+  mfc2 $s1, $v08.e1                                  ## L:1190 |    343 | outA = pos.y;
+  mfc2 $fp, $v08.e5                                  ## L:1196 |    344 | outB = pos.Y;
+  sll $k0, $k0, 12                                   ## L:1193 |    345 | tmp <<= 12;
+  andi $s1, $s1, 4095                                ## L:1191 |    346 | outA &= 0b1111'1111'1111;
+  vaddc $v06, $v06, $v12.e2                          ## L:359  |      ^ | uvStartNeg += texMirrorMask.z;
+  andi $fp, $fp, 4095                                ## L:1197 |    347 | outB &= 0b1111'1111'1111;
+  or $s1, $s1, $k0                                   ## L:1194 |    348 | outA |= tmp;
+  sw $s1, 4 + 16($s3)                                ## L:401  |    349 | store(posA, dmaDmemEnd, 0x04, 16);
+  vge $v29, $v11, $v05.h0                            ## L:361  |      ^ | uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStart : uvStartNeg;
+  mfc2 $k0, $v08.e4                                  ## L:1198 |    350 | tmp = pos.X;
+  vmrg $v01, $v01, $v06                              ## L:361  |      ^ | uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStart : uvStartNeg;
+  mfc2 $sp, $v04.e1                                  ## L:1190 |    351 | outA = pos.y;
+  ldv $v05, 0, 8, $s6                                ## L:381  |    352 | vec16 color = load(ptrIn, 8).xyzw;
+  sll $k0, $k0, 12                                   ## L:1199 |    353 | tmp <<= 12;
+  vlt $v06, $v08, $v04                               ## L:368  |      ^ | vec16 extend = posStart < posEnd;
+  or $fp, $fp, $k0                                   ## L:1200 |    354 | outB |= tmp;
+  cfc2 $t4, $vcc                                     ## L:369  |    355 | temp1 = get_vcc();
+  mfc2 $k0, $v04.e0                                  ## L:1192 |    356 | u32 tmp = pos.x;
+  lpv $v08, 0, 16, $s6                               ## L:413  |    357 | posStart = load_vec_s8(ptrIn, 16);
+  andi $sp, $sp, 4095                                ## L:1191 |    358 | outA &= 0b1111'1111'1111;
+  sll $k0, $k0, 12                                   ## L:1193 |    359 | tmp <<= 12;
+  or $sp, $sp, $k0                                   ## L:1194 |    360 | outA |= tmp;
+  slv $v05, 0, 4, $s3                                ## L:392  |    361 | store(color.xy, dmaDmemEnd, 4);
+  mfc2 $k0, $v04.e4                                  ## L:1198 |    362 | tmp = pos.X;
+  andi $a2, $t4, 3                                   ## L:371  |    363 | temp0 = temp1 & 0b0000'0011;
+  or $t1, $t1, $a2                                   ## L:372  |    364 | clipA |= temp0;
+  vmulf $v14, $v13, $v08.h3                          ## L:414  |      ^ | localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
+  slv $v01, 0, 24, $s3                               ## L:404  |    365 | store(uvStart.xy, dmaDmemEnd, 0x08, 16);
+  andi $a2, $t4, 48                                  ## L:374  |    366 | temp0 = temp1 & 0b0011'0000;
+  sll $k0, $k0, 12                                   ## L:1199 |    367 | tmp <<= 12;
+  or $k1, $k1, $k0                                   ## L:1200 |    368 | outB |= tmp;
+  sw $sp, 0 + 16($s3)                                ## L:402  |    369 | @Barrier("pos-cmd") store(posEndA, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1
+  or $t5, $t5, $a2                                   ## L:375  |    370 | clipB |= temp0;
+  vmudm $v08, $v08, $v31.e7                          ## L:415  |      ^ | posStart >>= 8;
+  bne $t1, $at, LABEL_TPXCmd_DrawTextured_001E       ## L:418  |    371 | if(clipA == 0b0000'0011) {
+  sb $t2, 0 + 16($s3)                                ## L:403  |   *373 | @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1
+  addiu $s3, $s3, 32                                 ## L:419  |    374 | dmaDmemEnd += 32;
+  LABEL_TPXCmd_DrawTextured_001E:
+  addiu $at, $zero, 48                               ## L:429  |    375 | if(clipB == 0b0011'0000) {
+  bne $t5, $at, LABEL_TPXCmd_DrawTextured_001F       ## L:429  |    376 | if(clipB == 0b0011'0000) {
+  addiu $s6, $s6, 16                                 ## L:425  |   *378 | ptrIn += 16;
+  sw $k1, 0 + 16($s3)                                ## L:434  |    379 | @Barrier("pos-cmd") store(posEndB, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1
+  slv $v01, 8, 24, $s3                               ## L:437  |    380 | store(uvStart.XY, dmaDmemEnd, 0x08, 16);
+  slv $v05, 4, 4, $s3                                ## L:430  |    381 | store(color.zw, dmaDmemEnd, 4);
+  ssv $v02, 12, 12, $s3                              ## L:431  |    382 | store(posCenter.Z, dmaDmemEnd, 0x04, 8);
+  sw $fp, 4 + 16($s3)                                ## L:433  |    383 | store(posB, dmaDmemEnd, 0x04, 16);
+  slv $v03, 8, 28, $s3                               ## L:438  |    384 | store(uvDelta.XY, dmaDmemEnd, 0x0C, 16);
+  sb $t2, 0 + 16($s3)                                ## L:435  |    385 | @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1
+  addiu $s3, $s3, 32                                 ## L:440  |    386 | dmaDmemEnd += 32;
+  LABEL_TPXCmd_DrawTextured_001F:
+  vmudn $v10, $v28, $v08.h0                          ## L:92   |      ^ | out = mat0  * vec.xxxxXXXX;
+  sltu $at, $s3, $s5                                 ## L:447  |    387 | if(dmaDmemEnd >= dmaDmemFlush) {
+  vmadh $v09, $v27, $v08.h0                          ## L:92   |      ^ | out = mat0  * vec.xxxxXXXX;
+  vmadn $v10, $v26, $v08.h1                          ## L:93   |    388 | out = mat1 +* vec.yyyyYYYY;
+  vmadh $v09, $v25, $v08.h1                          ## L:93   |    389 | out = mat1 +* vec.yyyyYYYY;
+  vmadn $v10, $v24, $v08.h2                          ## L:94   |    390 | out = mat2 +* vec.zzzzZZZZ;
+  vmadh $v09, $v23, $v08.h2                          ## L:94   |    391 | out = mat2 +* vec.zzzzZZZZ;
+  vmadn $v10, $v22, $v30.e7                          ## L:95   |    392 | out = mat3 +* 1;
+  bne $at, $zero, LABEL_TPXCmd_DrawTextured_0020     ## L:447  |      ^ | if(dmaDmemEnd >= dmaDmemFlush) {
+  vmadh $v09, $v21, $v30.e7                          ## L:95   |   *394 | out = mat3 +* 1;
+  jal RDPQ_Send                                      ## L:448  |    395 | RDPQ_Send(dmaDmem, dmaDmemEnd); ## Args: $s4, $s3
+  nop                                                ## L:448  |   *397 | RDPQ_Send(dmaDmem, dmaDmemEnd);
+  or $s3, $zero, $s4                                 ## L:449  |    398 | dmaDmemEnd = dmaDmem;
+  LABEL_TPXCmd_DrawTextured_0020:
+  vch $v29, $v09, $v09.h3                            ## L:452  |      ^ | temp1 = clip(posClip, posClip.wwwwWWWW);
+  vcl $v29, $v10, $v10.h3                            ## L:452  |    399 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  bne $s6, $s7, LABEL_TPXCmd_DrawTextured_001C       ## L:452  |      ^ | temp1 = clip(posClip, posClip.wwwwWWWW);
+  cfc2 $t4, $vcc                                     ## L:452  |   *401 | temp1 = clip(posClip, posClip.wwwwWWWW);
+  LABEL_TPXCmd_DrawTextured_0016:
+  j RDPQ_Send                                        ## L:1179 |    402 | goto RDPQ_Send;
+  ori $ra, $zero, %lo(RSPQ_Loop)                     ## L:1178 |   *404 | RA = RSPQ_Loop;
 
 OVERLAY_CODE_END:
 
diff --git a/src/t3d/rsp/rsp_tinypx.rspl b/src/t3d/rsp/rsp_tinypx.rspl
index 223d6bc6..af3b7328 100644
--- a/src/t3d/rsp/rsp_tinypx.rspl
+++ b/src/t3d/rsp/rsp_tinypx.rspl
@@ -17,7 +17,8 @@ include "rdpq_macros.h"
 #define RDP_CMD_TEX_RECT_FLIP 0x25
 
 // size of the 'T3DParticle' struct, containing 2 interleaved particles each
-#define PARTICLE_INPUT_SIZE 16
+#define PARTICLE_INPUT_SIZE_S8 16
+#define PARTICLE_INPUT_SIZE_S16 24
 // max particles, this must be a multiple of 2
 #define PARTICLE_MAX_COUNT 344
 
@@ -30,6 +31,7 @@ include "rdpq_macros.h"
 
 #define RDP_POS_MASK 0b1111'1111'1111
 
+
 state
 {
   // external libdragon labels
@@ -65,6 +67,15 @@ temp_state {
 
 #include "inc/math.rspl"
 
+#define LOOP_NAME 8Bit
+#include "tpxLoops.rspl"
+#undef LOOP_NAME
+
+#define LOOP_NAME 16Bit
+#define LOOP_16BIT 1
+#include "tpxLoops.rspl"
+#undef LOOP_NAME
+
 function RDPQ_Send(u16<$s4> dmemStart, u16<$s3> dmaDmemEnd);
 
 command<0> TPXCmd_SyncT3D(u32 rdramMatrix, u32 rdramScreen, u16 wNorm)
@@ -149,132 +160,14 @@ command<1> TPXCmd_DrawColor(s16 dataSize, s32 rdramAddr)
 
   u32 posA, posB, posEndA, posEndB;
   vec32 posClip;
-
-  // de-phase parts of the loop, this part is also at the end of the loop
-  vec16 posStart = load_vec_s8(ptrIn, 0x00);
-  localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
-  posStart >>= 8;
-  posStart.w = 1;
-  posStart.W = 1;
-
   const vec16 vecOne = 1;
   u32<$a2> temp0;
 
-  // point to clip space
-  mulMat4Vec8(mat0, mat1, mat2, mat3, posStart, posClip);
-  temp1 = clip(posClip, posClip.wwwwWWWW);
-
-  // Iterate over all points, transform + clip, save back those that need to be drawn
-  // the transformed amount might be smaller and shifted due to that
-  loop {
-    // only clip-check against Z to prevent overflow, X/Y is clamped in screen-space later
-
-    clipA = temp1 & 0b0000'0100'0000'0100;
-    clipB = temp1 & 0b0100'0000'0100'0000;
-
-    vec32 invW;
-    posClip *= normScaleW:ufract;
-
-    invW.w = invert_half(posClip).w;
-    invW.W = invert_half(posClip).W;
-
-    // store a particle base-size in W to only do one perspective division
-    posClip:sint.w = load(BASE_SIZE).x;
-    posClip:sint.W = load(BASE_SIZE).x;
-    posClip:sfract.w = 0;
-    posClip:sfract.W = 0;
-
-    // perspective division
-    posClip *= invW.wwwwWWWW;
-    undef invW;
-
-    // scale particle size by perspective
-    localPartSize:sfract *= posClip:sint.wwwwWWWW;
-
-    // transform to screen-space, this is the center of the particles and its depth-value
-    vec32 posScreen = posClip * screenSize;
-    vec16 posCenter = screenOffset:sint +* vecOne;
-
-    // extend to both sides for start/end point...
-    vec16 posEnd = vecOne +* localPartSize:sint;
-    posStart = posCenter - localPartSize:sint;
-
-    // ... and clamp to the edges of the screen
-    posEnd = min(posEnd, screenMax);
-    posStart = max(posStart, screenMin);
-
-    encodeRectPos(posA, posB, posStart);
-    encodeRectPos(posEndA, posEndB, posEnd);
-
-    // now check if it's completely outside the screen or has a zero-size
-    vec16 extend = posStart < posEnd;
-    temp1 = get_vcc();
-
-    temp0 = temp1 & 0b0000'0011; // only check X/Y
-    clipA |= temp0;
-
-    temp0 = temp1 & 0b0011'0000;
-    clipB |= temp0;
-
-    // load color and prepare RPD command IDs
-    vec16 color = load(ptrIn, 0x08).xyzw;
-    cmdRdpRect = RDP_CMD_RECT;
-
-    // Save the rectangles now. Each one consists of 3 commands: color, depth, rect
-    // The first one is always saved here to allow better reordering,
-    // however both will only submit it by advancing 'dmaDmemEnd'
-
-    //       Offset: |  0  |  1  |  2  |  3  ||  4  |  5  |  6  |  7  |
-    // "Prim Color": |0x3A |  -  |    LOD    ||      color (RGBA)     |
-    store(color.xy, dmaDmemEnd, 4);
-
-    //       Offset: |  0  |  1  |  2  |  3  ||  4  |  5  |  6  |  7  |
-    // "Prim Depth": |0x2E |  -  |  -  |  -  ||   depth   |  delta-Z  |
-    store(posCenter.z, dmaDmemEnd, 0x04, 8);
-
-    //       Offset: |  0  |  1  |  2  |  3  ||  4  |  5  |  6  |  7  |
-    //  "Fill Rect": |0x36 |   X0/Y0 (10.2)  ||  -  |  X1/Y1 (10.2)   |
-    store(posA, dmaDmemEnd, 0x04, 16);
-    @Barrier("pos-cmd") store(posEndA, dmaDmemEnd, 0x00, 16);
-    @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16);
-
-    // load particle for next iteration
-    ptrIn += PARTICLE_INPUT_SIZE;
-    posStart = load_vec_s8(ptrIn, 0x00, PARTICLE_INPUT_SIZE);
-    localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
-    posStart >>= 8;
-
-    posStart.w = 1;
-
-    if(clipA == 0b0000'0011) {
-      dmaDmemEnd += 24;
-    }
-
-    posStart.W = 1;
-    // Second rectangle:
-    if(clipB == 0b0011'0000) {
-      store(color.zw, dmaDmemEnd, 4);
-      store(posCenter.Z, dmaDmemEnd, 0x04, 8);
-
-      store(posB, dmaDmemEnd, 0x04, 16);
-      @Barrier("pos-cmd") store(posEndB, dmaDmemEnd, 0x00, 16);
-      @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16);
-
-      dmaDmemEnd += 24;
-    }
-
-    mulMat4Vec8(mat0, mat1, mat2, mat3, posStart, posClip);
-
-    // When the RDP buffer in DMEM is full, submit and DMA them out.
-    // If anything is left, a last call after the loop will submit the rest.
-    if(dmaDmemEnd >= dmaDmemFlush) {
-      RDPQ_Send(dmaDmem, dmaDmemEnd);
-      dmaDmemEnd = dmaDmem;
-    }
-
-    temp1 = clip(posClip, posClip.wwwwWWWW);
-
-  } while (ptrIn != ptrInEnd)
+  if(rdramAddr < 0) {
+    mainLoop_color16Bit();
+  } else {
+    mainLoop_color8Bit();
+  }
 
   // submit the rest of the buffer (if any)
   RA = RSPQ_Loop; // @TODO: add RSPL auto-opt. for this (needs RA assign)
@@ -391,189 +284,11 @@ command<4> TPXCmd_DrawTextured(s16 dataSize, s32 rdramAddr)
   u32 posA, posB, posEndA, posEndB;
   vec32 posClip;
 
-  // de-phase parts of the loop, this part is also at the end of the loop
-  vec16 posStart = load_vec_s8(ptrIn, 0x00);
-  localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
-  posStart >>= 8;
-  posStart.w = 1;
-  posStart.W = 1;
-
-  const vec16 vecOne = 1;
-  u32<$a2> temp0;
-
-  // point to clip space
-  mulMat4Vec8(mat0, mat1, mat2, mat3, posStart, posClip);
-  temp1 = clip(posClip, posClip.wwwwWWWW);
-
-  // Iterate over all points, transform + clip, save back those that need to be drawn
-  // the transformed amount might be smaller and shifted due to that
-  loop {
-    // only clip-check against Z to prevent overflow, X/Y is clamped in screen-space later
-
-    clipA = temp1 & 0b0000'0100'0000'0100;
-    clipB = temp1 & 0b0100'0000'0100'0000;
-
-    vec32 invW;
-    posClip *= normScaleW:ufract;
-
-    invW.w = invert_half(posClip).w;
-    invW.W = invert_half(posClip).W;
-
-    // store a particle base-size in W to only do one perspective division
-    posClip:sint.w = load(BASE_SIZE).x;
-    posClip:sint.W = load(BASE_SIZE).x;
-    posClip:sfract.w = 0;
-    posClip:sfract.W = 0;
-
-    // perspective division
-    posClip *= invW.wwwwWWWW;
-
-    // scale particle size by perspective
-    localPartSize:sfract *= posClip:sint.wwwwWWWW;
-
-    // transform to screen-space, this is the center of the particles and its depth-value
-    vec32 posScreen = posClip * screenSize;
-    vec16 posCenter = screenOffset:sint +* vecOne;
-
-    undef posScreen;
-
-    // extend to both sides for start/end point
-    vec16 posEnd = vecOne +* localPartSize:sint;
-    posStart = posCenter - localPartSize:sint;
-
-    // calculate UV delta, this is the increment in texture-coords per screen-space pixel
-    // even if clipped, this doesn't need further adjustment
-    localPartSize *= 2;
-    invW.x = invert_half(localPartSize).x;
-    invW.y = invert_half(localPartSize).y;
-    invW.X = invert_half(localPartSize).X;
-    invW.Y = invert_half(localPartSize).Y;
-    vec16 uvDelta = invW:sint;
-
-    // offset of the UV, only actually used if it is clipped on the upper or left side
-    // since rect-pos can't be negative, we clamp it and need to adjust UVs instead
-    vec16 uvStart = posStart * invW:sint;
-    uvStart >>= 7;
-    uvStart = uvStart < 0;
-
-    // local UV offset, stored in alpha channel of color, this is added to the global
-    temp0:s8 = load(ptrIn, 0x0B);
-    temp1:s8 = load(ptrIn, 0x0F);
-    temp0 += texOffset; temp0 <<= 3;
-    temp1 += texOffset; temp1 <<= 3;
-
-    undef invW;
-
-    // Repeating & mirroring of UVs for half-rotation effect.
-    // E.g.: given a texture (64x16) animating a half-rotation in 4 frames, it will go through 8 steps:
-    // x-axis goes from 0-112 in steps of 16 repeating the 4 frames two times,
-    // y-axis stays 0 until half-way, then goes to 16 for the other half.
-    // The texture needs to be mirrored, which causes it to mirror on both axis after half the frames.
-    // (Note that clipping which shifts UVs needs to be taken into account here)
-    {
-      vec16 uvStartNeg = VZERO - uvStart;
-
-      vec16 texOffsetTotal = 0;
-      texOffsetTotal.x = temp0:s16;
-      texOffsetTotal.X = temp1:s16;
-      texOffsetTotal &= texMirrorMask.w; // mask to stay within out tile counts
-
-      // this check here is inverted since we would need to negate in the case of clipping beforehand too
-      uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStartNeg : uvStart;
-
-      // clamp Pos to the edges of the screen
-      posEnd = min(posEnd, screenMax);
-      posStart = max(posStart, screenMin);
-
-      uvStart += texOffsetTotal;
-
-      // shift range to middle to invert, only used if we are in the second half (the mirrored one)
-      // E.g.: with 4 frames: (4,5,6,7) becomes (7,6,5,4)
-      // this is needed since mirroring inverts the indices of an animation
-      uvStartNeg = texMirrorMask - uvStart;
-      uvStartNeg += texMirrorMask.z;
-
-      uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStart : uvStartNeg;
-    }
-
-    encodeRectPos(posA, posB, posStart);
-    encodeRectPos(posEndA, posEndB, posEnd);
-
-    // now check if it's completely outside the screen or has a zero-size
-    vec16 extend = posStart < posEnd;
-    temp1 = get_vcc();
-
-    temp0 = temp1 & 0b0000'0011; // only check X/Y
-    clipA |= temp0;
-
-    temp0 = temp1 & 0b0011'0000;
-    clipB |= temp0;
-
-    // load color and prepare RPD command IDs
-    vec16 color = load(ptrIn, 0x08).xyzw;
-    cmdRdpRect = RDP_CMD_TEX_RECT;
-
-    // Save the rectangles now. Each one consists of 3 commands: color, depth, rect
-    // The first one is always saved here to allow better reordering,
-    // however both will only submit it by advancing 'dmaDmemEnd'
-
-    //       Offset: |  0  |  1  |  2  |  3  ||  4  |  5  |  6  |  7  |
-    // "Prim Color": |0x3A |  -  |    LOD    ||      color (RGBA)     |
-    store(color.xy, dmaDmemEnd, 4);
-
-    //       Offset: |  0  |  1  |  2  |  3  ||  4  |  5  |  6  |  7  |
-    // "Prim Depth": |0x2E |  -  |  -  |  -  ||   depth   |  delta-Z  |
-    store(posCenter.z, dmaDmemEnd, 0x04, 8);
-
-    //       Offset: |  0  |  1  |  2  |  3  ||  4  |  5  |  6  |  7  |
-    //   "Tex Rect": |0x24 |   X0/Y0 (10.2)  ||  -  |  X1/Y1 (10.2)   |
-    //               |  S (5.10) | T (5.10)  || Ds (5.10) | Dt (5.10) |
-    store(posA, dmaDmemEnd, 0x04, 16);
-    @Barrier("pos-cmd") store(posEndA, dmaDmemEnd, 0x00, 16);
-    @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16);
-    store(uvStart.xy, dmaDmemEnd, 0x08, 16);
-    store(uvDelta.xy, dmaDmemEnd, 0x0C, 16);
-
-    // load particle for next iteration
-    ptrIn += PARTICLE_INPUT_SIZE;
-    posStart = load_vec_s8(ptrIn, 0x00, PARTICLE_INPUT_SIZE);
-    localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
-    posStart >>= 8;
-
-    posStart.w = 1;
-
-    if(clipA == 0b0000'0011) {
-      dmaDmemEnd += 32;
-    }
-
-    posStart.W = 1;
-    // Second rectangle:
-    if(clipB == 0b0011'0000) {
-      store(color.zw, dmaDmemEnd, 4);
-      store(posCenter.Z, dmaDmemEnd, 0x04, 8);
-
-      store(posB, dmaDmemEnd, 0x04, 16);
-      @Barrier("pos-cmd") store(posEndB, dmaDmemEnd, 0x00, 16);
-      @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16);
-
-      store(uvStart.XY, dmaDmemEnd, 0x08, 16);
-      store(uvDelta.XY, dmaDmemEnd, 0x0C, 16);
-
-      dmaDmemEnd += 32;
-    }
-
-    mulMat4Vec8(mat0, mat1, mat2, mat3, posStart, posClip);
-
-    // When the RDP buffer in DMEM is full, submit and DMA them out.
-    // If anything is left, a last call after the loop will submit the rest.
-    if(dmaDmemEnd >= dmaDmemFlush) {
-      RDPQ_Send(dmaDmem, dmaDmemEnd);
-      dmaDmemEnd = dmaDmem;
-    }
-
-    temp1 = clip(posClip, posClip.wwwwWWWW);
-
-  } while (ptrIn != ptrInEnd)
+  if(rdramAddr < 0) {
+    mainLoop_tex16Bit();
+  } else {
+    mainLoop_tex8Bit();
+  }
 
   // submit the rest of the buffer (if any)
   RA = RSPQ_Loop; // @TODO: add RSPL auto-opt. for this (needs RA assign)
@@ -601,6 +316,7 @@ macro encodeRectPos(u32 outA, u32 outB, vec16 pos)
   outB |= tmp;
 }
 
+
 /**
  * Loads the current scissor area from the shared 'RDPQ_SCISSOR_RECT' setting
  * @param screenMin minimum (sets .xy & .XY)
diff --git a/src/t3d/rsp/tpxLoops.rspl b/src/t3d/rsp/tpxLoops.rspl
new file mode 100644
index 00000000..a8afa26d
--- /dev/null
+++ b/src/t3d/rsp/tpxLoops.rspl
@@ -0,0 +1,348 @@
+macro mainLoop_color${LOOP_NAME}()
+{
+   // de-phase parts of the loop, this part is also at the end of the loop
+#ifdef LOOP_16BIT
+  vec16 posStart = load(ptrIn, 0x00);
+  localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
+#else
+  vec16 posStart = load_vec_s8(ptrIn, 0x00);
+  localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
+  posStart >>= 8;
+#endif
+  // point to clip space
+  mulMat4Vec3(mat0, mat1, mat2, mat3, posStart, posClip);
+  temp1 = clip(posClip, posClip.wwwwWWWW);
+
+  // Iterate over all points, transform + clip, save back those that need to be drawn
+  // the transformed amount might be smaller and shifted due to that
+  loop {
+    // only clip-check against Z to prevent overflow, X/Y is clamped in screen-space later
+
+    clipA = temp1 & 0b0000'0100'0000'0100;
+    clipB = temp1 & 0b0100'0000'0100'0000;
+
+    vec32 invW;
+    posClip *= normScaleW:ufract;
+
+    invW.w = invert_half(posClip).w;
+    invW.W = invert_half(posClip).W;
+
+    // store a particle base-size in W to only do one perspective division
+    posClip:sint.w = load(BASE_SIZE).x;
+    posClip:sint.W = load(BASE_SIZE).x;
+    posClip:sfract.w = 0;
+    posClip:sfract.W = 0;
+
+    // perspective division
+    posClip *= invW.wwwwWWWW;
+    undef invW;
+
+    // scale particle size by perspective
+    localPartSize:sfract *= posClip:sint.wwwwWWWW;
+
+    // transform to screen-space, this is the center of the particles and its depth-value
+    vec32 posScreen = posClip * screenSize;
+    vec16 posCenter = screenOffset:sint +* vecOne;
+
+    // extend to both sides for start/end point...
+    vec16 posEnd = vecOne +* localPartSize:sint;
+    posStart = posCenter - localPartSize:sint;
+
+    // ... and clamp to the edges of the screen
+    posEnd = min(posEnd, screenMax);
+    posStart = max(posStart, screenMin);
+
+    encodeRectPos(posA, posB, posStart);
+    encodeRectPos(posEndA, posEndB, posEnd);
+
+    // now check if it's completely outside the screen or has a zero-size
+    vec16 extend = posStart < posEnd;
+    temp1 = get_vcc();
+
+    temp0 = temp1 & 0b0000'0011; // only check X/Y
+    clipA |= temp0;
+
+    temp0 = temp1 & 0b0011'0000;
+    clipB |= temp0;
+
+    // load color and prepare RPD command IDs
+    #ifdef LOOP_16BIT
+      vec16 color = load(ptrIn, 16).xyzw;
+    #else
+      vec16 color = load(ptrIn, 8).xyzw;
+    #endif
+    cmdRdpRect = RDP_CMD_RECT;
+
+    // Save the rectangles now. Each one consists of 3 commands: color, depth, rect
+    // The first one is always saved here to allow better reordering,
+    // however both will only submit it by advancing 'dmaDmemEnd'
+
+    //       Offset: |  0  |  1  |  2  |  3  ||  4  |  5  |  6  |  7  |
+    // "Prim Color": |0x3A |  -  |    LOD    ||      color (RGBA)     |
+    store(color.xy, dmaDmemEnd, 4);
+
+    //       Offset: |  0  |  1  |  2  |  3  ||  4  |  5  |  6  |  7  |
+    // "Prim Depth": |0x2E |  -  |  -  |  -  ||   depth   |  delta-Z  |
+    store(posCenter.z, dmaDmemEnd, 0x04, 8);
+
+    //       Offset: |  0  |  1  |  2  |  3  ||  4  |  5  |  6  |  7  |
+    //  "Fill Rect": |0x36 |   X0/Y0 (10.2)  ||  -  |  X1/Y1 (10.2)   |
+    store(posA, dmaDmemEnd, 0x04, 16);
+    @Barrier("pos-cmd") store(posEndA, dmaDmemEnd, 0x00, 16);
+    @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16);
+
+    // load particle for next iteration
+
+    #ifdef LOOP_16BIT
+      posStart.xyzw = load(ptrIn, PARTICLE_INPUT_SIZE_S16).xyzw;
+      posStart.XYZW = load(ptrIn, PARTICLE_INPUT_SIZE_S16).XYZW;
+      localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
+    #else
+      posStart = load_vec_s8(ptrIn, PARTICLE_INPUT_SIZE_S8);
+      localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
+      posStart >>= 8;
+    #endif
+
+    if(clipA == 0b0000'0011) {
+      dmaDmemEnd += 24;
+    }
+
+    #ifdef LOOP_16BIT
+      ptrIn += PARTICLE_INPUT_SIZE_S16;
+    #else
+      ptrIn += PARTICLE_INPUT_SIZE_S8;
+    #endif
+
+    // Second rectangle:
+    if(clipB == 0b0011'0000) {
+      store(color.zw, dmaDmemEnd, 4);
+      store(posCenter.Z, dmaDmemEnd, 0x04, 8);
+
+      store(posB, dmaDmemEnd, 0x04, 16);
+      @Barrier("pos-cmd") store(posEndB, dmaDmemEnd, 0x00, 16);
+      @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16);
+
+      dmaDmemEnd += 24;
+    }
+
+    mulMat4Vec3(mat0, mat1, mat2, mat3, posStart, posClip);
+
+    // When the RDP buffer in DMEM is full, submit and DMA them out.
+    // If anything is left, a last call after the loop will submit the rest.
+    if(dmaDmemEnd >= dmaDmemFlush) {
+      RDPQ_Send(dmaDmem, dmaDmemEnd);
+      dmaDmemEnd = dmaDmem;
+    }
+
+    temp1 = clip(posClip, posClip.wwwwWWWW);
+
+  } while (ptrIn != ptrInEnd)
+}
+
+macro mainLoop_tex${LOOP_NAME}()
+{
+ // de-phase parts of the loop, this part is also at the end of the loop
+#ifdef LOOP_16BIT
+  vec16 posStart = load(ptrIn, 0x00);
+  localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
+#else
+  vec16 posStart = load_vec_s8(ptrIn, 0x00);
+  localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
+  posStart >>= 8;
+#endif
+
+  const vec16 vecOne = 1;
+  u32<$a2> temp0;
+
+  // point to clip space
+  mulMat4Vec3(mat0, mat1, mat2, mat3, posStart, posClip);
+  temp1 = clip(posClip, posClip.wwwwWWWW);
+
+  // Iterate over all points, transform + clip, save back those that need to be drawn
+  // the transformed amount might be smaller and shifted due to that
+  loop {
+    // only clip-check against Z to prevent overflow, X/Y is clamped in screen-space later
+
+    clipA = temp1 & 0b0000'0100'0000'0100;
+    clipB = temp1 & 0b0100'0000'0100'0000;
+
+    vec32 invW;
+    posClip *= normScaleW:ufract;
+
+    invW.w = invert_half(posClip).w;
+    invW.W = invert_half(posClip).W;
+
+    // store a particle base-size in W to only do one perspective division
+    posClip:sint.w = load(BASE_SIZE).x;
+    posClip:sint.W = load(BASE_SIZE).x;
+    posClip:sfract.w = 0;
+    posClip:sfract.W = 0;
+
+    // perspective division
+    posClip *= invW.wwwwWWWW;
+
+    // scale particle size by perspective
+    localPartSize:sfract *= posClip:sint.wwwwWWWW;
+
+    // transform to screen-space, this is the center of the particles and its depth-value
+    vec32 posScreen = posClip * screenSize;
+    vec16 posCenter = screenOffset:sint +* vecOne;
+
+    undef posScreen;
+
+    // extend to both sides for start/end point
+    vec16 posEnd = vecOne +* localPartSize:sint;
+    posStart = posCenter - localPartSize:sint;
+
+    // calculate UV delta, this is the increment in texture-coords per screen-space pixel
+    // even if clipped, this doesn't need further adjustment
+    localPartSize *= 2;
+    invW.x = invert_half(localPartSize).x;
+    invW.y = invert_half(localPartSize).y;
+    invW.X = invert_half(localPartSize).X;
+    invW.Y = invert_half(localPartSize).Y;
+    vec16 uvDelta = invW:sint;
+
+    // offset of the UV, only actually used if it is clipped on the upper or left side
+    // since rect-pos can't be negative, we clamp it and need to adjust UVs instead
+    vec16 uvStart = posStart * invW:sint;
+    uvStart >>= 7;
+    uvStart = uvStart < 0;
+
+    // local UV offset, stored in alpha channel of color, this is added to the global
+    #ifdef LOOP_16BIT
+      temp0:s8 = load(ptrIn, 7);
+      temp1:s8 = load(ptrIn, 15);
+    #else
+      temp0:s8 = load(ptrIn, 0x0B);
+      temp1:s8 = load(ptrIn, 0x0F);
+    #endif
+    temp0 += texOffset; temp0 <<= 3;
+    temp1 += texOffset; temp1 <<= 3;
+
+    undef invW;
+
+    // Repeating & mirroring of UVs for half-rotation effect.
+    // E.g.: given a texture (64x16) animating a half-rotation in 4 frames, it will go through 8 steps:
+    // x-axis goes from 0-112 in steps of 16 repeating the 4 frames two times,
+    // y-axis stays 0 until half-way, then goes to 16 for the other half.
+    // The texture needs to be mirrored, which causes it to mirror on both axis after half the frames.
+    // (Note that clipping which shifts UVs needs to be taken into account here)
+    {
+      vec16 uvStartNeg = VZERO - uvStart;
+
+      vec16 texOffsetTotal = 0;
+      texOffsetTotal.x = temp0:s16;
+      texOffsetTotal.X = temp1:s16;
+      texOffsetTotal &= texMirrorMask.w; // mask to stay within out tile counts
+
+      // this check here is inverted since we would need to negate in the case of clipping beforehand too
+      uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStartNeg : uvStart;
+
+      // clamp Pos to the edges of the screen
+      posEnd = min(posEnd, screenMax);
+      posStart = max(posStart, screenMin);
+
+      uvStart += texOffsetTotal;
+
+      // shift range to middle to invert, only used if we are in the second half (the mirrored one)
+      // E.g.: with 4 frames: (4,5,6,7) becomes (7,6,5,4)
+      // this is needed since mirroring inverts the indices of an animation
+      uvStartNeg = texMirrorMask - uvStart;
+      uvStartNeg += texMirrorMask.z;
+
+      uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStart : uvStartNeg;
+    }
+
+    encodeRectPos(posA, posB, posStart);
+    encodeRectPos(posEndA, posEndB, posEnd);
+
+    // now check if it's completely outside the screen or has a zero-size
+    vec16 extend = posStart < posEnd;
+    temp1 = get_vcc();
+
+    temp0 = temp1 & 0b0000'0011; // only check X/Y
+    clipA |= temp0;
+
+    temp0 = temp1 & 0b0011'0000;
+    clipB |= temp0;
+
+    // load color and prepare RPD command IDs
+    #ifdef LOOP_16BIT
+      vec16 color = load(ptrIn, 16).xyzw;
+    #else
+      vec16 color = load(ptrIn, 8).xyzw;
+    #endif
+
+    cmdRdpRect = RDP_CMD_TEX_RECT;
+
+    // Save the rectangles now. Each one consists of 3 commands: color, depth, rect
+    // The first one is always saved here to allow better reordering,
+    // however both will only submit it by advancing 'dmaDmemEnd'
+
+    //       Offset: |  0  |  1  |  2  |  3  ||  4  |  5  |  6  |  7  |
+    // "Prim Color": |0x3A |  -  |    LOD    ||      color (RGBA)     |
+    store(color.xy, dmaDmemEnd, 4);
+
+    //       Offset: |  0  |  1  |  2  |  3  ||  4  |  5  |  6  |  7  |
+    // "Prim Depth": |0x2E |  -  |  -  |  -  ||   depth   |  delta-Z  |
+    store(posCenter.z, dmaDmemEnd, 0x04, 8);
+
+    //       Offset: |  0  |  1  |  2  |  3  ||  4  |  5  |  6  |  7  |
+    //   "Tex Rect": |0x24 |   X0/Y0 (10.2)  ||  -  |  X1/Y1 (10.2)   |
+    //               |  S (5.10) | T (5.10)  || Ds (5.10) | Dt (5.10) |
+    store(posA, dmaDmemEnd, 0x04, 16);
+    @Barrier("pos-cmd") store(posEndA, dmaDmemEnd, 0x00, 16);
+    @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16);
+    store(uvStart.xy, dmaDmemEnd, 0x08, 16);
+    store(uvDelta.xy, dmaDmemEnd, 0x0C, 16);
+
+    // load particle for next iteration
+    #ifdef LOOP_16BIT
+      posStart.xyzw = load(ptrIn, PARTICLE_INPUT_SIZE_S16).xyzw;
+      posStart.XYZW = load(ptrIn, PARTICLE_INPUT_SIZE_S16).XYZW;
+      localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
+    #else
+      posStart = load_vec_s8(ptrIn, PARTICLE_INPUT_SIZE_S8);
+      localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW;
+      posStart >>= 8;
+    #endif
+
+    if(clipA == 0b0000'0011) {
+      dmaDmemEnd += 32;
+    }
+
+    #ifdef LOOP_16BIT
+      ptrIn += PARTICLE_INPUT_SIZE_S16;
+    #else
+      ptrIn += PARTICLE_INPUT_SIZE_S8;
+    #endif
+
+    // Second rectangle:
+    if(clipB == 0b0011'0000) {
+      store(color.zw, dmaDmemEnd, 4);
+      store(posCenter.Z, dmaDmemEnd, 0x04, 8);
+
+      store(posB, dmaDmemEnd, 0x04, 16);
+      @Barrier("pos-cmd") store(posEndB, dmaDmemEnd, 0x00, 16);
+      @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16);
+
+      store(uvStart.XY, dmaDmemEnd, 0x08, 16);
+      store(uvDelta.XY, dmaDmemEnd, 0x0C, 16);
+
+      dmaDmemEnd += 32;
+    }
+
+    mulMat4Vec3(mat0, mat1, mat2, mat3, posStart, posClip);
+
+    // When the RDP buffer in DMEM is full, submit and DMA them out.
+    // If anything is left, a last call after the loop will submit the rest.
+    if(dmaDmemEnd >= dmaDmemFlush) {
+      RDPQ_Send(dmaDmem, dmaDmemEnd);
+      dmaDmemEnd = dmaDmem;
+    }
+
+    temp1 = clip(posClip, posClip.wwwwWWWW);
+
+  } while (ptrIn != ptrInEnd)
+}
\ No newline at end of file
diff --git a/src/t3d/t3d.h b/src/t3d/t3d.h
index eb256e7c..c2ddbde0 100644
--- a/src/t3d/t3d.h
+++ b/src/t3d/t3d.h
@@ -50,7 +50,7 @@ typedef struct {
   /* 0x1C */ int16_t stB[2]; // UV fixed point 10.5 (pixel coords)
 } __attribute__((aligned(8))) T3DVertPacked;
 
-_Static_assert(sizeof(T3DVertPacked) == 0x20, "T3DVertPacked has wrong size");
+static_assert(sizeof(T3DVertPacked) == 0x20, "T3DVertPacked has wrong size");
 
 enum T3DDrawFlags {
   T3D_FLAG_DEPTH      = 1 << 0,
diff --git a/src/t3d/t3danim.h b/src/t3d/t3danim.h
index d34ea74d..ffa42cac 100644
--- a/src/t3d/t3danim.h
+++ b/src/t3d/t3danim.h
@@ -122,7 +122,7 @@ void t3d_anim_set_time(T3DAnim* anim, float time);
  * @param anim animation to get time for
  * @return current time in seconds
  */
-static float t3d_anim_get_time(const T3DAnim* anim) {
+inline static float t3d_anim_get_time(const T3DAnim* anim) {
   return anim->time;
 }
 
@@ -131,7 +131,7 @@ static float t3d_anim_get_time(const T3DAnim* anim) {
  * @param anim animation to get length for
  * @return length in seconds
  */
-static float t3d_anim_get_length(const T3DAnim* anim) {
+inline static float t3d_anim_get_length(const T3DAnim* anim) {
   return anim->animRef->duration;
 }
 
diff --git a/src/t3d/tpx.c b/src/t3d/tpx.c
index fc24598b..8c361a2b 100644
--- a/src/t3d/tpx.c
+++ b/src/t3d/tpx.c
@@ -12,8 +12,9 @@ extern rsp_ucode_t rsp_tiny3d;
 DEFINE_RSP_UCODE(rsp_tinypx);
 uint32_t TPX_RSP_ID = 0;
 
-#define SWAP_U32(a, b) {uint32_t tmp = a; a = b; b = tmp;}
-#define MAX_PARTICLES_COLOR 344
+#define SWAP_VALUE(a, b) {auto tmp = a; a = b; b = tmp;}
+#define MAX_PARTICLES_S8 344
+#define MAX_PARTICLES_S16 228
 
 static T3DMat4FP *matrixStack = NULL;
 
@@ -76,29 +77,54 @@ void tpx_state_set_tex_params(int16_t offsetX, uint16_t mirrorPoint)
   tpx_dmem_set_u32(RSP_TPX_TEX_OFFSET, val);
 }
 
-inline static void tpx_particle_draw_generic(TPXParticle *particles, uint32_t count, uint32_t rspCmd)
+inline static void tpx_particle_draw_generic_s8(TPXParticleS8 *particles, uint32_t count, uint32_t rspCmd)
 {
   assert((count & 1) == 0);
 
-  for(uint32_t i = 0; i < count; i += MAX_PARTICLES_COLOR) {
+  for(uint32_t i = 0; i < count; i += MAX_PARTICLES_S8) {
     uint32_t batchSize = (count - i);
-    if(batchSize > MAX_PARTICLES_COLOR)batchSize = MAX_PARTICLES_COLOR;
+    if(batchSize > MAX_PARTICLES_S8)batchSize = MAX_PARTICLES_S8;
 
-    uint32_t loadSize = sizeof(TPXParticle) * batchSize / 2;
+    uint32_t loadSize = sizeof(TPXParticleS8) * batchSize / 2;
     rdpq_write(-1, TPX_RSP_ID, rspCmd,
-        loadSize, (uint32_t)UncachedAddr(particles)
+        loadSize, ((uint32_t)(particles) & 0xFFFFFF)
     );
 
-    particles += MAX_PARTICLES_COLOR / 2;
+    particles += MAX_PARTICLES_S8 / 2;
   }
 }
 
-void tpx_particle_draw(TPXParticle *particles, uint32_t count) {
-  tpx_particle_draw_generic(particles, count, TPX_CMD_DRAW_COLOR);
+inline static void tpx_particle_draw_generic_s16(TPXParticleS16 *particles, uint32_t count, uint32_t rspCmd)
+{
+  assert((count & 1) == 0);
+
+  for(uint32_t i = 0; i < count; i += MAX_PARTICLES_S16) {
+    uint32_t batchSize = (count - i);
+    if(batchSize > MAX_PARTICLES_S16)batchSize = MAX_PARTICLES_S16;
+
+    uint32_t loadSize = sizeof(TPXParticleS16) * batchSize / 2;
+    rdpq_write(-1, TPX_RSP_ID, rspCmd,
+        loadSize, ((uint32_t)(particles) & 0xFFFFFF) | 0x8000'0000
+    );
+
+    particles += MAX_PARTICLES_S16 / 2;
+  }
 }
 
-void tpx_particle_draw_tex(TPXParticle *particles, uint32_t count) {
-  tpx_particle_draw_generic(particles, count, TPX_CMD_DRAW_TEXTURE);
+void tpx_particle_draw_s8(TPXParticleS8 *particles, uint32_t count) {
+  tpx_particle_draw_generic_s8(particles, count, TPX_CMD_DRAW_COLOR);
+}
+
+void tpx_particle_draw_s16(TPXParticleS16 *particles, uint32_t count) {
+  tpx_particle_draw_generic_s16(particles, count, TPX_CMD_DRAW_COLOR);
+}
+
+void tpx_particle_draw_tex_s8(TPXParticleS8 *particles, uint32_t count) {
+  tpx_particle_draw_generic_s8(particles, count, TPX_CMD_DRAW_TEXTURE);
+}
+
+void tpx_particle_draw_tex_s16(TPXParticleS16 *particles, uint32_t count) {
+  tpx_particle_draw_generic_s16(particles, count, TPX_CMD_DRAW_TEXTURE);
 }
 
 inline static void tpx_matrix_stack(void *mat, int32_t stackAdvance, bool doMultiply, bool onlyStackMove) {
@@ -127,18 +153,29 @@ void tpx_matrix_push_pos(int count) {
   tpx_matrix_stack(NULL, stackAdvance, false, true);
 }
 
-void tpx_buffer_swap(TPXParticle pt[], uint32_t idxA, uint32_t idxB) {
+void tpx_buffer_s8_swap(TPXParticleS8 pt[], uint32_t idxA, uint32_t idxB) {
   uint32_t *dataA = (uint32_t*)&pt[idxA/2];
   uint32_t *dataB = (uint32_t*)&pt[idxB/2];
 
   dataA += idxA & 1;
   dataB += idxB & 1;
 
-  SWAP_U32(dataA[0], dataB[0]);
-  SWAP_U32(dataA[2], dataB[2]);
+  SWAP_VALUE(dataA[0], dataB[0]);
+  SWAP_VALUE(dataA[2], dataB[2]);
 }
 
-void tpx_buffer_copy(TPXParticle *pt, uint32_t idxDst, uint32_t idxSrc) {
+void tpx_buffer_s16_swap(TPXParticleS16 pt[], uint32_t idxA, uint32_t idxB)
+{
+  auto val0_a = (uint64_t*)tpx_buffer_s16_get_pos(pt, idxA);
+  auto val0_b = (uint64_t*)tpx_buffer_s16_get_pos(pt, idxB);
+  SWAP_VALUE(*val0_a, *val0_b);
+
+  auto val1_a = (uint32_t*)tpx_buffer_s16_get_rgba(pt, idxA);
+  auto val1_b = (uint32_t*)tpx_buffer_s16_get_rgba(pt, idxB);
+  SWAP_VALUE(*val1_a, *val1_b);
+}
+
+void tpx_buffer_s8_copy(TPXParticleS8 *pt, uint32_t idxDst, uint32_t idxSrc) {
   uint32_t *dataDst = (uint32_t*)&pt[idxDst/2];
   uint32_t *dataSrc = (uint32_t*)&pt[idxSrc/2];
 
@@ -149,6 +186,17 @@ void tpx_buffer_copy(TPXParticle *pt, uint32_t idxDst, uint32_t idxSrc) {
   dataDst[2] = dataSrc[2];
 }
 
+void tpx_buffer_s16_copy(TPXParticleS16 pt[], uint32_t idxDst, uint32_t idxSrc)
+{
+  auto val0_dst = (uint64_t*)tpx_buffer_s16_get_pos(pt, idxDst);
+  auto val0_src = (uint64_t*)tpx_buffer_s16_get_pos(pt, idxSrc);
+  *val0_dst = *val0_src;
+
+  auto val1_dst = (uint32_t*)tpx_buffer_s16_get_rgba(pt, idxDst);
+  auto val1_src = (uint32_t*)tpx_buffer_s16_get_rgba(pt, idxSrc);
+  *val1_dst = *val1_src;
+}
+
 void tpx_destroy()
 {
   if(matrixStack)
diff --git a/src/t3d/tpx.h b/src/t3d/tpx.h
index 1efd95fa..48f063c3 100644
--- a/src/t3d/tpx.h
+++ b/src/t3d/tpx.h
@@ -49,9 +49,27 @@ typedef struct {
   int8_t sizeB;
   uint8_t colorA[4];
   uint8_t colorB[4];
-}  __attribute__((packed, aligned(16))) TPXParticle;
+}  __attribute__((packed, aligned(16))) TPXParticleS8;
 
-_Static_assert(sizeof(TPXParticle) == 16, "TPXParticle size mismatch");
+static_assert(sizeof(TPXParticleS8) == 16, "TPXParticleS8 size mismatch");
+
+/**
+ * @deprecated Use 'TPXParticleS8' instead.
+ */
+[[deprecated("Use 'TPXParticleS8' instead")]] typedef TPXParticleS8 TPXParticle;
+
+typedef struct {
+  int16_t posA[3];
+  int8_t sizeA;
+  uint8_t texOffsetA;
+  int16_t posB[3];
+  int8_t sizeB;
+  uint8_t texOffsetB;
+  uint8_t colorA[4];
+  uint8_t colorB[4];
+} __attribute__((packed, aligned(8))) TPXParticleS16;
+
+static_assert(sizeof(TPXParticleS16) == 24, "TPXParticle16 size mismatch");
 
 /**
  * @brief Initializes the tinyPX library
@@ -103,25 +121,65 @@ void tpx_state_set_base_size(uint16_t baseSize);
 void tpx_state_set_tex_params(int16_t offsetX, uint16_t mirrorPoint);
 
 /**
- * Draws a given amount of particles.
+ * Draws a given amount of particles (8bit position precision).
  * In contrast to triangles in t3d, this works in a single command.
  * So load, transform and draw happens in one go.
  * @param particles pointer to the particle data
  * @param count number of particles to draw
  */
-void tpx_particle_draw(TPXParticle *particles, uint32_t count);
+void tpx_particle_draw_s8(TPXParticleS8 *particles, uint32_t count);
+
+[[deprecated("Use 'tpx_particle_draw_s8' instead")]]
+inline static void tpx_particle_draw(TPXParticleS8 *particles, uint32_t count) {
+  return tpx_particle_draw_s8(particles, count);
+}
+
+/**
+ * Draws a given amount of particles (16bit position precision).
+ * 16bit Precision gives you larger range but comes with slightly more memory and runtime cost.
+ * Whenever possible use the 8bit version instead.
+ * It is most useful if you need to cover large ranges, e.g. when using it for billboards in scene.
+ *
+ * In contrast to triangles in t3d, this works in a single command.
+ * So load, transform and draw happens in one go.
+ * @param particles pointer to the particle data
+ * @param count number of particles to draw
+ */
+void tpx_particle_draw_s16(TPXParticleS16 *particles, uint32_t count);
 
 /**
  * Draws a given amount of particles with a texture.
  * In contrast to triangles in t3d, this works in a single command.
  * So load, transform and draw happens in one go.
+ *
  * Note: this expects that you already setup textures.
  * It will also always use TILE0 for the rect-commands.
+ * The colors alpha channel acts as a texture offset.
  *
  * @param particles pointer to the particle data
  * @param count number of particles to draw
  */
-void tpx_particle_draw_tex(TPXParticle *particles, uint32_t count);
+void tpx_particle_draw_tex_s8(TPXParticleS8 *particles, uint32_t count);
+
+[[deprecated("Use 'tpx_particle_draw_tex_s8' instead")]]
+inline static void tpx_particle_draw_tex(TPXParticleS8 *particles, uint32_t count) {
+  return tpx_particle_draw_tex_s8(particles, count);
+}
+
+/**
+ * Draws a given amount of particles (16bit position precision).
+ * 16bit Precision gives you larger range but comes with slightly more memory and runtime cost.
+ * Whenever possible use the 8bit version instead.
+ * It is most useful if you need to cover large ranges, e.g. when using it for billboards in scene.
+ *
+ * Note: this expects that you already setup textures.
+ * It will also always use TILE0 for the rect-commands.
+ * A per-particle texture offset can be set in 'texOffsetA'/'texOffsetB'.
+ *
+ * @param particles pointer to the particle data
+ * @param count number of particles to draw
+ */
+void tpx_particle_draw_tex_s16(TPXParticleS16 *particles, uint32_t count);
 
 /**
  * Directly loads a matrix, overwriting the current stack position.
@@ -165,44 +223,128 @@ void tpx_matrix_push_pos(int count);
  * @param vert particle buffer
  * @param idx particle index
  */
-static inline int8_t* tpx_buffer_get_pos(TPXParticle pt[], int idx) {
+static inline int8_t* tpx_buffer_s8_get_pos(TPXParticleS8 pt[], int idx) {
   return (idx & 1) ? pt[idx/2].posB : pt[idx/2].posA;
 }
 
+[[deprecated("Use 'tpx_buffer_s8_get_pos' instead")]]
+static inline int8_t* tpx_buffer_get_pos(TPXParticleS8 pt[], int idx) {
+  return tpx_buffer_s8_get_pos(pt, idx);
+}
+
 /**
  * Returns the pointer to the size of a particle in a buffer
  * @param pt particle buffer
  * @param idx particle index
  */
-static inline int8_t* tpx_buffer_get_size(TPXParticle pt[], int idx) {
+static inline int8_t* tpx_buffer_s8_get_size(TPXParticleS8 pt[], int idx) {
   return (idx & 1) ? &pt[idx/2].sizeB : &pt[idx/2].sizeA;
 }
 
+[[deprecated("Use 'tpx_buffer_s8_get_size' instead")]]
+static inline int8_t* tpx_buffer_get_size(TPXParticleS8 pt[], int idx) {
+  return tpx_buffer_s8_get_size(pt, idx);
+}
+
 /**
  * Returns the pointer to the color (as a u32) of a particle in a buffer
  * @param pt particle buffer
  * @param idx particle index
  */
-static inline uint32_t* tpx_buffer_get_color(TPXParticle pt[], int idx) {
+static inline uint32_t* tpx_buffer_s8_get_color(TPXParticleS8 pt[], int idx) {
   return (idx & 1) ? (uint32_t*)&pt[idx/2].colorB : (uint32_t*)&pt[idx/2].colorA;
 }
 
+[[deprecated("Use 'tpx_buffer_s8_get_color' instead")]]
+static inline uint32_t* tpx_buffer_get_color(TPXParticleS8 pt[], int idx) {
+  return tpx_buffer_s8_get_color(pt, idx);
+}
+
 /**
  * Returns the pointer to the color (as a u8[4]) of a particle in a buffer
  * @param pt particle buffer
  * @param idx particle index
  */
-static inline uint8_t* tpx_buffer_get_rgba(TPXParticle pt[], int idx) {
+static inline uint8_t* tpx_buffer_s8_get_rgba(TPXParticleS8 pt[], int idx) {
   return (idx & 1) ? pt[idx/2].colorB : pt[idx/2].colorA;
 }
 
+[[deprecated("Use 'tpx_buffer_s8_get_rgba' instead")]]
+static inline uint8_t* tpx_buffer_get_rgba(TPXParticleS8 pt[], int idx) {
+  return tpx_buffer_s8_get_rgba(pt, idx);
+}
+
+/**
+ * Returns the pointer to a position of a particle in a buffer
+ * @param vert particle buffer
+ * @param idx particle index
+ */
+static inline int16_t* tpx_buffer_s16_get_pos(TPXParticleS16 pt[], int idx) {
+  return (idx & 1) ? pt[idx/2].posB : pt[idx/2].posA;
+}
+
+/**
+ * Returns the pointer to the size of a particle in a buffer
+ * @param pt particle buffer
+ * @param idx particle index
+ */
+static inline int8_t* tpx_buffer_s16_get_size(TPXParticleS16 pt[], int idx) {
+  return (idx & 1) ? &pt[idx/2].sizeB : &pt[idx/2].sizeA;
+}
+
+/**
+ * Returns the pointer to the color (as a u32) of a particle in a buffer
+ * @param pt particle buffer
+ * @param idx particle index
+ */
+static inline uint8_t* tpx_buffer_s16_get_rgba(TPXParticleS16 pt[], int idx) {
+  return (idx & 1) ? pt[idx/2].colorB : pt[idx/2].colorA;
+}
+
+/**
+ * Returns the pointer to the texture offset in the buffer.
+ * This is only present in the 16bit buffer, in the 8bit version this stored in alpha channel.
+ * @param pt particle buffer
+ * @param idx particle index
+*/
+static inline uint8_t* tpx_buffer_s16_get_tex_offset(TPXParticleS16 pt[], int idx) {
+  return (idx & 1) ? &pt[idx/2].texOffsetA : &pt[idx/2].texOffsetB;
+}
+
+/**
+ * Swaps two particles in a buffer
+ * @param pt buffer to swap particles in
+ * @param idxA index of the first particle
+ * @param idxB index of the second particle
+ */
+void tpx_buffer_s8_swap(TPXParticleS8 pt[], uint32_t idxA, uint32_t idxB);
+
+[[deprecated("Use 'tpx_buffer_s8_swap' instead")]]
+static inline void tpx_buffer_swap(TPXParticleS8 pt[], uint32_t idxA, uint32_t idxB) {
+  tpx_buffer_s8_swap(pt, idxA, idxB);
+}
+
 /**
  * Swaps two particles in a buffer
  * @param pt buffer to swap particles in
  * @param idxA index of the first particle
  * @param idxB index of the second particle
  */
-void tpx_buffer_swap(TPXParticle pt[], uint32_t idxA, uint32_t idxB);
+void tpx_buffer_s16_swap(TPXParticleS16 pt[], uint32_t idxA, uint32_t idxB);
+
+/**
+ * Copies a particle into another place in a buffer
+ * This will overwrite the destination particle and keep the source particle unchanged.
+ * @param pt buffer to copy particles in
+ * @param idxDst destination index
+ * @param idxSrc source index
+ */
+void tpx_buffer_s8_copy(TPXParticleS8 pt[], uint32_t idxDst, uint32_t idxSrc);
+
+[[deprecated("Use 'tpx_buffer_s8_copy' instead")]]
+static inline void tpx_buffer_copy(TPXParticleS8 pt[], uint32_t idxDst, uint32_t idxSrc) {
+  tpx_buffer_s8_copy(pt, idxDst, idxSrc);
+}
 
 /**
  * Copies a particle into another place in a buffer
@@ -211,7 +353,7 @@ void tpx_buffer_swap(TPXParticle pt[], uint32_t idxA, uint32_t idxB);
  * @param idxDst destination index
  * @param idxSrc source index
  */
-void tpx_buffer_copy(TPXParticle pt[], uint32_t idxDst, uint32_t idxSrc);
+void tpx_buffer_s16_copy(TPXParticleS16 pt[], uint32_t idxDst, uint32_t idxSrc);
 
 /**
  * Destroys the tinyPX library and frees all resources