diff --git a/examples/18_particles/main.c b/examples/18_particles/main.c index d36008ba..ff1aa781 100644 --- a/examples/18_particles/main.c +++ b/examples/18_particles/main.c @@ -43,10 +43,10 @@ int main() rdpq_init(); //rdpq_debug_start(); + uint64_t rspTimeTPX = 0; + uint64_t rdpTimeBusy = 0; #if RSPQ_PROFILE rspq_profile_data_t profile_data = (rspq_profile_data_t){}; - uint64_t rdpTimeBusy = 0; - uint64_t rspTimeTPX = 0; rspq_profile_start(); #endif @@ -65,13 +65,20 @@ int main() // Meaning you only have to allocate an buffer of arbitrary size here and fill it with data. uint32_t particleCountMax = 100'000; uint32_t particleCount = 2000; + // NOTE: just like with vertices, particles are interleaved in pairs of 2. - // So one TPXParticle struct always contains 2 particles. + // So one TPXParticleS8 struct always contains 2 particles. // If you need an odd number, just set the second particle size to 0. - uint32_t allocSize = sizeof(TPXParticle) * particleCountMax / 2; - TPXParticle *particles = malloc_uncached(allocSize); + uint32_t allocSize = sizeof(TPXParticleS8) * particleCountMax / 2; + TPXParticleS8 *particlesS8 = malloc_uncached(allocSize); debugf("Particle-Buffer %ldkb\n", allocSize / 1024); - generate_particles_random(particles, particleCount); + + // Additionally, a 16bit version of particles is available. + // This one takes up more space (24 bytes vs 16 bytes per pair) and is slightly slower. + // In return, it can cover a larger range which can be useful for 3D sprites placed in a scene. + // The 8bit variant should be preferred when possible (e.g. in local particle effects) + allocSize = sizeof(TPXParticleS16) * particleCountMax / 2; + TPXParticleS16 *particlesS16 = malloc_uncached(allocSize); // Now some regular 3D stuff, not related to particles. T3DModel *model = t3d_model_load("rom://scene.t3dm"); @@ -111,6 +118,7 @@ int main() float time = 0; bool needRebuild = true; int frameIdx = 0; + bool measureTime = false; for(;;) { @@ -132,6 +140,10 @@ int main() if(joypad.btn.c_up)partSizeY += deltaTime * 0.6f; if(joypad.btn.c_down)partSizeY -= deltaTime * 0.6f; + #if RSPQ_PROFILE + measureTime = joypad.btn.z; + #endif + partSizeX = fmaxf(0.01f, fminf(1.0f, partSizeX)); partSizeY = fmaxf(0.01f, fminf(1.0f, partSizeY)); @@ -173,6 +185,7 @@ int main() // A few example particles systems. // This will modify the particle buffer on the CPU side. + bool isS16 = false; switch(example) { case 0: // Random @@ -180,9 +193,9 @@ int main() particleRot = (T3DVec3){{time,time*0.77f,time*1.42f}}; particleMatScale = (T3DVec3){{partMatScaleVal, partMatScaleVal, partMatScaleVal}}; - if(needRebuild)generate_particles_random(particles, particleCount); + if(needRebuild)generate_particles_random(particlesS8, particleCount); rdpq_set_env_color((color_t){0xFF, 0xFF, 0xFF, 0xFF}); - break; + break; case 1: // Flame particleRot = (T3DVec3){{0,0,0}}; if(!joypad.btn.z)time += deltaTime * 1.0f; @@ -190,17 +203,17 @@ int main() float posX = fm_cosf(time) * 80.0f; float posZ = fm_sinf(2*time) * 40.0f; - simulate_particles_fire(particles, particleCount, posX, posZ); + simulate_particles_fire(particlesS8, particleCount, posX, posZ); particleMatScale = (T3DVec3){{0.9f, partMatScaleVal, 0.9f}}; particlePos.y = partMatScaleVal * 130.0f; rdpq_set_env_color((color_t){0xFF, 0xFF, 0xFF, 0xFF}); - break; + break; case 2: // Grass time += deltaTime * 1.0f; particleRot = (T3DVec3){{0,0,0}}; particlePos.y = 0; if(needRebuild) { - particleCount = simulate_particles_grass(particles, particleCount); + particleCount = simulate_particles_grass(particlesS16, particleCount); } particleMatScale = (T3DVec3){{partMatScaleVal, partSizeY * 2.9f, partMatScaleVal}}; rdpq_set_env_color(blend_colors( @@ -208,7 +221,8 @@ int main() (color_t){0xFF, 0xAA, 0x55, 0xFF}, fm_sinf(time)*0.5f+0.5f )); - break; + isS16 = true; + break; } needRebuild = false; @@ -267,10 +281,29 @@ int main() // This can only scale particles down, so the range is 0.0 - 1.0. tpx_state_set_scale(partSizeX, partSizeY); + if(measureTime) { + rspq_wait(); + rspq_highpri_begin(); + wait_ms(2); + rspTimeTPX = get_ticks(); + } + // Now draw particles. internally this will load, transform and draw them in one go on the RSP. // While the ucode can only handle a 344 at a time, this function will automatically batch them // so you can specify an arbitrary amount of particles (as long as it's an even count) - tpx_particle_draw(particles, particleCount); + if(isS16) { + tpx_particle_draw_s16(particlesS16, particleCount); + } else { + tpx_particle_draw_s8(particlesS8, particleCount); + } + + if(measureTime) + { + rspq_highpri_end(); + rspq_highpri_sync(); + rspTimeTPX = get_ticks() - rspTimeTPX; + rspTimeTPX = TICKS_TO_US(rspTimeTPX); + } // Make sure end up at the same stack level as before. tpx_matrix_pop(1); @@ -283,7 +316,8 @@ int main() t3d_debug_printf(20, 30, "[C] %.2f %.2f", partSizeX, partSizeY); t3d_debug_printf(220, 18, "FPS: %.2f", display_get_fps()); - #if RSPQ_PROFILE + if(measureTime) + { double timePerPart = 0; if(particleCount > 0) { timePerPart = (double)rspTimeTPX / (double)particleCount * 1000; @@ -291,10 +325,9 @@ int main() t3d_debug_printf(20, 240-34, "RSP/tpx: %6lldus %.1f", rspTimeTPX, timePerPart); //t3d_debug_printf(20, 240-34, "RSP/tpx: %6lldus", rspTimeTPX); t3d_debug_printf(20, 240-24, "RDP : %6lldus", rdpTimeBusy); - #else + } else { t3d_debug_printf(20, 240-24, "[L/R]: %s", EXAMPLE_NAMES[example]); - #endif - + } rdpq_detach_show(); #if RSPQ_PROFILE diff --git a/examples/18_particles/partSim.h b/examples/18_particles/partSim.h index 1239baa7..7df53355 100644 --- a/examples/18_particles/partSim.h +++ b/examples/18_particles/partSim.h @@ -5,7 +5,7 @@ static int currentPart = 0; /** * Basic static particles with random positions and colors. */ -static void generate_particles_random(TPXParticle *particles, uint32_t count) { +static void generate_particles_random(TPXParticleS8 *particles, uint32_t count) { for (int i = 0; i < count; i++) { int p = i / 2; int8_t *ptPos = i % 2 == 0 ? particles[p].posA : particles[p].posB; @@ -47,7 +47,7 @@ static int noise_2d(int x, int y) { * Static particles simulating grass. * This will create a random grid of 3 particles stacked on top of each other representing grass-blades. */ -static int simulate_particles_grass(TPXParticle *particles, uint32_t partCount) { +static int simulate_particles_grass(TPXParticleS16 *particles, uint32_t partCount) { int dist = 3; int heightParts = 3; @@ -56,14 +56,14 @@ static int simulate_particles_grass(TPXParticle *particles, uint32_t partCount) int p = 0; for(int y=heightParts-1; y>=0; --y) { - int8_t ptPosX = -(dist * sideLen) / 2; + int16_t ptPosX = -(dist * sideLen) / 2; for(int x=0; x 0) { timePerPart = (double)rspTimeTPX / (double)particleCount * 1000; @@ -343,9 +377,9 @@ int main() t3d_debug_printf(20, 240-34, "RSP/tpx: %6lldus %.1f", rspTimeTPX, timePerPart); //t3d_debug_printf(20, 240-34, "RSP/tpx: %6lldus", rspTimeTPX); t3d_debug_printf(20, 240-24, "RDP : %6lldus", rdpTimeBusy); - #else + } else { t3d_debug_printf(20, 240-24, "[L/R]: %s", EXAMPLE_NAMES[example]); - #endif + } rdpq_detach_show(); diff --git a/examples/19_particles_tex/partSim.h b/examples/19_particles_tex/partSim.h index cf967883..7bc1bf72 100644 --- a/examples/19_particles_tex/partSim.h +++ b/examples/19_particles_tex/partSim.h @@ -2,16 +2,6 @@ static int currentPart = 0; -static color_t blend_colors(color_t colorA, color_t colorB, float t) { - color_t color; - color.r = (uint8_t)(colorA.r * (1.0f - t) + colorB.r * t); - color.g = (uint8_t)(colorA.g * (1.0f - t) + colorB.g * t); - color.b = (uint8_t)(colorA.b * (1.0f - t) + colorB.b * t); - color.a = (uint8_t)(colorA.a * (1.0f - t) + colorB.a * t); - return color; -} - - static color_t get_rainbow_color(float s, float brightness) { float r = fm_sinf(s) * 0.5f + 0.5f; float g = fm_sinf(s + 2.094f) * 0.5f + 0.5f; @@ -82,7 +72,7 @@ static int noise_2d(int x, int y) { return (n * (n * n * 60493 + 19990303) + 89); } -static void generate_particles_random(TPXParticle *particles, uint32_t count) { +static void generate_particles_random(TPXParticleS8 *particles, uint32_t count) { for (int i = 0; i < count; i++) { int p = i / 2; int8_t *ptPos = i % 2 == 0 ? particles[p].posA : particles[p].posB; @@ -114,7 +104,7 @@ static void generate_particles_random(TPXParticle *particles, uint32_t count) { } } -static int simulate_particles_coins(TPXParticle *particles, uint32_t partCount) { +static int simulate_particles_coins(TPXParticleS16 *particles, uint32_t partCount) { int dist = 3; int heightParts = 1; @@ -122,14 +112,15 @@ static int simulate_particles_coins(TPXParticle *particles, uint32_t partCount) int p = 0; - int8_t ptPosX = -(dist * sideLen) / 2; + int16_t ptPosX = -(dist * sideLen) / 2; for(int x=0; xa = (rand() % 8) * 32; ptPos[0] = ptPosX + ((rnd % 3) - 1); ptPos[1] = height; ptPos[2] = ptPosZ + ((rnd % 3) - 1); - *tpx_buffer_get_size(particles, p) = size; + *tpx_buffer_s16_get_size(particles, p) = size; ptPosZ += dist; @@ -160,17 +150,17 @@ static int simulate_particles_coins(TPXParticle *particles, uint32_t partCount) * This will simulate particles over time by moving them up and changing their color. * The current position is used to spawn new particles, so it can move over time leaving a trail behind. */ -static void simulate_particles_fire(TPXParticle *particles, uint32_t partCount, float posX, float posZ) { - uint32_t p = currentPart / 2; +static void simulate_particles_fire(TPXParticleS8 *particles, uint32_t partCount, float posX, float posZ) { + int p = currentPart / 2; if(currentPart % (1+(rand() % 3)) == 0) { - int8_t *ptPos = currentPart % 2 == 0 ? particles[p].posA : particles[p].posB; - int8_t *size = currentPart % 2 == 0 ? &particles[p].sizeA : &particles[p].sizeB; - uint8_t *color = currentPart % 2 == 0 ? particles[p].colorA : particles[p].colorB; + int8_t *ptPos = tpx_buffer_s8_get_pos(particles, p); + int8_t *size = tpx_buffer_s8_get_size(particles, p); + uint8_t *color = tpx_buffer_s8_get_rgba(particles, p); ptPos[0] = posX + (rand() % 16) - 8; ptPos[1] = -126; gradient_fire(color, 0); - color[3] = (PhysicalAddr(ptPos) % 8) * 32; + color[3] = ((PhysicalAddr(ptPos) % 8) * 32); ptPos[2] = posZ + (rand() % 16) - 8; *size = 60 + (rand() % 10); diff --git a/examples/24_hdr_bloom/src/actors/magicSpell.cpp b/examples/24_hdr_bloom/src/actors/magicSpell.cpp index 9a36ec04..9517f25d 100644 --- a/examples/24_hdr_bloom/src/actors/magicSpell.cpp +++ b/examples/24_hdr_bloom/src/actors/magicSpell.cpp @@ -40,9 +40,9 @@ namespace Actor args.scale *= BASE_SCALE; for(uint32_t i=0; igetCam().getFrustum(); if(!checkFrustumSphere(pos, args.scale * 90.0f))return; t3d_matrix_set(matFP.get(), true); diff --git a/examples/24_hdr_bloom/src/actors/pointGlobe.cpp b/examples/24_hdr_bloom/src/actors/pointGlobe.cpp index c775cbed..5317d5bf 100644 --- a/examples/24_hdr_bloom/src/actors/pointGlobe.cpp +++ b/examples/24_hdr_bloom/src/actors/pointGlobe.cpp @@ -96,8 +96,8 @@ namespace Actor float latIncr = (T3D_PI * 32.0f) / sampleCount; for(uint32_t i=0; i 0) { mat = (T3DMat4FP*)malloc_uncached(sizeof(T3DMat4FP)); - particles = static_cast(malloc_uncached(countMax * sizeof(TPXParticle) / 2)); + particles = static_cast(malloc_uncached(countMax * sizeof(TPXParticleS8) / 2)); } } @@ -36,7 +36,7 @@ void PTSystem::draw() const { if(count == 0)return; tpx_matrix_push(mat); uint32_t safeCount = count & ~1; - tpx_particle_draw(particles, safeCount); + tpx_particle_draw_s8(particles, safeCount); tpx_matrix_pop(1); } @@ -44,7 +44,7 @@ void PTSystem::drawTextured() const { if(count == 0)return; tpx_matrix_push(mat); uint32_t safeCount = count & ~1; - tpx_particle_draw_tex(particles, safeCount); + tpx_particle_draw_tex_s8(particles, safeCount); tpx_matrix_pop(1); } @@ -59,7 +59,7 @@ int PTSystem::drawTexturedSlice(int begin, int end) const auto size = end - begin; if(size <= 0)return 0; tpx_matrix_push(mat); - tpx_particle_draw_tex(particles + (begin/2), size); + tpx_particle_draw_tex_s8(particles + (begin/2), size); tpx_matrix_pop(1); return size; diff --git a/examples/24_hdr_bloom/src/render/ptSystem.h b/examples/24_hdr_bloom/src/render/ptSystem.h index 546d7134..adac36d0 100644 --- a/examples/24_hdr_bloom/src/render/ptSystem.h +++ b/examples/24_hdr_bloom/src/render/ptSystem.h @@ -10,7 +10,7 @@ struct PTSystem { T3DVec3 pos{}; T3DMat4FP *mat{}; - TPXParticle *particles{}; + TPXParticleS8 *particles{}; uint32_t countMax{}; uint32_t count{}; @@ -21,9 +21,9 @@ struct PTSystem [[nodiscard]] bool isFull() const { return count == countMax; } void removeParticle(uint32_t index) { - tpx_buffer_copy(particles, index, --count); + tpx_buffer_s8_copy(particles, index, --count); if(count & 1) { - *tpx_buffer_get_size(particles, count + 1u) = 0; + *tpx_buffer_s8_get_size(particles, count + 1u) = 0; } } diff --git a/src/t3d/rsp/rsp_tinypx.S b/src/t3d/rsp/rsp_tinypx.S index d2084609..051016d5 100644 --- a/src/t3d/rsp/rsp_tinypx.S +++ b/src/t3d/rsp/rsp_tinypx.S @@ -152,199 +152,310 @@ TPXCmd_SyncT3D: j RSPQ_Loop nop TPXCmd_DrawColor: - or $s0, $zero, $a1 - ori $s4, $zero, %lo(PARTICLE_BUFF) - andi $t0, $a0, 65535 - addu $s7, $s4, $t0 - or $t2, $zero, $zero - jal DMAExec ## Args: $t0, $t1, $s0, $s4, $t2 - addiu $t0, $t0, -1 - ori $at, $zero, %lo(MATRIX_MVP) - ldv $v26, 0, 24, $at - ldv $v26, 8, 24, $at - lw $s2, %lo(RDPQ_SCISSOR_RECT + 4) - ldv $v24, 0, 40, $at - ldv $v28, 0, 8, $at - srl $t4, $s2, 12 - ldv $v21, 0, 48, $at - ldv $v22, 0, 56, $at - ldv $v24, 8, 40, $at - vmudl $v20, $v00, $v31.e3 - ldv $v23, 0, 32, $at - ldv $v27, 0, 0, $at - ldv $v22, 8, 56, $at - ldv $v21, 8, 48, $at - ldv $v25, 0, 16, $at - or $s6, $zero, $s4 - ldv $v25, 8, 16, $at - mtc2 $s2, $v15.e1 - ldv $v28, 8, 8, $at - ldv $v23, 8, 32, $at - lw $s1, %lo(RDPQ_SCISSOR_RECT + 0) - ldv $v27, 8, 0, $at - ori $at, $zero, %lo(SCREEN_SCALE_OFFSET) - lui $t7, 0x3A00 - ori $s4, $zero, %lo(RDP_BUFF) - mtc2 $t4, $v15.e0 - mtc2 $t4, $v15.e4 - ldv $v19, 0, 0, $at - ldv $v18, 0, 8, $at - srl $t4, $s1, 12 - ldv $v18, 8, 8, $at - mtc2 $s2, $v15.e5 - ldv $v19, 8, 0, $at - ori $at, $zero, %lo(NORM_SCALE_W) - ldv $v17, 0, 0, $at - mtc2 $s1, $v16.e1 - ldv $v17, 8, 0, $at - addiu $at, $zero, 4095 - vmadm $v19, $v19, $v31.e3 - ori $s3, $zero, %lo(RDP_BUFF) - vmadn $v20, $v00, $v00 - mtc2 $at, $v15.e3 - mtc2 $t4, $v16.e0 - ori $at, $zero, %lo(PARTICLE_SCALE) - ori $s2, $zero, %lo(RDP_BUFF) - addiu $s5, $s3, 528 - mtc2 $t4, $v16.e4 - vand $v15, $v15, $v15.e3 - mtc2 $s1, $v16.e5 - llv $v13, 0, 0, $at - llv $v13, 8, 0, $at - addiu $a0, $zero, 46 - addiu $s1, $s5, 24 - vand $v16, $v16, $v15.e3 - LABEL_0001: - sw $t7, 0($s2) - sb $a0, 8($s2) - sh $zero, 14($s2) - addiu $s2, $s2, 24 - bne $s2, $s1, LABEL_0001 - nop - LABEL_0002: - jal DMAWaitIdle - nop - lpv $v10, 0, 0, $s6 - vmulf $v14, $v13, $v10.h3 - vmudm $v10, $v10, $v31.e7 - vmov $v10.e3, $v30.e7 - vmov $v10.e7, $v30.e7 - vxor $v09, $v00, $v30.e7 - vmudn $v12, $v28, $v10.h0 - vmadh $v11, $v27, $v10.h0 - vmadn $v12, $v26, $v10.h1 - vmadh $v11, $v25, $v10.h1 - vmadn $v12, $v24, $v10.h2 - vmadh $v11, $v23, $v10.h2 - vmadn $v12, $v22, $v10.h3 - vmadh $v11, $v21, $v10.h3 - vch $v29, $v11, $v11.h3 - vcl $v29, $v12, $v12.h3 - cfc2 $t4, $vcc - LABEL_0003: - vmudl $v12, $v12, $v17.v - ori $at, $zero, %lo(BASE_SIZE) - vmadm $v11, $v11, $v17.v - ldv $v03, 0, 8, $s6 - vmadn $v12, $v00, $v00 - vrcph $v07.e3, $v11.e3 - andi $t5, $t4, 16448 - vrcpl $v08.e3, $v12.e3 - addiu $s6, $s6, 16 - vrcph $v07.e3, $v11.e7 - vrcpl $v08.e7, $v12.e7 - vrcph $v07.e7, $v00.e7 - lsv $v11, 6, 0, $at - vmov $v12.e3, $v00.e0 - lsv $v11, 14, 0, $at - vmov $v12.e7, $v00.e0 - addiu $at, $zero, 3 - vmudl $v29, $v12, $v08.h3 - vmadm $v29, $v11, $v08.h3 - vmadn $v12, $v12, $v07.h3 - addiu $t2, $zero, 54 - vmadh $v11, $v11, $v07.h3 - vmulf $v14, $v14, $v11.h3 - vmudl $v29, $v12, $v20.v - vmadm $v29, $v11, $v20.v - slv $v03, 0, 4, $s3 - vmadn $v08, $v12, $v19.v - andi $t1, $t4, 1028 - vmadh $v07, $v11, $v19.v - vmadh $v06, $v18, $v09.v - vmadh $v05, $v09, $v14.v - vsubc $v10, $v06, $v14.v - vlt $v05, $v05, $v15 - vge $v10, $v10, $v16 - ssv $v06, 4, 12, $s3 - mfc2 $sp, $v05.e5 - mfc2 $k1, $v10.e0 - mfc2 $s2, $v10.e1 - andi $sp, $sp, 4095 - sll $k1, $k1, 12 - mfc2 $fp, $v05.e1 - andi $s2, $s2, 4095 - or $s2, $s2, $k1 - mfc2 $k1, $v10.e4 - mfc2 $s1, $v10.e5 - andi $fp, $fp, 4095 - sll $k1, $k1, 12 - andi $s1, $s1, 4095 - sw $s2, 4 + 16($s3) - or $s1, $s1, $k1 - vlt $v04, $v10, $v05 - mfc2 $k1, $v05.e0 - cfc2 $t4, $vcc - sll $k1, $k1, 12 - or $fp, $fp, $k1 - andi $a2, $t4, 3 - lpv $v10, 0, 0, $s6 - sw $fp, 0 + 16($s3) ## Barrier: 0x1 - or $t1, $t1, $a2 - mfc2 $k1, $v05.e4 - sb $t2, 0 + 16($s3) ## Barrier: 0x1 - andi $a2, $t4, 48 - vmulf $v14, $v13, $v10.h3 - sll $k1, $k1, 12 - vmudm $v10, $v10, $v31.e7 - or $sp, $sp, $k1 - vmov $v10.e3, $v30.e7 - bne $t1, $at, LABEL_0005 - or $t5, $t5, $a2 - addiu $s3, $s3, 24 - LABEL_0005: - addiu $at, $zero, 48 - vmov $v10.e7, $v30.e7 - bne $t5, $at, LABEL_0006 - nop - slv $v03, 4, 4, $s3 - sw $sp, 0 + 16($s3) ## Barrier: 0x1 - ssv $v06, 12, 12, $s3 - sb $t2, 0 + 16($s3) ## Barrier: 0x1 - sw $s1, 4 + 16($s3) - addiu $s3, $s3, 24 - LABEL_0006: - vmudn $v12, $v28, $v10.h0 - vmadh $v11, $v27, $v10.h0 - vmadn $v12, $v26, $v10.h1 - vmadh $v11, $v25, $v10.h1 - sltu $at, $s3, $s5 - vmadn $v12, $v24, $v10.h2 - vmadh $v11, $v23, $v10.h2 - vmadn $v12, $v22, $v10.h3 - bne $at, $zero, LABEL_0007 - vmadh $v11, $v21, $v10.h3 - jal RDPQ_Send ## Args: $s4, $s3 - nop - or $s3, $zero, $s4 - LABEL_0007: - vch $v29, $v11, $v11.h3 - vcl $v29, $v12, $v12.h3 - bne $s6, $s7, LABEL_0003 - cfc2 $t4, $vcc - LABEL_0004: - j RDPQ_Send - ori $ra, $zero, %lo(RSPQ_Loop) + or $s0, $zero, $a1 ## L:850 | ^ | dma_in_async(dmaDmem, rdramAddr, dmaSize); + andi $t0, $a0, 65535 ## L:834 | 2 | u32<$t0> dmaSize = dataSize & 0xFFFF; + ori $s4, $zero, %lo(PARTICLE_BUFF) ## L:833 | 3 | u16<$s4> dmaDmem = PARTICLE_BUFF; + addu $s7, $s4, $t0 ## L:849 | 4 | u32 ptrInEnd = dmaDmem + dmaSize; + addiu $t0, $t0, -1 ## L:850 | 5 | dma_in_async(dmaDmem, rdramAddr, dmaSize); + jal DMAExec ## L:850 | 6 | dma_in_async(dmaDmem, rdramAddr, dmaSize); ## Args: $t0, $t1, $s0, $s4, $t2 + or $t2, $zero, $zero ## L:850 | *8 | dma_in_async(dmaDmem, rdramAddr, dmaSize); + vmudl $v20, $v00, $v31.e3 ## L:865 | 9 | screenSize >>= 4; + lw $s2, %lo(RDPQ_SCISSOR_RECT + 4) ## L:1213 | ^ | u32 extMax = load(RDPQ_SCISSOR_RECT, 4); + lw $s1, %lo(RDPQ_SCISSOR_RECT + 0) ## L:1214 | 10 | u32 extMin = load(RDPQ_SCISSOR_RECT, 0); + ori $at, $zero, %lo(MATRIX_MVP) ## L:858 | 11 | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw; + ldv $v21, 0, 48, $at ## L:861 | 12 | vec32 mat3 = load(MATRIX_MVP, 0x30).xyzwxyzw; + mtc2 $s2, $v15.e5 ## L:1218 | 13 | screenMax.y = extMax; screenMax.Y = extMax; + ldv $v23, 0, 32, $at ## L:860 | 14 | vec32 mat2 = load(MATRIX_MVP, 0x20).xyzwxyzw; + ldv $v28, 0, 8, $at ## L:858 | 15 | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw; + ldv $v22, 0, 56, $at ## L:861 | 16 | vec32 mat3 = load(MATRIX_MVP, 0x30).xyzwxyzw; + ldv $v26, 0, 24, $at ## L:859 | 17 | vec32 mat1 = load(MATRIX_MVP, 0x10).xyzwxyzw; + ori $s3, $zero, %lo(RDP_BUFF) ## L:855 | 18 | u16<$s3> dmaDmemEnd = RDP_BUFF; + ldv $v25, 0, 16, $at ## L:859 | 19 | vec32 mat1 = load(MATRIX_MVP, 0x10).xyzwxyzw; + ldv $v25, 8, 16, $at ## L:859 | 20 | vec32 mat1 = load(MATRIX_MVP, 0x10).xyzwxyzw; + ldv $v24, 0, 40, $at ## L:860 | 21 | vec32 mat2 = load(MATRIX_MVP, 0x20).xyzwxyzw; + ldv $v21, 8, 48, $at ## L:861 | 22 | vec32 mat3 = load(MATRIX_MVP, 0x30).xyzwxyzw; + ldv $v26, 8, 24, $at ## L:859 | 23 | vec32 mat1 = load(MATRIX_MVP, 0x10).xyzwxyzw; + ldv $v28, 8, 8, $at ## L:858 | 24 | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw; + ldv $v27, 0, 0, $at ## L:858 | 25 | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw; + ldv $v24, 8, 40, $at ## L:860 | 26 | vec32 mat2 = load(MATRIX_MVP, 0x20).xyzwxyzw; + ldv $v23, 8, 32, $at ## L:860 | 27 | vec32 mat2 = load(MATRIX_MVP, 0x20).xyzwxyzw; + ldv $v27, 8, 0, $at ## L:858 | 28 | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw; + ldv $v22, 8, 56, $at ## L:861 | 29 | vec32 mat3 = load(MATRIX_MVP, 0x30).xyzwxyzw; + ori $at, $zero, %lo(SCREEN_SCALE_OFFSET) ## L:863 | 30 | vec32 screenSize:sint = load(SCREEN_SCALE_OFFSET).xyzwxyzw; + ldv $v18, 0, 8, $at ## L:867 | 31 | vec16 screenOffset = load(SCREEN_SCALE_OFFSET, 0x08).xyzwxyzw; + mtc2 $s1, $v16.e5 ## L:1222 | 32 | screenMin.y = extMin; screenMin.Y = extMin; + ldv $v18, 8, 8, $at ## L:867 | 33 | vec16 screenOffset = load(SCREEN_SCALE_OFFSET, 0x08).xyzwxyzw; + srl $t4, $s2, 12 ## L:1216 | 34 | temp1 = extMax >> 12; + ldv $v19, 0, 0, $at ## L:863 | 35 | vec32 screenSize:sint = load(SCREEN_SCALE_OFFSET).xyzwxyzw; + mtc2 $s1, $v16.e1 ## L:1222 | 36 | screenMin.y = extMin; screenMin.Y = extMin; + ldv $v19, 8, 0, $at ## L:863 | 37 | vec32 screenSize:sint = load(SCREEN_SCALE_OFFSET).xyzwxyzw; + ori $at, $zero, %lo(NORM_SCALE_W) ## L:868 | 38 | vec16 normScaleW = load(NORM_SCALE_W).xyzwxyzw; + addiu $s5, $s3, 528 ## L:856 | 39 | u16 dmaDmemFlush = dmaDmemEnd + 528; + mtc2 $s2, $v15.e1 ## L:1218 | 40 | screenMax.y = extMax; screenMax.Y = extMax; + ori $s2, $zero, %lo(RDP_BUFF) ## L:881 | 41 | u16 buffRdp = RDP_BUFF; + ldv $v17, 0, 0, $at ## L:868 | 42 | vec16 normScaleW = load(NORM_SCALE_W).xyzwxyzw; + mtc2 $t4, $v15.e4 ## L:1217 | 43 | screenMax.x = temp1; screenMax.X = temp1; + ldv $v17, 8, 0, $at ## L:868 | 44 | vec16 normScaleW = load(NORM_SCALE_W).xyzwxyzw; + lui $t7, 0x3A00 ## L:883 | 45 | cmdRdpColor = 0x3A00'0000; + addiu $at, $zero, 4095 ## L:1212 | 46 | screenMax.w = 0b1111'1111'1111; + mtc2 $at, $v15.e3 ## L:1212 | 47 | screenMax.w = 0b1111'1111'1111; + mtc2 $t4, $v15.e0 ## L:1217 | 48 | screenMax.x = temp1; screenMax.X = temp1; + ori $at, $zero, %lo(PARTICLE_SCALE) ## L:875 | 49 | globalPartSize.xy = load(PARTICLE_SCALE).xy; + addiu $a0, $zero, 46 ## L:884 | 50 | cmdRdpDepth = 0x2E; + srl $t4, $s1, 12 ## L:1220 | 51 | temp1 = extMin >> 12; + vmadm $v19, $v19, $v31.e3 ## L:865 | ^ | screenSize >>= 4; + vmadn $v20, $v00, $v00 ## L:865 | 52 | screenSize >>= 4; + mtc2 $t4, $v16.e0 ## L:1221 | ^ | screenMin.x = temp1; screenMin.X = temp1; + mtc2 $t4, $v16.e4 ## L:1221 | 53 | screenMin.x = temp1; screenMin.X = temp1; + vand $v15, $v15, $v15.e3 ## L:1224 | ^ | screenMax &= screenMax.w; + llv $v13, 0, 0, $at ## L:875 | 54 | globalPartSize.xy = load(PARTICLE_SCALE).xy; + llv $v13, 8, 0, $at ## L:876 | 55 | globalPartSize.XY = load(PARTICLE_SCALE).xy; + addiu $s1, $s5, 24 ## L:882 | 56 | u16 buffRdpEnd = dmaDmemFlush + 24; + or $s6, $zero, $s4 ## L:852 | 57 | u32 ptrIn = dmaDmem; + ori $s4, $zero, %lo(RDP_BUFF) ## L:854 | 58 | dmaDmem = RDP_BUFF; + vand $v16, $v16, $v15.e3 ## L:1225 | ^ | screenMin &= screenMax.w; + LABEL_TPXCmd_DrawColor_0001: + sh $zero, 14($s2) ## L:888 | 59 | store(ZERO:u16, buffRdp, 0x0E); + sw $t7, 0($s2) ## L:886 | 60 | store(cmdRdpColor, buffRdp, 0); + sb $a0, 8($s2) ## L:887 | 61 | store(cmdRdpDepth, buffRdp, 8); + addiu $s2, $s2, 24 ## L:889 | 62 | buffRdp += 24; + bne $s2, $s1, LABEL_TPXCmd_DrawColor_0001 ## L:889 | 63 | buffRdp += 24; + nop ## L:889 | *65 | buffRdp += 24; + LABEL_TPXCmd_DrawColor_0002: + jal DMAWaitIdle ## L:893 | 66 | dma_await(); + nop ## L:893 | *68 | dma_await(); + bgez $a1, LABEL_TPXCmd_DrawColor_0003 ## L:900 | 69 | if(rdramAddr < 0) { + vxor $v10, $v00, $v30.e7 ## L:897 | *71 | const vec16 vecOne = 1; + lqv $v09, 0, 0, $s6 ## L:466 | 72 | vec16 posStart = load(ptrIn, 0x00); + vmulf $v14, $v13, $v09.h3 ## L:467 | ***76 | localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; + vmudn $v12, $v28, $v09.h0 ## L:93 | 77 | out = mat0 * vec.xxxxXXXX; + vmadh $v11, $v27, $v09.h0 ## L:93 | 78 | out = mat0 * vec.xxxxXXXX; + vmadn $v12, $v26, $v09.h1 ## L:94 | 79 | out = mat1 +* vec.yyyyYYYY; + vmadh $v11, $v25, $v09.h1 ## L:94 | 80 | out = mat1 +* vec.yyyyYYYY; + vmadn $v12, $v24, $v09.h2 ## L:95 | 81 | out = mat2 +* vec.zzzzZZZZ; + vmadh $v11, $v23, $v09.h2 ## L:95 | 82 | out = mat2 +* vec.zzzzZZZZ; + vmadn $v12, $v22, $v30.e7 ## L:96 | 83 | out = mat3 +* 1; + vmadh $v11, $v21, $v30.e7 ## L:96 | 84 | out = mat3 +* 1; + vch $v29, $v11, $v11.h3 ## L:475 | ***88 | temp1 = clip(posClip, posClip.wwwwWWWW); + vcl $v29, $v12, $v12.h3 ## L:475 | 89 | temp1 = clip(posClip, posClip.wwwwWWWW); + cfc2 $t4, $vcc ## L:475 | 90 | temp1 = clip(posClip, posClip.wwwwWWWW); + LABEL_TPXCmd_DrawColor_0005: + vmudl $v12, $v12, $v17.v ## L:486 | ^ | posClip *= normScaleW:ufract; + vmadm $v11, $v11, $v17.v ## L:486 | 91 | posClip *= normScaleW:ufract; + ldv $v03, 0, 16, $s6 ## L:531 | ^ | vec16 color = load(ptrIn, 16).xyzw; + vmadn $v12, $v00, $v00 ## L:486 | 92 | posClip *= normScaleW:ufract; + vrcph $v07.e3, $v11.e3 ## L:488 | **95 | invW.w = invert_half(posClip).w; + andi $t1, $t4, 1028 ## L:482 | ^ | clipA = temp1 & 0b0000'0100'0000'0100; + vrcpl $v08.e3, $v12.e3 ## L:488 | 96 | invW.w = invert_half(posClip).w; + addiu $t2, $zero, 54 ## L:535 | ^ | cmdRdpRect = 0x36; + ori $at, $zero, %lo(BASE_SIZE) ## L:492 | 97 | posClip:sint.w = load(BASE_SIZE).x; + vrcph $v07.e3, $v11.e7 ## L:489 | ^ | invW.W = invert_half(posClip).W; + vrcpl $v08.e7, $v12.e7 ## L:489 | 98 | invW.W = invert_half(posClip).W; + vmov $v12.e3, $v00.e0 ## L:494 | 99 | posClip:sfract.w = 0; + vmov $v12.e7, $v00.e0 ## L:495 | 100 | posClip:sfract.W = 0; + lsv $v11, 6, 0, $at ## L:492 | ^ | posClip:sint.w = load(BASE_SIZE).x; + lsv $v11, 14, 0, $at ## L:493 | 101 | posClip:sint.W = load(BASE_SIZE).x; + vrcph $v07.e7, $v00.e7 ## L:489 | ^ | invW.W = invert_half(posClip).W; + andi $t5, $t4, 16448 ## L:483 | **104 | clipB = temp1 & 0b0100'0000'0100'0000; + vmudl $v29, $v12, $v08.h3 ## L:498 | ^ | posClip *= invW.wwwwWWWW; + vmadm $v29, $v11, $v08.h3 ## L:498 | 105 | posClip *= invW.wwwwWWWW; + vmadn $v12, $v12, $v07.h3 ## L:498 | 106 | posClip *= invW.wwwwWWWW; + vmadh $v11, $v11, $v07.h3 ## L:498 | 107 | posClip *= invW.wwwwWWWW; + slv $v03, 0, 4, $s3 ## L:543 | ^ | store(color.xy, dmaDmemEnd, 4); + vmulf $v14, $v14, $v11.h3 ## L:502 | ***111 | localPartSize:sfract *= posClip:sint.wwwwWWWW; + vmudl $v29, $v12, $v20.v ## L:505 | 112 | vec32 posScreen = posClip * screenSize; + vmadm $v29, $v11, $v20.v ## L:505 | 113 | vec32 posScreen = posClip * screenSize; + vmadn $v08, $v12, $v19.v ## L:505 | 114 | vec32 posScreen = posClip * screenSize; + vmadh $v07, $v11, $v19.v ## L:505 | 115 | vec32 posScreen = posClip * screenSize; + vmadh $v06, $v18, $v10.v ## L:506 | 116 | vec16 posCenter = screenOffset:sint +* vecOne; + vmadh $v05, $v10, $v14.v ## L:509 | 117 | vec16 posEnd = vecOne +* localPartSize:sint; + vsubc $v09, $v06, $v14.v ## L:510 | **120 | posStart = posCenter - localPartSize:sint; + vlt $v05, $v05, $v15 ## L:513 | 121 | posEnd = min(posEnd, screenMax); + addiu $at, $zero, 3 ## L:567 | ^ | if(clipA == 0b0000'0011) { + ssv $v06, 4, 12, $s3 ## L:547 | **124 | store(posCenter.z, dmaDmemEnd, 0x04, 8); + vge $v09, $v09, $v16 ## L:514 | ^ | posStart = max(posStart, screenMin); + mfc2 $fp, $v05.e1 ## L:1191 | 125 | outA = pos.y; + mfc2 $sp, $v05.e5 ## L:1197 | 126 | outB = pos.Y; + andi $fp, $fp, 4095 ## L:1192 | *128 | outA &= 0b1111'1111'1111; + mfc2 $s2, $v09.e1 ## L:1191 | 129 | outA = pos.y; + mfc2 $k1, $v09.e0 ## L:1193 | 130 | u32 tmp = pos.x; + andi $sp, $sp, 4095 ## L:1198 | 131 | outB &= 0b1111'1111'1111; + andi $s2, $s2, 4095 ## L:1192 | 132 | outA &= 0b1111'1111'1111; + mfc2 $s1, $v09.e5 ## L:1197 | 133 | outB = pos.Y; + vlt $v04, $v09, $v05 ## L:520 | ^ | vec16 extend = posStart < posEnd; + sll $k1, $k1, 12 ## L:1194 | 134 | tmp <<= 12; + or $s2, $s2, $k1 ## L:1195 | 135 | outA |= tmp; + andi $s1, $s1, 4095 ## L:1198 | 136 | outB &= 0b1111'1111'1111; + mfc2 $k1, $v09.e4 ## L:1199 | 137 | tmp = pos.X; + sw $s2, 4 + 16($s3) ## L:551 | 138 | store(posA, dmaDmemEnd, 0x04, 16); + ldv $v09, 0, 24, $s6 ## L:558 | 139 | posStart.xyzw = load(ptrIn, 24).xyzw; + sll $k1, $k1, 12 ## L:1200 | 140 | tmp <<= 12; + or $s1, $s1, $k1 ## L:1201 | 141 | outB |= tmp; + mfc2 $k1, $v05.e0 ## L:1193 | 142 | u32 tmp = pos.x; + cfc2 $t4, $vcc ## L:521 | 143 | temp1 = get_vcc(); + ldv $v09, 8, 32, $s6 ## L:559 | 144 | posStart.XYZW = load(ptrIn, 24).XYZW; + sll $k1, $k1, 12 ## L:1194 | 145 | tmp <<= 12; + or $fp, $fp, $k1 ## L:1195 | 146 | outA |= tmp; + sw $fp, 0 + 16($s3) ## L:552 | 147 | @Barrier("pos-cmd") store(posEndA, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1 + andi $a2, $t4, 3 ## L:523 | 148 | temp0 = temp1 & 0b0000'0011; + mfc2 $k1, $v05.e4 ## L:1199 | 149 | tmp = pos.X; + or $t1, $t1, $a2 ## L:524 | 150 | clipA |= temp0; + andi $a2, $t4, 48 ## L:526 | 151 | temp0 = temp1 & 0b0011'0000; + vmulf $v14, $v13, $v09.h3 ## L:560 | ^ | localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; + sll $k1, $k1, 12 ## L:1200 | 152 | tmp <<= 12; + sb $t2, 0 + 16($s3) ## L:553 | 153 | @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1 + or $t5, $t5, $a2 ## L:527 | 154 | clipB |= temp0; + bne $t1, $at, LABEL_TPXCmd_DrawColor_0007 ## L:567 | 155 | if(clipA == 0b0000'0011) { + or $sp, $sp, $k1 ## L:1201 | *157 | outB |= tmp; + addiu $s3, $s3, 24 ## L:568 | 158 | dmaDmemEnd += 24; + LABEL_TPXCmd_DrawColor_0007: + addiu $at, $zero, 48 ## L:578 | 159 | if(clipB == 0b0011'0000) { + bne $t5, $at, LABEL_TPXCmd_DrawColor_0008 ## L:578 | 160 | if(clipB == 0b0011'0000) { + addiu $s6, $s6, 24 ## L:572 | *162 | ptrIn += 24; + sw $s1, 4 + 16($s3) ## L:582 | 163 | store(posB, dmaDmemEnd, 0x04, 16); + ssv $v06, 12, 12, $s3 ## L:580 | 164 | store(posCenter.Z, dmaDmemEnd, 0x04, 8); + sw $sp, 0 + 16($s3) ## L:583 | 165 | @Barrier("pos-cmd") store(posEndB, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1 + sb $t2, 0 + 16($s3) ## L:584 | 166 | @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1 + slv $v03, 4, 4, $s3 ## L:579 | 167 | store(color.zw, dmaDmemEnd, 4); + addiu $s3, $s3, 24 ## L:586 | 168 | dmaDmemEnd += 24; + LABEL_TPXCmd_DrawColor_0008: + vmudn $v12, $v28, $v09.h0 ## L:93 | ^ | out = mat0 * vec.xxxxXXXX; + sltu $at, $s3, $s5 ## L:593 | 169 | if(dmaDmemEnd >= dmaDmemFlush) { + vmadh $v11, $v27, $v09.h0 ## L:93 | ^ | out = mat0 * vec.xxxxXXXX; + vmadn $v12, $v26, $v09.h1 ## L:94 | 170 | out = mat1 +* vec.yyyyYYYY; + vmadh $v11, $v25, $v09.h1 ## L:94 | 171 | out = mat1 +* vec.yyyyYYYY; + vmadn $v12, $v24, $v09.h2 ## L:95 | 172 | out = mat2 +* vec.zzzzZZZZ; + vmadh $v11, $v23, $v09.h2 ## L:95 | 173 | out = mat2 +* vec.zzzzZZZZ; + vmadn $v12, $v22, $v30.e7 ## L:96 | 174 | out = mat3 +* 1; + bne $at, $zero, LABEL_TPXCmd_DrawColor_0009 ## L:593 | ^ | if(dmaDmemEnd >= dmaDmemFlush) { + vmadh $v11, $v21, $v30.e7 ## L:96 | *176 | out = mat3 +* 1; + jal RDPQ_Send ## L:594 | 177 | RDPQ_Send(dmaDmem, dmaDmemEnd); ## Args: $s4, $s3 + nop ## L:594 | *179 | RDPQ_Send(dmaDmem, dmaDmemEnd); + or $s3, $zero, $s4 ## L:595 | 180 | dmaDmemEnd = dmaDmem; + LABEL_TPXCmd_DrawColor_0009: + vch $v29, $v11, $v11.h3 ## L:598 | ^ | temp1 = clip(posClip, posClip.wwwwWWWW); + vcl $v29, $v12, $v12.h3 ## L:598 | 181 | temp1 = clip(posClip, posClip.wwwwWWWW); + bne $s6, $s7, LABEL_TPXCmd_DrawColor_0005 ## L:598 | ^ | temp1 = clip(posClip, posClip.wwwwWWWW); + cfc2 $t4, $vcc ## L:598 | *183 | temp1 = clip(posClip, posClip.wwwwWWWW); + LABEL_TPXCmd_DrawColor_0006: + beq $zero, $zero, LABEL_TPXCmd_DrawColor_0004 ## L:598 | 184 | temp1 = clip(posClip, posClip.wwwwWWWW); + nop ## L:598 | *186 | temp1 = clip(posClip, posClip.wwwwWWWW); + LABEL_TPXCmd_DrawColor_0003: + lpv $v09, 0, 0, $s6 ## L:116 | 187 | vec16 posStart = load_vec_s8(ptrIn, 0x00); + vmulf $v14, $v13, $v09.h3 ## L:117 | ***191 | localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; + vmudm $v09, $v09, $v31.e7 ## L:118 | 192 | posStart >>= 8; + vmudn $v12, $v28, $v09.h0 ## L:93 | ***196 | out = mat0 * vec.xxxxXXXX; + vmadh $v11, $v27, $v09.h0 ## L:93 | 197 | out = mat0 * vec.xxxxXXXX; + vmadn $v12, $v26, $v09.h1 ## L:94 | 198 | out = mat1 +* vec.yyyyYYYY; + vmadh $v11, $v25, $v09.h1 ## L:94 | 199 | out = mat1 +* vec.yyyyYYYY; + vmadn $v12, $v24, $v09.h2 ## L:95 | 200 | out = mat2 +* vec.zzzzZZZZ; + vmadh $v11, $v23, $v09.h2 ## L:95 | 201 | out = mat2 +* vec.zzzzZZZZ; + vmadn $v12, $v22, $v30.e7 ## L:96 | 202 | out = mat3 +* 1; + vmadh $v11, $v21, $v30.e7 ## L:96 | 203 | out = mat3 +* 1; + vch $v29, $v11, $v11.h3 ## L:122 | ***207 | temp1 = clip(posClip, posClip.wwwwWWWW); + vcl $v29, $v12, $v12.h3 ## L:122 | 208 | temp1 = clip(posClip, posClip.wwwwWWWW); + cfc2 $t4, $vcc ## L:122 | 209 | temp1 = clip(posClip, posClip.wwwwWWWW); + LABEL_TPXCmd_DrawColor_000A: + vmudl $v12, $v12, $v17.v ## L:133 | ^ | posClip *= normScaleW:ufract; + vmadm $v11, $v11, $v17.v ## L:133 | 210 | posClip *= normScaleW:ufract; + vmadn $v12, $v00, $v00 ## L:133 | 211 | posClip *= normScaleW:ufract; + ldv $v03, 0, 8, $s6 ## L:180 | ^ | vec16 color = load(ptrIn, 8).xyzw; + andi $t1, $t4, 1028 ## L:129 | 212 | clipA = temp1 & 0b0000'0100'0000'0100; + addiu $t2, $zero, 54 ## L:182 | *214 | cmdRdpRect = 0x36; + vrcph $v07.e3, $v11.e3 ## L:135 | ^ | invW.w = invert_half(posClip).w; + vrcpl $v08.e3, $v12.e3 ## L:135 | 215 | invW.w = invert_half(posClip).w; + vrcph $v07.e3, $v11.e7 ## L:136 | 216 | invW.W = invert_half(posClip).W; + ori $at, $zero, %lo(BASE_SIZE) ## L:139 | ^ | posClip:sint.w = load(BASE_SIZE).x; + vrcpl $v08.e7, $v12.e7 ## L:136 | 217 | invW.W = invert_half(posClip).W; + lsv $v11, 6, 0, $at ## L:139 | ^ | posClip:sint.w = load(BASE_SIZE).x; + vmov $v12.e3, $v00.e0 ## L:141 | 218 | posClip:sfract.w = 0; + lsv $v11, 14, 0, $at ## L:140 | ^ | posClip:sint.W = load(BASE_SIZE).x; + addiu $at, $zero, 3 ## L:214 | 219 | if(clipA == 0b0000'0011) { + vmov $v12.e7, $v00.e0 ## L:142 | ^ | posClip:sfract.W = 0; + vrcph $v07.e7, $v00.e7 ## L:136 | 220 | invW.W = invert_half(posClip).W; + vmudl $v29, $v12, $v08.h3 ## L:145 | **223 | posClip *= invW.wwwwWWWW; + vmadm $v29, $v11, $v08.h3 ## L:145 | 224 | posClip *= invW.wwwwWWWW; + vmadn $v12, $v12, $v07.h3 ## L:145 | 225 | posClip *= invW.wwwwWWWW; + vmadh $v11, $v11, $v07.h3 ## L:145 | 226 | posClip *= invW.wwwwWWWW; + vmulf $v14, $v14, $v11.h3 ## L:149 | ***230 | localPartSize:sfract *= posClip:sint.wwwwWWWW; + andi $t5, $t4, 16448 ## L:130 | ^ | clipB = temp1 & 0b0100'0000'0100'0000; + vmudl $v29, $v12, $v20.v ## L:152 | 231 | vec32 posScreen = posClip * screenSize; + slv $v03, 0, 4, $s3 ## L:190 | ^ | store(color.xy, dmaDmemEnd, 4); + vmadm $v29, $v11, $v20.v ## L:152 | 232 | vec32 posScreen = posClip * screenSize; + vmadn $v08, $v12, $v19.v ## L:152 | 233 | vec32 posScreen = posClip * screenSize; + vmadh $v07, $v11, $v19.v ## L:152 | 234 | vec32 posScreen = posClip * screenSize; + vmadh $v06, $v18, $v10.v ## L:153 | 235 | vec16 posCenter = screenOffset:sint +* vecOne; + vmadh $v05, $v10, $v14.v ## L:156 | 236 | vec16 posEnd = vecOne +* localPartSize:sint; + vsubc $v09, $v06, $v14.v ## L:157 | **239 | posStart = posCenter - localPartSize:sint; + vlt $v05, $v05, $v15 ## L:160 | 240 | posEnd = min(posEnd, screenMax); + vge $v09, $v09, $v16 ## L:161 | **243 | posStart = max(posStart, screenMin); + ssv $v06, 4, 12, $s3 ## L:194 | ^ | store(posCenter.z, dmaDmemEnd, 0x04, 8); + mfc2 $fp, $v05.e1 ## L:1191 | 244 | outA = pos.y; + mfc2 $sp, $v05.e5 ## L:1197 | 245 | outB = pos.Y; + andi $fp, $fp, 4095 ## L:1192 | *247 | outA &= 0b1111'1111'1111; + mfc2 $k1, $v09.e0 ## L:1193 | 248 | u32 tmp = pos.x; + mfc2 $s2, $v09.e1 ## L:1191 | 249 | outA = pos.y; + andi $sp, $sp, 4095 ## L:1198 | 250 | outB &= 0b1111'1111'1111; + vlt $v04, $v09, $v05 ## L:167 | ^ | vec16 extend = posStart < posEnd; + sll $k1, $k1, 12 ## L:1194 | 251 | tmp <<= 12; + mfc2 $s1, $v09.e5 ## L:1197 | 252 | outB = pos.Y; + andi $s2, $s2, 4095 ## L:1192 | 253 | outA &= 0b1111'1111'1111; + or $s2, $s2, $k1 ## L:1195 | 254 | outA |= tmp; + mfc2 $k1, $v09.e4 ## L:1199 | 255 | tmp = pos.X; + cfc2 $t4, $vcc ## L:168 | 256 | temp1 = get_vcc(); + lpv $v09, 0, 16, $s6 ## L:209 | 257 | posStart = load_vec_s8(ptrIn, 16); + sll $k1, $k1, 12 ## L:1200 | 258 | tmp <<= 12; + andi $s1, $s1, 4095 ## L:1198 | 259 | outB &= 0b1111'1111'1111; + andi $a2, $t4, 3 ## L:170 | 260 | temp0 = temp1 & 0b0000'0011; + or $s1, $s1, $k1 ## L:1201 | 261 | outB |= tmp; + mfc2 $k1, $v05.e0 ## L:1193 | 262 | u32 tmp = pos.x; + vmulf $v14, $v13, $v09.h3 ## L:210 | ^ | localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; + vmudm $v09, $v09, $v31.e7 ## L:211 | 263 | posStart >>= 8; + or $t1, $t1, $a2 ## L:171 | ^ | clipA |= temp0; + andi $a2, $t4, 48 ## L:173 | 264 | temp0 = temp1 & 0b0011'0000; + sll $k1, $k1, 12 ## L:1194 | 265 | tmp <<= 12; + or $fp, $fp, $k1 ## L:1195 | 266 | outA |= tmp; + mfc2 $k1, $v05.e4 ## L:1199 | 267 | tmp = pos.X; + sw $s2, 4 + 16($s3) ## L:198 | 268 | store(posA, dmaDmemEnd, 0x04, 16); + or $t5, $t5, $a2 ## L:174 | 269 | clipB |= temp0; + sw $fp, 0 + 16($s3) ## L:199 | 270 | @Barrier("pos-cmd") store(posEndA, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1 + sb $t2, 0 + 16($s3) ## L:200 | 271 | @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1 + sll $k1, $k1, 12 ## L:1200 | 272 | tmp <<= 12; + bne $t1, $at, LABEL_TPXCmd_DrawColor_000C ## L:214 | 273 | if(clipA == 0b0000'0011) { + or $sp, $sp, $k1 ## L:1201 | *275 | outB |= tmp; + addiu $s3, $s3, 24 ## L:215 | 276 | dmaDmemEnd += 24; + LABEL_TPXCmd_DrawColor_000C: + addiu $at, $zero, 48 ## L:225 | 277 | if(clipB == 0b0011'0000) { + bne $t5, $at, LABEL_TPXCmd_DrawColor_000D ## L:225 | 278 | if(clipB == 0b0011'0000) { + addiu $s6, $s6, 16 ## L:221 | *280 | ptrIn += 16; + ssv $v06, 12, 12, $s3 ## L:227 | 281 | store(posCenter.Z, dmaDmemEnd, 0x04, 8); + slv $v03, 4, 4, $s3 ## L:226 | 282 | store(color.zw, dmaDmemEnd, 4); + sw $sp, 0 + 16($s3) ## L:230 | 283 | @Barrier("pos-cmd") store(posEndB, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1 + sw $s1, 4 + 16($s3) ## L:229 | 284 | store(posB, dmaDmemEnd, 0x04, 16); + sb $t2, 0 + 16($s3) ## L:231 | 285 | @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1 + addiu $s3, $s3, 24 ## L:233 | 286 | dmaDmemEnd += 24; + LABEL_TPXCmd_DrawColor_000D: + vmudn $v12, $v28, $v09.h0 ## L:93 | ^ | out = mat0 * vec.xxxxXXXX; + vmadh $v11, $v27, $v09.h0 ## L:93 | 287 | out = mat0 * vec.xxxxXXXX; + vmadn $v12, $v26, $v09.h1 ## L:94 | 288 | out = mat1 +* vec.yyyyYYYY; + vmadh $v11, $v25, $v09.h1 ## L:94 | 289 | out = mat1 +* vec.yyyyYYYY; + sltu $at, $s3, $s5 ## L:240 | ^ | if(dmaDmemEnd >= dmaDmemFlush) { + vmadn $v12, $v24, $v09.h2 ## L:95 | 290 | out = mat2 +* vec.zzzzZZZZ; + vmadh $v11, $v23, $v09.h2 ## L:95 | 291 | out = mat2 +* vec.zzzzZZZZ; + vmadn $v12, $v22, $v30.e7 ## L:96 | 292 | out = mat3 +* 1; + bne $at, $zero, LABEL_TPXCmd_DrawColor_000E ## L:240 | ^ | if(dmaDmemEnd >= dmaDmemFlush) { + vmadh $v11, $v21, $v30.e7 ## L:96 | *294 | out = mat3 +* 1; + jal RDPQ_Send ## L:241 | 295 | RDPQ_Send(dmaDmem, dmaDmemEnd); ## Args: $s4, $s3 + nop ## L:241 | *297 | RDPQ_Send(dmaDmem, dmaDmemEnd); + or $s3, $zero, $s4 ## L:242 | 298 | dmaDmemEnd = dmaDmem; + LABEL_TPXCmd_DrawColor_000E: + vch $v29, $v11, $v11.h3 ## L:245 | ^ | temp1 = clip(posClip, posClip.wwwwWWWW); + vcl $v29, $v12, $v12.h3 ## L:245 | 299 | temp1 = clip(posClip, posClip.wwwwWWWW); + bne $s6, $s7, LABEL_TPXCmd_DrawColor_000A ## L:245 | ^ | temp1 = clip(posClip, posClip.wwwwWWWW); + cfc2 $t4, $vcc ## L:245 | *301 | temp1 = clip(posClip, posClip.wwwwWWWW); + LABEL_TPXCmd_DrawColor_0004: + j RDPQ_Send ## L:908 | 302 | goto RDPQ_Send; + ori $ra, $zero, %lo(RSPQ_Loop) ## L:907 | 303 | RA = RSPQ_Loop; mulMat4Mat4: ldv $v01, 0, 0, $s3 ldv $v10, 0, 8, $s4 @@ -458,250 +569,398 @@ TPXCmd_SetDMEM: j RSPQ_Loop sw $a1, ($a0) TPXCmd_DrawTextured: - andi $t0, $a0, 65535 - or $s0, $zero, $a1 - ori $s4, $zero, %lo(PARTICLE_BUFF) - addu $s7, $s4, $t0 - addiu $t0, $t0, -1 - jal DMAExec ## Args: $t0, $t1, $s0, $s4, $t2 - or $t2, $zero, $zero - ori $at, $zero, %lo(MATRIX_MVP) - lw $s2, %lo(RDPQ_SCISSOR_RECT + 4) - ldv $v21, 0, 48, $at - ldv $v27, 0, 0, $at - ldv $v24, 0, 40, $at - ldv $v21, 8, 48, $at - ldv $v25, 0, 16, $at - ldv $v22, 0, 56, $at - ldv $v26, 0, 24, $at - lw $s1, %lo(RDPQ_SCISSOR_RECT + 0) - ldv $v28, 0, 8, $at - ldv $v25, 8, 16, $at - ldv $v28, 8, 8, $at - srl $t4, $s2, 12 - ldv $v27, 8, 0, $at - or $s6, $zero, $s4 - ldv $v23, 0, 32, $at - ldv $v22, 8, 56, $at - ldv $v24, 8, 40, $at - ldv $v23, 8, 32, $at - ldv $v26, 8, 24, $at - ori $at, $zero, %lo(SCREEN_SCALE_OFFSET) - ldv $v19, 0, 0, $at - lui $t7, 0x3A00 - ori $s4, $zero, %lo(RDP_BUFF) - mtc2 $s2, $v15.e1 - mtc2 $s1, $v16.e1 - ldv $v19, 8, 0, $at - ldv $v18, 0, 8, $at - vmudl $v20, $v00, $v31.e3 - ldv $v18, 8, 8, $at - ori $at, $zero, %lo(NORM_SCALE_W) - ldv $v17, 0, 0, $at - mtc2 $s1, $v16.e5 - ldv $v17, 8, 0, $at - addiu $at, $zero, 4095 - ori $s3, $zero, %lo(RDP_BUFF) - mtc2 $t4, $v15.e0 - mtc2 $t4, $v15.e4 - addiu $s5, $s3, 480 - srl $t4, $s1, 12 - vmadm $v19, $v19, $v31.e3 - mtc2 $s2, $v15.e5 - mtc2 $at, $v15.e3 - ori $s2, $zero, %lo(RDP_BUFF) - ori $at, $zero, %lo(PARTICLE_SCALE) - mtc2 $t4, $v16.e4 - vmadn $v20, $v00, $v00 - mtc2 $t4, $v16.e0 - vand $v15, $v15, $v15.e3 - llv $v13, 0, 0, $at - llv $v13, 8, 0, $at - addiu $s1, $s5, 32 - addiu $a0, $zero, 46 - vand $v16, $v16, $v15.e3 - LABEL_000C: - sw $t7, 0($s2) - sh $zero, 14($s2) - sb $a0, 8($s2) - addiu $s2, $s2, 32 - bne $s2, $s1, LABEL_000C - nop - LABEL_000D: - lhu $s1, %lo(TILE_COUNT + 0) - vxor $v12, $v00, $v00.e0 - vmov $v12.e1, $v30.e7 - vxor $v11, $v00, $v00.e0 - vmov $v12.e5, $v30.e7 - addu $fp, $s1, $s1 - mtc2 $fp, $v12.e3 - mtc2 $s1, $v12.e2 - addiu $fp, $fp, 65535 - vmov $v12.e7, $v31.e7 - mtc2 $fp, $v12.e0 - mtc2 $fp, $v12.e4 - lh $s2, %lo(TEX_OFFSET + 0) - vmudn $v12, $v12, $v12.e7 - vsubc $v29, $v12, $v30.e7 - vmov $v12.e3, $v29.e3 - jal DMAWaitIdle - vor $v11, $v00, $v29.e2 - lpv $v08, 0, 0, $s6 - vmulf $v14, $v13, $v08.h3 - vmudm $v08, $v08, $v31.e7 - vmov $v08.e7, $v30.e7 - vmov $v08.e3, $v30.e7 - vxor $v07, $v00, $v30.e7 - vmudn $v10, $v28, $v08.h0 - vmadh $v09, $v27, $v08.h0 - vmadn $v10, $v26, $v08.h1 - vmadh $v09, $v25, $v08.h1 - vmadn $v10, $v24, $v08.h2 - vmadh $v09, $v23, $v08.h2 - vmadn $v10, $v22, $v08.h3 - vmadh $v09, $v21, $v08.h3 - vch $v29, $v09, $v09.h3 - vcl $v29, $v10, $v10.h3 - cfc2 $t4, $vcc - LABEL_000E: - vmudl $v10, $v10, $v17.v - vmadm $v09, $v09, $v17.v - ori $at, $zero, %lo(BASE_SIZE) - vmadn $v10, $v00, $v00 - vrcph $v05.e3, $v09.e3 - andi $t1, $t4, 1028 - vrcpl $v06.e3, $v10.e3 - vrcph $v05.e3, $v09.e7 - vrcpl $v06.e7, $v10.e7 - lsv $v09, 6, 0, $at - lsv $v09, 14, 0, $at - vmov $v10.e3, $v00.e0 - andi $t5, $t4, 16448 - vrcph $v05.e7, $v00.e7 - vmov $v10.e7, $v00.e0 - vmudl $v29, $v10, $v06.h3 - lb $a2, 11($s6) - vmadm $v29, $v09, $v06.h3 - vmadn $v10, $v10, $v05.h3 - vmadh $v09, $v09, $v05.h3 - vmulf $v14, $v14, $v09.h3 - vmudl $v29, $v10, $v20.v - vmadm $v29, $v09, $v20.v - vmadn $v04, $v10, $v19.v - vmadh $v03, $v09, $v19.v - vmadh $v02, $v18, $v07.v - lb $t4, 15($s6) - vmadh $v04, $v07, $v14.v - vsubc $v08, $v02, $v14.v - addu $t4, $t4, $s2 - addu $a2, $a2, $s2 - vmudn $v14, $v14, $v30.e6 - vrcp $v06.e0, $v14.e0 - vrcph $v05.e0, $v14.e0 - vrcp $v06.e1, $v14.e1 - vrcph $v05.e1, $v14.e1 - vrcp $v06.e4, $v14.e4 - vrcph $v05.e4, $v14.e4 - vrcp $v06.e5, $v14.e5 - vrcph $v05.e5, $v14.e5 - sll $a2, $a2, 3 - vor $v03, $v00, $v05 - vmudh $v01, $v08, $v05.v - vmudm $v01, $v01, $v31.e6 - vxor $v05, $v00, $v00.e0 - slv $v03, 0, 28, $s3 - sll $t4, $t4, 3 - mtc2 $t4, $v05.e4 - mtc2 $a2, $v05.e0 - vlt $v01, $v01, $v00.e0 - addiu $t2, $zero, 36 - vsubc $v06, $v00, $v01.v - vand $v05, $v05, $v12.e3 - vge $v29, $v11, $v05.h0 - vmrg $v01, $v06, $v01 - vlt $v04, $v04, $v15 - vge $v08, $v08, $v16 - addiu $at, $zero, 3 - vaddc $v01, $v01, $v05.v - ssv $v02, 4, 12, $s3 - mfc2 $k1, $v04.e5 - mfc2 $s1, $v08.e1 - andi $k1, $k1, 4095 - vsubc $v06, $v12, $v01.v - mfc2 $k0, $v08.e0 - mfc2 $fp, $v08.e5 - andi $s1, $s1, 4095 - sll $k0, $k0, 12 - vaddc $v06, $v06, $v12.e2 - or $s1, $s1, $k0 - mfc2 $k0, $v08.e4 - vge $v29, $v11, $v05.h0 - mfc2 $sp, $v04.e1 - ldv $v05, 0, 8, $s6 - vmrg $v01, $v01, $v06 - andi $fp, $fp, 4095 - sll $k0, $k0, 12 - sw $s1, 4 + 16($s3) - vlt $v06, $v08, $v04 - cfc2 $t4, $vcc - addiu $s6, $s6, 16 - or $fp, $fp, $k0 - mfc2 $k0, $v04.e0 - andi $sp, $sp, 4095 - andi $a2, $t4, 3 - slv $v01, 0, 24, $s3 - lpv $v08, 0, 0, $s6 - sll $k0, $k0, 12 - or $sp, $sp, $k0 - mfc2 $k0, $v04.e4 - sw $sp, 0 + 16($s3) ## Barrier: 0x1 - or $t1, $t1, $a2 - slv $v05, 0, 4, $s3 - sll $k0, $k0, 12 - andi $a2, $t4, 48 - vmulf $v14, $v13, $v08.h3 - vmudm $v08, $v08, $v31.e7 - or $t5, $t5, $a2 - vmov $v08.e3, $v30.e7 - sb $t2, 0 + 16($s3) ## Barrier: 0x1 - bne $t1, $at, LABEL_0010 - or $k1, $k1, $k0 - addiu $s3, $s3, 32 - LABEL_0010: - vmov $v08.e7, $v30.e7 - addiu $at, $zero, 48 - bne $t5, $at, LABEL_0011 - nop - sw $fp, 4 + 16($s3) - slv $v05, 4, 4, $s3 - sw $k1, 0 + 16($s3) ## Barrier: 0x1 - sb $t2, 0 + 16($s3) ## Barrier: 0x1 - slv $v03, 8, 28, $s3 - slv $v01, 8, 24, $s3 - ssv $v02, 12, 12, $s3 - addiu $s3, $s3, 32 - LABEL_0011: - vmudn $v10, $v28, $v08.h0 - vmadh $v09, $v27, $v08.h0 - vmadn $v10, $v26, $v08.h1 - vmadh $v09, $v25, $v08.h1 - sltu $at, $s3, $s5 - vmadn $v10, $v24, $v08.h2 - vmadh $v09, $v23, $v08.h2 - vmadn $v10, $v22, $v08.h3 - bne $at, $zero, LABEL_0012 - vmadh $v09, $v21, $v08.h3 - jal RDPQ_Send ## Args: $s4, $s3 - nop - or $s3, $zero, $s4 - LABEL_0012: - vch $v29, $v09, $v09.h3 - vcl $v29, $v10, $v10.h3 - bne $s6, $s7, LABEL_000E - cfc2 $t4, $vcc - LABEL_000F: - j RDPQ_Send - ori $ra, $zero, %lo(RSPQ_Loop) + andi $t0, $a0, 65535 ## L:1081 | ^ | u32<$t0> dmaSize = dataSize & 0xFFFF; + ori $s4, $zero, %lo(PARTICLE_BUFF) ## L:1080 | 2 | u16<$s4> dmaDmem = PARTICLE_BUFF; + or $t2, $zero, $zero ## L:1097 | 3 | dma_in_async(dmaDmem, rdramAddr, dmaSize); + or $s0, $zero, $a1 ## L:1097 | 4 | dma_in_async(dmaDmem, rdramAddr, dmaSize); + addu $s7, $s4, $t0 ## L:1096 | 5 | u32 ptrInEnd = dmaDmem + dmaSize; + jal DMAExec ## L:1097 | 6 | dma_in_async(dmaDmem, rdramAddr, dmaSize); ## Args: $t0, $t1, $s0, $s4, $t2 + addiu $t0, $t0, -1 ## L:1097 | *8 | dma_in_async(dmaDmem, rdramAddr, dmaSize); + vmudl $v20, $v00, $v31.e3 ## L:1112 | 9 | screenSize >>= 4; + ori $at, $zero, %lo(MATRIX_MVP) ## L:1105 | ^ | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw; + ldv $v22, 0, 56, $at ## L:1108 | 10 | vec32 mat3 = load(MATRIX_MVP, 0x30).xyzwxyzw; + lw $s2, %lo(RDPQ_SCISSOR_RECT + 4) ## L:1212 | 11 | u32 extMax = load(RDPQ_SCISSOR_RECT, 4); + ldv $v23, 0, 32, $at ## L:1107 | 12 | vec32 mat2 = load(MATRIX_MVP, 0x20).xyzwxyzw; + ldv $v25, 0, 16, $at ## L:1106 | 13 | vec32 mat1 = load(MATRIX_MVP, 0x10).xyzwxyzw; + ldv $v24, 0, 40, $at ## L:1107 | 14 | vec32 mat2 = load(MATRIX_MVP, 0x20).xyzwxyzw; + ldv $v21, 0, 48, $at ## L:1108 | 15 | vec32 mat3 = load(MATRIX_MVP, 0x30).xyzwxyzw; + ldv $v25, 8, 16, $at ## L:1106 | 16 | vec32 mat1 = load(MATRIX_MVP, 0x10).xyzwxyzw; + ldv $v27, 0, 0, $at ## L:1105 | 17 | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw; + ldv $v26, 0, 24, $at ## L:1106 | 18 | vec32 mat1 = load(MATRIX_MVP, 0x10).xyzwxyzw; + ldv $v27, 8, 0, $at ## L:1105 | 19 | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw; + ldv $v22, 8, 56, $at ## L:1108 | 20 | vec32 mat3 = load(MATRIX_MVP, 0x30).xyzwxyzw; + ldv $v26, 8, 24, $at ## L:1106 | 21 | vec32 mat1 = load(MATRIX_MVP, 0x10).xyzwxyzw; + ldv $v28, 0, 8, $at ## L:1105 | 22 | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw; + ldv $v23, 8, 32, $at ## L:1107 | 23 | vec32 mat2 = load(MATRIX_MVP, 0x20).xyzwxyzw; + ldv $v28, 8, 8, $at ## L:1105 | 24 | vec32 mat0 = load(MATRIX_MVP, 0x00).xyzwxyzw; + ldv $v21, 8, 48, $at ## L:1108 | 25 | vec32 mat3 = load(MATRIX_MVP, 0x30).xyzwxyzw; + ldv $v24, 8, 40, $at ## L:1107 | 26 | vec32 mat2 = load(MATRIX_MVP, 0x20).xyzwxyzw; + ori $at, $zero, %lo(SCREEN_SCALE_OFFSET) ## L:1110 | 27 | vec32 screenSize:sint = load(SCREEN_SCALE_OFFSET).xyzwxyzw; + ldv $v19, 0, 0, $at ## L:1110 | 28 | vec32 screenSize:sint = load(SCREEN_SCALE_OFFSET).xyzwxyzw; + mtc2 $s2, $v15.e5 ## L:1217 | 29 | screenMax.y = extMax; screenMax.Y = extMax; + ldv $v18, 0, 8, $at ## L:1114 | 30 | vec16 screenOffset = load(SCREEN_SCALE_OFFSET, 0x08).xyzwxyzw; + ldv $v19, 8, 0, $at ## L:1110 | 31 | vec32 screenSize:sint = load(SCREEN_SCALE_OFFSET).xyzwxyzw; + ldv $v18, 8, 8, $at ## L:1114 | 32 | vec16 screenOffset = load(SCREEN_SCALE_OFFSET, 0x08).xyzwxyzw; + srl $t4, $s2, 12 ## L:1215 | 33 | temp1 = extMax >> 12; + lw $s1, %lo(RDPQ_SCISSOR_RECT + 0) ## L:1213 | 34 | u32 extMin = load(RDPQ_SCISSOR_RECT, 0); + mtc2 $s2, $v15.e1 ## L:1217 | 35 | screenMax.y = extMax; screenMax.Y = extMax; + ori $at, $zero, %lo(NORM_SCALE_W) ## L:1115 | 36 | vec16 normScaleW = load(NORM_SCALE_W).xyzwxyzw; + ldv $v17, 0, 0, $at ## L:1115 | 37 | vec16 normScaleW = load(NORM_SCALE_W).xyzwxyzw; + mtc2 $s1, $v16.e5 ## L:1221 | 38 | screenMin.y = extMin; screenMin.Y = extMin; + lui $t7, 0x3A00 ## L:1130 | 39 | cmdRdpColor = 0x3A00'0000; + ori $s3, $zero, %lo(RDP_BUFF) ## L:1102 | 40 | u16<$s3> dmaDmemEnd = RDP_BUFF; + mtc2 $t4, $v15.e0 ## L:1216 | 41 | screenMax.x = temp1; screenMax.X = temp1; + ldv $v17, 8, 0, $at ## L:1115 | 42 | vec16 normScaleW = load(NORM_SCALE_W).xyzwxyzw; + addiu $s5, $s3, 480 ## L:1103 | 43 | u16 dmaDmemFlush = dmaDmemEnd + 480; + or $s6, $zero, $s4 ## L:1099 | 44 | u32 ptrIn = dmaDmem; + addiu $at, $zero, 4095 ## L:1211 | 45 | screenMax.w = 0b1111'1111'1111; + mtc2 $t4, $v15.e4 ## L:1216 | 46 | screenMax.x = temp1; screenMax.X = temp1; + mtc2 $s1, $v16.e1 ## L:1221 | 47 | screenMin.y = extMin; screenMin.Y = extMin; + addiu $a0, $zero, 46 ## L:1131 | 48 | cmdRdpDepth = 0x2E; + srl $t4, $s1, 12 ## L:1219 | 49 | temp1 = extMin >> 12; + mtc2 $at, $v15.e3 ## L:1211 | 50 | screenMax.w = 0b1111'1111'1111; + mtc2 $t4, $v16.e4 ## L:1220 | 51 | screenMin.x = temp1; screenMin.X = temp1; + ori $at, $zero, %lo(PARTICLE_SCALE) ## L:1122 | 52 | globalPartSize.xy = load(PARTICLE_SCALE).xy; + vmadm $v19, $v19, $v31.e3 ## L:1112 | ^ | screenSize >>= 4; + addiu $s1, $s5, 32 ## L:1129 | 53 | u16 buffRdpEnd = dmaDmemFlush + 32; + vmadn $v20, $v00, $v00 ## L:1112 | ^ | screenSize >>= 4; + vand $v15, $v15, $v15.e3 ## L:1223 | 54 | screenMax &= screenMax.w; + mtc2 $t4, $v16.e0 ## L:1220 | ^ | screenMin.x = temp1; screenMin.X = temp1; + llv $v13, 0, 0, $at ## L:1122 | 55 | globalPartSize.xy = load(PARTICLE_SCALE).xy; + llv $v13, 8, 0, $at ## L:1123 | 56 | globalPartSize.XY = load(PARTICLE_SCALE).xy; + ori $s2, $zero, %lo(RDP_BUFF) ## L:1128 | 57 | u16 buffRdp = RDP_BUFF; + ori $s4, $zero, %lo(RDP_BUFF) ## L:1101 | 58 | dmaDmem = RDP_BUFF; + vand $v16, $v16, $v15.e3 ## L:1224 | ^ | screenMin &= screenMax.w; + LABEL_TPXCmd_DrawTextured_0013: + sh $zero, 14($s2) ## L:1135 | 59 | store(ZERO:u16, buffRdp, 0x0E); + sw $t7, 0($s2) ## L:1133 | 60 | store(cmdRdpColor, buffRdp, 0); + sb $a0, 8($s2) ## L:1134 | 61 | store(cmdRdpDepth, buffRdp, 8); + addiu $s2, $s2, 32 ## L:1136 | 62 | buffRdp += 32; + bne $s2, $s1, LABEL_TPXCmd_DrawTextured_0013 ## L:1136 | 63 | buffRdp += 32; + nop ## L:1136 | *65 | buffRdp += 32; + LABEL_TPXCmd_DrawTextured_0014: + lhu $s1, %lo(TILE_COUNT + 0) ## L:1147 | 66 | u16 tiles = load(TILE_COUNT); + vxor $v12, $v00, $v00.e0 ## L:1143 | ^ | vec16 texMirrorMask = 0; + addu $fp, $s1, $s1 ## L:1148 | **69 | u16 tilesEnd = tiles + tiles; + vmov $v12.e5, $v30.e7 ## L:1150 | ^ | texMirrorMask.Y = 1; + mtc2 $fp, $v12.e3 ## L:1153 | 70 | texMirrorMask.w = tilesEnd; + mtc2 $s1, $v12.e2 ## L:1152 | 71 | texMirrorMask.z = tiles; + vmov $v12.e1, $v30.e7 ## L:1151 | 72 | texMirrorMask.y = 1; + vmov $v12.e7, $v31.e7 ## L:1158 | 73 | texMirrorMask.W = 0x100; + addiu $fp, $fp, 65535 ## L:1155 | ^ | tilesEnd -= 1; + mtc2 $fp, $v12.e4 ## L:1157 | 74 | texMirrorMask.X = tilesEnd; + mtc2 $fp, $v12.e0 ## L:1156 | 75 | texMirrorMask.x = tilesEnd; + vxor $v11, $v00, $v00.e0 ## L:1144 | ^ | vec16 texMirrorCompare = 0; + vmudn $v12, $v12, $v12.e7 ## L:1159 | ***79 | texMirrorMask = texMirrorMask * texMirrorMask.W; + vsubc $v29, $v12, $v30.e7 ## L:1161 | ***83 | VTEMP = texMirrorMask - 1; + lh $s2, %lo(TEX_OFFSET + 0) ## L:1141 | ^ | s16 texOffset = load(TEX_OFFSET); + vmov $v12.e3, $v29.e3 ## L:1162 | ***87 | texMirrorMask.w = VTEMP.w; + jal DMAWaitIdle ## L:1166 | ^ | dma_await(); + vor $v11, $v00, $v29.e2 ## L:1163 | *89 | texMirrorCompare = VTEMP.z; + bgez $a1, LABEL_TPXCmd_DrawTextured_0015 ## L:1171 | 90 | if(rdramAddr < 0) { + nop ## L:1171 | *92 | if(rdramAddr < 0) { + vxor $v07, $v00, $v30.e7 ## L:614 | 93 | const vec16 vecOne = 1; + lqv $v08, 0, 0, $s6 ## L:606 | ^ | vec16 posStart = load(ptrIn, 0x00); + vmulf $v14, $v13, $v08.h3 ## L:607 | ***97 | localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; + vmudn $v10, $v28, $v08.h0 ## L:92 | 98 | out = mat0 * vec.xxxxXXXX; + vmadh $v09, $v27, $v08.h0 ## L:92 | 99 | out = mat0 * vec.xxxxXXXX; + vmadn $v10, $v26, $v08.h1 ## L:93 | 100 | out = mat1 +* vec.yyyyYYYY; + vmadh $v09, $v25, $v08.h1 ## L:93 | 101 | out = mat1 +* vec.yyyyYYYY; + vmadn $v10, $v24, $v08.h2 ## L:94 | 102 | out = mat2 +* vec.zzzzZZZZ; + vmadh $v09, $v23, $v08.h2 ## L:94 | 103 | out = mat2 +* vec.zzzzZZZZ; + vmadn $v10, $v22, $v30.e7 ## L:95 | 104 | out = mat3 +* 1; + vmadh $v09, $v21, $v30.e7 ## L:95 | 105 | out = mat3 +* 1; + vch $v29, $v09, $v09.h3 ## L:619 | ***109 | temp1 = clip(posClip, posClip.wwwwWWWW); + vcl $v29, $v10, $v10.h3 ## L:619 | 110 | temp1 = clip(posClip, posClip.wwwwWWWW); + cfc2 $t4, $vcc ## L:619 | 111 | temp1 = clip(posClip, posClip.wwwwWWWW); + LABEL_TPXCmd_DrawTextured_0017: + vmudl $v10, $v10, $v17.v ## L:630 | ^ | posClip *= normScaleW:ufract; + lb $a2, 7($s6) ## L:674 | 112 | temp0:s8 = load(ptrIn, 7); + vmadm $v09, $v09, $v17.v ## L:630 | ^ | posClip *= normScaleW:ufract; + vmadn $v10, $v00, $v00 ## L:630 | 113 | posClip *= normScaleW:ufract; + ori $at, $zero, %lo(BASE_SIZE) ## L:636 | ^ | posClip:sint.w = load(BASE_SIZE).x; + vrcph $v05.e3, $v09.e3 ## L:632 | **116 | invW.w = invert_half(posClip).w; + andi $t1, $t4, 1028 ## L:626 | ^ | clipA = temp1 & 0b0000'0100'0000'0100; + andi $t5, $t4, 16448 ## L:627 | 117 | clipB = temp1 & 0b0100'0000'0100'0000; + vrcpl $v06.e3, $v10.e3 ## L:632 | ^ | invW.w = invert_half(posClip).w; + lb $t4, 15($s6) ## L:675 | 118 | temp1:s8 = load(ptrIn, 15); + vrcph $v05.e3, $v09.e7 ## L:633 | ^ | invW.W = invert_half(posClip).W; + addiu $t2, $zero, 36 ## L:737 | 119 | cmdRdpRect = 0x24; + vrcpl $v06.e7, $v10.e7 ## L:633 | ^ | invW.W = invert_half(posClip).W; + vmov $v10.e7, $v00.e0 ## L:639 | 120 | posClip:sfract.W = 0; + lsv $v09, 6, 0, $at ## L:636 | ^ | posClip:sint.w = load(BASE_SIZE).x; + lsv $v09, 14, 0, $at ## L:637 | 121 | posClip:sint.W = load(BASE_SIZE).x; + vmov $v10.e3, $v00.e0 ## L:638 | ^ | posClip:sfract.w = 0; + addu $t4, $t4, $s2 ## L:681 | 122 | temp1 += texOffset; temp1 <<= 3; + sll $t4, $t4, 3 ## L:681 | 123 | temp1 += texOffset; temp1 <<= 3; + vrcph $v05.e7, $v00.e7 ## L:633 | ^ | invW.W = invert_half(posClip).W; + addu $a2, $a2, $s2 ## L:680 | *125 | temp0 += texOffset; temp0 <<= 3; + vmudl $v29, $v10, $v06.h3 ## L:642 | ^ | posClip *= invW.wwwwWWWW; + sll $a2, $a2, 3 ## L:680 | 126 | temp0 += texOffset; temp0 <<= 3; + vmadm $v29, $v09, $v06.h3 ## L:642 | ^ | posClip *= invW.wwwwWWWW; + addiu $at, $zero, 3 ## L:771 | 127 | if(clipA == 0b0000'0011) { + vmadn $v10, $v10, $v05.h3 ## L:642 | ^ | posClip *= invW.wwwwWWWW; + vmadh $v09, $v09, $v05.h3 ## L:642 | 128 | posClip *= invW.wwwwWWWW; + vmulf $v14, $v14, $v09.h3 ## L:645 | ***132 | localPartSize:sfract *= posClip:sint.wwwwWWWW; + vmudl $v29, $v10, $v20.v ## L:648 | 133 | vec32 posScreen = posClip * screenSize; + vmadm $v29, $v09, $v20.v ## L:648 | 134 | vec32 posScreen = posClip * screenSize; + vmadn $v04, $v10, $v19.v ## L:648 | 135 | vec32 posScreen = posClip * screenSize; + vmadh $v03, $v09, $v19.v ## L:648 | 136 | vec32 posScreen = posClip * screenSize; + vmadh $v02, $v18, $v07.v ## L:649 | 137 | vec16 posCenter = screenOffset:sint +* vecOne; + vmadh $v04, $v07, $v14.v ## L:654 | 138 | vec16 posEnd = vecOne +* localPartSize:sint; + vsubc $v08, $v02, $v14.v ## L:655 | **141 | posStart = posCenter - localPartSize:sint; + vmudn $v14, $v14, $v30.e6 ## L:659 | 142 | localPartSize *= 2; + vrcp $v06.e0, $v14.e0 ## L:660 | ***146 | invW.x = invert_half(localPartSize).x; + vrcph $v05.e0, $v14.e0 ## L:660 | 147 | invW.x = invert_half(localPartSize).x; + vrcp $v06.e1, $v14.e1 ## L:661 | 148 | invW.y = invert_half(localPartSize).y; + vrcph $v05.e1, $v14.e1 ## L:661 | 149 | invW.y = invert_half(localPartSize).y; + vrcp $v06.e4, $v14.e4 ## L:662 | 150 | invW.X = invert_half(localPartSize).X; + vrcph $v05.e4, $v14.e4 ## L:662 | 151 | invW.X = invert_half(localPartSize).X; + vrcp $v06.e5, $v14.e5 ## L:663 | 152 | invW.Y = invert_half(localPartSize).Y; + vrcph $v05.e5, $v14.e5 ## L:663 | 153 | invW.Y = invert_half(localPartSize).Y; + vmudh $v01, $v08, $v05.v ## L:668 | ***157 | vec16 uvStart = posStart * invW:sint; + vor $v03, $v00, $v05 ## L:664 | 158 | vec16 uvDelta = invW:sint; + vmudm $v01, $v01, $v31.e6 ## L:669 | **161 | uvStart >>= 7; + vxor $v05, $v00, $v00.e0 ## L:694 | 162 | vec16 texOffsetTotal = 0; + vlt $v01, $v01, $v00.e0 ## L:670 | **165 | uvStart = uvStart < 0; + mtc2 $a2, $v05.e0 ## L:695 | ^ | texOffsetTotal.x = temp0:s16; + mtc2 $t4, $v05.e4 ## L:696 | 166 | texOffsetTotal.X = temp1:s16; + slv $v03, 0, 28, $s3 ## L:758 | **169 | store(uvDelta.xy, dmaDmemEnd, 0x0C, 16); + vsubc $v06, $v00, $v01.v ## L:692 | ^ | vec16 uvStartNeg = VZERO - uvStart; + vand $v05, $v05, $v12.e3 ## L:697 | 170 | texOffsetTotal &= texMirrorMask.w; + vge $v29, $v11, $v05.h0 ## L:700 | ***174 | uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStartNeg : uvStart; + vmrg $v01, $v06, $v01 ## L:700 | 175 | uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStartNeg : uvStart; + vlt $v04, $v04, $v15 ## L:703 | 176 | posEnd = min(posEnd, screenMax); + ssv $v02, 4, 12, $s3 ## L:749 | ^ | store(posCenter.z, dmaDmemEnd, 0x04, 8); + vge $v08, $v08, $v16 ## L:704 | 177 | posStart = max(posStart, screenMin); + vaddc $v01, $v01, $v05.v ## L:706 | **180 | uvStart += texOffsetTotal; + mfc2 $k1, $v04.e5 ## L:1196 | ^ | outB = pos.Y; + mfc2 $s1, $v08.e1 ## L:1190 | 181 | outA = pos.y; + mfc2 $fp, $v08.e5 ## L:1196 | **184 | outB = pos.Y; + vsubc $v06, $v12, $v01.v ## L:711 | ^ | uvStartNeg = texMirrorMask - uvStart; + mfc2 $k0, $v08.e0 ## L:1192 | 185 | u32 tmp = pos.x; + andi $s1, $s1, 4095 ## L:1191 | 186 | outA &= 0b1111'1111'1111; + andi $k1, $k1, 4095 ## L:1197 | 187 | outB &= 0b1111'1111'1111; + sll $k0, $k0, 12 ## L:1193 | 188 | tmp <<= 12; + or $s1, $s1, $k0 ## L:1194 | 189 | outA |= tmp; + mfc2 $k0, $v08.e4 ## L:1198 | 190 | tmp = pos.X; + vaddc $v06, $v06, $v12.e2 ## L:712 | ^ | uvStartNeg += texMirrorMask.z; + mfc2 $sp, $v04.e1 ## L:1190 | 191 | outA = pos.y; + andi $fp, $fp, 4095 ## L:1197 | 192 | outB &= 0b1111'1111'1111; + sll $k0, $k0, 12 ## L:1199 | 193 | tmp <<= 12; + or $fp, $fp, $k0 ## L:1200 | 194 | outB |= tmp; + mfc2 $k0, $v04.e0 ## L:1192 | 195 | u32 tmp = pos.x; + sw $s1, 4 + 16($s3) ## L:754 | 196 | store(posA, dmaDmemEnd, 0x04, 16); + vge $v29, $v11, $v05.h0 ## L:714 | ^ | uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStart : uvStartNeg; + andi $sp, $sp, 4095 ## L:1191 | 197 | outA &= 0b1111'1111'1111; + vmrg $v01, $v01, $v06 ## L:714 | ^ | uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStart : uvStartNeg; + sll $k0, $k0, 12 ## L:1193 | 198 | tmp <<= 12; + or $sp, $sp, $k0 ## L:1194 | 199 | outA |= tmp; + sw $sp, 0 + 16($s3) ## L:755 | 200 | @Barrier("pos-cmd") store(posEndA, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1 + mfc2 $k0, $v04.e4 ## L:1198 | 201 | tmp = pos.X; + sb $t2, 0 + 16($s3) ## L:756 | 202 | @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1 + vlt $v06, $v08, $v04 ## L:721 | ^ | vec16 extend = posStart < posEnd; + ldv $v08, 0, 24, $s6 ## L:762 | 203 | posStart.xyzw = load(ptrIn, 24).xyzw; + cfc2 $t4, $vcc ## L:722 | 204 | temp1 = get_vcc(); + ldv $v05, 0, 16, $s6 ## L:732 | 205 | vec16 color = load(ptrIn, 16).xyzw; + sll $k0, $k0, 12 ## L:1199 | 206 | tmp <<= 12; + ldv $v08, 8, 32, $s6 ## L:763 | 207 | posStart.XYZW = load(ptrIn, 24).XYZW; + slv $v01, 0, 24, $s3 ## L:757 | 208 | store(uvStart.xy, dmaDmemEnd, 0x08, 16); + andi $a2, $t4, 3 ## L:724 | 209 | temp0 = temp1 & 0b0000'0011; + or $k1, $k1, $k0 ## L:1200 | 210 | outB |= tmp; + or $t1, $t1, $a2 ## L:725 | 211 | clipA |= temp0; + slv $v05, 0, 4, $s3 ## L:745 | 212 | store(color.xy, dmaDmemEnd, 4); + vmulf $v14, $v13, $v08.h3 ## L:764 | ^ | localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; + andi $a2, $t4, 48 ## L:727 | 213 | temp0 = temp1 & 0b0011'0000; + bne $t1, $at, LABEL_TPXCmd_DrawTextured_0019 ## L:771 | 214 | if(clipA == 0b0000'0011) { + or $t5, $t5, $a2 ## L:728 | *216 | clipB |= temp0; + addiu $s3, $s3, 32 ## L:772 | 217 | dmaDmemEnd += 32; + LABEL_TPXCmd_DrawTextured_0019: + addiu $at, $zero, 48 ## L:782 | 218 | if(clipB == 0b0011'0000) { + bne $t5, $at, LABEL_TPXCmd_DrawTextured_001A ## L:782 | 219 | if(clipB == 0b0011'0000) { + addiu $s6, $s6, 24 ## L:776 | *221 | ptrIn += 24; + sw $k1, 0 + 16($s3) ## L:787 | 222 | @Barrier("pos-cmd") store(posEndB, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1 + sb $t2, 0 + 16($s3) ## L:788 | 223 | @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1 + ssv $v02, 12, 12, $s3 ## L:784 | 224 | store(posCenter.Z, dmaDmemEnd, 0x04, 8); + slv $v01, 8, 24, $s3 ## L:790 | 225 | store(uvStart.XY, dmaDmemEnd, 0x08, 16); + slv $v05, 4, 4, $s3 ## L:783 | 226 | store(color.zw, dmaDmemEnd, 4); + sw $fp, 4 + 16($s3) ## L:786 | 227 | store(posB, dmaDmemEnd, 0x04, 16); + slv $v03, 8, 28, $s3 ## L:791 | 228 | store(uvDelta.XY, dmaDmemEnd, 0x0C, 16); + addiu $s3, $s3, 32 ## L:793 | 229 | dmaDmemEnd += 32; + LABEL_TPXCmd_DrawTextured_001A: + vmudn $v10, $v28, $v08.h0 ## L:92 | ^ | out = mat0 * vec.xxxxXXXX; + vmadh $v09, $v27, $v08.h0 ## L:92 | 230 | out = mat0 * vec.xxxxXXXX; + vmadn $v10, $v26, $v08.h1 ## L:93 | 231 | out = mat1 +* vec.yyyyYYYY; + vmadh $v09, $v25, $v08.h1 ## L:93 | 232 | out = mat1 +* vec.yyyyYYYY; + vmadn $v10, $v24, $v08.h2 ## L:94 | 233 | out = mat2 +* vec.zzzzZZZZ; + vmadh $v09, $v23, $v08.h2 ## L:94 | 234 | out = mat2 +* vec.zzzzZZZZ; + sltu $at, $s3, $s5 ## L:800 | ^ | if(dmaDmemEnd >= dmaDmemFlush) { + vmadn $v10, $v22, $v30.e7 ## L:95 | 235 | out = mat3 +* 1; + bne $at, $zero, LABEL_TPXCmd_DrawTextured_001B ## L:800 | ^ | if(dmaDmemEnd >= dmaDmemFlush) { + vmadh $v09, $v21, $v30.e7 ## L:95 | *237 | out = mat3 +* 1; + jal RDPQ_Send ## L:801 | 238 | RDPQ_Send(dmaDmem, dmaDmemEnd); ## Args: $s4, $s3 + nop ## L:801 | *240 | RDPQ_Send(dmaDmem, dmaDmemEnd); + or $s3, $zero, $s4 ## L:802 | 241 | dmaDmemEnd = dmaDmem; + LABEL_TPXCmd_DrawTextured_001B: + vch $v29, $v09, $v09.h3 ## L:805 | ^ | temp1 = clip(posClip, posClip.wwwwWWWW); + vcl $v29, $v10, $v10.h3 ## L:805 | 242 | temp1 = clip(posClip, posClip.wwwwWWWW); + bne $s6, $s7, LABEL_TPXCmd_DrawTextured_0017 ## L:805 | ^ | temp1 = clip(posClip, posClip.wwwwWWWW); + cfc2 $t4, $vcc ## L:805 | *244 | temp1 = clip(posClip, posClip.wwwwWWWW); + LABEL_TPXCmd_DrawTextured_0018: + beq $zero, $zero, LABEL_TPXCmd_DrawTextured_0016 ## L:805 | 245 | temp1 = clip(posClip, posClip.wwwwWWWW); + nop ## L:805 | *247 | temp1 = clip(posClip, posClip.wwwwWWWW); + LABEL_TPXCmd_DrawTextured_0015: + vxor $v07, $v00, $v30.e7 ## L:261 | 248 | const vec16 vecOne = 1; + lpv $v08, 0, 0, $s6 ## L:256 | ^ | vec16 posStart = load_vec_s8(ptrIn, 0x00); + vmulf $v14, $v13, $v08.h3 ## L:257 | ***252 | localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; + vmudm $v08, $v08, $v31.e7 ## L:258 | 253 | posStart >>= 8; + vmudn $v10, $v28, $v08.h0 ## L:92 | ***257 | out = mat0 * vec.xxxxXXXX; + vmadh $v09, $v27, $v08.h0 ## L:92 | 258 | out = mat0 * vec.xxxxXXXX; + vmadn $v10, $v26, $v08.h1 ## L:93 | 259 | out = mat1 +* vec.yyyyYYYY; + vmadh $v09, $v25, $v08.h1 ## L:93 | 260 | out = mat1 +* vec.yyyyYYYY; + vmadn $v10, $v24, $v08.h2 ## L:94 | 261 | out = mat2 +* vec.zzzzZZZZ; + vmadh $v09, $v23, $v08.h2 ## L:94 | 262 | out = mat2 +* vec.zzzzZZZZ; + vmadn $v10, $v22, $v30.e7 ## L:95 | 263 | out = mat3 +* 1; + vmadh $v09, $v21, $v30.e7 ## L:95 | 264 | out = mat3 +* 1; + vch $v29, $v09, $v09.h3 ## L:266 | ***268 | temp1 = clip(posClip, posClip.wwwwWWWW); + vcl $v29, $v10, $v10.h3 ## L:266 | 269 | temp1 = clip(posClip, posClip.wwwwWWWW); + cfc2 $t4, $vcc ## L:266 | 270 | temp1 = clip(posClip, posClip.wwwwWWWW); + LABEL_TPXCmd_DrawTextured_001C: + vmudl $v10, $v10, $v17.v ## L:277 | ^ | posClip *= normScaleW:ufract; + lb $a2, 11($s6) ## L:324 | 271 | temp0:s8 = load(ptrIn, 0x0B); + vmadm $v09, $v09, $v17.v ## L:277 | ^ | posClip *= normScaleW:ufract; + vmadn $v10, $v00, $v00 ## L:277 | 272 | posClip *= normScaleW:ufract; + ori $at, $zero, %lo(BASE_SIZE) ## L:283 | ^ | posClip:sint.w = load(BASE_SIZE).x; + vrcph $v05.e3, $v09.e3 ## L:279 | **275 | invW.w = invert_half(posClip).w; + andi $t5, $t4, 16448 ## L:274 | ^ | clipB = temp1 & 0b0100'0000'0100'0000; + vrcpl $v06.e3, $v10.e3 ## L:279 | 276 | invW.w = invert_half(posClip).w; + andi $t1, $t4, 1028 ## L:273 | ^ | clipA = temp1 & 0b0000'0100'0000'0100; + lb $t4, 15($s6) ## L:325 | 277 | temp1:s8 = load(ptrIn, 0x0F); + vrcph $v05.e3, $v09.e7 ## L:280 | ^ | invW.W = invert_half(posClip).W; + vrcpl $v06.e7, $v10.e7 ## L:280 | 278 | invW.W = invert_half(posClip).W; + vmov $v10.e3, $v00.e0 ## L:285 | 279 | posClip:sfract.w = 0; + lsv $v09, 6, 0, $at ## L:283 | ^ | posClip:sint.w = load(BASE_SIZE).x; + lsv $v09, 14, 0, $at ## L:284 | 280 | posClip:sint.W = load(BASE_SIZE).x; + vmov $v10.e7, $v00.e0 ## L:286 | ^ | posClip:sfract.W = 0; + vrcph $v05.e7, $v00.e7 ## L:280 | 281 | invW.W = invert_half(posClip).W; + vmudl $v29, $v10, $v06.h3 ## L:289 | **284 | posClip *= invW.wwwwWWWW; + addu $a2, $a2, $s2 ## L:327 | ^ | temp0 += texOffset; temp0 <<= 3; + vmadm $v29, $v09, $v06.h3 ## L:289 | 285 | posClip *= invW.wwwwWWWW; + vmadn $v10, $v10, $v05.h3 ## L:289 | 286 | posClip *= invW.wwwwWWWW; + vmadh $v09, $v09, $v05.h3 ## L:289 | 287 | posClip *= invW.wwwwWWWW; + vmulf $v14, $v14, $v09.h3 ## L:292 | ***291 | localPartSize:sfract *= posClip:sint.wwwwWWWW; + vmudl $v29, $v10, $v20.v ## L:295 | 292 | vec32 posScreen = posClip * screenSize; + vmadm $v29, $v09, $v20.v ## L:295 | 293 | vec32 posScreen = posClip * screenSize; + vmadn $v04, $v10, $v19.v ## L:295 | 294 | vec32 posScreen = posClip * screenSize; + vmadh $v03, $v09, $v19.v ## L:295 | 295 | vec32 posScreen = posClip * screenSize; + vmadh $v02, $v18, $v07.v ## L:296 | 296 | vec16 posCenter = screenOffset:sint +* vecOne; + vmadh $v04, $v07, $v14.v ## L:301 | 297 | vec16 posEnd = vecOne +* localPartSize:sint; + vsubc $v08, $v02, $v14.v ## L:302 | **300 | posStart = posCenter - localPartSize:sint; + vmudn $v14, $v14, $v30.e6 ## L:306 | 301 | localPartSize *= 2; + vrcp $v06.e0, $v14.e0 ## L:307 | ***305 | invW.x = invert_half(localPartSize).x; + vrcph $v05.e0, $v14.e0 ## L:307 | 306 | invW.x = invert_half(localPartSize).x; + vrcp $v06.e1, $v14.e1 ## L:308 | 307 | invW.y = invert_half(localPartSize).y; + vrcph $v05.e1, $v14.e1 ## L:308 | 308 | invW.y = invert_half(localPartSize).y; + vrcp $v06.e4, $v14.e4 ## L:309 | 309 | invW.X = invert_half(localPartSize).X; + addu $t4, $t4, $s2 ## L:328 | ^ | temp1 += texOffset; temp1 <<= 3; + vrcph $v05.e4, $v14.e4 ## L:309 | 310 | invW.X = invert_half(localPartSize).X; + ssv $v02, 4, 12, $s3 ## L:396 | ^ | store(posCenter.z, dmaDmemEnd, 0x04, 8); + vrcp $v06.e5, $v14.e5 ## L:310 | 311 | invW.Y = invert_half(localPartSize).Y; + vrcph $v05.e5, $v14.e5 ## L:310 | 312 | invW.Y = invert_half(localPartSize).Y; + vmudh $v01, $v08, $v05.v ## L:315 | ***316 | vec16 uvStart = posStart * invW:sint; + sll $t4, $t4, 3 ## L:328 | ^ | temp1 += texOffset; temp1 <<= 3; + vor $v03, $v00, $v05 ## L:311 | 317 | vec16 uvDelta = invW:sint; + vxor $v05, $v00, $v00.e0 ## L:341 | 318 | vec16 texOffsetTotal = 0; + sll $a2, $a2, 3 ## L:327 | ^ | temp0 += texOffset; temp0 <<= 3; + vmudm $v01, $v01, $v31.e6 ## L:316 | *320 | uvStart >>= 7; + mtc2 $a2, $v05.e0 ## L:342 | ^ | texOffsetTotal.x = temp0:s16; + mtc2 $t4, $v05.e4 ## L:343 | ***324 | texOffsetTotal.X = temp1:s16; + vlt $v01, $v01, $v00.e0 ## L:317 | ^ | uvStart = uvStart < 0; + vsubc $v06, $v00, $v01.v ## L:339 | ***328 | vec16 uvStartNeg = VZERO - uvStart; + vand $v05, $v05, $v12.e3 ## L:344 | 329 | texOffsetTotal &= texMirrorMask.w; + vge $v29, $v11, $v05.h0 ## L:347 | ***333 | uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStartNeg : uvStart; + vmrg $v01, $v06, $v01 ## L:347 | 334 | uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStartNeg : uvStart; + vlt $v04, $v04, $v15 ## L:350 | 335 | posEnd = min(posEnd, screenMax); + slv $v03, 0, 28, $s3 ## L:405 | ^ | store(uvDelta.xy, dmaDmemEnd, 0x0C, 16); + addiu $t2, $zero, 36 ## L:384 | 336 | cmdRdpRect = 0x24; + vge $v08, $v08, $v16 ## L:351 | ^ | posStart = max(posStart, screenMin); + vaddc $v01, $v01, $v05.v ## L:353 | *338 | uvStart += texOffsetTotal; + addiu $at, $zero, 3 ## L:418 | ^ | if(clipA == 0b0000'0011) { + mfc2 $k1, $v04.e5 ## L:1196 | 339 | outB = pos.Y; + mfc2 $k0, $v08.e0 ## L:1192 | 340 | u32 tmp = pos.x; + andi $k1, $k1, 4095 ## L:1197 | *342 | outB &= 0b1111'1111'1111; + vsubc $v06, $v12, $v01.v ## L:358 | ^ | uvStartNeg = texMirrorMask - uvStart; + mfc2 $s1, $v08.e1 ## L:1190 | 343 | outA = pos.y; + mfc2 $fp, $v08.e5 ## L:1196 | 344 | outB = pos.Y; + sll $k0, $k0, 12 ## L:1193 | 345 | tmp <<= 12; + andi $s1, $s1, 4095 ## L:1191 | 346 | outA &= 0b1111'1111'1111; + vaddc $v06, $v06, $v12.e2 ## L:359 | ^ | uvStartNeg += texMirrorMask.z; + andi $fp, $fp, 4095 ## L:1197 | 347 | outB &= 0b1111'1111'1111; + or $s1, $s1, $k0 ## L:1194 | 348 | outA |= tmp; + sw $s1, 4 + 16($s3) ## L:401 | 349 | store(posA, dmaDmemEnd, 0x04, 16); + vge $v29, $v11, $v05.h0 ## L:361 | ^ | uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStart : uvStartNeg; + mfc2 $k0, $v08.e4 ## L:1198 | 350 | tmp = pos.X; + vmrg $v01, $v01, $v06 ## L:361 | ^ | uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStart : uvStartNeg; + mfc2 $sp, $v04.e1 ## L:1190 | 351 | outA = pos.y; + ldv $v05, 0, 8, $s6 ## L:381 | 352 | vec16 color = load(ptrIn, 8).xyzw; + sll $k0, $k0, 12 ## L:1199 | 353 | tmp <<= 12; + vlt $v06, $v08, $v04 ## L:368 | ^ | vec16 extend = posStart < posEnd; + or $fp, $fp, $k0 ## L:1200 | 354 | outB |= tmp; + cfc2 $t4, $vcc ## L:369 | 355 | temp1 = get_vcc(); + mfc2 $k0, $v04.e0 ## L:1192 | 356 | u32 tmp = pos.x; + lpv $v08, 0, 16, $s6 ## L:413 | 357 | posStart = load_vec_s8(ptrIn, 16); + andi $sp, $sp, 4095 ## L:1191 | 358 | outA &= 0b1111'1111'1111; + sll $k0, $k0, 12 ## L:1193 | 359 | tmp <<= 12; + or $sp, $sp, $k0 ## L:1194 | 360 | outA |= tmp; + slv $v05, 0, 4, $s3 ## L:392 | 361 | store(color.xy, dmaDmemEnd, 4); + mfc2 $k0, $v04.e4 ## L:1198 | 362 | tmp = pos.X; + andi $a2, $t4, 3 ## L:371 | 363 | temp0 = temp1 & 0b0000'0011; + or $t1, $t1, $a2 ## L:372 | 364 | clipA |= temp0; + vmulf $v14, $v13, $v08.h3 ## L:414 | ^ | localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; + slv $v01, 0, 24, $s3 ## L:404 | 365 | store(uvStart.xy, dmaDmemEnd, 0x08, 16); + andi $a2, $t4, 48 ## L:374 | 366 | temp0 = temp1 & 0b0011'0000; + sll $k0, $k0, 12 ## L:1199 | 367 | tmp <<= 12; + or $k1, $k1, $k0 ## L:1200 | 368 | outB |= tmp; + sw $sp, 0 + 16($s3) ## L:402 | 369 | @Barrier("pos-cmd") store(posEndA, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1 + or $t5, $t5, $a2 ## L:375 | 370 | clipB |= temp0; + vmudm $v08, $v08, $v31.e7 ## L:415 | ^ | posStart >>= 8; + bne $t1, $at, LABEL_TPXCmd_DrawTextured_001E ## L:418 | 371 | if(clipA == 0b0000'0011) { + sb $t2, 0 + 16($s3) ## L:403 | *373 | @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1 + addiu $s3, $s3, 32 ## L:419 | 374 | dmaDmemEnd += 32; + LABEL_TPXCmd_DrawTextured_001E: + addiu $at, $zero, 48 ## L:429 | 375 | if(clipB == 0b0011'0000) { + bne $t5, $at, LABEL_TPXCmd_DrawTextured_001F ## L:429 | 376 | if(clipB == 0b0011'0000) { + addiu $s6, $s6, 16 ## L:425 | *378 | ptrIn += 16; + sw $k1, 0 + 16($s3) ## L:434 | 379 | @Barrier("pos-cmd") store(posEndB, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1 + slv $v01, 8, 24, $s3 ## L:437 | 380 | store(uvStart.XY, dmaDmemEnd, 0x08, 16); + slv $v05, 4, 4, $s3 ## L:430 | 381 | store(color.zw, dmaDmemEnd, 4); + ssv $v02, 12, 12, $s3 ## L:431 | 382 | store(posCenter.Z, dmaDmemEnd, 0x04, 8); + sw $fp, 4 + 16($s3) ## L:433 | 383 | store(posB, dmaDmemEnd, 0x04, 16); + slv $v03, 8, 28, $s3 ## L:438 | 384 | store(uvDelta.XY, dmaDmemEnd, 0x0C, 16); + sb $t2, 0 + 16($s3) ## L:435 | 385 | @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); ## Barrier: 0x1 + addiu $s3, $s3, 32 ## L:440 | 386 | dmaDmemEnd += 32; + LABEL_TPXCmd_DrawTextured_001F: + vmudn $v10, $v28, $v08.h0 ## L:92 | ^ | out = mat0 * vec.xxxxXXXX; + sltu $at, $s3, $s5 ## L:447 | 387 | if(dmaDmemEnd >= dmaDmemFlush) { + vmadh $v09, $v27, $v08.h0 ## L:92 | ^ | out = mat0 * vec.xxxxXXXX; + vmadn $v10, $v26, $v08.h1 ## L:93 | 388 | out = mat1 +* vec.yyyyYYYY; + vmadh $v09, $v25, $v08.h1 ## L:93 | 389 | out = mat1 +* vec.yyyyYYYY; + vmadn $v10, $v24, $v08.h2 ## L:94 | 390 | out = mat2 +* vec.zzzzZZZZ; + vmadh $v09, $v23, $v08.h2 ## L:94 | 391 | out = mat2 +* vec.zzzzZZZZ; + vmadn $v10, $v22, $v30.e7 ## L:95 | 392 | out = mat3 +* 1; + bne $at, $zero, LABEL_TPXCmd_DrawTextured_0020 ## L:447 | ^ | if(dmaDmemEnd >= dmaDmemFlush) { + vmadh $v09, $v21, $v30.e7 ## L:95 | *394 | out = mat3 +* 1; + jal RDPQ_Send ## L:448 | 395 | RDPQ_Send(dmaDmem, dmaDmemEnd); ## Args: $s4, $s3 + nop ## L:448 | *397 | RDPQ_Send(dmaDmem, dmaDmemEnd); + or $s3, $zero, $s4 ## L:449 | 398 | dmaDmemEnd = dmaDmem; + LABEL_TPXCmd_DrawTextured_0020: + vch $v29, $v09, $v09.h3 ## L:452 | ^ | temp1 = clip(posClip, posClip.wwwwWWWW); + vcl $v29, $v10, $v10.h3 ## L:452 | 399 | temp1 = clip(posClip, posClip.wwwwWWWW); + bne $s6, $s7, LABEL_TPXCmd_DrawTextured_001C ## L:452 | ^ | temp1 = clip(posClip, posClip.wwwwWWWW); + cfc2 $t4, $vcc ## L:452 | *401 | temp1 = clip(posClip, posClip.wwwwWWWW); + LABEL_TPXCmd_DrawTextured_0016: + j RDPQ_Send ## L:1179 | 402 | goto RDPQ_Send; + ori $ra, $zero, %lo(RSPQ_Loop) ## L:1178 | *404 | RA = RSPQ_Loop; OVERLAY_CODE_END: diff --git a/src/t3d/rsp/rsp_tinypx.rspl b/src/t3d/rsp/rsp_tinypx.rspl index 223d6bc6..af3b7328 100644 --- a/src/t3d/rsp/rsp_tinypx.rspl +++ b/src/t3d/rsp/rsp_tinypx.rspl @@ -17,7 +17,8 @@ include "rdpq_macros.h" #define RDP_CMD_TEX_RECT_FLIP 0x25 // size of the 'T3DParticle' struct, containing 2 interleaved particles each -#define PARTICLE_INPUT_SIZE 16 +#define PARTICLE_INPUT_SIZE_S8 16 +#define PARTICLE_INPUT_SIZE_S16 24 // max particles, this must be a multiple of 2 #define PARTICLE_MAX_COUNT 344 @@ -30,6 +31,7 @@ include "rdpq_macros.h" #define RDP_POS_MASK 0b1111'1111'1111 + state { // external libdragon labels @@ -65,6 +67,15 @@ temp_state { #include "inc/math.rspl" +#define LOOP_NAME 8Bit +#include "tpxLoops.rspl" +#undef LOOP_NAME + +#define LOOP_NAME 16Bit +#define LOOP_16BIT 1 +#include "tpxLoops.rspl" +#undef LOOP_NAME + function RDPQ_Send(u16<$s4> dmemStart, u16<$s3> dmaDmemEnd); command<0> TPXCmd_SyncT3D(u32 rdramMatrix, u32 rdramScreen, u16 wNorm) @@ -149,132 +160,14 @@ command<1> TPXCmd_DrawColor(s16 dataSize, s32 rdramAddr) u32 posA, posB, posEndA, posEndB; vec32 posClip; - - // de-phase parts of the loop, this part is also at the end of the loop - vec16 posStart = load_vec_s8(ptrIn, 0x00); - localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; - posStart >>= 8; - posStart.w = 1; - posStart.W = 1; - const vec16 vecOne = 1; u32<$a2> temp0; - // point to clip space - mulMat4Vec8(mat0, mat1, mat2, mat3, posStart, posClip); - temp1 = clip(posClip, posClip.wwwwWWWW); - - // Iterate over all points, transform + clip, save back those that need to be drawn - // the transformed amount might be smaller and shifted due to that - loop { - // only clip-check against Z to prevent overflow, X/Y is clamped in screen-space later - - clipA = temp1 & 0b0000'0100'0000'0100; - clipB = temp1 & 0b0100'0000'0100'0000; - - vec32 invW; - posClip *= normScaleW:ufract; - - invW.w = invert_half(posClip).w; - invW.W = invert_half(posClip).W; - - // store a particle base-size in W to only do one perspective division - posClip:sint.w = load(BASE_SIZE).x; - posClip:sint.W = load(BASE_SIZE).x; - posClip:sfract.w = 0; - posClip:sfract.W = 0; - - // perspective division - posClip *= invW.wwwwWWWW; - undef invW; - - // scale particle size by perspective - localPartSize:sfract *= posClip:sint.wwwwWWWW; - - // transform to screen-space, this is the center of the particles and its depth-value - vec32 posScreen = posClip * screenSize; - vec16 posCenter = screenOffset:sint +* vecOne; - - // extend to both sides for start/end point... - vec16 posEnd = vecOne +* localPartSize:sint; - posStart = posCenter - localPartSize:sint; - - // ... and clamp to the edges of the screen - posEnd = min(posEnd, screenMax); - posStart = max(posStart, screenMin); - - encodeRectPos(posA, posB, posStart); - encodeRectPos(posEndA, posEndB, posEnd); - - // now check if it's completely outside the screen or has a zero-size - vec16 extend = posStart < posEnd; - temp1 = get_vcc(); - - temp0 = temp1 & 0b0000'0011; // only check X/Y - clipA |= temp0; - - temp0 = temp1 & 0b0011'0000; - clipB |= temp0; - - // load color and prepare RPD command IDs - vec16 color = load(ptrIn, 0x08).xyzw; - cmdRdpRect = RDP_CMD_RECT; - - // Save the rectangles now. Each one consists of 3 commands: color, depth, rect - // The first one is always saved here to allow better reordering, - // however both will only submit it by advancing 'dmaDmemEnd' - - // Offset: | 0 | 1 | 2 | 3 || 4 | 5 | 6 | 7 | - // "Prim Color": |0x3A | - | LOD || color (RGBA) | - store(color.xy, dmaDmemEnd, 4); - - // Offset: | 0 | 1 | 2 | 3 || 4 | 5 | 6 | 7 | - // "Prim Depth": |0x2E | - | - | - || depth | delta-Z | - store(posCenter.z, dmaDmemEnd, 0x04, 8); - - // Offset: | 0 | 1 | 2 | 3 || 4 | 5 | 6 | 7 | - // "Fill Rect": |0x36 | X0/Y0 (10.2) || - | X1/Y1 (10.2) | - store(posA, dmaDmemEnd, 0x04, 16); - @Barrier("pos-cmd") store(posEndA, dmaDmemEnd, 0x00, 16); - @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); - - // load particle for next iteration - ptrIn += PARTICLE_INPUT_SIZE; - posStart = load_vec_s8(ptrIn, 0x00, PARTICLE_INPUT_SIZE); - localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; - posStart >>= 8; - - posStart.w = 1; - - if(clipA == 0b0000'0011) { - dmaDmemEnd += 24; - } - - posStart.W = 1; - // Second rectangle: - if(clipB == 0b0011'0000) { - store(color.zw, dmaDmemEnd, 4); - store(posCenter.Z, dmaDmemEnd, 0x04, 8); - - store(posB, dmaDmemEnd, 0x04, 16); - @Barrier("pos-cmd") store(posEndB, dmaDmemEnd, 0x00, 16); - @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); - - dmaDmemEnd += 24; - } - - mulMat4Vec8(mat0, mat1, mat2, mat3, posStart, posClip); - - // When the RDP buffer in DMEM is full, submit and DMA them out. - // If anything is left, a last call after the loop will submit the rest. - if(dmaDmemEnd >= dmaDmemFlush) { - RDPQ_Send(dmaDmem, dmaDmemEnd); - dmaDmemEnd = dmaDmem; - } - - temp1 = clip(posClip, posClip.wwwwWWWW); - - } while (ptrIn != ptrInEnd) + if(rdramAddr < 0) { + mainLoop_color16Bit(); + } else { + mainLoop_color8Bit(); + } // submit the rest of the buffer (if any) RA = RSPQ_Loop; // @TODO: add RSPL auto-opt. for this (needs RA assign) @@ -391,189 +284,11 @@ command<4> TPXCmd_DrawTextured(s16 dataSize, s32 rdramAddr) u32 posA, posB, posEndA, posEndB; vec32 posClip; - // de-phase parts of the loop, this part is also at the end of the loop - vec16 posStart = load_vec_s8(ptrIn, 0x00); - localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; - posStart >>= 8; - posStart.w = 1; - posStart.W = 1; - - const vec16 vecOne = 1; - u32<$a2> temp0; - - // point to clip space - mulMat4Vec8(mat0, mat1, mat2, mat3, posStart, posClip); - temp1 = clip(posClip, posClip.wwwwWWWW); - - // Iterate over all points, transform + clip, save back those that need to be drawn - // the transformed amount might be smaller and shifted due to that - loop { - // only clip-check against Z to prevent overflow, X/Y is clamped in screen-space later - - clipA = temp1 & 0b0000'0100'0000'0100; - clipB = temp1 & 0b0100'0000'0100'0000; - - vec32 invW; - posClip *= normScaleW:ufract; - - invW.w = invert_half(posClip).w; - invW.W = invert_half(posClip).W; - - // store a particle base-size in W to only do one perspective division - posClip:sint.w = load(BASE_SIZE).x; - posClip:sint.W = load(BASE_SIZE).x; - posClip:sfract.w = 0; - posClip:sfract.W = 0; - - // perspective division - posClip *= invW.wwwwWWWW; - - // scale particle size by perspective - localPartSize:sfract *= posClip:sint.wwwwWWWW; - - // transform to screen-space, this is the center of the particles and its depth-value - vec32 posScreen = posClip * screenSize; - vec16 posCenter = screenOffset:sint +* vecOne; - - undef posScreen; - - // extend to both sides for start/end point - vec16 posEnd = vecOne +* localPartSize:sint; - posStart = posCenter - localPartSize:sint; - - // calculate UV delta, this is the increment in texture-coords per screen-space pixel - // even if clipped, this doesn't need further adjustment - localPartSize *= 2; - invW.x = invert_half(localPartSize).x; - invW.y = invert_half(localPartSize).y; - invW.X = invert_half(localPartSize).X; - invW.Y = invert_half(localPartSize).Y; - vec16 uvDelta = invW:sint; - - // offset of the UV, only actually used if it is clipped on the upper or left side - // since rect-pos can't be negative, we clamp it and need to adjust UVs instead - vec16 uvStart = posStart * invW:sint; - uvStart >>= 7; - uvStart = uvStart < 0; - - // local UV offset, stored in alpha channel of color, this is added to the global - temp0:s8 = load(ptrIn, 0x0B); - temp1:s8 = load(ptrIn, 0x0F); - temp0 += texOffset; temp0 <<= 3; - temp1 += texOffset; temp1 <<= 3; - - undef invW; - - // Repeating & mirroring of UVs for half-rotation effect. - // E.g.: given a texture (64x16) animating a half-rotation in 4 frames, it will go through 8 steps: - // x-axis goes from 0-112 in steps of 16 repeating the 4 frames two times, - // y-axis stays 0 until half-way, then goes to 16 for the other half. - // The texture needs to be mirrored, which causes it to mirror on both axis after half the frames. - // (Note that clipping which shifts UVs needs to be taken into account here) - { - vec16 uvStartNeg = VZERO - uvStart; - - vec16 texOffsetTotal = 0; - texOffsetTotal.x = temp0:s16; - texOffsetTotal.X = temp1:s16; - texOffsetTotal &= texMirrorMask.w; // mask to stay within out tile counts - - // this check here is inverted since we would need to negate in the case of clipping beforehand too - uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStartNeg : uvStart; - - // clamp Pos to the edges of the screen - posEnd = min(posEnd, screenMax); - posStart = max(posStart, screenMin); - - uvStart += texOffsetTotal; - - // shift range to middle to invert, only used if we are in the second half (the mirrored one) - // E.g.: with 4 frames: (4,5,6,7) becomes (7,6,5,4) - // this is needed since mirroring inverts the indices of an animation - uvStartNeg = texMirrorMask - uvStart; - uvStartNeg += texMirrorMask.z; - - uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStart : uvStartNeg; - } - - encodeRectPos(posA, posB, posStart); - encodeRectPos(posEndA, posEndB, posEnd); - - // now check if it's completely outside the screen or has a zero-size - vec16 extend = posStart < posEnd; - temp1 = get_vcc(); - - temp0 = temp1 & 0b0000'0011; // only check X/Y - clipA |= temp0; - - temp0 = temp1 & 0b0011'0000; - clipB |= temp0; - - // load color and prepare RPD command IDs - vec16 color = load(ptrIn, 0x08).xyzw; - cmdRdpRect = RDP_CMD_TEX_RECT; - - // Save the rectangles now. Each one consists of 3 commands: color, depth, rect - // The first one is always saved here to allow better reordering, - // however both will only submit it by advancing 'dmaDmemEnd' - - // Offset: | 0 | 1 | 2 | 3 || 4 | 5 | 6 | 7 | - // "Prim Color": |0x3A | - | LOD || color (RGBA) | - store(color.xy, dmaDmemEnd, 4); - - // Offset: | 0 | 1 | 2 | 3 || 4 | 5 | 6 | 7 | - // "Prim Depth": |0x2E | - | - | - || depth | delta-Z | - store(posCenter.z, dmaDmemEnd, 0x04, 8); - - // Offset: | 0 | 1 | 2 | 3 || 4 | 5 | 6 | 7 | - // "Tex Rect": |0x24 | X0/Y0 (10.2) || - | X1/Y1 (10.2) | - // | S (5.10) | T (5.10) || Ds (5.10) | Dt (5.10) | - store(posA, dmaDmemEnd, 0x04, 16); - @Barrier("pos-cmd") store(posEndA, dmaDmemEnd, 0x00, 16); - @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); - store(uvStart.xy, dmaDmemEnd, 0x08, 16); - store(uvDelta.xy, dmaDmemEnd, 0x0C, 16); - - // load particle for next iteration - ptrIn += PARTICLE_INPUT_SIZE; - posStart = load_vec_s8(ptrIn, 0x00, PARTICLE_INPUT_SIZE); - localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; - posStart >>= 8; - - posStart.w = 1; - - if(clipA == 0b0000'0011) { - dmaDmemEnd += 32; - } - - posStart.W = 1; - // Second rectangle: - if(clipB == 0b0011'0000) { - store(color.zw, dmaDmemEnd, 4); - store(posCenter.Z, dmaDmemEnd, 0x04, 8); - - store(posB, dmaDmemEnd, 0x04, 16); - @Barrier("pos-cmd") store(posEndB, dmaDmemEnd, 0x00, 16); - @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); - - store(uvStart.XY, dmaDmemEnd, 0x08, 16); - store(uvDelta.XY, dmaDmemEnd, 0x0C, 16); - - dmaDmemEnd += 32; - } - - mulMat4Vec8(mat0, mat1, mat2, mat3, posStart, posClip); - - // When the RDP buffer in DMEM is full, submit and DMA them out. - // If anything is left, a last call after the loop will submit the rest. - if(dmaDmemEnd >= dmaDmemFlush) { - RDPQ_Send(dmaDmem, dmaDmemEnd); - dmaDmemEnd = dmaDmem; - } - - temp1 = clip(posClip, posClip.wwwwWWWW); - - } while (ptrIn != ptrInEnd) + if(rdramAddr < 0) { + mainLoop_tex16Bit(); + } else { + mainLoop_tex8Bit(); + } // submit the rest of the buffer (if any) RA = RSPQ_Loop; // @TODO: add RSPL auto-opt. for this (needs RA assign) @@ -601,6 +316,7 @@ macro encodeRectPos(u32 outA, u32 outB, vec16 pos) outB |= tmp; } + /** * Loads the current scissor area from the shared 'RDPQ_SCISSOR_RECT' setting * @param screenMin minimum (sets .xy & .XY) diff --git a/src/t3d/rsp/tpxLoops.rspl b/src/t3d/rsp/tpxLoops.rspl new file mode 100644 index 00000000..a8afa26d --- /dev/null +++ b/src/t3d/rsp/tpxLoops.rspl @@ -0,0 +1,348 @@ +macro mainLoop_color${LOOP_NAME}() +{ + // de-phase parts of the loop, this part is also at the end of the loop +#ifdef LOOP_16BIT + vec16 posStart = load(ptrIn, 0x00); + localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; +#else + vec16 posStart = load_vec_s8(ptrIn, 0x00); + localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; + posStart >>= 8; +#endif + // point to clip space + mulMat4Vec3(mat0, mat1, mat2, mat3, posStart, posClip); + temp1 = clip(posClip, posClip.wwwwWWWW); + + // Iterate over all points, transform + clip, save back those that need to be drawn + // the transformed amount might be smaller and shifted due to that + loop { + // only clip-check against Z to prevent overflow, X/Y is clamped in screen-space later + + clipA = temp1 & 0b0000'0100'0000'0100; + clipB = temp1 & 0b0100'0000'0100'0000; + + vec32 invW; + posClip *= normScaleW:ufract; + + invW.w = invert_half(posClip).w; + invW.W = invert_half(posClip).W; + + // store a particle base-size in W to only do one perspective division + posClip:sint.w = load(BASE_SIZE).x; + posClip:sint.W = load(BASE_SIZE).x; + posClip:sfract.w = 0; + posClip:sfract.W = 0; + + // perspective division + posClip *= invW.wwwwWWWW; + undef invW; + + // scale particle size by perspective + localPartSize:sfract *= posClip:sint.wwwwWWWW; + + // transform to screen-space, this is the center of the particles and its depth-value + vec32 posScreen = posClip * screenSize; + vec16 posCenter = screenOffset:sint +* vecOne; + + // extend to both sides for start/end point... + vec16 posEnd = vecOne +* localPartSize:sint; + posStart = posCenter - localPartSize:sint; + + // ... and clamp to the edges of the screen + posEnd = min(posEnd, screenMax); + posStart = max(posStart, screenMin); + + encodeRectPos(posA, posB, posStart); + encodeRectPos(posEndA, posEndB, posEnd); + + // now check if it's completely outside the screen or has a zero-size + vec16 extend = posStart < posEnd; + temp1 = get_vcc(); + + temp0 = temp1 & 0b0000'0011; // only check X/Y + clipA |= temp0; + + temp0 = temp1 & 0b0011'0000; + clipB |= temp0; + + // load color and prepare RPD command IDs + #ifdef LOOP_16BIT + vec16 color = load(ptrIn, 16).xyzw; + #else + vec16 color = load(ptrIn, 8).xyzw; + #endif + cmdRdpRect = RDP_CMD_RECT; + + // Save the rectangles now. Each one consists of 3 commands: color, depth, rect + // The first one is always saved here to allow better reordering, + // however both will only submit it by advancing 'dmaDmemEnd' + + // Offset: | 0 | 1 | 2 | 3 || 4 | 5 | 6 | 7 | + // "Prim Color": |0x3A | - | LOD || color (RGBA) | + store(color.xy, dmaDmemEnd, 4); + + // Offset: | 0 | 1 | 2 | 3 || 4 | 5 | 6 | 7 | + // "Prim Depth": |0x2E | - | - | - || depth | delta-Z | + store(posCenter.z, dmaDmemEnd, 0x04, 8); + + // Offset: | 0 | 1 | 2 | 3 || 4 | 5 | 6 | 7 | + // "Fill Rect": |0x36 | X0/Y0 (10.2) || - | X1/Y1 (10.2) | + store(posA, dmaDmemEnd, 0x04, 16); + @Barrier("pos-cmd") store(posEndA, dmaDmemEnd, 0x00, 16); + @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); + + // load particle for next iteration + + #ifdef LOOP_16BIT + posStart.xyzw = load(ptrIn, PARTICLE_INPUT_SIZE_S16).xyzw; + posStart.XYZW = load(ptrIn, PARTICLE_INPUT_SIZE_S16).XYZW; + localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; + #else + posStart = load_vec_s8(ptrIn, PARTICLE_INPUT_SIZE_S8); + localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; + posStart >>= 8; + #endif + + if(clipA == 0b0000'0011) { + dmaDmemEnd += 24; + } + + #ifdef LOOP_16BIT + ptrIn += PARTICLE_INPUT_SIZE_S16; + #else + ptrIn += PARTICLE_INPUT_SIZE_S8; + #endif + + // Second rectangle: + if(clipB == 0b0011'0000) { + store(color.zw, dmaDmemEnd, 4); + store(posCenter.Z, dmaDmemEnd, 0x04, 8); + + store(posB, dmaDmemEnd, 0x04, 16); + @Barrier("pos-cmd") store(posEndB, dmaDmemEnd, 0x00, 16); + @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); + + dmaDmemEnd += 24; + } + + mulMat4Vec3(mat0, mat1, mat2, mat3, posStart, posClip); + + // When the RDP buffer in DMEM is full, submit and DMA them out. + // If anything is left, a last call after the loop will submit the rest. + if(dmaDmemEnd >= dmaDmemFlush) { + RDPQ_Send(dmaDmem, dmaDmemEnd); + dmaDmemEnd = dmaDmem; + } + + temp1 = clip(posClip, posClip.wwwwWWWW); + + } while (ptrIn != ptrInEnd) +} + +macro mainLoop_tex${LOOP_NAME}() +{ + // de-phase parts of the loop, this part is also at the end of the loop +#ifdef LOOP_16BIT + vec16 posStart = load(ptrIn, 0x00); + localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; +#else + vec16 posStart = load_vec_s8(ptrIn, 0x00); + localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; + posStart >>= 8; +#endif + + const vec16 vecOne = 1; + u32<$a2> temp0; + + // point to clip space + mulMat4Vec3(mat0, mat1, mat2, mat3, posStart, posClip); + temp1 = clip(posClip, posClip.wwwwWWWW); + + // Iterate over all points, transform + clip, save back those that need to be drawn + // the transformed amount might be smaller and shifted due to that + loop { + // only clip-check against Z to prevent overflow, X/Y is clamped in screen-space later + + clipA = temp1 & 0b0000'0100'0000'0100; + clipB = temp1 & 0b0100'0000'0100'0000; + + vec32 invW; + posClip *= normScaleW:ufract; + + invW.w = invert_half(posClip).w; + invW.W = invert_half(posClip).W; + + // store a particle base-size in W to only do one perspective division + posClip:sint.w = load(BASE_SIZE).x; + posClip:sint.W = load(BASE_SIZE).x; + posClip:sfract.w = 0; + posClip:sfract.W = 0; + + // perspective division + posClip *= invW.wwwwWWWW; + + // scale particle size by perspective + localPartSize:sfract *= posClip:sint.wwwwWWWW; + + // transform to screen-space, this is the center of the particles and its depth-value + vec32 posScreen = posClip * screenSize; + vec16 posCenter = screenOffset:sint +* vecOne; + + undef posScreen; + + // extend to both sides for start/end point + vec16 posEnd = vecOne +* localPartSize:sint; + posStart = posCenter - localPartSize:sint; + + // calculate UV delta, this is the increment in texture-coords per screen-space pixel + // even if clipped, this doesn't need further adjustment + localPartSize *= 2; + invW.x = invert_half(localPartSize).x; + invW.y = invert_half(localPartSize).y; + invW.X = invert_half(localPartSize).X; + invW.Y = invert_half(localPartSize).Y; + vec16 uvDelta = invW:sint; + + // offset of the UV, only actually used if it is clipped on the upper or left side + // since rect-pos can't be negative, we clamp it and need to adjust UVs instead + vec16 uvStart = posStart * invW:sint; + uvStart >>= 7; + uvStart = uvStart < 0; + + // local UV offset, stored in alpha channel of color, this is added to the global + #ifdef LOOP_16BIT + temp0:s8 = load(ptrIn, 7); + temp1:s8 = load(ptrIn, 15); + #else + temp0:s8 = load(ptrIn, 0x0B); + temp1:s8 = load(ptrIn, 0x0F); + #endif + temp0 += texOffset; temp0 <<= 3; + temp1 += texOffset; temp1 <<= 3; + + undef invW; + + // Repeating & mirroring of UVs for half-rotation effect. + // E.g.: given a texture (64x16) animating a half-rotation in 4 frames, it will go through 8 steps: + // x-axis goes from 0-112 in steps of 16 repeating the 4 frames two times, + // y-axis stays 0 until half-way, then goes to 16 for the other half. + // The texture needs to be mirrored, which causes it to mirror on both axis after half the frames. + // (Note that clipping which shifts UVs needs to be taken into account here) + { + vec16 uvStartNeg = VZERO - uvStart; + + vec16 texOffsetTotal = 0; + texOffsetTotal.x = temp0:s16; + texOffsetTotal.X = temp1:s16; + texOffsetTotal &= texMirrorMask.w; // mask to stay within out tile counts + + // this check here is inverted since we would need to negate in the case of clipping beforehand too + uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStartNeg : uvStart; + + // clamp Pos to the edges of the screen + posEnd = min(posEnd, screenMax); + posStart = max(posStart, screenMin); + + uvStart += texOffsetTotal; + + // shift range to middle to invert, only used if we are in the second half (the mirrored one) + // E.g.: with 4 frames: (4,5,6,7) becomes (7,6,5,4) + // this is needed since mirroring inverts the indices of an animation + uvStartNeg = texMirrorMask - uvStart; + uvStartNeg += texMirrorMask.z; + + uvStart = texMirrorCompare >= texOffsetTotal.xxxxXXXX ? uvStart : uvStartNeg; + } + + encodeRectPos(posA, posB, posStart); + encodeRectPos(posEndA, posEndB, posEnd); + + // now check if it's completely outside the screen or has a zero-size + vec16 extend = posStart < posEnd; + temp1 = get_vcc(); + + temp0 = temp1 & 0b0000'0011; // only check X/Y + clipA |= temp0; + + temp0 = temp1 & 0b0011'0000; + clipB |= temp0; + + // load color and prepare RPD command IDs + #ifdef LOOP_16BIT + vec16 color = load(ptrIn, 16).xyzw; + #else + vec16 color = load(ptrIn, 8).xyzw; + #endif + + cmdRdpRect = RDP_CMD_TEX_RECT; + + // Save the rectangles now. Each one consists of 3 commands: color, depth, rect + // The first one is always saved here to allow better reordering, + // however both will only submit it by advancing 'dmaDmemEnd' + + // Offset: | 0 | 1 | 2 | 3 || 4 | 5 | 6 | 7 | + // "Prim Color": |0x3A | - | LOD || color (RGBA) | + store(color.xy, dmaDmemEnd, 4); + + // Offset: | 0 | 1 | 2 | 3 || 4 | 5 | 6 | 7 | + // "Prim Depth": |0x2E | - | - | - || depth | delta-Z | + store(posCenter.z, dmaDmemEnd, 0x04, 8); + + // Offset: | 0 | 1 | 2 | 3 || 4 | 5 | 6 | 7 | + // "Tex Rect": |0x24 | X0/Y0 (10.2) || - | X1/Y1 (10.2) | + // | S (5.10) | T (5.10) || Ds (5.10) | Dt (5.10) | + store(posA, dmaDmemEnd, 0x04, 16); + @Barrier("pos-cmd") store(posEndA, dmaDmemEnd, 0x00, 16); + @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); + store(uvStart.xy, dmaDmemEnd, 0x08, 16); + store(uvDelta.xy, dmaDmemEnd, 0x0C, 16); + + // load particle for next iteration + #ifdef LOOP_16BIT + posStart.xyzw = load(ptrIn, PARTICLE_INPUT_SIZE_S16).xyzw; + posStart.XYZW = load(ptrIn, PARTICLE_INPUT_SIZE_S16).XYZW; + localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; + #else + posStart = load_vec_s8(ptrIn, PARTICLE_INPUT_SIZE_S8); + localPartSize = globalPartSize:sfract * posStart:sfract.wwwwWWWW; + posStart >>= 8; + #endif + + if(clipA == 0b0000'0011) { + dmaDmemEnd += 32; + } + + #ifdef LOOP_16BIT + ptrIn += PARTICLE_INPUT_SIZE_S16; + #else + ptrIn += PARTICLE_INPUT_SIZE_S8; + #endif + + // Second rectangle: + if(clipB == 0b0011'0000) { + store(color.zw, dmaDmemEnd, 4); + store(posCenter.Z, dmaDmemEnd, 0x04, 8); + + store(posB, dmaDmemEnd, 0x04, 16); + @Barrier("pos-cmd") store(posEndB, dmaDmemEnd, 0x00, 16); + @Barrier("pos-cmd") store(cmdRdpRect, dmaDmemEnd, 0x00, 16); + + store(uvStart.XY, dmaDmemEnd, 0x08, 16); + store(uvDelta.XY, dmaDmemEnd, 0x0C, 16); + + dmaDmemEnd += 32; + } + + mulMat4Vec3(mat0, mat1, mat2, mat3, posStart, posClip); + + // When the RDP buffer in DMEM is full, submit and DMA them out. + // If anything is left, a last call after the loop will submit the rest. + if(dmaDmemEnd >= dmaDmemFlush) { + RDPQ_Send(dmaDmem, dmaDmemEnd); + dmaDmemEnd = dmaDmem; + } + + temp1 = clip(posClip, posClip.wwwwWWWW); + + } while (ptrIn != ptrInEnd) +} \ No newline at end of file diff --git a/src/t3d/t3d.h b/src/t3d/t3d.h index eb256e7c..c2ddbde0 100644 --- a/src/t3d/t3d.h +++ b/src/t3d/t3d.h @@ -50,7 +50,7 @@ typedef struct { /* 0x1C */ int16_t stB[2]; // UV fixed point 10.5 (pixel coords) } __attribute__((aligned(8))) T3DVertPacked; -_Static_assert(sizeof(T3DVertPacked) == 0x20, "T3DVertPacked has wrong size"); +static_assert(sizeof(T3DVertPacked) == 0x20, "T3DVertPacked has wrong size"); enum T3DDrawFlags { T3D_FLAG_DEPTH = 1 << 0, diff --git a/src/t3d/t3danim.h b/src/t3d/t3danim.h index d34ea74d..ffa42cac 100644 --- a/src/t3d/t3danim.h +++ b/src/t3d/t3danim.h @@ -122,7 +122,7 @@ void t3d_anim_set_time(T3DAnim* anim, float time); * @param anim animation to get time for * @return current time in seconds */ -static float t3d_anim_get_time(const T3DAnim* anim) { +inline static float t3d_anim_get_time(const T3DAnim* anim) { return anim->time; } @@ -131,7 +131,7 @@ static float t3d_anim_get_time(const T3DAnim* anim) { * @param anim animation to get length for * @return length in seconds */ -static float t3d_anim_get_length(const T3DAnim* anim) { +inline static float t3d_anim_get_length(const T3DAnim* anim) { return anim->animRef->duration; } diff --git a/src/t3d/tpx.c b/src/t3d/tpx.c index fc24598b..8c361a2b 100644 --- a/src/t3d/tpx.c +++ b/src/t3d/tpx.c @@ -12,8 +12,9 @@ extern rsp_ucode_t rsp_tiny3d; DEFINE_RSP_UCODE(rsp_tinypx); uint32_t TPX_RSP_ID = 0; -#define SWAP_U32(a, b) {uint32_t tmp = a; a = b; b = tmp;} -#define MAX_PARTICLES_COLOR 344 +#define SWAP_VALUE(a, b) {auto tmp = a; a = b; b = tmp;} +#define MAX_PARTICLES_S8 344 +#define MAX_PARTICLES_S16 228 static T3DMat4FP *matrixStack = NULL; @@ -76,29 +77,54 @@ void tpx_state_set_tex_params(int16_t offsetX, uint16_t mirrorPoint) tpx_dmem_set_u32(RSP_TPX_TEX_OFFSET, val); } -inline static void tpx_particle_draw_generic(TPXParticle *particles, uint32_t count, uint32_t rspCmd) +inline static void tpx_particle_draw_generic_s8(TPXParticleS8 *particles, uint32_t count, uint32_t rspCmd) { assert((count & 1) == 0); - for(uint32_t i = 0; i < count; i += MAX_PARTICLES_COLOR) { + for(uint32_t i = 0; i < count; i += MAX_PARTICLES_S8) { uint32_t batchSize = (count - i); - if(batchSize > MAX_PARTICLES_COLOR)batchSize = MAX_PARTICLES_COLOR; + if(batchSize > MAX_PARTICLES_S8)batchSize = MAX_PARTICLES_S8; - uint32_t loadSize = sizeof(TPXParticle) * batchSize / 2; + uint32_t loadSize = sizeof(TPXParticleS8) * batchSize / 2; rdpq_write(-1, TPX_RSP_ID, rspCmd, - loadSize, (uint32_t)UncachedAddr(particles) + loadSize, ((uint32_t)(particles) & 0xFFFFFF) ); - particles += MAX_PARTICLES_COLOR / 2; + particles += MAX_PARTICLES_S8 / 2; } } -void tpx_particle_draw(TPXParticle *particles, uint32_t count) { - tpx_particle_draw_generic(particles, count, TPX_CMD_DRAW_COLOR); +inline static void tpx_particle_draw_generic_s16(TPXParticleS16 *particles, uint32_t count, uint32_t rspCmd) +{ + assert((count & 1) == 0); + + for(uint32_t i = 0; i < count; i += MAX_PARTICLES_S16) { + uint32_t batchSize = (count - i); + if(batchSize > MAX_PARTICLES_S16)batchSize = MAX_PARTICLES_S16; + + uint32_t loadSize = sizeof(TPXParticleS16) * batchSize / 2; + rdpq_write(-1, TPX_RSP_ID, rspCmd, + loadSize, ((uint32_t)(particles) & 0xFFFFFF) | 0x8000'0000 + ); + + particles += MAX_PARTICLES_S16 / 2; + } } -void tpx_particle_draw_tex(TPXParticle *particles, uint32_t count) { - tpx_particle_draw_generic(particles, count, TPX_CMD_DRAW_TEXTURE); +void tpx_particle_draw_s8(TPXParticleS8 *particles, uint32_t count) { + tpx_particle_draw_generic_s8(particles, count, TPX_CMD_DRAW_COLOR); +} + +void tpx_particle_draw_s16(TPXParticleS16 *particles, uint32_t count) { + tpx_particle_draw_generic_s16(particles, count, TPX_CMD_DRAW_COLOR); +} + +void tpx_particle_draw_tex_s8(TPXParticleS8 *particles, uint32_t count) { + tpx_particle_draw_generic_s8(particles, count, TPX_CMD_DRAW_TEXTURE); +} + +void tpx_particle_draw_tex_s16(TPXParticleS16 *particles, uint32_t count) { + tpx_particle_draw_generic_s16(particles, count, TPX_CMD_DRAW_TEXTURE); } inline static void tpx_matrix_stack(void *mat, int32_t stackAdvance, bool doMultiply, bool onlyStackMove) { @@ -127,18 +153,29 @@ void tpx_matrix_push_pos(int count) { tpx_matrix_stack(NULL, stackAdvance, false, true); } -void tpx_buffer_swap(TPXParticle pt[], uint32_t idxA, uint32_t idxB) { +void tpx_buffer_s8_swap(TPXParticleS8 pt[], uint32_t idxA, uint32_t idxB) { uint32_t *dataA = (uint32_t*)&pt[idxA/2]; uint32_t *dataB = (uint32_t*)&pt[idxB/2]; dataA += idxA & 1; dataB += idxB & 1; - SWAP_U32(dataA[0], dataB[0]); - SWAP_U32(dataA[2], dataB[2]); + SWAP_VALUE(dataA[0], dataB[0]); + SWAP_VALUE(dataA[2], dataB[2]); } -void tpx_buffer_copy(TPXParticle *pt, uint32_t idxDst, uint32_t idxSrc) { +void tpx_buffer_s16_swap(TPXParticleS16 pt[], uint32_t idxA, uint32_t idxB) +{ + auto val0_a = (uint64_t*)tpx_buffer_s16_get_pos(pt, idxA); + auto val0_b = (uint64_t*)tpx_buffer_s16_get_pos(pt, idxB); + SWAP_VALUE(*val0_a, *val0_b); + + auto val1_a = (uint32_t*)tpx_buffer_s16_get_rgba(pt, idxA); + auto val1_b = (uint32_t*)tpx_buffer_s16_get_rgba(pt, idxB); + SWAP_VALUE(*val1_a, *val1_b); +} + +void tpx_buffer_s8_copy(TPXParticleS8 *pt, uint32_t idxDst, uint32_t idxSrc) { uint32_t *dataDst = (uint32_t*)&pt[idxDst/2]; uint32_t *dataSrc = (uint32_t*)&pt[idxSrc/2]; @@ -149,6 +186,17 @@ void tpx_buffer_copy(TPXParticle *pt, uint32_t idxDst, uint32_t idxSrc) { dataDst[2] = dataSrc[2]; } +void tpx_buffer_s16_copy(TPXParticleS16 pt[], uint32_t idxDst, uint32_t idxSrc) +{ + auto val0_dst = (uint64_t*)tpx_buffer_s16_get_pos(pt, idxDst); + auto val0_src = (uint64_t*)tpx_buffer_s16_get_pos(pt, idxSrc); + *val0_dst = *val0_src; + + auto val1_dst = (uint32_t*)tpx_buffer_s16_get_rgba(pt, idxDst); + auto val1_src = (uint32_t*)tpx_buffer_s16_get_rgba(pt, idxSrc); + *val1_dst = *val1_src; +} + void tpx_destroy() { if(matrixStack) diff --git a/src/t3d/tpx.h b/src/t3d/tpx.h index 1efd95fa..48f063c3 100644 --- a/src/t3d/tpx.h +++ b/src/t3d/tpx.h @@ -49,9 +49,27 @@ typedef struct { int8_t sizeB; uint8_t colorA[4]; uint8_t colorB[4]; -} __attribute__((packed, aligned(16))) TPXParticle; +} __attribute__((packed, aligned(16))) TPXParticleS8; -_Static_assert(sizeof(TPXParticle) == 16, "TPXParticle size mismatch"); +static_assert(sizeof(TPXParticleS8) == 16, "TPXParticleS8 size mismatch"); + +/** + * @deprecated Use 'TPXParticleS8' instead. + */ +[[deprecated("Use 'TPXParticleS8' instead")]] typedef TPXParticleS8 TPXParticle; + +typedef struct { + int16_t posA[3]; + int8_t sizeA; + uint8_t texOffsetA; + int16_t posB[3]; + int8_t sizeB; + uint8_t texOffsetB; + uint8_t colorA[4]; + uint8_t colorB[4]; +} __attribute__((packed, aligned(8))) TPXParticleS16; + +static_assert(sizeof(TPXParticleS16) == 24, "TPXParticle16 size mismatch"); /** * @brief Initializes the tinyPX library @@ -103,25 +121,65 @@ void tpx_state_set_base_size(uint16_t baseSize); void tpx_state_set_tex_params(int16_t offsetX, uint16_t mirrorPoint); /** - * Draws a given amount of particles. + * Draws a given amount of particles (8bit position precision). * In contrast to triangles in t3d, this works in a single command. * So load, transform and draw happens in one go. * @param particles pointer to the particle data * @param count number of particles to draw */ -void tpx_particle_draw(TPXParticle *particles, uint32_t count); +void tpx_particle_draw_s8(TPXParticleS8 *particles, uint32_t count); + +[[deprecated("Use 'tpx_particle_draw_s8' instead")]] +inline static void tpx_particle_draw(TPXParticleS8 *particles, uint32_t count) { + return tpx_particle_draw_s8(particles, count); +} + +/** + * Draws a given amount of particles (16bit position precision). + * 16bit Precision gives you larger range but comes with slightly more memory and runtime cost. + * Whenever possible use the 8bit version instead. + * It is most useful if you need to cover large ranges, e.g. when using it for billboards in scene. + * + * In contrast to triangles in t3d, this works in a single command. + * So load, transform and draw happens in one go. + * @param particles pointer to the particle data + * @param count number of particles to draw + */ +void tpx_particle_draw_s16(TPXParticleS16 *particles, uint32_t count); /** * Draws a given amount of particles with a texture. * In contrast to triangles in t3d, this works in a single command. * So load, transform and draw happens in one go. + * * Note: this expects that you already setup textures. * It will also always use TILE0 for the rect-commands. + * The colors alpha channel acts as a texture offset. * * @param particles pointer to the particle data * @param count number of particles to draw */ -void tpx_particle_draw_tex(TPXParticle *particles, uint32_t count); +void tpx_particle_draw_tex_s8(TPXParticleS8 *particles, uint32_t count); + +[[deprecated("Use 'tpx_particle_draw_tex_s8' instead")]] +inline static void tpx_particle_draw_tex(TPXParticleS8 *particles, uint32_t count) { + return tpx_particle_draw_tex_s8(particles, count); +} + +/** + * Draws a given amount of particles (16bit position precision). + * 16bit Precision gives you larger range but comes with slightly more memory and runtime cost. + * Whenever possible use the 8bit version instead. + * It is most useful if you need to cover large ranges, e.g. when using it for billboards in scene. + * + * Note: this expects that you already setup textures. + * It will also always use TILE0 for the rect-commands. + * A per-particle texture offset can be set in 'texOffsetA'/'texOffsetB'. + * + * @param particles pointer to the particle data + * @param count number of particles to draw + */ +void tpx_particle_draw_tex_s16(TPXParticleS16 *particles, uint32_t count); /** * Directly loads a matrix, overwriting the current stack position. @@ -165,44 +223,128 @@ void tpx_matrix_push_pos(int count); * @param vert particle buffer * @param idx particle index */ -static inline int8_t* tpx_buffer_get_pos(TPXParticle pt[], int idx) { +static inline int8_t* tpx_buffer_s8_get_pos(TPXParticleS8 pt[], int idx) { return (idx & 1) ? pt[idx/2].posB : pt[idx/2].posA; } +[[deprecated("Use 'tpx_buffer_s8_get_pos' instead")]] +static inline int8_t* tpx_buffer_get_pos(TPXParticleS8 pt[], int idx) { + return tpx_buffer_s8_get_pos(pt, idx); +} + /** * Returns the pointer to the size of a particle in a buffer * @param pt particle buffer * @param idx particle index */ -static inline int8_t* tpx_buffer_get_size(TPXParticle pt[], int idx) { +static inline int8_t* tpx_buffer_s8_get_size(TPXParticleS8 pt[], int idx) { return (idx & 1) ? &pt[idx/2].sizeB : &pt[idx/2].sizeA; } +[[deprecated("Use 'tpx_buffer_s8_get_size' instead")]] +static inline int8_t* tpx_buffer_get_size(TPXParticleS8 pt[], int idx) { + return tpx_buffer_s8_get_size(pt, idx); +} + /** * Returns the pointer to the color (as a u32) of a particle in a buffer * @param pt particle buffer * @param idx particle index */ -static inline uint32_t* tpx_buffer_get_color(TPXParticle pt[], int idx) { +static inline uint32_t* tpx_buffer_s8_get_color(TPXParticleS8 pt[], int idx) { return (idx & 1) ? (uint32_t*)&pt[idx/2].colorB : (uint32_t*)&pt[idx/2].colorA; } +[[deprecated("Use 'tpx_buffer_s8_get_color' instead")]] +static inline uint32_t* tpx_buffer_get_color(TPXParticleS8 pt[], int idx) { + return tpx_buffer_s8_get_color(pt, idx); +} + /** * Returns the pointer to the color (as a u8[4]) of a particle in a buffer * @param pt particle buffer * @param idx particle index */ -static inline uint8_t* tpx_buffer_get_rgba(TPXParticle pt[], int idx) { +static inline uint8_t* tpx_buffer_s8_get_rgba(TPXParticleS8 pt[], int idx) { return (idx & 1) ? pt[idx/2].colorB : pt[idx/2].colorA; } +[[deprecated("Use 'tpx_buffer_s8_get_rgba' instead")]] +static inline uint8_t* tpx_buffer_get_rgba(TPXParticleS8 pt[], int idx) { + return tpx_buffer_s8_get_rgba(pt, idx); +} + +/** + * Returns the pointer to a position of a particle in a buffer + * @param vert particle buffer + * @param idx particle index + */ +static inline int16_t* tpx_buffer_s16_get_pos(TPXParticleS16 pt[], int idx) { + return (idx & 1) ? pt[idx/2].posB : pt[idx/2].posA; +} + +/** + * Returns the pointer to the size of a particle in a buffer + * @param pt particle buffer + * @param idx particle index + */ +static inline int8_t* tpx_buffer_s16_get_size(TPXParticleS16 pt[], int idx) { + return (idx & 1) ? &pt[idx/2].sizeB : &pt[idx/2].sizeA; +} + +/** + * Returns the pointer to the color (as a u32) of a particle in a buffer + * @param pt particle buffer + * @param idx particle index + */ +static inline uint8_t* tpx_buffer_s16_get_rgba(TPXParticleS16 pt[], int idx) { + return (idx & 1) ? pt[idx/2].colorB : pt[idx/2].colorA; +} + +/** + * Returns the pointer to the texture offset in the buffer. + * This is only present in the 16bit buffer, in the 8bit version this stored in alpha channel. + * @param pt particle buffer + * @param idx particle index +*/ +static inline uint8_t* tpx_buffer_s16_get_tex_offset(TPXParticleS16 pt[], int idx) { + return (idx & 1) ? &pt[idx/2].texOffsetA : &pt[idx/2].texOffsetB; +} + +/** + * Swaps two particles in a buffer + * @param pt buffer to swap particles in + * @param idxA index of the first particle + * @param idxB index of the second particle + */ +void tpx_buffer_s8_swap(TPXParticleS8 pt[], uint32_t idxA, uint32_t idxB); + +[[deprecated("Use 'tpx_buffer_s8_swap' instead")]] +static inline void tpx_buffer_swap(TPXParticleS8 pt[], uint32_t idxA, uint32_t idxB) { + tpx_buffer_s8_swap(pt, idxA, idxB); +} + /** * Swaps two particles in a buffer * @param pt buffer to swap particles in * @param idxA index of the first particle * @param idxB index of the second particle */ -void tpx_buffer_swap(TPXParticle pt[], uint32_t idxA, uint32_t idxB); +void tpx_buffer_s16_swap(TPXParticleS16 pt[], uint32_t idxA, uint32_t idxB); + +/** + * Copies a particle into another place in a buffer + * This will overwrite the destination particle and keep the source particle unchanged. + * @param pt buffer to copy particles in + * @param idxDst destination index + * @param idxSrc source index + */ +void tpx_buffer_s8_copy(TPXParticleS8 pt[], uint32_t idxDst, uint32_t idxSrc); + +[[deprecated("Use 'tpx_buffer_s8_copy' instead")]] +static inline void tpx_buffer_copy(TPXParticleS8 pt[], uint32_t idxDst, uint32_t idxSrc) { + tpx_buffer_s8_copy(pt, idxDst, idxSrc); +} /** * Copies a particle into another place in a buffer @@ -211,7 +353,7 @@ void tpx_buffer_swap(TPXParticle pt[], uint32_t idxA, uint32_t idxB); * @param idxDst destination index * @param idxSrc source index */ -void tpx_buffer_copy(TPXParticle pt[], uint32_t idxDst, uint32_t idxSrc); +void tpx_buffer_s16_copy(TPXParticleS16 pt[], uint32_t idxDst, uint32_t idxSrc); /** * Destroys the tinyPX library and frees all resources