From 64926223504cf03b5872a8545154bb6b32ae6473 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20D=C3=B6singer?= Date: Tue, 12 May 2009 20:11:50 +0200 Subject: [PATCH] wined3d: sincos for vertex shaders. SCS is unfortunately a fragment program only instruction. If we have the NV extensions we can use SIN and COS. Otherwise we have to approximate sine and cosine with a taylor series. Luckily we're provided with the necessary constants by the application. --- dlls/d3d9/tests/visual.c | 77 +++++++++++++++++++++++++ dlls/wined3d/arb_program_shader.c | 95 +++++++++++++++++++++++++++++-- 2 files changed, 167 insertions(+), 5 deletions(-) diff --git a/dlls/d3d9/tests/visual.c b/dlls/d3d9/tests/visual.c index b6f2147987e..07a3531d3df 100644 --- a/dlls/d3d9/tests/visual.c +++ b/dlls/d3d9/tests/visual.c @@ -10111,6 +10111,82 @@ static void alphatest_test(IDirect3DDevice9 *device) { ok(hr == D3D_OK, "IDirect3DDevice9_SetPixelShader failed with 0x%08x\n", hr); } +static void sincos_test(IDirect3DDevice9 *device) { + const DWORD sin_shader_code[] = { + 0xfffe0200, /* vs_2_0 */ + 0x0200001f, 0x80000000, 0x900f0000, /* dcl_position v0 */ + 0x05000051, 0xa00f0002, 0x40490fdb, 0x3f800000, 0x00000000, 0x3f59999a, /* def c2, 3.14159, 1, 0, 0.85 */ + 0x03000005, 0x80010001, 0x90000000, 0xa0000002, /* mul r1.x, v0.x, c2.x */ + 0x04000025, 0x80020000, 0x80000001, 0xa0e40000, 0xa0e40001, /* sincos r0.y, r1.x, c0, c1 */ + 0x02000001, 0xc00d0000, 0x90e40000, /* mov oPos.xzw, v0 */ + 0x03000005, 0xc0020000, 0x80550000, 0xa0ff0002, /* mul oPos.y, r0.y, c2.w */ + 0x02000001, 0xd00f0000, 0xa0a60002, /* mov oD0, c2.zyzz */ + 0x0000ffff /* end */ + }; + const DWORD cos_shader_code[] = { + 0xfffe0200, /* vs_2_0 */ + 0x0200001f, 0x80000000, 0x900f0000, /* dcl_position v0 */ + 0x05000051, 0xa00f0002, 0x40490fdb, 0x3f800000, 0x00000000, 0x3f59999a, /* def c2, 3.14159, 1, 0, 0.85 */ + 0x03000005, 0x80010001, 0x90000000, 0xa0000002, /* mul r1.x, v0.x, c2.x */ + 0x04000025, 0x80010000, 0x80000001, 0xa0e40000, 0xa0e40001, /* sincos r0.x, r1.x, c0, c1 */ + 0x02000001, 0xc00d0000, 0x90e40000, /* mov oPos.xzw, v0 */ + 0x03000005, 0xc0020000, 0x80000000, 0xa0ff0002, /* mul oPos.y, r0.x, c2.w */ + 0x02000001, 0xd00f0000, 0xa0a90002, /* mov oD0, c2.yzzz */ + 0x0000ffff /* end */ + }; + IDirect3DVertexShader9 *sin_shader, *cos_shader; + HRESULT hr; + struct { + float x, y, z; + } data[1280]; + unsigned int i; + float sincosc1[4] = {D3DSINCOSCONST1}; + float sincosc2[4] = {D3DSINCOSCONST2}; + + hr = IDirect3DDevice9_Clear(device, 0, NULL, D3DCLEAR_TARGET | D3DCLEAR_ZBUFFER, 0x00000000, 1.0f, 0); + ok(hr == D3D_OK, "IDirect3DDevice9_Clear failed with 0x%08x\n", hr); + + hr = IDirect3DDevice9_CreateVertexShader(device, sin_shader_code, &sin_shader); + ok(hr == D3D_OK, "IDirect3DDevice9_Clear failed with 0x%08x\n", hr); + hr = IDirect3DDevice9_CreateVertexShader(device, cos_shader_code, &cos_shader); + ok(hr == D3D_OK, "IDirect3DDevice9_Clear failed with 0x%08x\n", hr); + hr = IDirect3DDevice9_SetFVF(device, D3DFVF_XYZ); + ok(hr == D3D_OK, "IDirect3DDevice9_SetFVF failed with 0x%08x\n", hr); + hr = IDirect3DDevice9_SetVertexShaderConstantF(device, 0, sincosc1, 1); + ok(hr == D3D_OK, "IDirect3DDevice9_SetVertexShaderConstantF failed with 0x%08x\n", hr); + hr = IDirect3DDevice9_SetVertexShaderConstantF(device, 1, sincosc2, 1); + ok(hr == D3D_OK, "IDirect3DDevice9_SetVertexShaderConstantF failed with 0x%08x\n", hr); + + /* Generate a point from -1 to 1 every 0.5 pixels */ + for(i = 0; i < 1280; i++) { + data[i].x = (-640.0 + i) / 640.0; + data[i].y = 0.0; + data[i].z = 0.1; + } + + hr = IDirect3DDevice9_BeginScene(device); + if(SUCCEEDED(hr)) { + hr = IDirect3DDevice9_SetVertexShader(device, sin_shader); + ok(hr == D3D_OK, "IDirect3DDevice9_SetVertexShader failed with 0x%08x\n", hr); + hr = IDirect3DDevice9_DrawPrimitiveUP(device, D3DPT_POINTLIST, 1280, data, sizeof(*data)); + ok(hr == D3D_OK, "IDirect3DDevice9_DrawPrimitiveUP failed with 0x%08x\n", hr); + + hr = IDirect3DDevice9_SetVertexShader(device, cos_shader); + ok(hr == D3D_OK, "IDirect3DDevice9_SetVertexShader failed with 0x%08x\n", hr); + hr = IDirect3DDevice9_DrawPrimitiveUP(device, D3DPT_POINTLIST, 1280, data, sizeof(*data)); + ok(hr == D3D_OK, "IDirect3DDevice9_DrawPrimitiveUP failed with 0x%08x\n", hr); + + hr = IDirect3DDevice9_EndScene(device); + ok(hr == D3D_OK, "IDirect3DDevice9_EndScene failed with 0x%08x\n", hr); + } + hr = IDirect3DDevice9_Present(device, NULL, NULL, NULL, NULL); + /* TODO: Find a way to properly validate the lines. Precicion issues make this a kinda nasty task */ + + IDirect3DDevice9_SetVertexShader(device, NULL); + IDirect3DVertexShader9_Release(sin_shader); + IDirect3DVertexShader9_Release(cos_shader); +} + START_TEST(visual) { IDirect3DDevice9 *device_ptr; @@ -10220,6 +10296,7 @@ START_TEST(visual) if (caps.VertexShaderVersion >= D3DVS_VERSION(2, 0)) { test_mova(device_ptr); + sincos_test(device_ptr); if (caps.VertexShaderVersion >= D3DVS_VERSION(3, 0)) { test_vshader_input(device_ptr); test_vshader_float16(device_ptr); diff --git a/dlls/wined3d/arb_program_shader.c b/dlls/wined3d/arb_program_shader.c index 1a10355f6e4..4ace13b5a86 100644 --- a/dlls/wined3d/arb_program_shader.c +++ b/dlls/wined3d/arb_program_shader.c @@ -1725,13 +1725,98 @@ static void shader_hw_sincos(const struct wined3d_shader_instruction *ins) * can't use map2gl */ SHADER_BUFFER *buffer = ins->ctx->buffer; + struct shader_arb_ctx_priv *priv = ins->ctx->backend_data; + const struct wined3d_shader_dst_param *dst = &ins->dst[0]; char dst_name[50]; - char src_name[50]; + char src_name0[50], src_name1[50], src_name2[50]; + BOOL is_color; - shader_arb_get_dst_param(ins, &ins->dst[0], dst_name); - shader_arb_get_src_param(ins, &ins->src[0], 0, src_name); - shader_addline(buffer, "SCS%s %s, %s;\n", shader_arb_get_modifier(ins), dst_name, - src_name); + shader_arb_get_src_param(ins, &ins->src[0], 0, src_name0); + if(shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type)) { + shader_arb_get_dst_param(ins, &ins->dst[0], dst_name); + shader_addline(buffer, "SCS%s %s, %s;\n", shader_arb_get_modifier(ins), dst_name, + src_name0); + } else if(priv->target_version >= NV2) { + shader_arb_get_register_name(ins, &dst->reg, dst_name, &is_color); + + /* Sincos writemask must be .x, .y or .xy */ + if(dst->write_mask & WINED3DSP_WRITEMASK_0) + shader_addline(buffer, "COS%s %s.x, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name0); + if(dst->write_mask & WINED3DSP_WRITEMASK_1) + shader_addline(buffer, "SIN%s %s.y, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name0); + } else { + /* Approximate sine and cosine with a taylor series, as per math textbook. The application passes 8 + * helper constants(D3DSINCOSCONST1 and D3DSINCOSCONST2) in src1 and src2. + * + * sin(x) = x - x^3/3! + x^5/5! - x^7/7! + ... + * cos(x) = 1 - x^2/2! + x^4/4! - x^6/6! + ... + * + * The constants we get are: + * + * +1 +1, -1 -1 +1 +1 -1 -1 + * ---- , ---- , ---- , ----- , ----- , ----- , ------ + * 1!*2 2!*4 3!*8 4!*16 5!*32 6!*64 7!*128 + * + * If used with x^2, x^3, x^4 etc they calculate sin(x/2) and cos(x/2): + * + * (x/2)^2 = x^2 / 4 + * (x/2)^3 = x^3 / 8 + * (x/2)^4 = x^4 / 16 + * (x/2)^5 = x^5 / 32 + * etc + * + * To get the final result: + * sin(x) = 2 * sin(x/2) * cos(x/2) + * cos(x) = cos(x/2)^2 - sin(x/2)^2 + * (from sin(x+y) and cos(x+y) rules) + * + * As per MSDN, dst.z is undefined after the operation, and so is + * dst.x and dst.y if they're masked out by the writemask. Ie + * sincos dst.y, src1, c0, c1 + * returns the sine in dst.y. dst.x and dst.z are undefined, dst.w is not touched. The assembler + * vsa.exe also stops with an error if the dest register is the same register as the source + * register. This means we can use dest.xyz as temporary storage. The assembler vsa.exe output also + * indicates that sincos consumes 8 instruction slots in vs_2_0(and, strangely, in vs_3_0). + */ + shader_arb_get_src_param(ins, &ins->src[1], 1, src_name1); + shader_arb_get_src_param(ins, &ins->src[2], 2, src_name2); + shader_arb_get_register_name(ins, &dst->reg, dst_name, &is_color); + + shader_addline(buffer, "MUL %s.x, %s, %s;\n", dst_name, src_name0, src_name0); /* x ^ 2 */ + shader_addline(buffer, "MUL TA.y, %s.x, %s;\n", dst_name, src_name0); /* x ^ 3 */ + shader_addline(buffer, "MUL %s.y, TA.y, %s;\n", dst_name, src_name0); /* x ^ 4 */ + shader_addline(buffer, "MUL TA.z, %s.y, %s;\n", dst_name, src_name0); /* x ^ 5 */ + shader_addline(buffer, "MUL %s.z, TA.z, %s;\n", dst_name, src_name0); /* x ^ 6 */ + shader_addline(buffer, "MUL TA.w, %s.z, %s;\n", dst_name, src_name0); /* x ^ 7 */ + + /* sin(x/2) + * + * Unfortunately we don't get the constants in a DP4-capable form. Is there a way to + * properly merge that with MULs in the code above? + * The swizzles .yz and xw however fit into the .yzxw swizzle added to ps_2_0. Maybe + * we can merge the sine and cosine MAD rows to calculate them together. + */ + shader_addline(buffer, "MUL TA.x, %s, %s.w;\n", src_name0, src_name2); /* x^1, +1/(1!*2) */ + shader_addline(buffer, "MAD TA.x, TA.y, %s.x, TA.x;\n", src_name2); /* -1/(3!*8) */ + shader_addline(buffer, "MAD TA.x, TA.z, %s.w, TA.x;\n", src_name1); /* +1/(5!*32) */ + shader_addline(buffer, "MAD TA.x, TA.w, %s.x, TA.x;\n", src_name1); /* -1/(7!*128) */ + + /* cos(x/2) */ + shader_addline(buffer, "MAD TA.y, %s.x, %s.y, %s.z;\n", dst_name, src_name2, src_name2); /* -1/(2!*4), +1.0 */ + shader_addline(buffer, "MAD TA.y, %s.y, %s.z, TA.y;\n", dst_name, src_name1); /* +1/(4!*16) */ + shader_addline(buffer, "MAD TA.y, %s.z, %s.y, TA.y;\n", dst_name, src_name1); /* -1/(6!*64) */ + + if(dst->write_mask & WINED3DSP_WRITEMASK_0) { + /* cos x */ + shader_addline(buffer, "MUL TA.z, TA.y, TA.y;\n"); + shader_addline(buffer, "MAD %s.x, -TA.x, TA.x, TA.z;\n", dst_name); + } + if(dst->write_mask & WINED3DSP_WRITEMASK_1) { + /* sin x */ + shader_addline(buffer, "MUL %s.y, TA.x, TA.y;\n", dst_name); + shader_addline(buffer, "ADD %s.y, %s.y, %s.y;\n", dst_name, dst_name, dst_name); + } + } } /* GL locking is done by the caller */