From 64926223504cf03b5872a8545154bb6b32ae6473 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20D=C3=B6singer?= <stefan@codeweavers.com>
Date: Tue, 12 May 2009 20:11:50 +0200
Subject: [PATCH] wined3d: sincos for vertex shaders.

SCS is unfortunately a fragment program only instruction. If we have the NV
extensions we can use SIN and COS. Otherwise we have to approximate sine and
cosine with a taylor series. Luckily we're provided with the necessary
constants by the application.
---
 dlls/d3d9/tests/visual.c          | 77 +++++++++++++++++++++++++
 dlls/wined3d/arb_program_shader.c | 95 +++++++++++++++++++++++++++++--
 2 files changed, 167 insertions(+), 5 deletions(-)

diff --git a/dlls/d3d9/tests/visual.c b/dlls/d3d9/tests/visual.c
index b6f2147987e..07a3531d3df 100644
--- a/dlls/d3d9/tests/visual.c
+++ b/dlls/d3d9/tests/visual.c
@@ -10111,6 +10111,82 @@ static void alphatest_test(IDirect3DDevice9 *device) {
     ok(hr == D3D_OK, "IDirect3DDevice9_SetPixelShader failed with 0x%08x\n", hr);
 }
 
+static void sincos_test(IDirect3DDevice9 *device) {
+    const DWORD sin_shader_code[] = {
+        0xfffe0200,                                                                 /* vs_2_0                       */
+        0x0200001f, 0x80000000, 0x900f0000,                                         /* dcl_position v0              */
+        0x05000051, 0xa00f0002, 0x40490fdb, 0x3f800000, 0x00000000, 0x3f59999a,     /* def c2, 3.14159, 1, 0, 0.85  */
+        0x03000005, 0x80010001, 0x90000000, 0xa0000002,                             /* mul r1.x, v0.x, c2.x         */
+        0x04000025, 0x80020000, 0x80000001, 0xa0e40000, 0xa0e40001,                 /* sincos r0.y, r1.x, c0, c1    */
+        0x02000001, 0xc00d0000, 0x90e40000,                                         /* mov oPos.xzw, v0             */
+        0x03000005, 0xc0020000, 0x80550000, 0xa0ff0002,                             /* mul oPos.y, r0.y, c2.w       */
+        0x02000001, 0xd00f0000, 0xa0a60002,                                         /* mov oD0, c2.zyzz             */
+        0x0000ffff                                                                  /* end                          */
+    };
+    const DWORD cos_shader_code[] = {
+        0xfffe0200,                                                                 /* vs_2_0                       */
+        0x0200001f, 0x80000000, 0x900f0000,                                         /* dcl_position v0              */
+        0x05000051, 0xa00f0002, 0x40490fdb, 0x3f800000, 0x00000000, 0x3f59999a,     /* def c2, 3.14159, 1, 0, 0.85  */
+        0x03000005, 0x80010001, 0x90000000, 0xa0000002,                             /* mul r1.x, v0.x, c2.x         */
+        0x04000025, 0x80010000, 0x80000001, 0xa0e40000, 0xa0e40001,                 /* sincos r0.x, r1.x, c0, c1    */
+        0x02000001, 0xc00d0000, 0x90e40000,                                         /* mov oPos.xzw, v0             */
+        0x03000005, 0xc0020000, 0x80000000, 0xa0ff0002,                             /* mul oPos.y, r0.x, c2.w       */
+        0x02000001, 0xd00f0000, 0xa0a90002,                                         /* mov oD0, c2.yzzz             */
+        0x0000ffff                                                                  /* end                          */
+    };
+    IDirect3DVertexShader9 *sin_shader, *cos_shader;
+    HRESULT hr;
+    struct {
+        float x, y, z;
+    } data[1280];
+    unsigned int i;
+    float sincosc1[4] = {D3DSINCOSCONST1};
+    float sincosc2[4] = {D3DSINCOSCONST2};
+
+    hr = IDirect3DDevice9_Clear(device, 0, NULL, D3DCLEAR_TARGET | D3DCLEAR_ZBUFFER, 0x00000000, 1.0f, 0);
+    ok(hr == D3D_OK, "IDirect3DDevice9_Clear failed with 0x%08x\n", hr);
+
+    hr = IDirect3DDevice9_CreateVertexShader(device, sin_shader_code, &sin_shader);
+    ok(hr == D3D_OK, "IDirect3DDevice9_Clear failed with 0x%08x\n", hr);
+    hr = IDirect3DDevice9_CreateVertexShader(device, cos_shader_code, &cos_shader);
+    ok(hr == D3D_OK, "IDirect3DDevice9_Clear failed with 0x%08x\n", hr);
+    hr = IDirect3DDevice9_SetFVF(device, D3DFVF_XYZ);
+    ok(hr == D3D_OK, "IDirect3DDevice9_SetFVF failed with 0x%08x\n", hr);
+    hr = IDirect3DDevice9_SetVertexShaderConstantF(device, 0, sincosc1, 1);
+    ok(hr == D3D_OK, "IDirect3DDevice9_SetVertexShaderConstantF failed with 0x%08x\n", hr);
+    hr = IDirect3DDevice9_SetVertexShaderConstantF(device, 1, sincosc2, 1);
+    ok(hr == D3D_OK, "IDirect3DDevice9_SetVertexShaderConstantF failed with 0x%08x\n", hr);
+
+    /* Generate a point from -1 to 1 every 0.5 pixels */
+    for(i = 0; i < 1280; i++) {
+        data[i].x = (-640.0 + i) / 640.0;
+        data[i].y = 0.0;
+        data[i].z = 0.1;
+    }
+
+    hr = IDirect3DDevice9_BeginScene(device);
+    if(SUCCEEDED(hr)) {
+        hr = IDirect3DDevice9_SetVertexShader(device, sin_shader);
+        ok(hr == D3D_OK, "IDirect3DDevice9_SetVertexShader failed with 0x%08x\n", hr);
+        hr = IDirect3DDevice9_DrawPrimitiveUP(device, D3DPT_POINTLIST, 1280, data, sizeof(*data));
+        ok(hr == D3D_OK, "IDirect3DDevice9_DrawPrimitiveUP failed with 0x%08x\n", hr);
+
+        hr = IDirect3DDevice9_SetVertexShader(device, cos_shader);
+        ok(hr == D3D_OK, "IDirect3DDevice9_SetVertexShader failed with 0x%08x\n", hr);
+        hr = IDirect3DDevice9_DrawPrimitiveUP(device, D3DPT_POINTLIST, 1280, data, sizeof(*data));
+        ok(hr == D3D_OK, "IDirect3DDevice9_DrawPrimitiveUP failed with 0x%08x\n", hr);
+
+        hr = IDirect3DDevice9_EndScene(device);
+        ok(hr == D3D_OK, "IDirect3DDevice9_EndScene failed with 0x%08x\n", hr);
+    }
+    hr = IDirect3DDevice9_Present(device, NULL, NULL, NULL, NULL);
+    /* TODO: Find a way to properly validate the lines. Precicion issues make this a kinda nasty task */
+
+    IDirect3DDevice9_SetVertexShader(device, NULL);
+    IDirect3DVertexShader9_Release(sin_shader);
+    IDirect3DVertexShader9_Release(cos_shader);
+}
+
 START_TEST(visual)
 {
     IDirect3DDevice9 *device_ptr;
@@ -10220,6 +10296,7 @@ START_TEST(visual)
     if (caps.VertexShaderVersion >= D3DVS_VERSION(2, 0))
     {
         test_mova(device_ptr);
+        sincos_test(device_ptr);
         if (caps.VertexShaderVersion >= D3DVS_VERSION(3, 0)) {
             test_vshader_input(device_ptr);
             test_vshader_float16(device_ptr);
diff --git a/dlls/wined3d/arb_program_shader.c b/dlls/wined3d/arb_program_shader.c
index 1a10355f6e4..4ace13b5a86 100644
--- a/dlls/wined3d/arb_program_shader.c
+++ b/dlls/wined3d/arb_program_shader.c
@@ -1725,13 +1725,98 @@ static void shader_hw_sincos(const struct wined3d_shader_instruction *ins)
      * can't use map2gl
      */
     SHADER_BUFFER *buffer = ins->ctx->buffer;
+    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
+    const struct wined3d_shader_dst_param *dst = &ins->dst[0];
     char dst_name[50];
-    char src_name[50];
+    char src_name0[50], src_name1[50], src_name2[50];
+    BOOL is_color;
 
-    shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
-    shader_arb_get_src_param(ins, &ins->src[0], 0, src_name);
-    shader_addline(buffer, "SCS%s %s, %s;\n", shader_arb_get_modifier(ins), dst_name,
-                   src_name);
+    shader_arb_get_src_param(ins, &ins->src[0], 0, src_name0);
+    if(shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type)) {
+        shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
+        shader_addline(buffer, "SCS%s %s, %s;\n", shader_arb_get_modifier(ins), dst_name,
+                       src_name0);
+    } else if(priv->target_version >= NV2) {
+        shader_arb_get_register_name(ins, &dst->reg, dst_name, &is_color);
+
+        /* Sincos writemask must be .x, .y or .xy */
+        if(dst->write_mask & WINED3DSP_WRITEMASK_0)
+            shader_addline(buffer, "COS%s %s.x, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name0);
+        if(dst->write_mask & WINED3DSP_WRITEMASK_1)
+            shader_addline(buffer, "SIN%s %s.y, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name0);
+    } else {
+        /* Approximate sine and cosine with a taylor series, as per math textbook. The application passes 8
+         * helper constants(D3DSINCOSCONST1 and D3DSINCOSCONST2) in src1 and src2.
+         *
+         * sin(x) = x - x^3/3! + x^5/5! - x^7/7! + ...
+         * cos(x) = 1 - x^2/2! + x^4/4! - x^6/6! + ...
+         *
+         * The constants we get are:
+         *
+         *  +1   +1,     -1     -1     +1      +1      -1       -1
+         *      ---- ,  ---- , ---- , ----- , ----- , ----- , ------
+         *      1!*2    2!*4   3!*8   4!*16   5!*32   6!*64   7!*128
+         *
+         * If used with x^2, x^3, x^4 etc they calculate sin(x/2) and cos(x/2):
+         *
+         * (x/2)^2 = x^2 / 4
+         * (x/2)^3 = x^3 / 8
+         * (x/2)^4 = x^4 / 16
+         * (x/2)^5 = x^5 / 32
+         * etc
+         *
+         * To get the final result:
+         * sin(x) = 2 * sin(x/2) * cos(x/2)
+         * cos(x) = cos(x/2)^2 - sin(x/2)^2
+         * (from sin(x+y) and cos(x+y) rules)
+         *
+         * As per MSDN, dst.z is undefined after the operation, and so is
+         * dst.x and dst.y if they're masked out by the writemask. Ie
+         * sincos dst.y, src1, c0, c1
+         * returns the sine in dst.y. dst.x and dst.z are undefined, dst.w is not touched. The assembler
+         * vsa.exe also stops with an error if the dest register is the same register as the source
+         * register. This means we can use dest.xyz as temporary storage. The assembler vsa.exe output also
+         * indicates that sincos consumes 8 instruction slots in vs_2_0(and, strangely, in vs_3_0).
+         */
+        shader_arb_get_src_param(ins, &ins->src[1], 1, src_name1);
+        shader_arb_get_src_param(ins, &ins->src[2], 2, src_name2);
+        shader_arb_get_register_name(ins, &dst->reg, dst_name, &is_color);
+
+        shader_addline(buffer, "MUL %s.x, %s, %s;\n", dst_name, src_name0, src_name0);  /* x ^ 2 */
+        shader_addline(buffer, "MUL TA.y, %s.x, %s;\n", dst_name, src_name0);           /* x ^ 3 */
+        shader_addline(buffer, "MUL %s.y, TA.y, %s;\n", dst_name, src_name0);           /* x ^ 4 */
+        shader_addline(buffer, "MUL TA.z, %s.y, %s;\n", dst_name, src_name0);           /* x ^ 5 */
+        shader_addline(buffer, "MUL %s.z, TA.z, %s;\n", dst_name, src_name0);           /* x ^ 6 */
+        shader_addline(buffer, "MUL TA.w, %s.z, %s;\n", dst_name, src_name0);           /* x ^ 7 */
+
+        /* sin(x/2)
+         *
+         * Unfortunately we don't get the constants in a DP4-capable form. Is there a way to
+         * properly merge that with MULs in the code above?
+         * The swizzles .yz and xw however fit into the .yzxw swizzle added to ps_2_0. Maybe
+         * we can merge the sine and cosine MAD rows to calculate them together.
+         */
+        shader_addline(buffer, "MUL TA.x, %s, %s.w;\n", src_name0, src_name2); /* x^1, +1/(1!*2) */
+        shader_addline(buffer, "MAD TA.x, TA.y, %s.x, TA.x;\n", src_name2); /* -1/(3!*8) */
+        shader_addline(buffer, "MAD TA.x, TA.z, %s.w, TA.x;\n", src_name1); /* +1/(5!*32) */
+        shader_addline(buffer, "MAD TA.x, TA.w, %s.x, TA.x;\n", src_name1); /* -1/(7!*128) */
+
+        /* cos(x/2) */
+        shader_addline(buffer, "MAD TA.y, %s.x, %s.y, %s.z;\n", dst_name, src_name2, src_name2); /* -1/(2!*4), +1.0 */
+        shader_addline(buffer, "MAD TA.y, %s.y, %s.z, TA.y;\n", dst_name, src_name1); /* +1/(4!*16) */
+        shader_addline(buffer, "MAD TA.y, %s.z, %s.y, TA.y;\n", dst_name, src_name1); /* -1/(6!*64) */
+
+        if(dst->write_mask & WINED3DSP_WRITEMASK_0) {
+            /* cos x */
+            shader_addline(buffer, "MUL TA.z, TA.y, TA.y;\n");
+            shader_addline(buffer, "MAD %s.x, -TA.x, TA.x, TA.z;\n", dst_name);
+        }
+        if(dst->write_mask & WINED3DSP_WRITEMASK_1) {
+            /* sin x */
+            shader_addline(buffer, "MUL %s.y, TA.x, TA.y;\n", dst_name);
+            shader_addline(buffer, "ADD %s.y, %s.y, %s.y;\n", dst_name, dst_name, dst_name);
+        }
+    }
 }
 
 /* GL locking is done by the caller */