From ab89486e96819f741f220677023f464e1459bcab Mon Sep 17 00:00:00 2001 From: Paul Gofman Date: Fri, 12 Feb 2021 20:29:14 +0300 Subject: [PATCH] vcomp: Implement C2VectParallel(). Used by Ancient Cities. Signed-off-by: Paul Gofman Signed-off-by: Alexandre Julliard --- dlls/vcomp/main.c | 158 ++++++++++++++++++++++++++++++++++++ dlls/vcomp110/vcomp110.spec | 2 +- dlls/vcomp120/vcomp120.spec | 2 +- dlls/vcomp140/vcomp140.spec | 2 +- 4 files changed, 161 insertions(+), 3 deletions(-) diff --git a/dlls/vcomp/main.c b/dlls/vcomp/main.c index 11dfb78030b..fa0b84300f1 100644 --- a/dlls/vcomp/main.c +++ b/dlls/vcomp/main.c @@ -33,6 +33,8 @@ WINE_DEFAULT_DEBUG_CHANNEL(vcomp); +#define MAX_VECT_PARALLEL_CALLBACK_ARGS 128 + typedef CRITICAL_SECTION *omp_lock_t; typedef CRITICAL_SECTION *omp_nest_lock_t; @@ -122,6 +124,14 @@ static void **ptr_from_va_list(__ms_va_list valist) return *(void ***)&valist; } +static void copy_va_list_data(void **args, __ms_va_list valist, int args_count) +{ + unsigned int i; + + for (i = 0; i < args_count; ++i) + args[i] = va_arg(valist, void *); +} + #if defined(__i386__) extern void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, void **args); @@ -1665,6 +1675,154 @@ void CDECL _vcomp_leave_critsect(CRITICAL_SECTION *critsect) LeaveCriticalSection(critsect); } +static unsigned int get_step_count(int start, int end, int range_offset, int step) +{ + int range = end - start + step - range_offset; + + if (step < 0) + return (unsigned)-range / -step; + else + return (unsigned)range / step; +} + +static void CDECL c2vectparallel_wrapper(int start, int end, int step, int end_included, BOOL dynamic_distribution, + int volatile *dynamic_start, void *function, int nargs, __ms_va_list valist) +{ + void *wrapper_args[MAX_VECT_PARALLEL_CALLBACK_ARGS]; + unsigned int step_count, steps_per_call, remainder; + int thread_count = omp_get_num_threads(); + int curr_start, curr_end, range_offset; + int thread = _vcomp_get_thread_num(); + int step_sign; + + copy_va_list_data(&wrapper_args[2], valist, nargs - 2); + + step_sign = step > 0 ? 1 : -1; + range_offset = step_sign * !end_included; + + if (dynamic_distribution) + { + int next_start, new_start, end_value; + + start = *dynamic_start; + end_value = end + !!end_included * step; + while (start != end_value) + { + step_count = get_step_count(start, end, range_offset, step); + + curr_end = start + (step_count + thread_count - 1) / thread_count * step + + range_offset; + + if ((curr_end - end) * step_sign > 0) + { + next_start = end_value; + curr_end = end; + } + else + { + next_start = curr_end - range_offset; + curr_end -= step; + } + + if ((new_start = InterlockedCompareExchange(dynamic_start, next_start, start)) != start) + { + start = new_start; + continue; + } + + wrapper_args[0] = (void *)(ULONG_PTR)start; + wrapper_args[1] = (void *)(ULONG_PTR)curr_end; + _vcomp_fork_call_wrapper(function, nargs, wrapper_args); + start = *dynamic_start; + } + return; + } + + step_count = get_step_count(start, end, range_offset, step); + + /* According to the tests native vcomp still makes extra calls + * with empty range from excessive threads under certain conditions + * for unclear reason. */ + if (thread >= step_count && (end_included || (step != 1 && step != -1))) + return; + + steps_per_call = step_count / thread_count; + remainder = step_count % thread_count; + + if (thread < remainder) + { + curr_start = thread * (steps_per_call + 1); + curr_end = curr_start + steps_per_call + 1; + } + else if (thread < step_count) + { + curr_start = remainder + steps_per_call * thread; + curr_end = curr_start + steps_per_call; + } + else + { + curr_start = curr_end = 0; + } + + curr_start = start + curr_start * step; + curr_end = start + (curr_end - 1) * step + range_offset; + + wrapper_args[0] = (void *)(ULONG_PTR)curr_start; + wrapper_args[1] = (void *)(ULONG_PTR)curr_end; + _vcomp_fork_call_wrapper(function, nargs, wrapper_args); +} + +void CDECL C2VectParallel(int start, int end, int step, BOOL end_included, int thread_count, + BOOL dynamic_distribution, void *function, int nargs, ...) +{ + struct vcomp_thread_data *thread_data; + int volatile dynamic_start; + int prev_thread_count; + __ms_va_list valist; + + TRACE("start %d, end %d, step %d, end_included %d, thread_count %d, dynamic_distribution %#x," + " function %p, nargs %d.\n", start, end, step, end_included, thread_count, + dynamic_distribution, function, nargs); + + if (nargs > MAX_VECT_PARALLEL_CALLBACK_ARGS) + { + FIXME("Number of arguments %u exceeds supported maximum %u" + " (not calling the loop code, expect problems).\n", + nargs, MAX_VECT_PARALLEL_CALLBACK_ARGS); + return; + } + + __ms_va_start(valist, nargs); + + /* This expression can result in integer overflow. According to the tests, + * native vcomp runs the function as a single thread both for empty range + * and (end - start) not fitting the integer range. */ + if ((step > 0 && end < start) || (step < 0 && end > start) + || (end - start) / step < 2 || thread_count < 0) + { + void *wrapper_args[MAX_VECT_PARALLEL_CALLBACK_ARGS]; + + wrapper_args[0] = (void *)(ULONG_PTR)start; + wrapper_args[1] = (void *)(ULONG_PTR)end; + copy_va_list_data(&wrapper_args[2], valist, nargs - 2); + _vcomp_fork_call_wrapper(function, nargs, wrapper_args); + __ms_va_end(valist); + return; + } + + thread_data = vcomp_init_thread_data(); + prev_thread_count = thread_data->fork_threads; + thread_data->fork_threads = thread_count; + + dynamic_start = start; + + _vcomp_fork(TRUE, 9, c2vectparallel_wrapper, start, end, step, end_included, dynamic_distribution, + &dynamic_start, function, nargs, valist); + + thread_data->fork_threads = prev_thread_count; + __ms_va_end(valist); +} + BOOL WINAPI DllMain(HINSTANCE instance, DWORD reason, LPVOID reserved) { TRACE("(%p, %d, %p)\n", instance, reason, reserved); diff --git a/dlls/vcomp110/vcomp110.spec b/dlls/vcomp110/vcomp110.spec index 22a94823c5f..fb4d626c07c 100644 --- a/dlls/vcomp110/vcomp110.spec +++ b/dlls/vcomp110/vcomp110.spec @@ -1,4 +1,4 @@ -@ stub C2VectParallel +@ varargs C2VectParallel(long long long long long long ptr long) @ cdecl _vcomp_atomic_add_i1(ptr long) @ cdecl _vcomp_atomic_add_i2(ptr long) @ cdecl _vcomp_atomic_add_i4(ptr long) diff --git a/dlls/vcomp120/vcomp120.spec b/dlls/vcomp120/vcomp120.spec index 22a94823c5f..fb4d626c07c 100644 --- a/dlls/vcomp120/vcomp120.spec +++ b/dlls/vcomp120/vcomp120.spec @@ -1,4 +1,4 @@ -@ stub C2VectParallel +@ varargs C2VectParallel(long long long long long long ptr long) @ cdecl _vcomp_atomic_add_i1(ptr long) @ cdecl _vcomp_atomic_add_i2(ptr long) @ cdecl _vcomp_atomic_add_i4(ptr long) diff --git a/dlls/vcomp140/vcomp140.spec b/dlls/vcomp140/vcomp140.spec index 22a94823c5f..fb4d626c07c 100644 --- a/dlls/vcomp140/vcomp140.spec +++ b/dlls/vcomp140/vcomp140.spec @@ -1,4 +1,4 @@ -@ stub C2VectParallel +@ varargs C2VectParallel(long long long long long long ptr long) @ cdecl _vcomp_atomic_add_i1(ptr long) @ cdecl _vcomp_atomic_add_i2(ptr long) @ cdecl _vcomp_atomic_add_i4(ptr long)