vcomp: Implement C2VectParallel().

Used by Ancient Cities.

Signed-off-by: Paul Gofman <pgofman@codeweavers.com>
Signed-off-by: Alexandre Julliard <julliard@winehq.org>
This commit is contained in:
Paul Gofman 2021-02-12 20:29:14 +03:00 committed by Alexandre Julliard
parent 5366f67ef0
commit ab89486e96
4 changed files with 161 additions and 3 deletions

View File

@ -33,6 +33,8 @@
WINE_DEFAULT_DEBUG_CHANNEL(vcomp);
#define MAX_VECT_PARALLEL_CALLBACK_ARGS 128
typedef CRITICAL_SECTION *omp_lock_t;
typedef CRITICAL_SECTION *omp_nest_lock_t;
@ -122,6 +124,14 @@ static void **ptr_from_va_list(__ms_va_list valist)
return *(void ***)&valist;
}
static void copy_va_list_data(void **args, __ms_va_list valist, int args_count)
{
unsigned int i;
for (i = 0; i < args_count; ++i)
args[i] = va_arg(valist, void *);
}
#if defined(__i386__)
extern void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, void **args);
@ -1665,6 +1675,154 @@ void CDECL _vcomp_leave_critsect(CRITICAL_SECTION *critsect)
LeaveCriticalSection(critsect);
}
static unsigned int get_step_count(int start, int end, int range_offset, int step)
{
int range = end - start + step - range_offset;
if (step < 0)
return (unsigned)-range / -step;
else
return (unsigned)range / step;
}
static void CDECL c2vectparallel_wrapper(int start, int end, int step, int end_included, BOOL dynamic_distribution,
int volatile *dynamic_start, void *function, int nargs, __ms_va_list valist)
{
void *wrapper_args[MAX_VECT_PARALLEL_CALLBACK_ARGS];
unsigned int step_count, steps_per_call, remainder;
int thread_count = omp_get_num_threads();
int curr_start, curr_end, range_offset;
int thread = _vcomp_get_thread_num();
int step_sign;
copy_va_list_data(&wrapper_args[2], valist, nargs - 2);
step_sign = step > 0 ? 1 : -1;
range_offset = step_sign * !end_included;
if (dynamic_distribution)
{
int next_start, new_start, end_value;
start = *dynamic_start;
end_value = end + !!end_included * step;
while (start != end_value)
{
step_count = get_step_count(start, end, range_offset, step);
curr_end = start + (step_count + thread_count - 1) / thread_count * step
+ range_offset;
if ((curr_end - end) * step_sign > 0)
{
next_start = end_value;
curr_end = end;
}
else
{
next_start = curr_end - range_offset;
curr_end -= step;
}
if ((new_start = InterlockedCompareExchange(dynamic_start, next_start, start)) != start)
{
start = new_start;
continue;
}
wrapper_args[0] = (void *)(ULONG_PTR)start;
wrapper_args[1] = (void *)(ULONG_PTR)curr_end;
_vcomp_fork_call_wrapper(function, nargs, wrapper_args);
start = *dynamic_start;
}
return;
}
step_count = get_step_count(start, end, range_offset, step);
/* According to the tests native vcomp still makes extra calls
* with empty range from excessive threads under certain conditions
* for unclear reason. */
if (thread >= step_count && (end_included || (step != 1 && step != -1)))
return;
steps_per_call = step_count / thread_count;
remainder = step_count % thread_count;
if (thread < remainder)
{
curr_start = thread * (steps_per_call + 1);
curr_end = curr_start + steps_per_call + 1;
}
else if (thread < step_count)
{
curr_start = remainder + steps_per_call * thread;
curr_end = curr_start + steps_per_call;
}
else
{
curr_start = curr_end = 0;
}
curr_start = start + curr_start * step;
curr_end = start + (curr_end - 1) * step + range_offset;
wrapper_args[0] = (void *)(ULONG_PTR)curr_start;
wrapper_args[1] = (void *)(ULONG_PTR)curr_end;
_vcomp_fork_call_wrapper(function, nargs, wrapper_args);
}
void CDECL C2VectParallel(int start, int end, int step, BOOL end_included, int thread_count,
BOOL dynamic_distribution, void *function, int nargs, ...)
{
struct vcomp_thread_data *thread_data;
int volatile dynamic_start;
int prev_thread_count;
__ms_va_list valist;
TRACE("start %d, end %d, step %d, end_included %d, thread_count %d, dynamic_distribution %#x,"
" function %p, nargs %d.\n", start, end, step, end_included, thread_count,
dynamic_distribution, function, nargs);
if (nargs > MAX_VECT_PARALLEL_CALLBACK_ARGS)
{
FIXME("Number of arguments %u exceeds supported maximum %u"
" (not calling the loop code, expect problems).\n",
nargs, MAX_VECT_PARALLEL_CALLBACK_ARGS);
return;
}
__ms_va_start(valist, nargs);
/* This expression can result in integer overflow. According to the tests,
* native vcomp runs the function as a single thread both for empty range
* and (end - start) not fitting the integer range. */
if ((step > 0 && end < start) || (step < 0 && end > start)
|| (end - start) / step < 2 || thread_count < 0)
{
void *wrapper_args[MAX_VECT_PARALLEL_CALLBACK_ARGS];
wrapper_args[0] = (void *)(ULONG_PTR)start;
wrapper_args[1] = (void *)(ULONG_PTR)end;
copy_va_list_data(&wrapper_args[2], valist, nargs - 2);
_vcomp_fork_call_wrapper(function, nargs, wrapper_args);
__ms_va_end(valist);
return;
}
thread_data = vcomp_init_thread_data();
prev_thread_count = thread_data->fork_threads;
thread_data->fork_threads = thread_count;
dynamic_start = start;
_vcomp_fork(TRUE, 9, c2vectparallel_wrapper, start, end, step, end_included, dynamic_distribution,
&dynamic_start, function, nargs, valist);
thread_data->fork_threads = prev_thread_count;
__ms_va_end(valist);
}
BOOL WINAPI DllMain(HINSTANCE instance, DWORD reason, LPVOID reserved)
{
TRACE("(%p, %d, %p)\n", instance, reason, reserved);

View File

@ -1,4 +1,4 @@
@ stub C2VectParallel
@ varargs C2VectParallel(long long long long long long ptr long)
@ cdecl _vcomp_atomic_add_i1(ptr long)
@ cdecl _vcomp_atomic_add_i2(ptr long)
@ cdecl _vcomp_atomic_add_i4(ptr long)

View File

@ -1,4 +1,4 @@
@ stub C2VectParallel
@ varargs C2VectParallel(long long long long long long ptr long)
@ cdecl _vcomp_atomic_add_i1(ptr long)
@ cdecl _vcomp_atomic_add_i2(ptr long)
@ cdecl _vcomp_atomic_add_i4(ptr long)

View File

@ -1,4 +1,4 @@
@ stub C2VectParallel
@ varargs C2VectParallel(long long long long long long ptr long)
@ cdecl _vcomp_atomic_add_i1(ptr long)
@ cdecl _vcomp_atomic_add_i2(ptr long)
@ cdecl _vcomp_atomic_add_i4(ptr long)