1098 lines
31 KiB
C
1098 lines
31 KiB
C
/*
|
|
*
|
|
* vcomp implementation
|
|
*
|
|
* Copyright 2011 Austin English
|
|
* Copyright 2012 Dan Kegel
|
|
* Copyright 2015 Sebastian Lackner
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this library; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
|
|
*/
|
|
|
|
#include "config.h"
|
|
#include "wine/port.h"
|
|
|
|
#include <stdarg.h>
|
|
#include <assert.h>
|
|
|
|
#include "windef.h"
|
|
#include "winbase.h"
|
|
#include "wine/debug.h"
|
|
#include "wine/list.h"
|
|
|
|
WINE_DEFAULT_DEBUG_CHANNEL(vcomp);
|
|
|
|
static struct list vcomp_idle_threads = LIST_INIT(vcomp_idle_threads);
|
|
static DWORD vcomp_context_tls = TLS_OUT_OF_INDEXES;
|
|
static HMODULE vcomp_module;
|
|
static int vcomp_max_threads;
|
|
static int vcomp_num_threads;
|
|
static BOOL vcomp_nested_fork = FALSE;
|
|
|
|
static RTL_CRITICAL_SECTION vcomp_section;
|
|
static RTL_CRITICAL_SECTION_DEBUG critsect_debug =
|
|
{
|
|
0, 0, &vcomp_section,
|
|
{ &critsect_debug.ProcessLocksList, &critsect_debug.ProcessLocksList },
|
|
0, 0, { (DWORD_PTR)(__FILE__ ": vcomp_section") }
|
|
};
|
|
static RTL_CRITICAL_SECTION vcomp_section = { &critsect_debug, -1, 0, 0, 0, 0 };
|
|
|
|
#define VCOMP_DYNAMIC_FLAGS_STATIC 0x01
|
|
#define VCOMP_DYNAMIC_FLAGS_CHUNKED 0x02
|
|
#define VCOMP_DYNAMIC_FLAGS_GUIDED 0x03
|
|
#define VCOMP_DYNAMIC_FLAGS_INCREMENT 0x40
|
|
|
|
struct vcomp_thread_data
|
|
{
|
|
struct vcomp_team_data *team;
|
|
struct vcomp_task_data *task;
|
|
int thread_num;
|
|
BOOL parallel;
|
|
int fork_threads;
|
|
|
|
/* only used for concurrent tasks */
|
|
struct list entry;
|
|
CONDITION_VARIABLE cond;
|
|
|
|
/* single */
|
|
unsigned int single;
|
|
|
|
/* section */
|
|
unsigned int section;
|
|
|
|
/* dynamic */
|
|
unsigned int dynamic;
|
|
unsigned int dynamic_type;
|
|
unsigned int dynamic_begin;
|
|
unsigned int dynamic_end;
|
|
};
|
|
|
|
struct vcomp_team_data
|
|
{
|
|
CONDITION_VARIABLE cond;
|
|
int num_threads;
|
|
int finished_threads;
|
|
|
|
/* callback arguments */
|
|
int nargs;
|
|
void *wrapper;
|
|
__ms_va_list valist;
|
|
|
|
/* barrier */
|
|
unsigned int barrier;
|
|
int barrier_count;
|
|
};
|
|
|
|
struct vcomp_task_data
|
|
{
|
|
/* single */
|
|
unsigned int single;
|
|
|
|
/* section */
|
|
unsigned int section;
|
|
int num_sections;
|
|
int section_index;
|
|
|
|
/* dynamic */
|
|
unsigned int dynamic;
|
|
unsigned int dynamic_first;
|
|
unsigned int dynamic_last;
|
|
unsigned int dynamic_iterations;
|
|
int dynamic_step;
|
|
unsigned int dynamic_chunksize;
|
|
};
|
|
|
|
#if defined(__i386__)
|
|
|
|
extern void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, __ms_va_list args);
|
|
__ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper,
|
|
"pushl %ebp\n\t"
|
|
__ASM_CFI(".cfi_adjust_cfa_offset 4\n\t")
|
|
__ASM_CFI(".cfi_rel_offset %ebp,0\n\t")
|
|
"movl %esp,%ebp\n\t"
|
|
__ASM_CFI(".cfi_def_cfa_register %ebp\n\t")
|
|
"pushl %esi\n\t"
|
|
__ASM_CFI(".cfi_rel_offset %esi,-4\n\t")
|
|
"pushl %edi\n\t"
|
|
__ASM_CFI(".cfi_rel_offset %edi,-8\n\t")
|
|
"movl 12(%ebp),%edx\n\t"
|
|
"movl %esp,%edi\n\t"
|
|
"shll $2,%edx\n\t"
|
|
"jz 1f\n\t"
|
|
"subl %edx,%edi\n\t"
|
|
"andl $~15,%edi\n\t"
|
|
"movl %edi,%esp\n\t"
|
|
"movl 12(%ebp),%ecx\n\t"
|
|
"movl 16(%ebp),%esi\n\t"
|
|
"cld\n\t"
|
|
"rep; movsl\n"
|
|
"1:\tcall *8(%ebp)\n\t"
|
|
"leal -8(%ebp),%esp\n\t"
|
|
"popl %edi\n\t"
|
|
__ASM_CFI(".cfi_same_value %edi\n\t")
|
|
"popl %esi\n\t"
|
|
__ASM_CFI(".cfi_same_value %esi\n\t")
|
|
"popl %ebp\n\t"
|
|
__ASM_CFI(".cfi_def_cfa %esp,4\n\t")
|
|
__ASM_CFI(".cfi_same_value %ebp\n\t")
|
|
"ret" )
|
|
|
|
#elif defined(__x86_64__)
|
|
|
|
extern void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, __ms_va_list args);
|
|
__ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper,
|
|
"pushq %rbp\n\t"
|
|
__ASM_CFI(".cfi_adjust_cfa_offset 8\n\t")
|
|
__ASM_CFI(".cfi_rel_offset %rbp,0\n\t")
|
|
"movq %rsp,%rbp\n\t"
|
|
__ASM_CFI(".cfi_def_cfa_register %rbp\n\t")
|
|
"pushq %rsi\n\t"
|
|
__ASM_CFI(".cfi_rel_offset %rsi,-8\n\t")
|
|
"pushq %rdi\n\t"
|
|
__ASM_CFI(".cfi_rel_offset %rdi,-16\n\t")
|
|
"movq %rcx,%rax\n\t"
|
|
"movq $4,%rcx\n\t"
|
|
"cmp %rcx,%rdx\n\t"
|
|
"cmovgq %rdx,%rcx\n\t"
|
|
"leaq 0(,%rcx,8),%rdx\n\t"
|
|
"subq %rdx,%rsp\n\t"
|
|
"andq $~15,%rsp\n\t"
|
|
"movq %rsp,%rdi\n\t"
|
|
"movq %r8,%rsi\n\t"
|
|
"rep; movsq\n\t"
|
|
"movq 0(%rsp),%rcx\n\t"
|
|
"movq 8(%rsp),%rdx\n\t"
|
|
"movq 16(%rsp),%r8\n\t"
|
|
"movq 24(%rsp),%r9\n\t"
|
|
"callq *%rax\n\t"
|
|
"leaq -16(%rbp),%rsp\n\t"
|
|
"popq %rdi\n\t"
|
|
__ASM_CFI(".cfi_same_value %rdi\n\t")
|
|
"popq %rsi\n\t"
|
|
__ASM_CFI(".cfi_same_value %rsi\n\t")
|
|
__ASM_CFI(".cfi_def_cfa_register %rsp\n\t")
|
|
"popq %rbp\n\t"
|
|
__ASM_CFI(".cfi_adjust_cfa_offset -8\n\t")
|
|
__ASM_CFI(".cfi_same_value %rbp\n\t")
|
|
"ret")
|
|
|
|
#elif defined(__arm__)
|
|
|
|
extern void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, __ms_va_list args);
|
|
__ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper,
|
|
".arm\n\t"
|
|
"push {r4, r5, LR}\n\t"
|
|
"mov r4, r0\n\t"
|
|
"mov r5, SP\n\t"
|
|
"lsl r3, r1, #2\n\t"
|
|
"cmp r3, #0\n\t"
|
|
"beq 5f\n\t"
|
|
"sub SP, SP, r3\n\t"
|
|
"tst r1, #1\n\t"
|
|
"subeq SP, SP, #4\n\t"
|
|
"1:\tsub r3, r3, #4\n\t"
|
|
"ldr r0, [r2, r3]\n\t"
|
|
"str r0, [SP, r3]\n\t"
|
|
"cmp r3, #0\n\t"
|
|
"bgt 1b\n\t"
|
|
"cmp r1, #1\n\t"
|
|
"bgt 2f\n\t"
|
|
"pop {r0}\n\t"
|
|
"b 5f\n\t"
|
|
"2:\tcmp r1, #2\n\t"
|
|
"bgt 3f\n\t"
|
|
"pop {r0-r1}\n\t"
|
|
"b 5f\n\t"
|
|
"3:\tcmp r1, #3\n\t"
|
|
"bgt 4f\n\t"
|
|
"pop {r0-r2}\n\t"
|
|
"b 5f\n\t"
|
|
"4:\tpop {r0-r3}\n\t"
|
|
"5:\tblx r4\n\t"
|
|
"mov SP, r5\n\t"
|
|
"pop {r4, r5, PC}" )
|
|
|
|
#else
|
|
|
|
static void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, __ms_va_list args)
|
|
{
|
|
ERR("Not implemented for this architecture\n");
|
|
}
|
|
|
|
#endif
|
|
|
|
static inline struct vcomp_thread_data *vcomp_get_thread_data(void)
|
|
{
|
|
return (struct vcomp_thread_data *)TlsGetValue(vcomp_context_tls);
|
|
}
|
|
|
|
static inline void vcomp_set_thread_data(struct vcomp_thread_data *thread_data)
|
|
{
|
|
TlsSetValue(vcomp_context_tls, thread_data);
|
|
}
|
|
|
|
static struct vcomp_thread_data *vcomp_init_thread_data(void)
|
|
{
|
|
struct vcomp_thread_data *thread_data = vcomp_get_thread_data();
|
|
struct
|
|
{
|
|
struct vcomp_thread_data thread;
|
|
struct vcomp_task_data task;
|
|
} *data;
|
|
|
|
if (thread_data) return thread_data;
|
|
if (!(data = HeapAlloc(GetProcessHeap(), 0, sizeof(*data))))
|
|
{
|
|
ERR("could not create thread data\n");
|
|
ExitProcess(1);
|
|
}
|
|
|
|
data->task.single = 0;
|
|
data->task.section = 0;
|
|
data->task.dynamic = 0;
|
|
|
|
thread_data = &data->thread;
|
|
thread_data->team = NULL;
|
|
thread_data->task = &data->task;
|
|
thread_data->thread_num = 0;
|
|
thread_data->parallel = FALSE;
|
|
thread_data->fork_threads = 0;
|
|
thread_data->single = 1;
|
|
thread_data->section = 1;
|
|
thread_data->dynamic = 1;
|
|
thread_data->dynamic_type = 0;
|
|
|
|
vcomp_set_thread_data(thread_data);
|
|
return thread_data;
|
|
}
|
|
|
|
static void vcomp_free_thread_data(void)
|
|
{
|
|
struct vcomp_thread_data *thread_data = vcomp_get_thread_data();
|
|
if (!thread_data) return;
|
|
|
|
HeapFree(GetProcessHeap(), 0, thread_data);
|
|
vcomp_set_thread_data(NULL);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_add_i4(int *dest, int val)
|
|
{
|
|
interlocked_xchg_add(dest, val);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_and_i4(int *dest, int val)
|
|
{
|
|
int old;
|
|
do old = *dest; while (interlocked_cmpxchg(dest, old & val, old) != old);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_div_i4(int *dest, int val)
|
|
{
|
|
int old;
|
|
do old = *dest; while (interlocked_cmpxchg(dest, old / val, old) != old);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_div_ui4(unsigned int *dest, unsigned int val)
|
|
{
|
|
unsigned int old;
|
|
do old = *dest; while (interlocked_cmpxchg((int *)dest, old / val, old) != old);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_mul_i4(int *dest, int val)
|
|
{
|
|
int old;
|
|
do old = *dest; while (interlocked_cmpxchg(dest, old * val, old) != old);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_or_i4(int *dest, int val)
|
|
{
|
|
int old;
|
|
do old = *dest; while (interlocked_cmpxchg(dest, old | val, old) != old);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_shl_i4(int *dest, int val)
|
|
{
|
|
int old;
|
|
do old = *dest; while (interlocked_cmpxchg(dest, old << val, old) != old);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_shr_i4(int *dest, int val)
|
|
{
|
|
int old;
|
|
do old = *dest; while (interlocked_cmpxchg(dest, old >> val, old) != old);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_shr_ui4(unsigned int *dest, unsigned int val)
|
|
{
|
|
unsigned int old;
|
|
do old = *dest; while (interlocked_cmpxchg((int *)dest, old >> val, old) != old);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_sub_i4(int *dest, int val)
|
|
{
|
|
interlocked_xchg_add(dest, -val);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_xor_i4(int *dest, int val)
|
|
{
|
|
int old;
|
|
do old = *dest; while (interlocked_cmpxchg(dest, old ^ val, old) != old);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_add_r4(float *dest, float val)
|
|
{
|
|
int old, new;
|
|
do
|
|
{
|
|
old = *(int *)dest;
|
|
*(float *)&new = *(float *)&old + val;
|
|
}
|
|
while (interlocked_cmpxchg((int *)dest, new, old) != old);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_div_r4(float *dest, float val)
|
|
{
|
|
int old, new;
|
|
do
|
|
{
|
|
old = *(int *)dest;
|
|
*(float *)&new = *(float *)&old / val;
|
|
}
|
|
while (interlocked_cmpxchg((int *)dest, new, old) != old);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_mul_r4(float *dest, float val)
|
|
{
|
|
int old, new;
|
|
do
|
|
{
|
|
old = *(int *)dest;
|
|
*(float *)&new = *(float *)&old * val;
|
|
}
|
|
while (interlocked_cmpxchg((int *)dest, new, old) != old);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_sub_r4(float *dest, float val)
|
|
{
|
|
int old, new;
|
|
do
|
|
{
|
|
old = *(int *)dest;
|
|
*(float *)&new = *(float *)&old - val;
|
|
}
|
|
while (interlocked_cmpxchg((int *)dest, new, old) != old);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_add_r8(double *dest, double val)
|
|
{
|
|
LONG64 old, new;
|
|
do
|
|
{
|
|
old = *(LONG64 *)dest;
|
|
*(double *)&new = *(double *)&old + val;
|
|
}
|
|
while (interlocked_cmpxchg64((LONG64 *)dest, new, old) != old);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_div_r8(double *dest, double val)
|
|
{
|
|
LONG64 old, new;
|
|
do
|
|
{
|
|
old = *(LONG64 *)dest;
|
|
*(double *)&new = *(double *)&old / val;
|
|
}
|
|
while (interlocked_cmpxchg64((LONG64 *)dest, new, old) != old);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_mul_r8(double *dest, double val)
|
|
{
|
|
LONG64 old, new;
|
|
do
|
|
{
|
|
old = *(LONG64 *)dest;
|
|
*(double *)&new = *(double *)&old * val;
|
|
}
|
|
while (interlocked_cmpxchg64((LONG64 *)dest, new, old) != old);
|
|
}
|
|
|
|
void CDECL _vcomp_atomic_sub_r8(double *dest, double val)
|
|
{
|
|
LONG64 old, new;
|
|
do
|
|
{
|
|
old = *(LONG64 *)dest;
|
|
*(double *)&new = *(double *)&old - val;
|
|
}
|
|
while (interlocked_cmpxchg64((LONG64 *)dest, new, old) != old);
|
|
}
|
|
|
|
int CDECL omp_get_dynamic(void)
|
|
{
|
|
TRACE("stub\n");
|
|
return 0;
|
|
}
|
|
|
|
int CDECL omp_get_max_threads(void)
|
|
{
|
|
TRACE("()\n");
|
|
return vcomp_max_threads;
|
|
}
|
|
|
|
int CDECL omp_get_nested(void)
|
|
{
|
|
TRACE("stub\n");
|
|
return vcomp_nested_fork;
|
|
}
|
|
|
|
int CDECL omp_get_num_procs(void)
|
|
{
|
|
TRACE("stub\n");
|
|
return 1;
|
|
}
|
|
|
|
int CDECL omp_get_num_threads(void)
|
|
{
|
|
struct vcomp_team_data *team_data = vcomp_init_thread_data()->team;
|
|
TRACE("()\n");
|
|
return team_data ? team_data->num_threads : 1;
|
|
}
|
|
|
|
int CDECL omp_get_thread_num(void)
|
|
{
|
|
TRACE("()\n");
|
|
return vcomp_init_thread_data()->thread_num;
|
|
}
|
|
|
|
/* Time in seconds since "some time in the past" */
|
|
double CDECL omp_get_wtime(void)
|
|
{
|
|
return GetTickCount() / 1000.0;
|
|
}
|
|
|
|
void CDECL omp_set_dynamic(int val)
|
|
{
|
|
TRACE("(%d): stub\n", val);
|
|
}
|
|
|
|
void CDECL omp_set_nested(int nested)
|
|
{
|
|
TRACE("(%d)\n", nested);
|
|
vcomp_nested_fork = (nested != 0);
|
|
}
|
|
|
|
void CDECL omp_set_num_threads(int num_threads)
|
|
{
|
|
TRACE("(%d)\n", num_threads);
|
|
if (num_threads >= 1)
|
|
vcomp_num_threads = num_threads;
|
|
}
|
|
|
|
void CDECL _vcomp_barrier(void)
|
|
{
|
|
struct vcomp_team_data *team_data = vcomp_init_thread_data()->team;
|
|
|
|
TRACE("()\n");
|
|
|
|
if (!team_data)
|
|
return;
|
|
|
|
EnterCriticalSection(&vcomp_section);
|
|
if (++team_data->barrier_count >= team_data->num_threads)
|
|
{
|
|
team_data->barrier++;
|
|
team_data->barrier_count = 0;
|
|
WakeAllConditionVariable(&team_data->cond);
|
|
}
|
|
else
|
|
{
|
|
unsigned int barrier = team_data->barrier;
|
|
while (team_data->barrier == barrier)
|
|
SleepConditionVariableCS(&team_data->cond, &vcomp_section, INFINITE);
|
|
}
|
|
LeaveCriticalSection(&vcomp_section);
|
|
}
|
|
|
|
void CDECL _vcomp_set_num_threads(int num_threads)
|
|
{
|
|
TRACE("(%d)\n", num_threads);
|
|
if (num_threads >= 1)
|
|
vcomp_init_thread_data()->fork_threads = num_threads;
|
|
}
|
|
|
|
int CDECL _vcomp_master_begin(void)
|
|
{
|
|
TRACE("()\n");
|
|
return !vcomp_init_thread_data()->thread_num;
|
|
}
|
|
|
|
void CDECL _vcomp_master_end(void)
|
|
{
|
|
TRACE("()\n");
|
|
/* nothing to do here */
|
|
}
|
|
|
|
int CDECL _vcomp_single_begin(int flags)
|
|
{
|
|
struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
|
|
struct vcomp_task_data *task_data = thread_data->task;
|
|
int ret = FALSE;
|
|
|
|
TRACE("(%x): semi-stub\n", flags);
|
|
|
|
EnterCriticalSection(&vcomp_section);
|
|
thread_data->single++;
|
|
if ((int)(thread_data->single - task_data->single) > 0)
|
|
{
|
|
task_data->single = thread_data->single;
|
|
ret = TRUE;
|
|
}
|
|
LeaveCriticalSection(&vcomp_section);
|
|
|
|
return ret;
|
|
}
|
|
|
|
void CDECL _vcomp_single_end(void)
|
|
{
|
|
TRACE("()\n");
|
|
/* nothing to do here */
|
|
}
|
|
|
|
void CDECL _vcomp_sections_init(int n)
|
|
{
|
|
struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
|
|
struct vcomp_task_data *task_data = thread_data->task;
|
|
|
|
TRACE("(%d)\n", n);
|
|
|
|
EnterCriticalSection(&vcomp_section);
|
|
thread_data->section++;
|
|
if ((int)(thread_data->section - task_data->section) > 0)
|
|
{
|
|
task_data->section = thread_data->section;
|
|
task_data->num_sections = n;
|
|
task_data->section_index = 0;
|
|
}
|
|
LeaveCriticalSection(&vcomp_section);
|
|
}
|
|
|
|
int CDECL _vcomp_sections_next(void)
|
|
{
|
|
struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
|
|
struct vcomp_task_data *task_data = thread_data->task;
|
|
int i = -1;
|
|
|
|
TRACE("()\n");
|
|
|
|
EnterCriticalSection(&vcomp_section);
|
|
if (thread_data->section == task_data->section &&
|
|
task_data->section_index != task_data->num_sections)
|
|
{
|
|
i = task_data->section_index++;
|
|
}
|
|
LeaveCriticalSection(&vcomp_section);
|
|
return i;
|
|
}
|
|
|
|
void CDECL _vcomp_for_static_simple_init(unsigned int first, unsigned int last, int step,
|
|
BOOL increment, unsigned int *begin, unsigned int *end)
|
|
{
|
|
unsigned int iterations, per_thread, remaining;
|
|
struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
|
|
struct vcomp_team_data *team_data = thread_data->team;
|
|
int num_threads = team_data ? team_data->num_threads : 1;
|
|
int thread_num = thread_data->thread_num;
|
|
|
|
TRACE("(%u, %u, %d, %u, %p, %p)\n", first, last, step, increment, begin, end);
|
|
|
|
if (num_threads == 1)
|
|
{
|
|
*begin = first;
|
|
*end = last;
|
|
return;
|
|
}
|
|
|
|
if (step <= 0)
|
|
{
|
|
*begin = 0;
|
|
*end = increment ? -1 : 1;
|
|
return;
|
|
}
|
|
|
|
if (increment)
|
|
iterations = 1 + (last - first) / step;
|
|
else
|
|
{
|
|
iterations = 1 + (first - last) / step;
|
|
step *= -1;
|
|
}
|
|
|
|
per_thread = iterations / num_threads;
|
|
remaining = iterations - per_thread * num_threads;
|
|
|
|
if (thread_num < remaining)
|
|
per_thread++;
|
|
else if (per_thread)
|
|
first += remaining * step;
|
|
else
|
|
{
|
|
*begin = first;
|
|
*end = first - step;
|
|
return;
|
|
}
|
|
|
|
*begin = first + per_thread * thread_num * step;
|
|
*end = *begin + (per_thread - 1) * step;
|
|
}
|
|
|
|
void CDECL _vcomp_for_static_init(int first, int last, int step, int chunksize, unsigned int *loops,
|
|
int *begin, int *end, int *next, int *lastchunk)
|
|
{
|
|
unsigned int iterations, num_chunks, per_thread, remaining;
|
|
struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
|
|
struct vcomp_team_data *team_data = thread_data->team;
|
|
int num_threads = team_data ? team_data->num_threads : 1;
|
|
int thread_num = thread_data->thread_num;
|
|
|
|
TRACE("(%d, %d, %d, %d, %p, %p, %p, %p, %p)\n",
|
|
first, last, step, chunksize, loops, begin, end, next, lastchunk);
|
|
|
|
if (num_threads == 1 && chunksize != 1)
|
|
{
|
|
*loops = 1;
|
|
*begin = first;
|
|
*end = last;
|
|
*next = 0;
|
|
*lastchunk = first;
|
|
return;
|
|
}
|
|
|
|
if (first == last)
|
|
{
|
|
*loops = !thread_num;
|
|
if (!thread_num)
|
|
{
|
|
*begin = first;
|
|
*end = last;
|
|
*next = 0;
|
|
*lastchunk = first;
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (step <= 0)
|
|
{
|
|
*loops = 0;
|
|
return;
|
|
}
|
|
|
|
if (first < last)
|
|
iterations = 1 + (last - first) / step;
|
|
else
|
|
{
|
|
iterations = 1 + (first - last) / step;
|
|
step *= -1;
|
|
}
|
|
|
|
if (chunksize < 1)
|
|
chunksize = 1;
|
|
|
|
num_chunks = ((DWORD64)iterations + chunksize - 1) / chunksize;
|
|
per_thread = num_chunks / num_threads;
|
|
remaining = num_chunks - per_thread * num_threads;
|
|
|
|
*loops = per_thread + (thread_num < remaining);
|
|
*begin = first + thread_num * chunksize * step;
|
|
*end = *begin + (chunksize - 1) * step;
|
|
*next = chunksize * num_threads * step;
|
|
*lastchunk = first + (num_chunks - 1) * chunksize * step;
|
|
}
|
|
|
|
void CDECL _vcomp_for_static_end(void)
|
|
{
|
|
TRACE("()\n");
|
|
/* nothing to do here */
|
|
}
|
|
|
|
void CDECL _vcomp_for_dynamic_init(unsigned int flags, unsigned int first, unsigned int last,
|
|
int step, unsigned int chunksize)
|
|
{
|
|
unsigned int iterations, per_thread, remaining;
|
|
struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
|
|
struct vcomp_team_data *team_data = thread_data->team;
|
|
struct vcomp_task_data *task_data = thread_data->task;
|
|
int num_threads = team_data ? team_data->num_threads : 1;
|
|
int thread_num = thread_data->thread_num;
|
|
unsigned int type = flags & ~VCOMP_DYNAMIC_FLAGS_INCREMENT;
|
|
|
|
TRACE("(%u, %u, %u, %d, %u)\n", flags, first, last, step, chunksize);
|
|
|
|
if (step <= 0)
|
|
{
|
|
thread_data->dynamic_type = 0;
|
|
return;
|
|
}
|
|
|
|
if (flags & VCOMP_DYNAMIC_FLAGS_INCREMENT)
|
|
iterations = 1 + (last - first) / step;
|
|
else
|
|
{
|
|
iterations = 1 + (first - last) / step;
|
|
step *= -1;
|
|
}
|
|
|
|
if (type == VCOMP_DYNAMIC_FLAGS_STATIC)
|
|
{
|
|
per_thread = iterations / num_threads;
|
|
remaining = iterations - per_thread * num_threads;
|
|
|
|
if (thread_num < remaining)
|
|
per_thread++;
|
|
else if (per_thread)
|
|
first += remaining * step;
|
|
else
|
|
{
|
|
thread_data->dynamic_type = 0;
|
|
return;
|
|
}
|
|
|
|
thread_data->dynamic_type = VCOMP_DYNAMIC_FLAGS_STATIC;
|
|
thread_data->dynamic_begin = first + per_thread * thread_num * step;
|
|
thread_data->dynamic_end = thread_data->dynamic_begin + (per_thread - 1) * step;
|
|
}
|
|
else
|
|
{
|
|
if (type != VCOMP_DYNAMIC_FLAGS_CHUNKED &&
|
|
type != VCOMP_DYNAMIC_FLAGS_GUIDED)
|
|
{
|
|
FIXME("unsupported flags %u\n", flags);
|
|
type = VCOMP_DYNAMIC_FLAGS_GUIDED;
|
|
}
|
|
|
|
EnterCriticalSection(&vcomp_section);
|
|
thread_data->dynamic++;
|
|
thread_data->dynamic_type = type;
|
|
if ((int)(thread_data->dynamic - task_data->dynamic) > 0)
|
|
{
|
|
task_data->dynamic = thread_data->dynamic;
|
|
task_data->dynamic_first = first;
|
|
task_data->dynamic_last = last;
|
|
task_data->dynamic_iterations = iterations;
|
|
task_data->dynamic_step = step;
|
|
task_data->dynamic_chunksize = chunksize;
|
|
}
|
|
LeaveCriticalSection(&vcomp_section);
|
|
}
|
|
}
|
|
|
|
int CDECL _vcomp_for_dynamic_next(unsigned int *begin, unsigned int *end)
|
|
{
|
|
struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
|
|
struct vcomp_task_data *task_data = thread_data->task;
|
|
struct vcomp_team_data *team_data = thread_data->team;
|
|
int num_threads = team_data ? team_data->num_threads : 1;
|
|
|
|
TRACE("(%p, %p)\n", begin, end);
|
|
|
|
if (thread_data->dynamic_type == VCOMP_DYNAMIC_FLAGS_STATIC)
|
|
{
|
|
*begin = thread_data->dynamic_begin;
|
|
*end = thread_data->dynamic_end;
|
|
thread_data->dynamic_type = 0;
|
|
return 1;
|
|
}
|
|
else if (thread_data->dynamic_type == VCOMP_DYNAMIC_FLAGS_CHUNKED ||
|
|
thread_data->dynamic_type == VCOMP_DYNAMIC_FLAGS_GUIDED)
|
|
{
|
|
unsigned int iterations = 0;
|
|
EnterCriticalSection(&vcomp_section);
|
|
if (thread_data->dynamic == task_data->dynamic &&
|
|
task_data->dynamic_iterations != 0)
|
|
{
|
|
iterations = min(task_data->dynamic_iterations, task_data->dynamic_chunksize);
|
|
if (thread_data->dynamic_type == VCOMP_DYNAMIC_FLAGS_GUIDED &&
|
|
task_data->dynamic_iterations > num_threads * task_data->dynamic_chunksize)
|
|
{
|
|
iterations = (task_data->dynamic_iterations + num_threads - 1) / num_threads;
|
|
}
|
|
*begin = task_data->dynamic_first;
|
|
*end = task_data->dynamic_first + (iterations - 1) * task_data->dynamic_step;
|
|
task_data->dynamic_iterations -= iterations;
|
|
task_data->dynamic_first += iterations * task_data->dynamic_step;
|
|
if (!task_data->dynamic_iterations)
|
|
*end = task_data->dynamic_last;
|
|
}
|
|
LeaveCriticalSection(&vcomp_section);
|
|
return iterations != 0;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int CDECL omp_in_parallel(void)
|
|
{
|
|
TRACE("()\n");
|
|
return vcomp_init_thread_data()->parallel;
|
|
}
|
|
|
|
static DWORD WINAPI _vcomp_fork_worker(void *param)
|
|
{
|
|
struct vcomp_thread_data *thread_data = param;
|
|
vcomp_set_thread_data(thread_data);
|
|
|
|
TRACE("starting worker thread for %p\n", thread_data);
|
|
|
|
EnterCriticalSection(&vcomp_section);
|
|
for (;;)
|
|
{
|
|
struct vcomp_team_data *team = thread_data->team;
|
|
if (team != NULL)
|
|
{
|
|
LeaveCriticalSection(&vcomp_section);
|
|
_vcomp_fork_call_wrapper(team->wrapper, team->nargs, team->valist);
|
|
EnterCriticalSection(&vcomp_section);
|
|
|
|
thread_data->team = NULL;
|
|
list_remove(&thread_data->entry);
|
|
list_add_tail(&vcomp_idle_threads, &thread_data->entry);
|
|
if (++team->finished_threads >= team->num_threads)
|
|
WakeAllConditionVariable(&team->cond);
|
|
}
|
|
|
|
if (!SleepConditionVariableCS(&thread_data->cond, &vcomp_section, 5000) &&
|
|
GetLastError() == ERROR_TIMEOUT && !thread_data->team)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
list_remove(&thread_data->entry);
|
|
LeaveCriticalSection(&vcomp_section);
|
|
|
|
TRACE("terminating worker thread for %p\n", thread_data);
|
|
|
|
HeapFree(GetProcessHeap(), 0, thread_data);
|
|
vcomp_set_thread_data(NULL);
|
|
FreeLibraryAndExitThread(vcomp_module, 0);
|
|
return 0;
|
|
}
|
|
|
|
void WINAPIV _vcomp_fork(BOOL ifval, int nargs, void *wrapper, ...)
|
|
{
|
|
struct vcomp_thread_data *prev_thread_data = vcomp_init_thread_data();
|
|
struct vcomp_thread_data thread_data;
|
|
struct vcomp_team_data team_data;
|
|
struct vcomp_task_data task_data;
|
|
int num_threads;
|
|
|
|
TRACE("(%d, %d, %p, ...)\n", ifval, nargs, wrapper);
|
|
|
|
if (prev_thread_data->parallel && !vcomp_nested_fork)
|
|
ifval = FALSE;
|
|
|
|
if (!ifval)
|
|
num_threads = 1;
|
|
else if (prev_thread_data->fork_threads)
|
|
num_threads = prev_thread_data->fork_threads;
|
|
else
|
|
num_threads = vcomp_num_threads;
|
|
|
|
InitializeConditionVariable(&team_data.cond);
|
|
team_data.num_threads = 1;
|
|
team_data.finished_threads = 0;
|
|
team_data.nargs = nargs;
|
|
team_data.wrapper = wrapper;
|
|
__ms_va_start(team_data.valist, wrapper);
|
|
team_data.barrier = 0;
|
|
team_data.barrier_count = 0;
|
|
|
|
task_data.single = 0;
|
|
task_data.section = 0;
|
|
task_data.dynamic = 0;
|
|
|
|
thread_data.team = &team_data;
|
|
thread_data.task = &task_data;
|
|
thread_data.thread_num = 0;
|
|
thread_data.parallel = ifval || prev_thread_data->parallel;
|
|
thread_data.fork_threads = 0;
|
|
thread_data.single = 1;
|
|
thread_data.section = 1;
|
|
thread_data.dynamic = 1;
|
|
thread_data.dynamic_type = 0;
|
|
list_init(&thread_data.entry);
|
|
InitializeConditionVariable(&thread_data.cond);
|
|
|
|
if (num_threads > 1)
|
|
{
|
|
struct list *ptr;
|
|
EnterCriticalSection(&vcomp_section);
|
|
|
|
/* reuse existing threads (if any) */
|
|
while (team_data.num_threads < num_threads && (ptr = list_head(&vcomp_idle_threads)))
|
|
{
|
|
struct vcomp_thread_data *data = LIST_ENTRY(ptr, struct vcomp_thread_data, entry);
|
|
data->team = &team_data;
|
|
data->task = &task_data;
|
|
data->thread_num = team_data.num_threads++;
|
|
data->parallel = thread_data.parallel;
|
|
data->fork_threads = 0;
|
|
data->single = 1;
|
|
data->section = 1;
|
|
data->dynamic = 1;
|
|
data->dynamic_type = 0;
|
|
list_remove(&data->entry);
|
|
list_add_tail(&thread_data.entry, &data->entry);
|
|
WakeAllConditionVariable(&data->cond);
|
|
}
|
|
|
|
/* spawn additional threads */
|
|
while (team_data.num_threads < num_threads)
|
|
{
|
|
struct vcomp_thread_data *data;
|
|
HMODULE module;
|
|
HANDLE thread;
|
|
|
|
data = HeapAlloc(GetProcessHeap(), 0, sizeof(*data));
|
|
if (!data) break;
|
|
|
|
data->team = &team_data;
|
|
data->task = &task_data;
|
|
data->thread_num = team_data.num_threads;
|
|
data->parallel = thread_data.parallel;
|
|
data->fork_threads = 0;
|
|
data->single = 1;
|
|
data->section = 1;
|
|
data->dynamic = 1;
|
|
data->dynamic_type = 0;
|
|
InitializeConditionVariable(&data->cond);
|
|
|
|
thread = CreateThread(NULL, 0, _vcomp_fork_worker, data, 0, NULL);
|
|
if (!thread)
|
|
{
|
|
HeapFree(GetProcessHeap(), 0, data);
|
|
break;
|
|
}
|
|
|
|
GetModuleHandleExW(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS,
|
|
(const WCHAR *)vcomp_module, &module);
|
|
team_data.num_threads++;
|
|
list_add_tail(&thread_data.entry, &data->entry);
|
|
CloseHandle(thread);
|
|
}
|
|
|
|
LeaveCriticalSection(&vcomp_section);
|
|
}
|
|
|
|
vcomp_set_thread_data(&thread_data);
|
|
_vcomp_fork_call_wrapper(team_data.wrapper, team_data.nargs, team_data.valist);
|
|
vcomp_set_thread_data(prev_thread_data);
|
|
prev_thread_data->fork_threads = 0;
|
|
|
|
if (team_data.num_threads > 1)
|
|
{
|
|
EnterCriticalSection(&vcomp_section);
|
|
|
|
team_data.finished_threads++;
|
|
while (team_data.finished_threads < team_data.num_threads)
|
|
SleepConditionVariableCS(&team_data.cond, &vcomp_section, INFINITE);
|
|
|
|
LeaveCriticalSection(&vcomp_section);
|
|
assert(list_empty(&thread_data.entry));
|
|
}
|
|
|
|
__ms_va_end(team_data.valist);
|
|
}
|
|
|
|
void CDECL _vcomp_enter_critsect(CRITICAL_SECTION **critsect)
|
|
{
|
|
TRACE("(%p)\n", critsect);
|
|
|
|
if (!*critsect)
|
|
{
|
|
CRITICAL_SECTION *new_critsect;
|
|
if (!(new_critsect = HeapAlloc(GetProcessHeap(), 0, sizeof(*new_critsect))))
|
|
{
|
|
ERR("could not allocate critical section\n");
|
|
ExitProcess(1);
|
|
}
|
|
|
|
InitializeCriticalSection(new_critsect);
|
|
new_critsect->DebugInfo->Spare[0] = (DWORD_PTR)(__FILE__ ": critsect");
|
|
|
|
if (interlocked_cmpxchg_ptr((void **)critsect, new_critsect, NULL) != NULL)
|
|
{
|
|
/* someone beat us to it */
|
|
new_critsect->DebugInfo->Spare[0] = 0;
|
|
DeleteCriticalSection(new_critsect);
|
|
HeapFree(GetProcessHeap(), 0, new_critsect);
|
|
}
|
|
}
|
|
|
|
EnterCriticalSection(*critsect);
|
|
}
|
|
|
|
void CDECL _vcomp_leave_critsect(CRITICAL_SECTION *critsect)
|
|
{
|
|
TRACE("(%p)\n", critsect);
|
|
LeaveCriticalSection(critsect);
|
|
}
|
|
|
|
BOOL WINAPI DllMain(HINSTANCE instance, DWORD reason, LPVOID reserved)
|
|
{
|
|
TRACE("(%p, %d, %p)\n", instance, reason, reserved);
|
|
|
|
switch (reason)
|
|
{
|
|
case DLL_WINE_PREATTACH:
|
|
return FALSE; /* prefer native version */
|
|
|
|
case DLL_PROCESS_ATTACH:
|
|
{
|
|
SYSTEM_INFO sysinfo;
|
|
|
|
if ((vcomp_context_tls = TlsAlloc()) == TLS_OUT_OF_INDEXES)
|
|
{
|
|
ERR("Failed to allocate TLS index\n");
|
|
return FALSE;
|
|
}
|
|
|
|
GetSystemInfo(&sysinfo);
|
|
vcomp_module = instance;
|
|
vcomp_max_threads = sysinfo.dwNumberOfProcessors;
|
|
vcomp_num_threads = sysinfo.dwNumberOfProcessors;
|
|
break;
|
|
}
|
|
|
|
case DLL_PROCESS_DETACH:
|
|
{
|
|
if (reserved) break;
|
|
if (vcomp_context_tls != TLS_OUT_OF_INDEXES)
|
|
{
|
|
vcomp_free_thread_data();
|
|
TlsFree(vcomp_context_tls);
|
|
}
|
|
break;
|
|
}
|
|
|
|
case DLL_THREAD_DETACH:
|
|
{
|
|
vcomp_free_thread_data();
|
|
break;
|
|
}
|
|
}
|
|
|
|
return TRUE;
|
|
}
|