mirror of
https://github.com/gcc-mirror/gcc.git
synced 2026-05-06 14:59:39 +02:00
amdgcn: Enable OpenACC worker partitioning for AMD GCN
gcc/ * config/gcn/gcn.c (gcn_init_builtins): Override decls for BUILT_IN_GOACC_SINGLE_START, BUILT_IN_GOACC_SINGLE_COPY_START, BUILT_IN_GOACC_SINGLE_COPY_END and BUILT_IN_GOACC_BARRIER. (gcn_goacc_validate_dims): Turn on worker partitioning unconditionally. (gcn_fork_join): Update comment. * config/gcn/gcn.opt (flag_worker_partitioning): Remove. (macc_experimental_workers): Remove unused option. libgomp/ * plugin/plugin-gcn.c (gcn_exec): Change default number of workers to 16. * testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c [acc_device_radeon]: Update. * testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c [ACC_DEVICE_TYPE_radeon]: Likewise. * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c [acc_device_radeon]: Likewise. * testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c [ACC_DEVICE_TYPE_radeon]: Likewise. * testsuite/libgomp.oacc-fortran/optional-reduction.f90: XFAIL for 'openacc_radeon_accel_selected' and '-O0'. * testsuite/libgomp.oacc-fortran/reduction-7.f90: Likewise. Co-Authored-By: Kwok Cheung Yeung <kcy@codesourcery.com> Co-Authored-By: Thomas Schwinge <thomas@codesourcery.com>
This commit is contained in:
committed by
Thomas Schwinge
parent
e2a58ed6dc
commit
c408512e1f
@@ -3712,8 +3712,6 @@ gcn_init_builtins (void)
|
||||
TREE_NOTHROW (gcn_builtin_decls[i]) = 1;
|
||||
}
|
||||
|
||||
/* FIXME: remove the ifdef once OpenACC support is merged upstream. */
|
||||
#ifdef BUILT_IN_GOACC_SINGLE_START
|
||||
/* These builtins need to take/return an LDS pointer: override the generic
|
||||
versions here. */
|
||||
|
||||
@@ -3730,7 +3728,6 @@ gcn_init_builtins (void)
|
||||
|
||||
set_builtin_decl (BUILT_IN_GOACC_BARRIER,
|
||||
gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Implement TARGET_INIT_LIBFUNCS. */
|
||||
@@ -5019,11 +5016,7 @@ gcn_goacc_validate_dims (tree decl, int dims[], int fn_level,
|
||||
unsigned /*used*/)
|
||||
{
|
||||
bool changed = false;
|
||||
|
||||
/* FIXME: remove -facc-experimental-workers when they're ready. */
|
||||
int max_workers = flag_worker_partitioning ? 16 : 1;
|
||||
|
||||
gcc_assert (!flag_worker_partitioning);
|
||||
const int max_workers = 16;
|
||||
|
||||
/* The vector size must appear to be 64, to the user, unless this is a
|
||||
SEQ routine. The real, internal value is always 1, which means use
|
||||
@@ -5060,8 +5053,7 @@ gcn_goacc_validate_dims (tree decl, int dims[], int fn_level,
|
||||
{
|
||||
dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS;
|
||||
if (dims[GOMP_DIM_WORKER] < 0)
|
||||
dims[GOMP_DIM_WORKER] = (flag_worker_partitioning
|
||||
? GCN_DEFAULT_WORKERS : 1);
|
||||
dims[GOMP_DIM_WORKER] = GCN_DEFAULT_WORKERS;
|
||||
if (dims[GOMP_DIM_GANG] < 0)
|
||||
dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS;
|
||||
changed = true;
|
||||
@@ -5126,8 +5118,7 @@ static bool
|
||||
gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims),
|
||||
bool ARG_UNUSED (is_fork))
|
||||
{
|
||||
/* GCN does not use the fork/join concept invented for NVPTX.
|
||||
Instead we use standard autovectorization. */
|
||||
/* GCN does not need to expand fork/join markers at the RTL level. */
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -62,11 +62,6 @@ bool flag_bypass_init_error = false
|
||||
mbypass-init-error
|
||||
Target RejectNegative Var(flag_bypass_init_error)
|
||||
|
||||
bool flag_worker_partitioning = false
|
||||
|
||||
macc-experimental-workers
|
||||
Target Var(flag_worker_partitioning) Init(0)
|
||||
|
||||
int stack_size_opt = -1
|
||||
|
||||
mstack-size=
|
||||
|
||||
@@ -3038,8 +3038,7 @@ gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
|
||||
64 gangs matches a typical Fiji device. */
|
||||
|
||||
if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs. */
|
||||
/* NOTE: Until support for middle-end worker partitioning is merged, force 'num_workers (1)'. */
|
||||
if (/*TODO dims[1] == 0*/ true) dims[1] = 1; /* Workers. */
|
||||
if (dims[1] == 0) dims[1] = 16; /* Workers. */
|
||||
|
||||
/* The incoming dimensions are expressed in terms of gangs, workers, and
|
||||
vectors. The HSA dimensions are expressed in terms of "work-items",
|
||||
|
||||
@@ -93,9 +93,6 @@ static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *e
|
||||
}
|
||||
if (num_workers < 1)
|
||||
assert (event_info->launch_event.num_workers >= 1);
|
||||
/* GCN currently enforces 'num_workers (1)'. */
|
||||
else if (acc_device_type == acc_device_radeon)
|
||||
assert (event_info->launch_event.num_workers == 1);
|
||||
else
|
||||
{
|
||||
#ifdef __OPTIMIZE__
|
||||
|
||||
@@ -132,9 +132,7 @@ int main ()
|
||||
/* AMD GCN uses the autovectorizer for the vector dimension: the use
|
||||
of a function call in vector-partitioned code in this test is not
|
||||
currently supported. */
|
||||
/* AMD GCN does not currently support multiple workers. This should be
|
||||
set to 16 when that changes. */
|
||||
return test_1 (16, 1, 1);
|
||||
return test_1 (16, 16, 1);
|
||||
#else
|
||||
return test_1 (16, 16, 32);
|
||||
#endif
|
||||
|
||||
@@ -261,9 +261,8 @@ int main ()
|
||||
}
|
||||
else if (acc_on_device (acc_device_radeon))
|
||||
{
|
||||
/* The GCC GCN back end is limited to num_workers (16).
|
||||
Temporarily set this to 1 until multiple workers are permitted. */
|
||||
workers_actual = 1; // 16;
|
||||
/* The GCC GCN back end is limited to num_workers (16). */
|
||||
workers_actual = 16;
|
||||
}
|
||||
else
|
||||
__builtin_abort ();
|
||||
@@ -313,9 +312,8 @@ int main ()
|
||||
}
|
||||
else if (acc_on_device (acc_device_radeon))
|
||||
{
|
||||
/* The GCC GCN back end is limited to num_workers (16).
|
||||
Temporarily set this to 1 until multiple workers are permitted. */
|
||||
workers_actual = 1; // 16;
|
||||
/* The GCC GCN back end is limited to num_workers (16). */
|
||||
workers_actual = 16;
|
||||
}
|
||||
else
|
||||
__builtin_abort ();
|
||||
@@ -465,8 +463,6 @@ int main ()
|
||||
}
|
||||
else if (acc_on_device (acc_device_radeon))
|
||||
{
|
||||
/* Temporary setting, until multiple workers are permitted. */
|
||||
workers_actual = 1;
|
||||
/* See above comments about GCN vectors_actual. */
|
||||
vectors_actual = 1;
|
||||
}
|
||||
|
||||
@@ -5,12 +5,13 @@
|
||||
#include <openacc.h>
|
||||
#include <gomp-constants.h>
|
||||
|
||||
#define NUM_WORKERS 16
|
||||
#ifdef ACC_DEVICE_TYPE_radeon
|
||||
/* Temporarily set this to 1 until multiple workers are permitted. */
|
||||
#define NUM_WORKERS 1
|
||||
/* AMD GCN uses the autovectorizer for the vector dimension: the use
|
||||
of a function call in vector-partitioned code in this test is not
|
||||
currently supported. */
|
||||
#define NUM_VECTORS 1
|
||||
#else
|
||||
#define NUM_WORKERS 16
|
||||
#define NUM_VECTORS 32
|
||||
#endif
|
||||
#define WIDTH 64
|
||||
|
||||
@@ -4,6 +4,9 @@
|
||||
|
||||
! { dg-do run }
|
||||
|
||||
!TODO
|
||||
! { dg-xfail-run-if TODO { openacc_radeon_accel_selected && { ! __OPTIMIZE__ } } }
|
||||
|
||||
program optional_reduction
|
||||
implicit none
|
||||
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
! { dg-do run }
|
||||
|
||||
!TODO
|
||||
! { dg-xfail-run-if TODO { openacc_radeon_accel_selected && { ! __OPTIMIZE__ } } }
|
||||
|
||||
! subroutine reduction with private and firstprivate variables
|
||||
|
||||
program reduction
|
||||
|
||||
Reference in New Issue
Block a user