diff --git a/libgomp/alloc_cache.h b/libgomp/alloc_cache.h
new file mode 100644
index 00000000000..782569c1fae
--- /dev/null
+++ b/libgomp/alloc_cache.h
@@ -0,0 +1,144 @@
+/* A simple allocation cache.
+ Copyright (C) 2026 Free Software Foundation, Inc.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ . */
+
+#ifndef _ALLOC_CACHE_H
+#define _ALLOC_CACHE_H
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+/* A single cached allocation. All fields immutable. */
+struct alloc_cache_node
+{
+ /* When taken, someone is using this node, and we can't. */
+ pthread_mutex_t lock;
+ void *allocation;
+ size_t size;
+
+ struct alloc_cache_node *next;
+};
+
+struct alloc_cache
+{
+ _Atomic (struct alloc_cache_node *) head;
+
+ /* Could be made better by breaking it up into buckets eventually. Our
+ current allocation pattern is such that most accesses are likely to only
+ ever use the smallest practical allocation, so there isn't much gain in
+ implementing buckets currently.
+
+ Currently, as it is used, this cache will likely be of size O(T) where T
+ is the max number of concurrently executing kernels during the lifetime of
+ the process. I suspect this value is low, so even with a single bucket,
+ it is likely fast enough to search through. */
+};
+
+/* Prepare CACHE for use, initializing it as empty. */
+static inline void
+init_alloc_cache (struct alloc_cache *cache)
+{
+ atomic_init (&cache->head, NULL);
+}
+
+/* Search through CACHE, looking for a non-taken node of large enough to fit
+ DESIRED_SIZE bytes. Returns NULL if no such node exists. */
+static inline struct alloc_cache_node *
+alloc_cache_try_find (struct alloc_cache *cache, size_t desired_size)
+{
+ for (struct alloc_cache_node *node =
+ atomic_load_explicit (&cache->head, memory_order_relaxed);
+ node;
+ node = node->next)
+ {
+ if (node->size < desired_size)
+ continue;
+
+ int ret;
+ if ((ret = pthread_mutex_trylock (&node->lock)) == EBUSY)
+ continue;
+ assert (ret == 0);
+
+ /* It worked! We found a node that's large enough and free. */
+ return node;
+ }
+
+ return NULL;
+}
+
+/* Add a new node for allocation ALLOCATION of SIZE bytes into the cache. The
+ new node is acquired on return. */
+static inline struct alloc_cache_node *
+alloc_cache_add_taken_node (struct alloc_cache *cache,
+ void *allocation,
+ size_t size)
+{
+ struct alloc_cache_node *new_node = malloc (sizeof (*new_node));
+
+ if (!new_node)
+ return NULL;
+
+ *new_node = (struct alloc_cache_node) {
+ .lock = PTHREAD_MUTEX_INITIALIZER,
+ .allocation = allocation,
+ .size = size,
+ .next = NULL
+ };
+ pthread_mutex_lock (&new_node->lock);
+
+ /* Place it on the top of the stack. */
+ struct alloc_cache_node *top = (atomic_load_explicit
+ (&cache->head, memory_order_acquire));
+
+ do new_node->next = top;
+ while (!atomic_compare_exchange_weak_explicit
+ (&cache->head, &top, new_node,
+ memory_order_acq_rel, memory_order_acquire));
+
+ return new_node;
+}
+
+/* Allow NODE to be used by other users of its cache. */
+static inline void
+release_alloc_cache_node (struct alloc_cache_node *node)
+{
+ pthread_mutex_unlock (&node->lock);
+}
+
+/* Destroy NODE. Caller is responsible for cleaning up the allocation inside
+ of NODE, and for making sure that it is not part of any cache that is going
+ to be used in the future. */
+static inline void
+destroy_alloc_cache_node (struct alloc_cache_node *node)
+{
+ pthread_mutex_destroy (&node->lock);
+ free (node);
+}
+
+
+#endif /* _ALLOC_CACHE_H */
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 3f0577fa82e..32f573f1b7f 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -41,6 +41,7 @@
#include
#include
#include
+#include "alloc_cache.h"
#define _LIBGOMP_PLUGIN_INCLUDE 1
#include "libgomp-plugin.h"
#undef _LIBGOMP_PLUGIN_INCLUDE
@@ -281,8 +282,9 @@ struct kernel_dispatch
struct agent_info *agent;
/* Pointer to a command queue associated with a kernel dispatch agent. */
void *queue;
- /* Pointer to a memory space used for kernel arguments passing. */
- void *kernarg_address;
+ /* Pointer to a memory space used for kernel arguments passing, wrapped in a
+ node from the agent kernel argument cache. */
+ struct alloc_cache_node *kernarg_cache_node;
/* Kernel object. */
uint64_t object;
/* Synchronization signal used for dispatch synchronization. */
@@ -472,6 +474,10 @@ struct agent_info
/* The HSA memory region from which to allocate kernel arguments. */
hsa_region_t kernarg_region;
+ /* A stack of allocations in kernarg_region of (sizeof (struct kernargs))
+ size each, used for ammortizing kernel argument allocation cost. */
+ struct alloc_cache kernarg_cache;
+
/* The HSA memory region from which to allocate device data. */
hsa_region_t data_region;
@@ -1082,7 +1088,7 @@ dump_executable_symbols (hsa_executable_t executable)
static void
print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
{
- struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
+ struct kernargs *kernargs = dispatch->kernarg_cache_node->allocation;
fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
@@ -2004,6 +2010,34 @@ alloc_by_agent (struct agent_info *agent, size_t size)
return ptr;
}
+/* Get a cached kernargs from AGENT, returning an existing one if any are
+ available. Returns an alloc_cache_node whose value is this allocation. */
+
+static struct alloc_cache_node *
+alloc_kernargs_on_agent (struct agent_info *agent, size_t size)
+{
+ struct alloc_cache_node *ka_node = (alloc_cache_try_find
+ (&agent->kernarg_cache, size));
+
+ /* The cache was empty. */
+ if (!ka_node)
+ {
+ void *ka_addr;
+ hsa_status_t status = hsa_fns.hsa_memory_allocate_fn
+ (agent->kernarg_region, sizeof (struct kernargs), &ka_addr);
+ if (status != HSA_STATUS_SUCCESS)
+ hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
+
+ ka_node = alloc_cache_add_taken_node (&agent->kernarg_cache,
+ ka_addr,
+ size);
+ if (!ka_node)
+ GOMP_PLUGIN_fatal ("Could not allocate cache node for kernel arguments");
+ }
+
+ return ka_node;
+}
+
/* Create kernel dispatch data structure for given KERNEL, along with
the necessary device signals and memory allocations. */
@@ -2054,12 +2088,10 @@ create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
return NULL;
}
- status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
- sizeof (struct kernargs),
- &shadow->kernarg_address);
- if (status != HSA_STATUS_SUCCESS)
- hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
- struct kernargs *kernargs = shadow->kernarg_address;
+ /* Get an allocation, if possible from the cache. */
+ shadow->kernarg_cache_node = (alloc_kernargs_on_agent
+ (agent, sizeof (struct kernargs)));
+ struct kernargs *kernargs = shadow->kernarg_cache_node->allocation;
/* Zero-initialize the output_data (minimum needed). */
kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
@@ -2158,13 +2190,13 @@ release_kernel_dispatch (struct kernel_dispatch *shadow)
{
GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
- struct kernargs *kernargs = shadow->kernarg_address;
+ struct kernargs *kernargs = shadow->kernarg_cache_node->allocation;
void *addr = (void *)kernargs->abi.arena_ptr;
if (!addr)
addr = (void *)kernargs->abi.stack_ptr;
release_ephemeral_memories (shadow->agent, addr);
- hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
+ release_alloc_cache_node (shadow->kernarg_cache_node);
hsa_signal_t s;
s.handle = shadow->signal;
@@ -2406,12 +2438,13 @@ run_kernel (struct kernel_info *kernel, void *vars,
packet->private_segment_size = shadow->private_segment_size;
packet->group_segment_size = shadow->group_segment_size;
packet->kernel_object = shadow->object;
- packet->kernarg_address = shadow->kernarg_address;
+ struct kernargs *kernargs = (packet->kernarg_address
+ = shadow->kernarg_cache_node->allocation);
hsa_signal_t s;
s.handle = shadow->signal;
packet->completion_signal = s;
hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
- memcpy (shadow->kernarg_address, &vars, sizeof (vars));
+ memcpy (kernargs, &vars, sizeof (vars));
GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
@@ -2437,11 +2470,10 @@ run_kernel (struct kernel_info *kernel, void *vars,
1000 * 1000,
HSA_WAIT_STATE_BLOCKED) != 0)
{
- console_output (kernel, shadow->kernarg_address, false);
+ console_output (kernel, kernargs, false);
}
- console_output (kernel, shadow->kernarg_address, true);
+ console_output (kernel, kernargs, true);
- struct kernargs *kernargs = shadow->kernarg_address;
unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
release_kernel_dispatch (shadow);
@@ -3766,6 +3798,9 @@ GOMP_OFFLOAD_init_device (int n)
GCN_DEBUG ("Selected device data memory region:\n");
dump_hsa_region (agent->data_region, NULL);
+ /* Prepare kernargs cache. */
+ init_alloc_cache (&agent->kernarg_cache);
+
GCN_DEBUG ("GCN agent %d initialized\n", n);
agent->initialized = true;
@@ -4183,6 +4218,17 @@ GOMP_OFFLOAD_fini_device (int n)
if (status != HSA_STATUS_SUCCESS)
return hsa_error ("Error destroying command queue", status);
+ /* Clean up kernargs cache. */
+ struct alloc_cache_node *node = agent->kernarg_cache.head;
+ while (node)
+ {
+ hsa_fns.hsa_memory_free_fn (node->allocation);
+
+ struct alloc_cache_node *curr_node = node;
+ node = curr_node->next;
+ destroy_alloc_cache_node (curr_node);
+ }
+
if (pthread_mutex_destroy (&agent->prog_mutex))
{
GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
diff --git a/libgomp/testsuite/libgomp.c/alloc_cache-1.c b/libgomp/testsuite/libgomp.c/alloc_cache-1.c
new file mode 100644
index 00000000000..b71368cba85
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/alloc_cache-1.c
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+/* Unit-test the alloc cache DS. */
+#include
+#include
+
+#include "alloc_cache.h"
+
+int
+main ()
+{
+ struct alloc_cache cache;
+
+ init_alloc_cache (&cache);
+
+ /* Empty cache. Should return NULL. */
+ assert (alloc_cache_try_find (&cache, 16) == NULL);
+ assert (alloc_cache_try_find (&cache, 0) == NULL);
+
+ /* Populating it a bit. */
+ {
+ for (int i = 0; i < 5; i++)
+ {
+ uintptr_t x = 1 << i;
+ __auto_type n = alloc_cache_add_taken_node (&cache, (void *) x, x);
+ assert (n);
+ assert (n->allocation == (void *)x);
+ release_alloc_cache_node (n);
+ }
+ }
+
+ /* Taking five things, each of size 1, should return the whole cache. */
+ {
+ struct alloc_cache_node *n[5];
+ uint32_t gotten_nodes = 0;
+ for (int i = 0; i < 5; i++)
+ {
+ __auto_type node = n[i] = alloc_cache_try_find (&cache, 1);
+ uintptr_t x = (uintptr_t) node->allocation;
+ gotten_nodes |= x;
+ assert (x == 1
+ || x == 2
+ || x == 4
+ || x == 8
+ || x == 16);
+ }
+ assert (gotten_nodes == 0b11111);
+
+ /* ... and the cache should remain empty. */
+ assert (alloc_cache_try_find (&cache, 0) == NULL);
+
+ for (int i = 0; i < 5; i++)
+ release_alloc_cache_node (n[i]);
+ }
+
+ /* Taking 16 twice should fail the second time. */
+ {
+ __auto_type n = alloc_cache_try_find (&cache, 16);
+ assert (n != NULL && ((uintptr_t) n->allocation) == 16);
+ assert (alloc_cache_try_find (&cache, 16) == NULL);
+ release_alloc_cache_node (n);
+ }
+}