diff --git a/libgomp/alloc_cache.h b/libgomp/alloc_cache.h new file mode 100644 index 00000000000..782569c1fae --- /dev/null +++ b/libgomp/alloc_cache.h @@ -0,0 +1,144 @@ +/* A simple allocation cache. + Copyright (C) 2026 Free Software Foundation, Inc. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _ALLOC_CACHE_H +#define _ALLOC_CACHE_H + +#include +#include +#include +#include +#include +#include + +/* A single cached allocation. All fields immutable. */ +struct alloc_cache_node +{ + /* When taken, someone is using this node, and we can't. */ + pthread_mutex_t lock; + void *allocation; + size_t size; + + struct alloc_cache_node *next; +}; + +struct alloc_cache +{ + _Atomic (struct alloc_cache_node *) head; + + /* Could be made better by breaking it up into buckets eventually. Our + current allocation pattern is such that most accesses are likely to only + ever use the smallest practical allocation, so there isn't much gain in + implementing buckets currently. + + Currently, as it is used, this cache will likely be of size O(T) where T + is the max number of concurrently executing kernels during the lifetime of + the process. I suspect this value is low, so even with a single bucket, + it is likely fast enough to search through. */ +}; + +/* Prepare CACHE for use, initializing it as empty. */ +static inline void +init_alloc_cache (struct alloc_cache *cache) +{ + atomic_init (&cache->head, NULL); +} + +/* Search through CACHE, looking for a non-taken node of large enough to fit + DESIRED_SIZE bytes. Returns NULL if no such node exists. */ +static inline struct alloc_cache_node * +alloc_cache_try_find (struct alloc_cache *cache, size_t desired_size) +{ + for (struct alloc_cache_node *node = + atomic_load_explicit (&cache->head, memory_order_relaxed); + node; + node = node->next) + { + if (node->size < desired_size) + continue; + + int ret; + if ((ret = pthread_mutex_trylock (&node->lock)) == EBUSY) + continue; + assert (ret == 0); + + /* It worked! We found a node that's large enough and free. */ + return node; + } + + return NULL; +} + +/* Add a new node for allocation ALLOCATION of SIZE bytes into the cache. The + new node is acquired on return. */ +static inline struct alloc_cache_node * +alloc_cache_add_taken_node (struct alloc_cache *cache, + void *allocation, + size_t size) +{ + struct alloc_cache_node *new_node = malloc (sizeof (*new_node)); + + if (!new_node) + return NULL; + + *new_node = (struct alloc_cache_node) { + .lock = PTHREAD_MUTEX_INITIALIZER, + .allocation = allocation, + .size = size, + .next = NULL + }; + pthread_mutex_lock (&new_node->lock); + + /* Place it on the top of the stack. */ + struct alloc_cache_node *top = (atomic_load_explicit + (&cache->head, memory_order_acquire)); + + do new_node->next = top; + while (!atomic_compare_exchange_weak_explicit + (&cache->head, &top, new_node, + memory_order_acq_rel, memory_order_acquire)); + + return new_node; +} + +/* Allow NODE to be used by other users of its cache. */ +static inline void +release_alloc_cache_node (struct alloc_cache_node *node) +{ + pthread_mutex_unlock (&node->lock); +} + +/* Destroy NODE. Caller is responsible for cleaning up the allocation inside + of NODE, and for making sure that it is not part of any cache that is going + to be used in the future. */ +static inline void +destroy_alloc_cache_node (struct alloc_cache_node *node) +{ + pthread_mutex_destroy (&node->lock); + free (node); +} + + +#endif /* _ALLOC_CACHE_H */ diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index 3f0577fa82e..32f573f1b7f 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -41,6 +41,7 @@ #include #include #include +#include "alloc_cache.h" #define _LIBGOMP_PLUGIN_INCLUDE 1 #include "libgomp-plugin.h" #undef _LIBGOMP_PLUGIN_INCLUDE @@ -281,8 +282,9 @@ struct kernel_dispatch struct agent_info *agent; /* Pointer to a command queue associated with a kernel dispatch agent. */ void *queue; - /* Pointer to a memory space used for kernel arguments passing. */ - void *kernarg_address; + /* Pointer to a memory space used for kernel arguments passing, wrapped in a + node from the agent kernel argument cache. */ + struct alloc_cache_node *kernarg_cache_node; /* Kernel object. */ uint64_t object; /* Synchronization signal used for dispatch synchronization. */ @@ -472,6 +474,10 @@ struct agent_info /* The HSA memory region from which to allocate kernel arguments. */ hsa_region_t kernarg_region; + /* A stack of allocations in kernarg_region of (sizeof (struct kernargs)) + size each, used for ammortizing kernel argument allocation cost. */ + struct alloc_cache kernarg_cache; + /* The HSA memory region from which to allocate device data. */ hsa_region_t data_region; @@ -1082,7 +1088,7 @@ dump_executable_symbols (hsa_executable_t executable) static void print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent) { - struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address; + struct kernargs *kernargs = dispatch->kernarg_cache_node->allocation; fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch); fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue); @@ -2004,6 +2010,34 @@ alloc_by_agent (struct agent_info *agent, size_t size) return ptr; } +/* Get a cached kernargs from AGENT, returning an existing one if any are + available. Returns an alloc_cache_node whose value is this allocation. */ + +static struct alloc_cache_node * +alloc_kernargs_on_agent (struct agent_info *agent, size_t size) +{ + struct alloc_cache_node *ka_node = (alloc_cache_try_find + (&agent->kernarg_cache, size)); + + /* The cache was empty. */ + if (!ka_node) + { + void *ka_addr; + hsa_status_t status = hsa_fns.hsa_memory_allocate_fn + (agent->kernarg_region, sizeof (struct kernargs), &ka_addr); + if (status != HSA_STATUS_SUCCESS) + hsa_fatal ("Could not allocate memory for GCN kernel arguments", status); + + ka_node = alloc_cache_add_taken_node (&agent->kernarg_cache, + ka_addr, + size); + if (!ka_node) + GOMP_PLUGIN_fatal ("Could not allocate cache node for kernel arguments"); + } + + return ka_node; +} + /* Create kernel dispatch data structure for given KERNEL, along with the necessary device signals and memory allocations. */ @@ -2054,12 +2088,10 @@ create_kernel_dispatch (struct kernel_info *kernel, int num_teams, return NULL; } - status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region, - sizeof (struct kernargs), - &shadow->kernarg_address); - if (status != HSA_STATUS_SUCCESS) - hsa_fatal ("Could not allocate memory for GCN kernel arguments", status); - struct kernargs *kernargs = shadow->kernarg_address; + /* Get an allocation, if possible from the cache. */ + shadow->kernarg_cache_node = (alloc_kernargs_on_agent + (agent, sizeof (struct kernargs))); + struct kernargs *kernargs = shadow->kernarg_cache_node->allocation; /* Zero-initialize the output_data (minimum needed). */ kernargs->abi.out_ptr = (int64_t)&kernargs->output_data; @@ -2158,13 +2190,13 @@ release_kernel_dispatch (struct kernel_dispatch *shadow) { GCN_DEBUG ("Released kernel dispatch: %p\n", shadow); - struct kernargs *kernargs = shadow->kernarg_address; + struct kernargs *kernargs = shadow->kernarg_cache_node->allocation; void *addr = (void *)kernargs->abi.arena_ptr; if (!addr) addr = (void *)kernargs->abi.stack_ptr; release_ephemeral_memories (shadow->agent, addr); - hsa_fns.hsa_memory_free_fn (shadow->kernarg_address); + release_alloc_cache_node (shadow->kernarg_cache_node); hsa_signal_t s; s.handle = shadow->signal; @@ -2406,12 +2438,13 @@ run_kernel (struct kernel_info *kernel, void *vars, packet->private_segment_size = shadow->private_segment_size; packet->group_segment_size = shadow->group_segment_size; packet->kernel_object = shadow->object; - packet->kernarg_address = shadow->kernarg_address; + struct kernargs *kernargs = (packet->kernarg_address + = shadow->kernarg_cache_node->allocation); hsa_signal_t s; s.handle = shadow->signal; packet->completion_signal = s; hsa_fns.hsa_signal_store_relaxed_fn (s, 1); - memcpy (shadow->kernarg_address, &vars, sizeof (vars)); + memcpy (kernargs, &vars, sizeof (vars)); GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n"); @@ -2437,11 +2470,10 @@ run_kernel (struct kernel_info *kernel, void *vars, 1000 * 1000, HSA_WAIT_STATE_BLOCKED) != 0) { - console_output (kernel, shadow->kernarg_address, false); + console_output (kernel, kernargs, false); } - console_output (kernel, shadow->kernarg_address, true); + console_output (kernel, kernargs, true); - struct kernargs *kernargs = shadow->kernarg_address; unsigned int return_value = (unsigned int)kernargs->output_data.return_value; release_kernel_dispatch (shadow); @@ -3766,6 +3798,9 @@ GOMP_OFFLOAD_init_device (int n) GCN_DEBUG ("Selected device data memory region:\n"); dump_hsa_region (agent->data_region, NULL); + /* Prepare kernargs cache. */ + init_alloc_cache (&agent->kernarg_cache); + GCN_DEBUG ("GCN agent %d initialized\n", n); agent->initialized = true; @@ -4183,6 +4218,17 @@ GOMP_OFFLOAD_fini_device (int n) if (status != HSA_STATUS_SUCCESS) return hsa_error ("Error destroying command queue", status); + /* Clean up kernargs cache. */ + struct alloc_cache_node *node = agent->kernarg_cache.head; + while (node) + { + hsa_fns.hsa_memory_free_fn (node->allocation); + + struct alloc_cache_node *curr_node = node; + node = curr_node->next; + destroy_alloc_cache_node (curr_node); + } + if (pthread_mutex_destroy (&agent->prog_mutex)) { GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex"); diff --git a/libgomp/testsuite/libgomp.c/alloc_cache-1.c b/libgomp/testsuite/libgomp.c/alloc_cache-1.c new file mode 100644 index 00000000000..b71368cba85 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/alloc_cache-1.c @@ -0,0 +1,62 @@ +/* { dg-do run } */ +/* Unit-test the alloc cache DS. */ +#include +#include + +#include "alloc_cache.h" + +int +main () +{ + struct alloc_cache cache; + + init_alloc_cache (&cache); + + /* Empty cache. Should return NULL. */ + assert (alloc_cache_try_find (&cache, 16) == NULL); + assert (alloc_cache_try_find (&cache, 0) == NULL); + + /* Populating it a bit. */ + { + for (int i = 0; i < 5; i++) + { + uintptr_t x = 1 << i; + __auto_type n = alloc_cache_add_taken_node (&cache, (void *) x, x); + assert (n); + assert (n->allocation == (void *)x); + release_alloc_cache_node (n); + } + } + + /* Taking five things, each of size 1, should return the whole cache. */ + { + struct alloc_cache_node *n[5]; + uint32_t gotten_nodes = 0; + for (int i = 0; i < 5; i++) + { + __auto_type node = n[i] = alloc_cache_try_find (&cache, 1); + uintptr_t x = (uintptr_t) node->allocation; + gotten_nodes |= x; + assert (x == 1 + || x == 2 + || x == 4 + || x == 8 + || x == 16); + } + assert (gotten_nodes == 0b11111); + + /* ... and the cache should remain empty. */ + assert (alloc_cache_try_find (&cache, 0) == NULL); + + for (int i = 0; i < 5; i++) + release_alloc_cache_node (n[i]); + } + + /* Taking 16 twice should fail the second time. */ + { + __auto_type n = alloc_cache_try_find (&cache, 16); + assert (n != NULL && ((uintptr_t) n->allocation) == 16); + assert (alloc_cache_try_find (&cache, 16) == NULL); + release_alloc_cache_node (n); + } +}