diff --git a/libgomp/alloc_cache.h b/libgomp/alloc_cache.h
new file mode 100644
index 00000000000..782569c1fae
--- /dev/null
+++ b/libgomp/alloc_cache.h
@@ -0,0 +1,144 @@
+/* A simple allocation cache.
+   Copyright (C) 2026 Free Software Foundation, Inc.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _ALLOC_CACHE_H
+#define _ALLOC_CACHE_H
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdatomic.h>
+#include <stddef.h>
+#include <pthread.h>
+
+/* A single cached allocation.  All fields immutable.  */
+struct alloc_cache_node
+{
+  /* When taken, someone is using this node, and we can't.  */
+  pthread_mutex_t lock;
+  void *allocation;
+  size_t size;
+
+  struct alloc_cache_node *next;
+};
+
+struct alloc_cache
+{
+  _Atomic (struct alloc_cache_node *) head;
+
+  /* Could be made better by breaking it up into buckets eventually.  Our
+     current allocation pattern is such that most accesses are likely to only
+     ever use the smallest practical allocation, so there isn't much gain in
+     implementing buckets currently.
+
+     Currently, as it is used, this cache will likely be of size O(T) where T
+     is the max number of concurrently executing kernels during the lifetime of
+     the process.  I suspect this value is low, so even with a single bucket,
+     it is likely fast enough to search through.  */
+};
+
+/* Prepare CACHE for use, initializing it as empty.  */
+static inline void
+init_alloc_cache (struct alloc_cache *cache)
+{
+  atomic_init (&cache->head, NULL);
+}
+
+/* Search through CACHE, looking for a non-taken node of large enough to fit
+   DESIRED_SIZE bytes.  Returns NULL if no such node exists.  */
+static inline struct alloc_cache_node *
+alloc_cache_try_find (struct alloc_cache *cache, size_t desired_size)
+{
+  for (struct alloc_cache_node *node =
+	 atomic_load_explicit (&cache->head, memory_order_relaxed);
+       node;
+       node = node->next)
+    {
+      if (node->size < desired_size)
+	continue;
+
+      int ret;
+      if ((ret = pthread_mutex_trylock (&node->lock)) == EBUSY)
+	continue;
+      assert (ret == 0);
+
+      /* It worked!  We found a node that's large enough and free.  */
+      return node;
+    }
+
+  return NULL;
+}
+
+/* Add a new node for allocation ALLOCATION of SIZE bytes into the cache.  The
+   new node is acquired on return.  */
+static inline struct alloc_cache_node *
+alloc_cache_add_taken_node (struct alloc_cache *cache,
+			    void *allocation,
+			    size_t size)
+{
+  struct alloc_cache_node *new_node = malloc (sizeof (*new_node));
+
+  if (!new_node)
+    return NULL;
+
+  *new_node = (struct alloc_cache_node) {
+    .lock = PTHREAD_MUTEX_INITIALIZER,
+    .allocation = allocation,
+    .size = size,
+    .next = NULL
+  };
+  pthread_mutex_lock (&new_node->lock);
+
+  /* Place it on the top of the stack.  */
+  struct alloc_cache_node *top = (atomic_load_explicit
+				  (&cache->head, memory_order_acquire));
+
+  do new_node->next = top;
+  while (!atomic_compare_exchange_weak_explicit
+	 (&cache->head, &top, new_node,
+	  memory_order_acq_rel, memory_order_acquire));
+
+  return new_node;
+}
+
+/* Allow NODE to be used by other users of its cache.  */
+static inline void
+release_alloc_cache_node (struct alloc_cache_node *node)
+{
+  pthread_mutex_unlock (&node->lock);
+}
+
+/* Destroy NODE.  Caller is responsible for cleaning up the allocation inside
+   of NODE, and for making sure that it is not part of any cache that is going
+   to be used in the future.  */
+static inline void
+destroy_alloc_cache_node (struct alloc_cache_node *node)
+{
+  pthread_mutex_destroy (&node->lock);
+  free (node);
+}
+
+
+#endif /* _ALLOC_CACHE_H  */
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 3f0577fa82e..32f573f1b7f 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -41,6 +41,7 @@
 #include <hsa_ext_amd.h>
 #include <dlfcn.h>
 #include <signal.h>
+#include "alloc_cache.h"
 #define _LIBGOMP_PLUGIN_INCLUDE 1
 #include "libgomp-plugin.h"
 #undef _LIBGOMP_PLUGIN_INCLUDE
@@ -281,8 +282,9 @@ struct kernel_dispatch
   struct agent_info *agent;
   /* Pointer to a command queue associated with a kernel dispatch agent.  */
   void *queue;
-  /* Pointer to a memory space used for kernel arguments passing.  */
-  void *kernarg_address;
+  /* Pointer to a memory space used for kernel arguments passing, wrapped in a
+     node from the agent kernel argument cache.  */
+  struct alloc_cache_node *kernarg_cache_node;
   /* Kernel object.  */
   uint64_t object;
   /* Synchronization signal used for dispatch synchronization.  */
@@ -472,6 +474,10 @@ struct agent_info
   /* The HSA memory region from which to allocate kernel arguments.  */
   hsa_region_t kernarg_region;
 
+  /* A stack of allocations in kernarg_region of (sizeof (struct kernargs))
+     size each, used for ammortizing kernel argument allocation cost.  */
+  struct alloc_cache kernarg_cache;
+
   /* The HSA memory region from which to allocate device data.  */
   hsa_region_t data_region;
 
@@ -1082,7 +1088,7 @@ dump_executable_symbols (hsa_executable_t executable)
 static void
 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
 {
-  struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
+  struct kernargs *kernargs = dispatch->kernarg_cache_node->allocation;
 
   fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
   fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
@@ -2004,6 +2010,34 @@ alloc_by_agent (struct agent_info *agent, size_t size)
   return ptr;
 }
 
+/* Get a cached kernargs from AGENT, returning an existing one if any are
+   available.  Returns an alloc_cache_node whose value is this allocation.  */
+
+static struct alloc_cache_node *
+alloc_kernargs_on_agent (struct agent_info *agent, size_t size)
+{
+  struct alloc_cache_node *ka_node = (alloc_cache_try_find
+				      (&agent->kernarg_cache, size));
+
+  /* The cache was empty.  */
+  if (!ka_node)
+    {
+      void *ka_addr;
+      hsa_status_t status = hsa_fns.hsa_memory_allocate_fn
+	(agent->kernarg_region, sizeof (struct kernargs), &ka_addr);
+      if (status != HSA_STATUS_SUCCESS)
+	hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
+
+      ka_node = alloc_cache_add_taken_node (&agent->kernarg_cache,
+					    ka_addr,
+					    size);
+      if (!ka_node)
+	GOMP_PLUGIN_fatal ("Could not allocate cache node for kernel arguments");
+    }
+
+  return ka_node;
+}
+
 /* Create kernel dispatch data structure for given KERNEL, along with
    the necessary device signals and memory allocations.  */
 
@@ -2054,12 +2088,10 @@ create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
       return NULL;
     }
 
-  status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
-					   sizeof (struct kernargs),
-					   &shadow->kernarg_address);
-  if (status != HSA_STATUS_SUCCESS)
-    hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
-  struct kernargs *kernargs = shadow->kernarg_address;
+  /* Get an allocation, if possible from the cache.  */
+  shadow->kernarg_cache_node = (alloc_kernargs_on_agent
+				(agent, sizeof (struct kernargs)));
+  struct kernargs *kernargs = shadow->kernarg_cache_node->allocation;
 
   /* Zero-initialize the output_data (minimum needed).  */
   kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
@@ -2158,13 +2190,13 @@ release_kernel_dispatch (struct kernel_dispatch *shadow)
 {
   GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
 
-  struct kernargs *kernargs = shadow->kernarg_address;
+  struct kernargs *kernargs = shadow->kernarg_cache_node->allocation;
   void *addr = (void *)kernargs->abi.arena_ptr;
   if (!addr)
     addr = (void *)kernargs->abi.stack_ptr;
   release_ephemeral_memories (shadow->agent, addr);
 
-  hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
+  release_alloc_cache_node (shadow->kernarg_cache_node);
 
   hsa_signal_t s;
   s.handle = shadow->signal;
@@ -2406,12 +2438,13 @@ run_kernel (struct kernel_info *kernel, void *vars,
   packet->private_segment_size = shadow->private_segment_size;
   packet->group_segment_size = shadow->group_segment_size;
   packet->kernel_object = shadow->object;
-  packet->kernarg_address = shadow->kernarg_address;
+  struct kernargs *kernargs = (packet->kernarg_address
+			       = shadow->kernarg_cache_node->allocation);
   hsa_signal_t s;
   s.handle = shadow->signal;
   packet->completion_signal = s;
   hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
-  memcpy (shadow->kernarg_address, &vars, sizeof (vars));
+  memcpy (kernargs, &vars, sizeof (vars));
 
   GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
 
@@ -2437,11 +2470,10 @@ run_kernel (struct kernel_info *kernel, void *vars,
 					     1000 * 1000,
 					     HSA_WAIT_STATE_BLOCKED) != 0)
     {
-      console_output (kernel, shadow->kernarg_address, false);
+      console_output (kernel, kernargs, false);
     }
-  console_output (kernel, shadow->kernarg_address, true);
+  console_output (kernel, kernargs, true);
 
-  struct kernargs *kernargs = shadow->kernarg_address;
   unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
 
   release_kernel_dispatch (shadow);
@@ -3766,6 +3798,9 @@ GOMP_OFFLOAD_init_device (int n)
   GCN_DEBUG ("Selected device data memory region:\n");
   dump_hsa_region (agent->data_region, NULL);
 
+  /* Prepare kernargs cache.  */
+  init_alloc_cache (&agent->kernarg_cache);
+
   GCN_DEBUG ("GCN agent %d initialized\n", n);
 
   agent->initialized = true;
@@ -4183,6 +4218,17 @@ GOMP_OFFLOAD_fini_device (int n)
   if (status != HSA_STATUS_SUCCESS)
     return hsa_error ("Error destroying command queue", status);
 
+  /* Clean up kernargs cache.  */
+  struct alloc_cache_node *node = agent->kernarg_cache.head;
+  while (node)
+    {
+      hsa_fns.hsa_memory_free_fn (node->allocation);
+
+      struct alloc_cache_node *curr_node = node;
+      node = curr_node->next;
+      destroy_alloc_cache_node (curr_node);
+    }
+
   if (pthread_mutex_destroy (&agent->prog_mutex))
     {
       GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
diff --git a/libgomp/testsuite/libgomp.c/alloc_cache-1.c b/libgomp/testsuite/libgomp.c/alloc_cache-1.c
new file mode 100644
index 00000000000..b71368cba85
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/alloc_cache-1.c
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+/* Unit-test the alloc cache DS.  */
+#include <assert.h>
+#include <stdint.h>
+
+#include "alloc_cache.h"
+
+int
+main ()
+{
+  struct alloc_cache cache;
+
+  init_alloc_cache (&cache);
+
+  /* Empty cache.  Should return NULL.  */
+  assert (alloc_cache_try_find (&cache, 16) == NULL);
+  assert (alloc_cache_try_find (&cache, 0) == NULL);
+
+  /* Populating it a bit.  */
+  {
+    for (int i = 0; i < 5; i++)
+      {
+	uintptr_t x = 1 << i;
+	__auto_type n = alloc_cache_add_taken_node (&cache, (void *) x, x);
+	assert (n);
+	assert (n->allocation == (void *)x);
+	release_alloc_cache_node (n);
+      }
+  }
+
+  /* Taking five things, each of size 1, should return the whole cache.  */
+  {
+    struct alloc_cache_node *n[5];
+    uint32_t gotten_nodes = 0;
+    for (int i = 0; i < 5; i++)
+      {
+	__auto_type node = n[i] = alloc_cache_try_find (&cache, 1);
+	uintptr_t x = (uintptr_t) node->allocation;
+	gotten_nodes |= x;
+	assert (x == 1
+		|| x == 2
+		|| x == 4
+		|| x == 8
+		|| x == 16);
+      }
+    assert (gotten_nodes == 0b11111);
+
+    /* ... and the cache should remain empty.  */
+    assert (alloc_cache_try_find (&cache, 0) == NULL);
+
+    for (int i = 0; i < 5; i++)
+      release_alloc_cache_node (n[i]);
+  }
+
+  /* Taking 16 twice should fail the second time.  */
+  {
+    __auto_type n = alloc_cache_try_find (&cache, 16);
+    assert (n != NULL && ((uintptr_t) n->allocation) == 16);
+    assert (alloc_cache_try_find (&cache, 16) == NULL);
+    release_alloc_cache_node (n);
+  }
+}