diff --git a/libctru/source/internal.h b/libctru/source/internal.h index a3ab481..0d2d477 100644 --- a/libctru/source/internal.h +++ b/libctru/source/internal.h @@ -55,10 +55,6 @@ static inline ThreadVars* getThreadVars(void) void initThreadVars(struct Thread_tag *thread); -static inline size_t getThreadLocalStartOffset(size_t tls_tp) { - size_t align = 8; - if (__tdata_align > align) align = __tdata_align; - // ARM ELF TLS ABI mandates an 8-byte header, so we include an extra 8 bytes - // then add padding to align the .tdata properly - return (8 + (size_t)tls_tp + (__tdata_align - 1)) & ~(__tdata_align - 1); +static inline size_t alignTo(const size_t base, const size_t align) { + return (base + (align - 1)) & ~(align - 1); } diff --git a/libctru/source/system/syscalls.c b/libctru/source/system/syscalls.c index 393de1b..8921bb3 100644 --- a/libctru/source/system/syscalls.c +++ b/libctru/source/system/syscalls.c @@ -164,7 +164,7 @@ void initThreadVars(struct Thread_tag *thread) tv->thread_ptr = thread; #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Warray-bounds" - tv->tls_tp = (thread != NULL ? (u8*)thread->stacktop : __tls_start); + tv->tls_tp = (thread != NULL ? (u8*)thread->stacktop : __tls_start) - 8; // Arm ELF TLS ABI mandates an 8-byte header #pragma GCC diagnostic pop tv->srv_blocking_policy = false; @@ -181,7 +181,7 @@ void __system_initSyscalls(void) // Initialize thread vars for the main thread initThreadVars(NULL); u32 tls_size = __tdata_lma_end - __tdata_lma; - size_t tdata_start = getThreadLocalStartOffset((size_t)__tls_start); + size_t tdata_start = alignTo((size_t)__tls_start, __tdata_align); if (tls_size) memcpy((void*)tdata_start, __tdata_lma, tls_size); } diff --git a/libctru/source/thread.c b/libctru/source/thread.c index b71b549..e052fd5 100644 --- a/libctru/source/thread.c +++ b/libctru/source/thread.c @@ -19,31 +19,42 @@ static void _thread_begin(void* arg) Thread threadCreate(ThreadFunc entrypoint, void* arg, size_t stack_size, int prio, int core_id, bool detached) { - size_t stackoffset = (sizeof(struct Thread_tag) + 7) & ~7; - size_t allocsize = getThreadLocalStartOffset(stackoffset + stack_size); + // The stack must be 8-aligned at minimum. + size_t align = __tdata_align > 8 ? __tdata_align : 8; + + size_t stackoffset = alignTo(sizeof(struct Thread_tag), align); + size_t allocsize = alignTo(stackoffset + stack_size, align); + size_t tlssize = __tls_end-__tls_start; size_t tlsloadsize = __tdata_lma_end-__tdata_lma; - size_t tbsssize = tlssize-tlsloadsize; + size_t tbsssize = tlssize - tlsloadsize; + + // memalign seems to have an implicit requirement that (size % align) == 0. + // Without this, it seems to return NULL whenever (align > 8). + size_t size = alignTo(allocsize + tlssize, align); // Guard against overflow if (allocsize < stackoffset) return NULL; - if ((allocsize-stackoffset) < stack_size) return NULL; - if ((allocsize+tlssize) < allocsize) return NULL; + if ((allocsize - stackoffset) < stack_size) return NULL; + if (size < allocsize) return NULL; - Thread t = (Thread)memalign(__tdata_align, allocsize + tlssize); + Thread t = (Thread)memalign(align, size); if (!t) return NULL; t->ep = entrypoint; t->arg = arg; t->detached = detached; t->finished = false; - t->stacktop = (u8*)t + stackoffset + stack_size; + t->stacktop = (u8*)t + allocsize; + + // ThreadVars.tls_tp must be aligned correctly, so we bump tdata_start to + // ensure that after subtracting 8 bytes for the TLS header, it will be aligned. + size_t tdata_start = 8 + alignTo((size_t)t->stacktop - 8, align); - void* tdata_start = (void*)getThreadLocalStartOffset((size_t)t->stacktop); if (tlsloadsize) - memcpy(tdata_start, __tdata_lma, tlsloadsize); + memcpy((void*)tdata_start, __tdata_lma, tlsloadsize); if (tbsssize) - memset(tdata_start + tlsloadsize, 0, tbsssize); + memset((void*)tdata_start + tlsloadsize, 0, tbsssize); // Set up child thread's reent struct, inheriting standard file handles _REENT_INIT_PTR(&t->reent);