From 46b44370f430c1de1da835903b96c77a55ff1d63 Mon Sep 17 00:00:00 2001 From: DJ Delorie Date: Fri, 29 Apr 2016 14:58:34 -0400 Subject: changes to per-thread cache algorithms Core algorithm changes: * Per-thread cache is refilled from existing fastbins and smallbins instead of always needing a bigger chunk. * Caches are linked, and cache is cleaned up when the thread exits (incomplete for now, needed framework for chunk scanner). * Fixes to mutex placement - needed to sync chunk headers across threads. Enabling the per-thread cache (tcache) gives about a 20-30% speedup at a 20-30% memory cost (due to fragmentation). Still working on that :-) Debugging helpers (temporary): * __malloc_scan_chunks() calls back to the app for each chunk in each heap. * _m_printf() helper for "safe" printing within malloc * Lots of calls to the above, commented out, in case you need them. * trace_run scans leftover chunks too. --- malloc/trace_run.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 116 insertions(+), 7 deletions(-) (limited to 'malloc/trace_run.c') diff --git a/malloc/trace_run.c b/malloc/trace_run.c index 986c3dd093..52d548738e 100644 --- a/malloc/trace_run.c +++ b/malloc/trace_run.c @@ -9,6 +9,8 @@ #include #include +#include "malloc.h" + /* These must stay in sync with trace2dat */ #define C_NOP 0 #define C_DONE 1 @@ -49,6 +51,15 @@ static int64_t diff_timeval (struct timeval e, struct timeval s) return usec; } +#if 1 +#define Q1 +#define Q2 +#else +pthread_mutex_t genmutex = PTHREAD_MUTEX_INITIALIZER; +#define Q1 pthread_mutex_lock(&genmutex) +#define Q2 pthread_mutex_unlock(&genmutex) +#endif + pthread_mutex_t cmutex = PTHREAD_MUTEX_INITIALIZER; #define NCBUF 10 static char cbuf[NCBUF][30]; @@ -95,9 +106,15 @@ int64_t calloc_time = 0, calloc_count = 0; int64_t realloc_time = 0, realloc_count = 0; int64_t free_time = 0, free_count = 0; +pthread_mutex_t stop_mutex = PTHREAD_MUTEX_INITIALIZER; +int threads_done = 0; + //#define dprintf printf #define dprintf(...) 1 +//#define mprintf printf +#define mprintf(...) 1 + #define myabort() my_abort_2(me, __LINE__) my_abort_2 (pthread_t me, int line) { @@ -110,10 +127,20 @@ wmem (volatile void *ptr, int count) { char *p = (char *)ptr; int i; + size_t sz; + + if (!p) + return; + + // sz = *((size_t *)ptr-1) & ~7; + // fprintf(stderr, "wmem: %p size %x csize %x\n", ptr, + // count, sz); + // if (sz < 4*sizeof(size_t)) + // abort(); for (i=0; i n_ptrs) myabort(); stime = rdtsc_s(); + Q1; + if (ptrs[p2]) + free ((void *)ptrs[p2]); ptrs[p2] = malloc (sz); + mprintf("%p = malloc(%lx)\n", ptrs[p2], sz); + Q2; my_malloc_time += rdtsc_e() - stime; my_malloc_count ++; wmem(ptrs[p2], sz); @@ -185,7 +221,12 @@ thread_common (void *my_data_v) if (p2 > n_ptrs) myabort(); stime = rdtsc_s(); + Q1; + if (ptrs[p2]) + free ((void *)ptrs[p2]); ptrs[p2] = calloc (sz, 1); + mprintf("%p = calloc(%lx)\n", ptrs[p2], sz); + Q2; my_calloc_time += rdtsc_e() - stime; my_calloc_count ++; wmem(ptrs[p2], sz); @@ -196,22 +237,33 @@ thread_common (void *my_data_v) p1 = get_int (&cp); sz = get_int (&cp); dprintf("op %d:%d %d = REALLOC %d %d\n", (int)me, cp-data, p2, p1, sz); + if (p1 > n_ptrs) + myabort(); if (p2 > n_ptrs) myabort(); stime = rdtsc_s(); + Q1; + tmp = ptrs[p1]; ptrs[p2] = realloc ((void *)ptrs[p1], sz); + mprintf("%p = relloc(%p,%lx)\n", ptrs[p2], tmp,sz); + Q2; my_realloc_time += rdtsc_e() - stime; my_realloc_count ++; wmem(ptrs[p2], sz); + if (p1 != p2) + ptrs[p1] = 0; break; case C_FREE: p1 = get_int (&cp); - if (p2 > n_ptrs) + if (p1 > n_ptrs) myabort(); dprintf("op %d:%d FREE %d\n", (int)me, cp-data, p1); stime = rdtsc_s(); + Q1; + mprintf("free(%p)\n", ptrs[p1]); free ((void *)ptrs[p1]); + Q2; my_free_time += rdtsc_e() - stime; my_free_count ++; ptrs[p1] = 0; @@ -276,6 +328,25 @@ my_malloc (char *msg, int size, unsigned char **cp, size_t *psz, size_t count) return rv; } +static const char * const scan_names[] = { + "UNUSED", + "ARENA", + "HEAP", + "CHUNK_USED", + "CHUNK_FREE", + "FASTBIN_FREE", + "UNSORTED", + "TOP", + "TCACHE", + "USED" +}; + +void +malloc_scan_callback (void *ptr, size_t length, int type) +{ + printf("%s: ptr %p length %llx\n", scan_names[type], ptr, length); +} + #define MY_ALLOC(T, psz) \ (typeof (T)) my_malloc (#T, sizeof(*T), &cp, psz, 0) #define MY_ALLOCN(T, count) \ @@ -317,6 +388,8 @@ main(int argc, char **argv) for (i=0; i