diff options
| author | DJ Delorie <dj@delorie.com> | 2016-04-29 14:58:34 -0400 |
|---|---|---|
| committer | DJ Delorie <dj@delorie.com> | 2016-04-29 15:01:16 -0400 |
| commit | 46b44370f430c1de1da835903b96c77a55ff1d63 (patch) | |
| tree | 6d29bcc00e85789d7829f7a8ff1c3dbde0895660 /malloc/trace_run.c | |
| parent | 5bd0885c9ddf14ddb959742fcb0d4b2368d7b44f (diff) | |
| download | glibc-46b44370f430c1de1da835903b96c77a55ff1d63.tar.xz glibc-46b44370f430c1de1da835903b96c77a55ff1d63.zip | |
changes to per-thread cache algorithms
Core algorithm changes:
* Per-thread cache is refilled from existing fastbins and smallbins
instead of always needing a bigger chunk.
* Caches are linked, and cache is cleaned up when the thread exits
(incomplete for now, needed framework for chunk scanner).
* Fixes to mutex placement - needed to sync chunk headers across
threads.
Enabling the per-thread cache (tcache) gives about a 20-30% speedup at
a 20-30% memory cost (due to fragmentation). Still working on that :-)
Debugging helpers (temporary):
* __malloc_scan_chunks() calls back to the app for each chunk in each
heap.
* _m_printf() helper for "safe" printing within malloc
* Lots of calls to the above, commented out, in case you need them.
* trace_run scans leftover chunks too.
Diffstat (limited to 'malloc/trace_run.c')
| -rw-r--r-- | malloc/trace_run.c | 123 |
1 files changed, 116 insertions, 7 deletions
diff --git a/malloc/trace_run.c b/malloc/trace_run.c index 986c3dd093..52d548738e 100644 --- a/malloc/trace_run.c +++ b/malloc/trace_run.c @@ -9,6 +9,8 @@ #include <sys/resource.h> #include <fcntl.h> +#include "malloc.h" + /* These must stay in sync with trace2dat */ #define C_NOP 0 #define C_DONE 1 @@ -49,6 +51,15 @@ static int64_t diff_timeval (struct timeval e, struct timeval s) return usec; } +#if 1 +#define Q1 +#define Q2 +#else +pthread_mutex_t genmutex = PTHREAD_MUTEX_INITIALIZER; +#define Q1 pthread_mutex_lock(&genmutex) +#define Q2 pthread_mutex_unlock(&genmutex) +#endif + pthread_mutex_t cmutex = PTHREAD_MUTEX_INITIALIZER; #define NCBUF 10 static char cbuf[NCBUF][30]; @@ -95,9 +106,15 @@ int64_t calloc_time = 0, calloc_count = 0; int64_t realloc_time = 0, realloc_count = 0; int64_t free_time = 0, free_count = 0; +pthread_mutex_t stop_mutex = PTHREAD_MUTEX_INITIALIZER; +int threads_done = 0; + //#define dprintf printf #define dprintf(...) 1 +//#define mprintf printf +#define mprintf(...) 1 + #define myabort() my_abort_2(me, __LINE__) my_abort_2 (pthread_t me, int line) { @@ -110,10 +127,20 @@ wmem (volatile void *ptr, int count) { char *p = (char *)ptr; int i; + size_t sz; + + if (!p) + return; + + // sz = *((size_t *)ptr-1) & ~7; + // fprintf(stderr, "wmem: %p size %x csize %x\n", ptr, + // count, sz); + // if (sz < 4*sizeof(size_t)) + // abort(); for (i=0; i<count; i+=8) - p[i] = 0; + p[i] = 0x11; } -#define wmem(a,b) +#define xwmem(a,b) static size_t get_int (unsigned char **ptr) { @@ -140,6 +167,7 @@ thread_common (void *my_data_v) int64_t my_realloc_time = 0, my_realloc_count = 0; int64_t my_free_time = 0, my_free_count = 0; int64_t stime; + volatile void *tmp; while (1) { @@ -162,7 +190,10 @@ thread_common (void *my_data_v) calloc_count += my_calloc_count; realloc_count += my_realloc_count; free_count += my_free_count; + threads_done ++; pthread_mutex_unlock (&stat_mutex); + pthread_mutex_lock(&stop_mutex); + pthread_mutex_unlock(&stop_mutex); return NULL; case C_MALLOC: @@ -172,7 +203,12 @@ thread_common (void *my_data_v) if (p2 > n_ptrs) myabort(); stime = rdtsc_s(); + Q1; + if (ptrs[p2]) + free ((void *)ptrs[p2]); ptrs[p2] = malloc (sz); + mprintf("%p = malloc(%lx)\n", ptrs[p2], sz); + Q2; my_malloc_time += rdtsc_e() - stime; my_malloc_count ++; wmem(ptrs[p2], sz); @@ -185,7 +221,12 @@ thread_common (void *my_data_v) if (p2 > n_ptrs) myabort(); stime = rdtsc_s(); + Q1; + if (ptrs[p2]) + free ((void *)ptrs[p2]); ptrs[p2] = calloc (sz, 1); + mprintf("%p = calloc(%lx)\n", ptrs[p2], sz); + Q2; my_calloc_time += rdtsc_e() - stime; my_calloc_count ++; wmem(ptrs[p2], sz); @@ -196,22 +237,33 @@ thread_common (void *my_data_v) p1 = get_int (&cp); sz = get_int (&cp); dprintf("op %d:%d %d = REALLOC %d %d\n", (int)me, cp-data, p2, p1, sz); + if (p1 > n_ptrs) + myabort(); if (p2 > n_ptrs) myabort(); stime = rdtsc_s(); + Q1; + tmp = ptrs[p1]; ptrs[p2] = realloc ((void *)ptrs[p1], sz); + mprintf("%p = relloc(%p,%lx)\n", ptrs[p2], tmp,sz); + Q2; my_realloc_time += rdtsc_e() - stime; my_realloc_count ++; wmem(ptrs[p2], sz); + if (p1 != p2) + ptrs[p1] = 0; break; case C_FREE: p1 = get_int (&cp); - if (p2 > n_ptrs) + if (p1 > n_ptrs) myabort(); dprintf("op %d:%d FREE %d\n", (int)me, cp-data, p1); stime = rdtsc_s(); + Q1; + mprintf("free(%p)\n", ptrs[p1]); free ((void *)ptrs[p1]); + Q2; my_free_time += rdtsc_e() - stime; my_free_count ++; ptrs[p1] = 0; @@ -276,6 +328,25 @@ my_malloc (char *msg, int size, unsigned char **cp, size_t *psz, size_t count) return rv; } +static const char * const scan_names[] = { + "UNUSED", + "ARENA", + "HEAP", + "CHUNK_USED", + "CHUNK_FREE", + "FASTBIN_FREE", + "UNSORTED", + "TOP", + "TCACHE", + "USED" +}; + +void +malloc_scan_callback (void *ptr, size_t length, int type) +{ + printf("%s: ptr %p length %llx\n", scan_names[type], ptr, length); +} + #define MY_ALLOC(T, psz) \ (typeof (T)) my_malloc (#T, sizeof(*T), &cp, psz, 0) #define MY_ALLOCN(T, count) \ @@ -317,6 +388,8 @@ main(int argc, char **argv) for (i=0; i<n_data; i+=512) asm volatile ("# forced read %0" :: "r" (data[i])); + pthread_mutex_lock(&stop_mutex); + cp = data; while (cp) { @@ -353,11 +426,12 @@ main(int argc, char **argv) thread_idx ++; break; case C_DONE: - for (i=0; i<thread_idx; i++) + do { - dprintf("Joining thread %lld\n", (long)thread_ids[i]); - pthread_join (thread_ids[i], NULL); - } + pthread_mutex_lock (&stat_mutex); + i = threads_done; + pthread_mutex_unlock (&stat_mutex); + } while (i < thread_idx); cp = NULL; break; } @@ -388,5 +462,40 @@ main(int argc, char **argv) printf("Avg free time: %8s in %10s calls\n", comma(free_time/free_count), comma(free_count)); printf("Total call time: %s cycles\n", comma(malloc_time+calloc_time+realloc_time+free_time)); printf("\n"); + +#if 0 + /* Free any still-held chunks of memory. */ + for (idx=0; idx<n_ptrs; idx++) + if (ptrs[idx]) + { + free((void *)ptrs[idx]); + ptrs[idx] = 0; + } +#endif + + /* This will fail (crash) for system glibc but that's OK. */ + __malloc_scan_chunks(malloc_scan_callback); + + malloc_info (0, stdout); + +#if 1 + /* ...or report them as used. */ + for (idx=0; idx<n_ptrs; idx++) + if (ptrs[idx]) + { + char *p = (char *)ptrs[idx] - 2*sizeof(size_t); + size_t *sp = (size_t *)p; + size_t size = sp[1] & ~7; + malloc_scan_callback (sp, size, 9); + } +#endif + + /* Now that we've scanned all the per-thread caches, it's safe to + let them exit and clean up. */ + pthread_mutex_unlock(&stop_mutex); + + for (i=0; i<thread_idx; i++) + pthread_join (thread_ids[i], NULL); + return 0; } |
