Monday, December 15, 2008

cache line bouncing

Usually we don't realize how expensive is cacheline bouncing in parallel systems. Following is a simple example to evaluate the bouncing cost.

A multi-threaded application uses shared data in some of its thread:
struct shared_data_struct {
unsigned int data1;
unsigned int data2;
};
Suppose data1 is used only by thread1 and data2 is used by thread2. A natural way to optimize it is to pack data together in order to reduce the size of the application, thus maximizing the amount of memory that fits into the cache.

Unfortunately, this inevitably leads to poor performance: if both threads write to their assigned memory location the cache line must be always in exclusive state in the L1 data cache (L1D) of each core/processors and this generates a big cache coherency overhead.

For example, in the Intel Core 2 processor the cacheline size is 64 bytes (this can be retrieved using the command `getconf LEVEL1_DCACHE_LINESIZE` from the shell), so in the example above both data1 and data2 share the same L1D cacheline, though apparently they're using different independent memory locations.

We can measure the cost of the cacheline bounces using oprofile and a simple example. In cache-parallel.c (see the code below) we have the same `struct shared_data_struct` with an optional pad, depending on DISTINCT_CACHE_LINES symbol.

Let's see what happens without the pad, commenting the #define DISTINCT_CACHE_LINES:
Configure oprofile to account L1D cache misses:
$ sudo opcontrol --setup --event=L1D_PEND_MISS:500

Start oprofile:
$ sudo opcontrol -s
Using 2.6+ OProfile kernel interface.
Reading module info.
Using log file /var/lib/oprofile/samples/oprofiled.log
Daemon started.
Profiler running.

Run cache-parallel _without_ the pad in shared_data_struct:
$ time ./cache-parallel
...
real 0m29.274s
user 0m47.540s
sys 0m0.121s

Stop oprofile:
$ sudo opcontrol -h
Stopping profiling.
Killing daemon.

And see the results:
$ opannotate --source ./cache-parallel | grep data[12]++
1752 0.7326 : sd->data1++;
237090 99.1361 : sd->data2++;
^^^^^^
|
A lot of misses here!
If we add the pad (defining DISTINCT_CACHE_LINES):
$ time ./cache-parallel-with-pad
...
real 0m11.330s
user 0m20.686s
sys 0m0.037s

$ opannotate --source ./cache-parallel-with-pad | grep data[12]++
49 43.7500 : sd->data1++;
24 21.4286 : sd->data2++;
^^^^^^
|
Cache misses are dramatically reduced now!
A speedup of 29.274 / 11.330 = 2.583, in other words the cacheline bouncing effect produced, in this case, a slowdown of ~260%!!!

Following the source code of the cache-parallel example:
/*
* cache-parallel.c
*
* build with:
* gcc -DCACHELINE_SIZE=$(getconf LEVEL1_DCACHE_LINESIZE) -lpthread -Wall \
* -g -ocache-parallel cache-parallel.c
*/

#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <sched.h>
#include <unistd.h>
#include <pthread.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/syscall.h>

#define unlikely(expr) __builtin_expect(!!(expr), 0)
#define likely(expr) __builtin_expect(!!(expr), 1)

#define __cacheline_aligned__ __attribute__((__aligned__(CACHELINE_SIZE)))

#define LOOPS_MAX 2000000000
#define STACK_SIZE 4096

/*
* From GETTID(2):
*
* Glibc does not provide a wrapper for this system call; call it using
* syscall(2).
*
*/
static inline pid_t gettid(void)
{
return syscall(SYS_gettid);
}

/* XXX: comment this to see the effect of the cache line bouncing */
#define DISTINCT_CACHE_LINES
struct shared_data_struct {
unsigned int data1;
#ifdef DISTINCT_CACHE_LINES
unsigned char pad[CACHELINE_SIZE - sizeof(unsigned int)];
#endif
unsigned int data2;
};

static struct shared_data_struct shared_data __cacheline_aligned__;

static void dump_schedstats(pid_t pid, pid_t tid)
{
char buffer[256];
char filename[64];
FILE *f;

snprintf(filename, sizeof(filename),
"/proc/%d/task/%d/status", pid, tid);
f = fopen(filename, "r");
if (unlikely(f == NULL)) {
perror("could not read scheduler statistics");
exit(1);
}
while (fgets(buffer, sizeof(buffer), f))
fprintf(stdout, "[%d:%d] %s", pid, tid, buffer);
fclose(f);
}

static void *inc_first(void *arg)
{
struct shared_data_struct *sd = (struct shared_data_struct *)arg;
pid_t pid = getpid(), tid = gettid();
cpu_set_t cmask;
register long i;

/* set affinity */
CPU_ZERO(&cmask);
CPU_SET(0, &cmask);
if (unlikely(sched_setaffinity(pid, sizeof(cmask), &cmask) < 0)) {
perror("could not set cpu affinity for the child.");
exit(1);
}
/* periodically increment first member of shared struct */
for (i = 0; i < LOOPS_MAX; i++)
sd->data1++;
dump_schedstats(pid, tid);

return NULL;
}

static void *inc_second(void *arg)
{
struct shared_data_struct *sd = (struct shared_data_struct *)arg;
pid_t pid = getpid(), tid = gettid();
cpu_set_t cmask;
register long i;

/* set affinity */
CPU_ZERO(&cmask);
CPU_SET(1, &cmask);
if (unlikely(sched_setaffinity(0, sizeof(cmask), &cmask) < 0)) {
perror("could not set cpu affinity for current process.");
exit(1);
}
/* periodically increment second member of shared struct */
for (i = 0; i < LOOPS_MAX; i++)
sd->data2++;
dump_schedstats(pid, tid);

return NULL;
}

int main(int argc, char **argv)
{
void *child_stack;
pthread_t child_thr;

/* allocate memory for other process to execute in */
if (unlikely((child_stack = malloc(STACK_SIZE)) == NULL)) {
perror("cannot allocate stack for child");
exit(1);
}

/* create the child */
if (unlikely(pthread_create(&child_thr, NULL,
&inc_second, &shared_data) < 0)) {
perror("pthread_create failed");
exit(1);
}
inc_first((void *)&shared_data);
pthread_join(child_thr, NULL);

return 0;
}

1 comment:

captain said...

Great explanation, thank you. Your source code is truncated by your blogger layout in a couple of places where you have long lines, unfortunately.

Example:
#define __cacheline_aligned__ __attribute__((__aligned__(CACHELI