tools/syncdemo.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440

/* syncdemo - a program to demonstrate the performance and validity of different
 * synchronization methods as well as some timing properties.
 *
 * The task to be done is very simple: a single gloabl integer is to to incremented 
 * by multiple threads. All this is done in a very-high concurrency environment. Note that
 * the test is unfair to mechanisms likes spinlocks, because we have almost only wait
 * time but no real processing time between the waits. However, the test provides
 * some good insight into atomic instructions vs. other synchronisation methods.
 * It also proves that garbling variables by not doing proper synchronisation is
 * highly likely. For best results, this program should be executed on a 
 * multiprocessor machine (on a uniprocessor, it will probably not display the
 * problems caused by missing synchronisation).
 *
 * Note: partitioned processing mode means that all computation is first done
 *       locally and the final result is then combined doing proper synchronization.
 *       This mode is used as a baseline for uninterrupted processing.
 *
 * compile with $ gcc -O1 -o syncdemo -lpthread syncdemo.c
 *
 * Alternatively, you may use -O0, but not a higher level. Note that
 * the gcc code generator does in neither case generate code really
 * suitable to compare "part" and "none" modes. If you absolutely need
 * to do that, you need to use inline assembly. However, the results should
 * be fairly OK when consitently using either -O0 or -O1. If you see a big loss
 * of performance when you compare "none" and "part", be sure to run 
 * "none" with -t1 and watch out for the results! In any case, looking at the generated
 * assembly code is vital to interpret results correctly. Review of generated assembly
 * done on 2010-05-05 indicates that -O0 is probably the best choice. Note that we
 * use the volatile attribute in one spot. This is used because it results in the
 * best comparable result for our gcc 4.4.3, not really to invoke the volatile semantics.
 *
 * use "gcc -g -Wa,-ahl=syncdemo.s -lpthread syncdemo.c" to obtain a mixed code/assembly listing.
 *
 * This program REQUIRES linux. With slight modification, it may run on Solaris.
 * Note that gcc on Sparc does NOT offer atomic instruction support!
 *
 * Copyright (C) 2010 by Rainer Gerhards <rgerhards@hq.adiscon.com>
 * Released under the GNU GPLv3.
 *
 * Inspired by (retrieved 2010-04-13)
 * http://www.alexonlinux.com/multithreaded-simple-data-type-access-and-atomic-variables
 */
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <pthread.h>
#include <unistd.h>
#include <semaphore.h>
#include <stdlib.h>
#include <linux/unistd.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <errno.h>
#include <getopt.h>


typedef enum { part, none, atomic, cas, spinlock, mutex, semaphore } syncType_t;
static syncType_t syncTypes[] = { part, none, atomic, cas, spinlock, mutex, semaphore };

/* config settings */
static int bCPUAffinity = 0;
static int procs = 0; /* number of processors */
static int numthrds = 0; /* if zero, => equal num of processors */
static unsigned goal = 50000000; /* 50 million */
static int bCSV = 0; /* generate CSV output? */
static int numIterations = 1; /* number of iterations */
static int dummyLoad = 0;     /* number of dummy load iterations to generate */
syncType_t syncType;
static int bAllSyncTypes = 0;

static int global_int = 0;	/* our global counter */
static unsigned thrd_WorkToDo;	/* number of computations each thread must do */
static volatile int bStartRun = 0; /* indicate to flag when threads should start */

static struct timeval tvStart, tvEnd; /* used for timing one testing iteration */

/* statistic counters */
static long long totalRuntime;
static unsigned minRuntime = 999999999;
static unsigned maxRuntime = 0;

/* sync objects (if needed) */
static pthread_mutex_t mut;
static pthread_spinlock_t spin;
static sem_t sem;

static char*
getSyncMethName(syncType_t st)
{
	switch(st) {
	case part     : return "partition";
	case none     : return "none";
	case atomic   : return "atomic op";
	case spinlock : return "spin lock";
	case mutex    : return "mutex";
	case semaphore: return "semaphore";
	case cas      : return "cas";
	}
}


static pid_t
gettid()
{
	return syscall( __NR_gettid );
}


void *workerThread( void *arg )
{
	int i, j;
	volatile int partval = 0; /* use volatile so that gcc generates code similar to global var */
	int *partptr;
	int oldval, newval; /* for CAS sync mode */
	int thrd_num = (int)(long)arg;
	cpu_set_t set;

	CPU_ZERO(&set);
	CPU_SET(thrd_num % procs, &set);
	if(syncType == part) {
		partval = 0;
	}

	/* if enabled, try to put thread on a fixed CPU (the one that corresponds to the
	 * thread ID). This may 
	 */
	if(bCPUAffinity) {
		if (sched_setaffinity( gettid(), sizeof( cpu_set_t ), &set )) {
			perror( "sched_setaffinity" );
			return NULL;
		}
	}

	/* wait for "go" */
	while(bStartRun == 0)
		/*WAIT!*/;

	for (i = 0; i < thrd_WorkToDo; i++) {
		switch(syncType) {
		case part:
			///* one needs to use inline assembly to get this right... */
			//asm("addl	$1, global_int(%rip)");
			partval++;
			break;
		case none:
			global_int++;
			break;
		case atomic:
			__sync_fetch_and_add(&global_int,1);
			break;
		case cas:
			do {
				oldval = global_int;
				newval = oldval + 1;
			} while(!__sync_bool_compare_and_swap(&global_int, oldval, newval));
			break;
		case mutex:
			pthread_mutex_lock(&mut);
			global_int++;
			pthread_mutex_unlock(&mut);
			break;
		case spinlock:
			pthread_spin_lock(&spin);
			global_int++;
			pthread_spin_unlock(&spin);
			break;
		case semaphore:
			sem_wait(&sem);
			global_int++;
			sem_post(&sem);
			break;
		}

		/* we now generate "dummy load" if instructed to do so. The idea is that
		 * we do some other work, as in real life, so that we have a better
		 * ratio of sync vs. actual work to do.
		 */
		for(j = 0 ; j < dummyLoad ; ++j) {
			/* be careful: compiler may optimize loop out! */;
		}
	}

	if(syncType == part) {
		pthread_mutex_lock(&mut);
		global_int += partval;
		pthread_mutex_unlock(&mut);
	}

	return NULL;
}


static void beginTiming(void)
{
	if(!(bCSV || bAllSyncTypes)) {
		printf("Test Parameters:\n");
		printf("\tNumber of Cores.........: %d\n", procs);
		printf("\tNumber of Threads.......: %d\n", numthrds);
		printf("\tSet Affinity............: %s\n", bCPUAffinity ? "yes" : "no");
		printf("\tCount to................: %u\n", goal);
		printf("\tWork for each Thread....: %u\n", thrd_WorkToDo);
		printf("\tDummy Load Counter......: %d\n", dummyLoad);
		printf("\tSync Method used........: %s\n", getSyncMethName(syncType));
	}
	gettimeofday(&tvStart, NULL);
}


static void endTiming(void)
{
	unsigned delta;
	long sec, usec;
	long runtime;

	gettimeofday(&tvEnd, NULL);
	if(tvStart.tv_usec > tvEnd.tv_usec) {
		tvEnd.tv_sec--;
		tvEnd.tv_usec += 1000000;
	}

	sec = tvEnd.tv_sec - tvStart.tv_sec;
	usec = tvEnd.tv_usec - tvStart.tv_usec;

	delta = thrd_WorkToDo * numthrds - global_int;
	if(!bAllSyncTypes) {
		if(bCSV) {
			printf("%s,%d,%d,%d,%u,%u,%ld.%06.6ld\n",
				getSyncMethName(syncType), procs, numthrds, bCPUAffinity, goal, delta, sec, usec);
		} else {
			printf("measured (sytem time) runtime is %ld.% 6.6ld seconds\n", sec, usec);
			if(delta == 0) {
				printf("Computation was done correctly.\n");
			} else {
				printf("Computation INCORRECT,\n"
				       "\texpected %9u\n"
				       "\treal     %9u\n"
				       "\toff by   %9u\n",
					thrd_WorkToDo * numthrds,
					global_int,
					delta);
			}
		}
	}

	runtime = sec * 1000 + (usec / 1000);
	totalRuntime += runtime;
	if(runtime < minRuntime)
		minRuntime = runtime;
	if(runtime > maxRuntime)
		maxRuntime = runtime;
}


static void
usage(void)
{
	fprintf(stderr, "Usage: syncdemo -a -c<num> -t<num>\n");
	fprintf(stderr, "\t-a        set CPU affinity\n");
	fprintf(stderr, "\t-i        number of iterations\n");
	fprintf(stderr, "\t-c<num>   count to <num>\n");
	fprintf(stderr, "\t-d<num>   dummy load, <num> iterations\n");
	fprintf(stderr, "\t-t<num>   number of threads to use\n");
	fprintf(stderr, "\t-s<type>  sync-type to use (none, atomic, mutex, spin, semaphore)\n");
	fprintf(stderr, "\t-C        generate CSV output\n");
	fprintf(stderr, "\t-A        test ALL sync types\n");
	exit(2);
}


/* carry out the actual test (one iteration)
 */
static void
singleTest(void)
{
	int i;
	pthread_t *thrs;

	global_int = 0;
	bStartRun = 0;

	thrs = malloc(sizeof(pthread_t) * numthrds);
	if (thrs == NULL) {
		perror( "malloc" );
		exit(1);
	}

	thrd_WorkToDo = goal / numthrds;

	for (i = 0; i < numthrds; i++) {
		if(pthread_create( &thrs[i], NULL, workerThread, (void *)(long)i )) {
			perror( "pthread_create" );
			procs = i;
			break;
		}
	}

	beginTiming();
	bStartRun = 1; /* start the threads (they are busy-waiting so far!) */

	for (i = 0; i < numthrds; i++)
		pthread_join( thrs[i], NULL );

	endTiming();

	free( thrs );

}


/* display an unsigned ms runtime count as string. Note that the
 * string is inside a dynamically allocated buffer, which the caller
 * must free to prevent a memory leak.
 */
char *
dispRuntime(unsigned rt)
{
	static char *fmtbuf;

	fmtbuf = malloc(32 * sizeof(char));
	snprintf(fmtbuf, 32, "%u.%03.3u",
		 rt / 1000, rt % 1000);
	return(fmtbuf);
}


doTest(syncType_t st)
{
	int i;

	syncType = st;
	totalRuntime = 0;
	minRuntime = 999999999;
	maxRuntime = 0;
	for(i = 0 ; i < numIterations ; ++i) {
		//printf("starting iteration %d\n", i);
		singleTest();
	}

	/* we have a memory leak due to calling dispRuntime(), but we don't
         * care as we terminate immediately.
         */
	printf("%-10s: total runtime %6ld.%3.3u, avg %s, min %s, max %s\n",
	       getSyncMethName(st),
	       (long)totalRuntime/1000, (unsigned)(totalRuntime % 1000),
	       dispRuntime((unsigned) (totalRuntime / numIterations)),
	       dispRuntime(minRuntime),
	       dispRuntime(maxRuntime));
}


int
main(int argc, char *argv[])
{
	int i;
	int opt;

	while((opt = getopt(argc, argv, "ac:d:i:t:s:CA")) != EOF) {
		switch((char)opt) {
		case 'A':
			bAllSyncTypes = 1;
			break;
		case 'a':
			bCPUAffinity = 1;
			break;
		case 'c':
			goal = (unsigned) atol(optarg);
			break;
		case 'd':
			dummyLoad = atoi(optarg);
			break;
		case 'i':
			numIterations = atoi(optarg);
			break;
		case 't':
			numthrds = atoi(optarg);
			break;
		case 'C':
			bCSV = 1;
			break;
		case 's':
			if(!strcmp(optarg, "none"))
				syncType = none;
			else if(!strcmp(optarg, "part"))
				syncType = part;
			else if(!strcmp(optarg, "atomic"))
				syncType = atomic;
			else if(!strcmp(optarg, "cas"))
				syncType = cas;
			else if(!strcmp(optarg, "mutex")) {
				syncType = mutex;
				pthread_mutex_init(&mut, NULL);
			} else if(!strcmp(optarg, "spin")) {
				syncType = spinlock;
			} else if(!strcmp(optarg, "semaphore")) {
				syncType = semaphore;
				sem_init(&sem, 0, 1);
			} else {
				fprintf(stderr, "error: invalid sync mode '%s'\n", optarg);
				usage();
			}
			break;
		default:usage();
			break;
		}
	}

	/* for simplicity, we init all sync helpers no matter if we need them */
	pthread_mutex_init(&mut, NULL);
	pthread_spin_init(&spin, PTHREAD_PROCESS_PRIVATE);
	sem_init(&sem, 0, 1);

	/* Getting number of CPUs */
	procs = (int)sysconf(_SC_NPROCESSORS_ONLN);
	if(procs < 0) {
		perror("sysconf");
		return -1;
	}

	if(numthrds < 1) {
		numthrds = procs;
	}

	if(bAllSyncTypes) {
		for(i = 0 ; i < sizeof(syncTypes) / sizeof(syncType_t) ; ++i) {
			doTest(syncTypes[i]);
		}
		printf("Done running tests, result based on:\n");
		printf("\tNumber of Cores.........: %d\n", procs);
		printf("\tNumber of Threads.......: %d\n", numthrds);
		printf("\tSet CPU Affinity........: %s\n", bCPUAffinity ? "yes" : "no");
		printf("\tCount to................: %u\n", goal);
		printf("\tWork for each Thread....: %u\n", thrd_WorkToDo);
		printf("\tDummy Load Counter......: %d\n", dummyLoad);
		printf("\tIterations..............: %d\n", numIterations);
	} else {
		doTest(syncType);
	}

	return 0;
}