summaryrefslogtreecommitdiffstats
path: root/tools/syncdemo.c
blob: b4b75cdcb4e005ccd868a9ed35ea07272f84791b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
/* syncdemo - a program to demonstrate the performance and validity of different
 * synchronization methods as well as some timing properties.
 *
 * The task to be done is very simple: a single gloabl integer is to to incremented 
 * by multiple threads. All this is done in a very-high concurrency environment. Note that
 * the test is unfair to mechanisms likes spinlocks, because we have almost only wait
 * time but no real processing time between the waits. However, the test provides
 * some good insight into atomic instructions vs. other synchronisation methods.
 * It also proves that garbling variables by not doing proper synchronisation is
 * highly likely. For best results, this program should be executed on a 
 * multiprocessor machine (on a uniprocessor, it will probably not display the
 * problems caused by missing synchronisation).
 *
 * compile with $ gcc -O0 -o syncdemo -lpthread syncdemo.c
 *
 * This program REQUIRES linux. With slight modification, it may run on Solaris.
 * Note that gcc on Sparc does NOT offer atomic instruction support!
 *
 * Copyright (C) 2010 by Rainer Gerhards <rgerhards@hq.adiscon.com>
 * Released under the GNU GPLv3.
 *
 * Inspired by (retrieved 2010-04-13)
 * http://www.alexonlinux.com/multithreaded-simple-data-type-access-and-atomic-variables
 */
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <pthread.h>
#include <unistd.h>
#include <stdlib.h>
#include <linux/unistd.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <errno.h>
#include <getopt.h>

/* config settings */
static int bCPUAffinity = 0;
static int procs = 0; /* number of processors */
static int numthrds = 0; /* if zero, => equal num of processors */
static unsigned goal = 50000000; /* 50 million */
static int bCSV = 0; /* generate CVS output? */
static int numIterations = 1; /* number of iterations */
static int dummyLoad = 0;     /* number of dummy load iterations to generate */
static enum { none, atomic, cas, mutex, spinlock } syncType;

static int global_int = 0;	/* our global counter */
static unsigned thrd_WorkToDo;	/* number of computations each thread must do */
static volatile int bStartRun = 0; /* indicate to flag when threads should start */

static struct timeval tvStart, tvEnd; /* used for timing one testing iteration */

/* statistic counters */
static long long totalRuntime;

/* sync objects (if needed) */
static pthread_mutex_t mut;
static pthread_spinlock_t spin;

static char*
getSyncMethName()
{
	switch(syncType) {
	case none    : return "none";
	case atomic  : return "atomic instruction";
	case mutex   : return "mutex";
	case spinlock: return "spin lock";
	case cas     : return "cas";
	}
}


static pid_t
gettid()
{
	return syscall( __NR_gettid );
}


void *workerThread( void *arg )
{
	int i, j;
	int oldval, newval; /* for CAS sync mode */
	int thrd_num = (int)(long)arg;
	cpu_set_t set;

	CPU_ZERO(&set);
	CPU_SET(thrd_num % procs, &set);

	/* if enabled, try to put thread on a fixed CPU (the one that corresponds to the
	 * thread ID). This may 
	 */
	if(bCPUAffinity) {
		if (sched_setaffinity( gettid(), sizeof( cpu_set_t ), &set )) {
			perror( "sched_setaffinity" );
			return NULL;
		}
	}

	/* wait for "go" */
	while(bStartRun == 0)
		/*WAIT!*/;

	for (i = 0; i < thrd_WorkToDo; i++) {
		switch(syncType) {
		case none:
			global_int++;
			break;
		case atomic:
			__sync_fetch_and_add(&global_int,1);
			break;
		case cas:
			do {
				oldval = global_int;
				newval = oldval + 1;
			} while(!__sync_bool_compare_and_swap(&global_int, oldval, newval));
			break;
		case mutex:
			pthread_mutex_lock(&mut);
			global_int++;
			pthread_mutex_unlock(&mut);
			break;
		case spinlock:
			pthread_spin_lock(&spin);
			global_int++;
			pthread_spin_unlock(&spin);
			break;
		}

		/* we now generate "dummy load" if instructed to do so. The idea is that
		 * we do some other work, as in real life, so that we have a better
		 * ratio of sync vs. actual work to do.
		 */
		for(j = 0 ; j < dummyLoad ; ++j) {
			/* be careful: compiler may optimize loop out! */;
		}
	}

	return NULL;
}


static void beginTiming(void)
{
	if(!bCSV) {
		printf("Test Parameters:\n");
		printf("\tNumber of Cores.........: %d\n", procs);
		printf("\tNumber of Threads.......: %d\n", numthrds);
		printf("\tSet Affinity............: %s\n", bCPUAffinity ? "yes" : "no");
		printf("\tCount to................: %u\n", goal);
		printf("\tWork for each Thread....: %u\n", thrd_WorkToDo);
		printf("\tDummy Load Counter......: %d\n", dummyLoad);
		printf("\tSync Method used........: %s\n", getSyncMethName());
	}
	gettimeofday(&tvStart, NULL);
}


static void endTiming(void)
{
	unsigned delta;
	long sec, usec;

	gettimeofday(&tvEnd, NULL);
	if(tvStart.tv_usec > tvEnd.tv_usec) {
		tvEnd.tv_sec--;
		tvEnd.tv_usec += 1000000;
	}

	sec = tvEnd.tv_sec - tvStart.tv_sec;
	usec = tvEnd.tv_usec - tvStart.tv_usec;

	delta = thrd_WorkToDo * numthrds - global_int;
	if(bCSV) {
		printf("%s,%d,%d,%d,%u,%u,%ld.%ld\n",
			getSyncMethName(), procs, numthrds, bCPUAffinity, goal, delta, sec, usec);
	} else {
		printf("measured (sytem time) runtime is %ld.%ld seconds\n", sec, usec);
		if(delta == 0) {
			printf("Computation was done correctly.\n");
		} else {
			printf("Computation INCORRECT,\n"
			       "\texpected %9u\n"
			       "\treal     %9u\n"
			       "\toff by   %9u\n",
				thrd_WorkToDo * numthrds,
				global_int,
				delta);
		}
	}

	totalRuntime += sec * 1000 + (usec / 1000);
}


static void
usage(void)
{
	fprintf(stderr, "Usage: syncdemo -a -c<num> -t<num>\n");
	fprintf(stderr, "\t-a        set CPU affinity\n");
	fprintf(stderr, "\t-c<num>   count to <num>\n");
	fprintf(stderr, "\t-d<num>   dummy load, <num> iterations\n");
	fprintf(stderr, "\t-t<num>   number of threads to use\n");
	fprintf(stderr, "\t-s<type>  sync-type to use (none, atomic, mutex, spin)\n");
	fprintf(stderr, "\t-C        generate CVS output\n");
	fprintf(stderr, "\t-I        number of iterations\n");
	exit(2);
}


/* carry out the actual test (one iteration)
 */
static void
singleTest(void)
{
	int i;
	pthread_t *thrs;

	global_int = 0;
	bStartRun = 0;

	thrs = malloc(sizeof(pthread_t) * numthrds);
	if (thrs == NULL) {
		perror( "malloc" );
		exit(1);
	}

	thrd_WorkToDo = goal / numthrds;

	for (i = 0; i < numthrds; i++) {
		if(pthread_create( &thrs[i], NULL, workerThread, (void *)(long)i )) {
			perror( "pthread_create" );
			procs = i;
			break;
		}
	}

	beginTiming();
	bStartRun = 1; /* start the threads (they are busy-waiting so far!) */

	for (i = 0; i < numthrds; i++)
		pthread_join( thrs[i], NULL );

	endTiming();

	free( thrs );

}


int
main(int argc, char *argv[])
{
	int i;
	int opt;

	while((opt = getopt(argc, argv, "ac:d:i:t:s:C")) != EOF) {
		switch((char)opt) {
		case 'a':
			bCPUAffinity = 1;
			break;
		case 'c':
			goal = (unsigned) atol(optarg);
			break;
		case 'd':
			dummyLoad = atoi(optarg);
			break;
		case 'i':
			numIterations = atoi(optarg);
			break;
		case 't':
			numthrds = atoi(optarg);
			break;
		case 'C':
			bCSV = 1;
			break;
		case 's':
			if(!strcmp(optarg, "none"))
				syncType = none;
			else if(!strcmp(optarg, "atomic"))
				syncType = atomic;
			else if(!strcmp(optarg, "cas"))
				syncType = cas;
			else if(!strcmp(optarg, "mutex")) {
				syncType = mutex;
				pthread_mutex_init(&mut, NULL);
			} else if(!strcmp(optarg, "spin")) {
				syncType = spinlock;
				pthread_spin_init(&spin, PTHREAD_PROCESS_PRIVATE);
			} else {
				fprintf(stderr, "error: invalid sync mode '%s'\n", optarg);
				usage();
			}
			break;
		default:usage();
			break;
		}
	}

	/* Getting number of CPUs */
	procs = (int)sysconf(_SC_NPROCESSORS_ONLN);
	if (procs < 0) {
		perror( "sysconf" );
		return -1;
	}

	if(numthrds < 1) {
		numthrds = procs;
	}

	totalRuntime = 0;
	for(i = 0 ; i < numIterations ; ++i) {
		singleTest();
	}

	printf("total runtime %ld, avg %ld\n", totalRuntime, totalRuntime / numIterations);
	return 0;
}