/* FILE: bug_src.c COMPILE: gcc -pthread -std=c99 -Wall -Wextra -Werror -m64 -O3 -mtune=znver2 -ggdb \ -o good bug_src.c -lrt Change '-o good' to '-o bad' and add -DBROKEN to generate a failing test and also remember to set the architecture for your machine. */ #include #include #include #include #include #include #if !defined T1LOCK_PAUSE_THRESH #define T1LOCK_PAUSE_THRESH 10 #endif typedef unsigned long long u64; typedef unsigned int u32; typedef unsigned short u16; #define expect_false(e) __builtin_expect((e), 0) #define expect_true(e) __builtin_expect((e), 1) #define if_un(e) if (expect_false(e)) #define if_l(e) if (expect_true(e)) #define die(fmt, arg...) { fprintf(stdout, "%p:%s: " fmt, \ (void *) pthread_self(), __func__, ## arg); abort(); } #define ASM_CPU_PAUSE __asm__ volatile ("pause \n\t" ::: ) typedef struct { u64 count; u64 cpu; } tsc_timestamp; #define ASM_CPU_RDTSCP(value, cpu) __asm__ volatile ( \ "rdtscp \n\t" \ "shl $0x20, %%rdx \n\t" \ "or %%rax, %%rdx \n\t" \ "movq %%rdx, %[v] \n\t" \ "movq %%rcx, %[c] \n\t" \ : [v] "=g" (value), \ [c] "=g" (cpu) \ : \ : "%rax", "%rcx", \ "%rdx", "memory", "cc") #define BEGIN_TSC_TIMING \ { \ tsc_timestamp ts1; \ tsc_timestamp ts2; \ ASM_CPU_RDTSCP(ts1.count, ts1.cpu); \ #define END_TSC_TIMING(result) \ ASM_CPU_RDTSCP(ts2.count, ts2.cpu); \ if(ts1.cpu != ts2.cpu) \ result = 1; \ else \ result = ts2.count - ts1.count; \ } #define ASM_ATOMIC_CMPXCHGW(ptr, old, new, result) __asm__ volatile ( \ "xor %[r], %[r] \n\t" \ "movw %[o], %%ax \n\t" \ "lock cmpxchgw %[n], (%[p]) \n\t" \ "jne 0f \n\t" \ "xor $1, %[r] \n\t" \ "0: \n\t" \ : [r] "=&r" (result), \ [o] "+r" (old) \ : [p] "r" (ptr), \ [n] "r" (new) \ : "cc", "memory", "%ax") typedef struct { u16 ticket; u16 queue; } t1lock; #define T1LOCK_INITIALIZER (t1lock) { \ .ticket = 1, \ .queue = 0 \ } __inline__ void t1lock_release(t1lock * oo) { oo->ticket++; return; } __attribute__((noinline)) void t1lock_relax(u32 nr_spin) { if_l (nr_spin < T1LOCK_PAUSE_THRESH) { ASM_CPU_PAUSE; return; } if_l (nr_spin < 65536) { sched_yield(); return; } if_un (nr_spin == 65536) fprintf(stdout, "Thread may be deadlocked.\n"); sleep(1); return; } inline bool t1lock_cmpxchg(t1lock * oo, u16 old, u16 new) { u32 result; ASM_ATOMIC_CMPXCHGW(&oo->queue, old, new, result); return result; } u32 t1lock_acquire(t1lock * oo) { t1lock old, new; u32 nr_spin = 0; u16 ticket; spin: nr_spin++; old = new = *((volatile t1lock *) oo); if_un ((old.queue + 1) == (old.ticket - 65400)) { t1lock_relax(nr_spin); goto spin; } ticket = ++new.queue; if_un (!t1lock_cmpxchg(oo, old.queue, new.queue)) { t1lock_relax(nr_spin); goto spin; } wait: if (ticket != ((volatile t1lock *) oo)->ticket) { t1lock_relax(nr_spin); nr_spin++; goto wait; } return nr_spin; } u64 timestamp(void) /* microseconds */ { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (ts.tv_sec * 1000000000 + ts.tv_nsec) / 1000; } typedef struct { t1lock lock; u64 value; } spr_t; spr_t obj = { .lock = T1LOCK_INITIALIZER, .value = 0 }; volatile u64 temp; /* Unprotected scribble variable */ volatile int start_flag = 0; typedef struct { u32 cycles; u32 nr_spin; } sample_t; void * wr_thread(void * data) { u64 T; sample_t *samples; u32 nr_rounds, i, nr_spin; nr_rounds = (u64) data; samples = malloc(nr_rounds * sizeof(sample_t)); if (!samples) die("Unable to allocate sample buffer - %m"); while (!start_flag) ASM_CPU_PAUSE; for (i = 0; i < nr_rounds; i++) { BEGIN_TSC_TIMING; nr_spin = t1lock_acquire(&obj.lock); #if defined BROKEN temp = ++obj.value; #else ++obj.value; #endif t1lock_release(&obj.lock); END_TSC_TIMING(T); if (T <= 28) /* rdtscp is usually about 28 cycles */ T = 1; else T -= 28; samples[i].cycles = T; samples[i].nr_spin = nr_spin; } return (void *) samples; } void process_thread_result(sample_t * samples, u32 nr_samples) { u64 max = 0; u64 avg = 0; double nr_spin = 0.0; u32 max_nr_spin = 0; u32 i; if (!nr_samples || !samples) die("Usage."); for (i = 0; i < nr_samples; i++) { if (max < samples[i].cycles) max = samples[i].cycles; if (max_nr_spin < samples[i].nr_spin) max_nr_spin = samples[i].nr_spin; avg += samples[i].cycles; nr_spin += samples[i].nr_spin; } avg /= nr_samples; nr_spin /= nr_samples; fprintf(stdout, "%p: avg: %-4qu max: %-4qu nr_spin_avg: %2.4f max_nr_spin: %u\n", samples, avg, max, nr_spin, max_nr_spin); free(samples); return; } int main(int argc, char ** argv) { u64 start = 0, stop = 0; u32 nr_threads = 0; u32 nr_samples = 0; u32 i; pthread_t * threads; sample_t ** results; if (argc != 3) { usage: printf("Usage: %s nr_threads nr_iter\n", argv[0]); printf(" nr_threads <= 1000\n"); exit(1); } nr_threads = strtoul(argv[1], NULL, 0); if ((nr_threads > 1000) || !nr_threads) goto usage; nr_samples = strtoul(argv[2], NULL, 0); if (!nr_samples) goto usage; if (!(threads = malloc(sizeof(pthread_t) * nr_threads))) die("Unable to malloc thread array - %m"); if (!(results = malloc(sizeof(sample_t *) * nr_threads))) die("Unable to malloc results array - %m"); for (i = 0; i < nr_threads; i++) { if (pthread_create(&threads[i], NULL, wr_thread, ((void *) (u64) nr_samples)) == -1) die("Unable to create thread = %m"); } fprintf(stdout, "Launched %u threads.\n", nr_threads); start = timestamp(); start_flag = 1; for (i = 0; i < nr_threads; i++) pthread_join(threads[i], (void *) &results[i]); stop = timestamp(); for (i = 0; i < nr_threads; i++) process_thread_result(results[i], nr_samples); printf("t2lock_test.c:\nElapsed time: %2.4f s\n", ((double) (stop - start)) / 1000000.0); printf("Avg transaction latency: %1.3fns\n", (((double) (stop - start)) * 1000.0) / ((double) (nr_samples * nr_threads))); if (obj.value != (nr_samples * nr_threads)) die("TEST FAILED: obj.value = %qu; (nr_samples * nr_threads) = %u\n", obj.value, (nr_threads * nr_samples)); free(threads); free(results); exit(0); }