
Signed-off-by: Dhiru Kholia <dhiru.kholia@gmail.com>

# Please enter the commit message for your changes. Lines starting
# with '#' will be ignored, and an empty message aborts the commit.
# On branch master
#
# Initial commit
#
# Changes to be committed:
#   (use "git rm --cached <file>..." to unstage)
#
#	new file:   common-opencl.c
#	new file:   common-opencl.h
#	new file:   md5.c
#	new file:   md5_kernel.cl
#
# Untracked files:
#   (use "git add <file>..." to include in what will be committed)
#
#	common-opencl.c~
#	common-opencl.h~
#	md5
#	md5.c~
#	test.c~
diff --git a/common-opencl.c b/common-opencl.c
new file mode 100644
index 0000000..0cb8353
--- /dev/null
+++ b/common-opencl.c
@@ -0,0 +1,554 @@
+/* Common OpenCL functions go in this file */
+
+#include "common-opencl.h"
+#include <assert.h>
+#include <string.h>
+#define LOG_SIZE 1024*16
+
+static char opencl_log[LOG_SIZE];
+static char *kernel_source;
+static int kernel_loaded;
+
+void handle_clerror(cl_int cl_error, const char *message, const char *file,
+    int line)
+{
+	if (cl_error != CL_SUCCESS) {
+		fprintf(stderr,
+		    "OpenCL error (%s) in file (%s) at line (%d) - (%s)\n",
+		    get_error_name(cl_error), file, line, message);
+		exit(EXIT_FAILURE);
+	}
+}
+
+static void read_kernel_source(char *kernel_filename)
+{
+	char *kernel_path = kernel_filename;
+	FILE *fp = fopen(kernel_path, "r");
+	size_t source_size, read_size;
+
+	if (!fp)
+		HANDLE_CLERROR(!CL_SUCCESS, "Source kernel not found!");
+	fseek(fp, 0, SEEK_END);
+	source_size = ftell(fp);
+	fseek(fp, 0, SEEK_SET);
+	if (kernel_source != NULL)
+		free(kernel_source);
+	kernel_source = calloc(source_size + 1, 1);
+	read_size = fread(kernel_source, sizeof(char), source_size, fp);
+	if (read_size != source_size)
+		fprintf(stderr,
+		    "Error reading source: expected %zu, got %zu bytes.\n",
+		    source_size, read_size);
+	fclose(fp);
+	kernel_loaded = 1;
+}
+
+static void dev_init(unsigned int dev_id, unsigned int platform_id)
+{
+	cl_platform_id platform[MAX_PLATFORMS];
+	cl_uint num_platforms, device_num;
+	cl_context_properties properties[3];
+	assert(dev_id < MAXGPUS);
+	///Find CPU's
+	HANDLE_CLERROR(clGetPlatformIDs(MAX_PLATFORMS, platform,
+		&num_platforms), "No OpenCL platform found");
+	HANDLE_CLERROR(clGetPlatformInfo(platform[platform_id],
+		CL_PLATFORM_NAME, sizeof(opencl_log), opencl_log, NULL),
+	    "Error querying PLATFORM_NAME");
+	HANDLE_CLERROR(clGetDeviceIDs(platform[platform_id],
+		CL_DEVICE_TYPE_ALL, MAXGPUS, devices, &device_num),
+	    "No OpenCL device of that type exist");
+	fprintf(stderr, "OpenCL platform %d: %s, %d device(s).\n", platform_id,
+	    opencl_log, device_num);
+
+	properties[0] = CL_CONTEXT_PLATFORM;
+	properties[1] = (cl_context_properties) platform[platform_id];
+	properties[2] = 0;
+	HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_NAME,
+		sizeof(opencl_log), opencl_log, NULL),
+	    "Error querying DEVICE_NAME");
+	fprintf(stderr, "Using device %d: %s\n", dev_id, opencl_log);
+	HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id],
+		CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_group_size),
+		&max_group_size, NULL), "Error querying MAX_WORK_GROUP_SIZE");
+	///Setup context
+	context[dev_id] =
+	    clCreateContext(properties, 1, &devices[dev_id], NULL, NULL,
+	    &ret_code);
+	HANDLE_CLERROR(ret_code, "Error creating context");
+	queue[dev_id] =
+	    clCreateCommandQueue(context[dev_id], devices[dev_id], 0,
+	    &ret_code);
+	HANDLE_CLERROR(ret_code, "Error creating command queue");
+}
+
+static char *include_source(char *pathname, int dev_id)
+{
+	static char include[PATH_BUFFER_SIZE];
+
+	//fprintf(stderr, "Options used: %s\n", include);
+	return include;
+}
+
+static void build_kernel(int dev_id)
+{
+	cl_int build_code;
+	const char *srcptr[] = { kernel_source };
+	assert(kernel_loaded);
+	program[dev_id] =
+	    clCreateProgramWithSource(context[dev_id], 1, srcptr, NULL,
+	    &ret_code);
+	HANDLE_CLERROR(ret_code, "Error while creating program");
+
+	build_code = clBuildProgram(program[dev_id], 0, NULL,
+	    include_source("$JOHN/", dev_id), NULL, NULL);
+
+	HANDLE_CLERROR(clGetProgramBuildInfo(program[dev_id], devices[dev_id],
+		CL_PROGRAM_BUILD_LOG, sizeof(opencl_log), (void *) opencl_log,
+		NULL), "Error while getting build info");
+
+	///Report build errors and warnings
+	if (build_code != CL_SUCCESS)
+		fprintf(stderr, "Compilation log: %s\n", opencl_log);
+#ifdef REPORT_OPENCL_WARNINGS
+	else if (strlen(opencl_log) > 1)	// Nvidia may return a single '\n' which is not that interesting
+		fprintf(stderr, "Compilation log: %s\n", opencl_log);
+#endif
+#if 0
+	FILE *file;
+	size_t source_size;
+	char *source;
+
+	HANDLE_CLERROR(clGetProgramInfo(program[dev_id],
+		CL_PROGRAM_BINARY_SIZES,
+		sizeof(size_t), &source_size, NULL), "error");
+	fprintf(stderr, "source size %zu\n", source_size);
+	source = malloc(source_size);
+
+	HANDLE_CLERROR(clGetProgramInfo(program[dev_id],
+		CL_PROGRAM_BINARIES, sizeof(char *), &source, NULL), "error");
+
+	file = fopen("program.bin", "w");
+	if (file == NULL)
+		fprintf(stderr, "Error opening binary file\n");
+	else if (fwrite(source, source_size, 1, file) != 1)
+		fprintf(stderr, "error writing binary\n");
+	fclose(file);
+	free(source);
+#endif
+}
+
+void opencl_get_dev_info(unsigned int dev_id)
+{
+	cl_device_type device;
+
+	device = get_device_type(dev_id);
+
+	if (device == CL_DEVICE_TYPE_CPU)
+		device_info[dev_id] = CPU;
+	else if (device == CL_DEVICE_TYPE_GPU)
+		device_info[dev_id] = GPU;
+	else if (device == CL_DEVICE_TYPE_ACCELERATOR)
+		device_info[dev_id] = ACCELERATOR;
+
+	device_info[dev_id] += get_vendor_id(dev_id);
+	device_info[dev_id] += get_processor_family(dev_id);
+}
+
+void opencl_init_dev(unsigned int dev_id, unsigned int platform_id)
+{
+	dev_init(dev_id, platform_id);
+	opencl_get_dev_info(dev_id);
+}
+
+void opencl_build_kernel(char *kernel_filename, unsigned int dev_id)
+{
+	read_kernel_source(kernel_filename);
+	build_kernel(dev_id);
+}
+
+void opencl_init(char *kernel_filename, unsigned int dev_id,
+    unsigned int platform_id)
+{
+	kernel_loaded=0;
+	opencl_init_dev(dev_id, platform_id);
+	opencl_build_kernel(kernel_filename, dev_id);
+}
+
+cl_device_type get_device_type(int dev_id)
+{
+	cl_device_type type;
+	HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_TYPE,
+		sizeof(cl_device_type), &type, NULL),
+	    "Error querying CL_DEVICE_TYPE");
+
+	return type;
+}
+
+cl_ulong get_local_memory_size(int dev_id)
+{
+	cl_ulong size;
+	HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id],
+		CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &size, NULL),
+	    "Error querying CL_DEVICE_LOCAL_MEM_SIZE");
+
+	return size;
+}
+
+size_t get_max_work_group_size(int dev_id)
+{
+	size_t max_group_size;
+
+	HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id],
+		CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_group_size),
+		&max_group_size, NULL),
+	    "Error querying CL_DEVICE_MAX_WORK_GROUP_SIZE");
+
+	return max_group_size;
+}
+
+size_t get_current_work_group_size(int dev_id, cl_kernel crypt_kernel)
+{
+	size_t max_group_size;
+
+	HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[dev_id],
+		CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_group_size),
+		&max_group_size, NULL),
+	    "Error querying clGetKernelWorkGroupInfo");
+
+	return max_group_size;
+}
+
+cl_uint get_max_compute_units(int dev_id)
+{
+	cl_uint size;
+	HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id],
+		CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &size, NULL),
+	    "Error querying CL_DEVICE_MAX_COMPUTE_UNITS");
+
+	return size;
+}
+
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+void get_compute_capability(int dev_id, unsigned int *major,
+    unsigned int *minor)
+{
+	HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id],
+		CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
+		sizeof(cl_uint), major, NULL),
+	    "Error querying CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV");
+	HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id],
+		CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
+		sizeof(cl_uint), minor, NULL),
+	    "Error querying CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV");
+}
+#endif
+
+cl_uint get_processors_count(int dev_id)
+{
+	cl_uint core_count = get_max_compute_units(dev_id);
+
+	cores_per_MP[dev_id] = 0;
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+	if (gpu_nvidia(device_info[dev_id])) {
+		unsigned int major = 0, minor = 0;
+
+		get_compute_capability(dev_id, &major, &minor);
+		if (major == 1)
+			core_count *= (cores_per_MP[dev_id] = 8);
+		else if (major == 2 && minor == 0)
+			core_count *= (cores_per_MP[dev_id] = 32);	//2.0
+		else if (major == 2 && minor >= 1)
+			core_count *= (cores_per_MP[dev_id] = 48);	//2.1
+		else if (major == 3)
+			core_count *= (cores_per_MP[dev_id] = 192);	//3.0
+	} else
+#endif
+	if (gpu_amd(device_info[dev_id])) {
+		core_count *= (cores_per_MP[dev_id] = (16 *	//16 thread proc * 5 SP
+			((amd_gcn(device_info[dev_id]) ||
+				amd_vliw4(device_info[dev_id])) ? 4 : 5)));
+	} else if (gpu(device_info[dev_id]))	//Any other GPU
+		core_count *= (cores_per_MP[dev_id] = 8);
+
+	return core_count;
+}
+
+cl_uint get_processor_family(int dev_id)
+{
+	char dname[MAX_OCLINFO_STRING_LEN];
+
+	HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_NAME,
+		sizeof(dname), dname, NULL), "Error querying CL_DEVICE_NAME");
+
+	if gpu (device_info[dev_id]) {
+
+		if (gpu_amd(device_info[dev_id]) && (strstr(dname, "Cedar") ||
+			strstr(dname, "Redwood") ||
+			strstr(dname, "Juniper") ||
+			strstr(dname, "Cypress") ||
+			strstr(dname, "Hemlock") ||
+			strstr(dname, "Caicos") ||
+			strstr(dname, "Turks") ||
+			strstr(dname, "Barts") ||
+			strstr(dname, "Cayman") ||
+			strstr(dname, "Antilles") ||
+			strstr(dname, "Wrestler") ||
+			strstr(dname, "Zacate") ||
+			strstr(dname, "WinterPark") ||
+			strstr(dname, "BeaverCreek"))) {
+
+			if (strstr(dname, "Cayman") ||
+			    strstr(dname, "Antilles"))
+				return AMD_VLIW4;
+			else
+				return AMD_VLIW5;
+
+		} else
+			return AMD_GCN + AMD_VLIW5;
+		}
+	return UNKNOWN;
+}
+
+int get_vendor_id(int dev_id)
+{
+	char dname[MAX_OCLINFO_STRING_LEN];
+
+	HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_VENDOR,
+		sizeof(dname), dname, NULL),
+	    "Error querying CL_DEVICE_VENDOR");
+
+	if (strstr(dname, "NVIDIA") != NULL)
+		return NVIDIA;
+
+	if (strstr(dname, "Advanced Micro") != NULL ||
+	    strstr(dname, "AMD") != NULL || strstr(dname, "ATI") != NULL)
+		return AMD;
+
+	return UNKNOWN;
+}
+
+int get_device_version(int dev_id)
+{
+	char dname[MAX_OCLINFO_STRING_LEN];
+
+        clGetDeviceInfo(devices[dev_id], CL_DEVICE_VERSION,
+                MAX_OCLINFO_STRING_LEN, dname, NULL);
+
+        if (strstr(dname, "1.0"))
+                return 100;
+        if (strstr(dname, "1.1"))
+                return 110;
+        if (strstr(dname, "1.2"))
+                return 120;
+
+        return UNKNOWN;
+}
+
+char *get_error_name(cl_int cl_error)
+{
+	static char *err_1[] =
+	    { "CL_SUCCESS", "CL_DEVICE_NOT_FOUND", "CL_DEVICE_NOT_AVAILABLE",
+		"CL_COMPILER_NOT_AVAILABLE",
+		"CL_MEM_OBJECT_ALLOCATION_FAILURE", "CL_OUT_OF_RESOURCES",
+		"CL_OUT_OF_HOST_MEMORY",
+		"CL_PROFILING_INFO_NOT_AVAILABLE", "CL_MEM_COPY_OVERLAP",
+		"CL_IMAGE_FORMAT_MISMATCH",
+		"CL_IMAGE_FORMAT_NOT_SUPPORTED", "CL_BUILD_PROGRAM_FAILURE",
+		"CL_MAP_FAILURE"
+	};
+	static char *err_invalid[] = {
+		"CL_INVALID_VALUE", "CL_INVALID_DEVICE_TYPE",
+		"CL_INVALID_PLATFORM", "CL_INVALID_DEVICE",
+		"CL_INVALID_CONTEXT", "CL_INVALID_QUEUE_PROPERTIES",
+		"CL_INVALID_COMMAND_QUEUE", "CL_INVALID_HOST_PTR",
+		"CL_INVALID_MEM_OBJECT", "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
+		"CL_INVALID_IMAGE_SIZE", "CL_INVALID_SAMPLER",
+		"CL_INVALID_BINARY", "CL_INVALID_BUILD_OPTIONS",
+		"CL_INVALID_PROGRAM", "CL_INVALID_PROGRAM_EXECUTABLE",
+		"CL_INVALID_KERNEL_NAME", "CL_INVALID_KERNEL_DEFINITION",
+		"CL_INVALID_KERNEL", "CL_INVALID_ARG_INDEX",
+		"CL_INVALID_ARG_VALUE", "CL_INVALID_ARG_SIZE",
+		"CL_INVALID_KERNEL_ARGS", "CL_INVALID_WORK_DIMENSION",
+		"CL_INVALID_WORK_GROUP_SIZE", "CL_INVALID_WORK_ITEM_SIZE",
+		"CL_INVALID_GLOBAL_OFFSET", "CL_INVALID_EVENT_WAIT_LIST",
+		"CL_INVALID_EVENT", "CL_INVALID_OPERATION",
+		"CL_INVALID_GL_OBJECT", "CL_INVALID_BUFFER_SIZE",
+		"CL_INVALID_MIP_LEVEL", "CL_INVALID_GLOBAL_WORK_SIZE"
+	};
+
+	if (cl_error <= 0 && cl_error >= -12) {
+		cl_error = -cl_error;
+		return err_1[cl_error];
+	}
+	if (cl_error <= -30 && cl_error >= -63) {
+		cl_error = -cl_error;
+		return err_invalid[cl_error - 30];
+	}
+
+	return "UNKNOWN ERROR :(";
+}
+
+char *megastring(unsigned long long value)
+{
+	static char outbuf[16];
+
+	if (value >= 10000000000ULL)
+		sprintf(outbuf, "%llu GB", value >> 30);
+	else if (value >= 10000000ULL)
+		sprintf(outbuf, "%llu MB", value >> 20);
+	else if (value >= 10000ULL)
+		sprintf(outbuf, "%llu KB", value >> 10);
+	else
+		sprintf(outbuf, "%llu bytes", value);
+
+	return outbuf;
+}
+
+void listOpenCLdevices(void)
+{
+	char dname[MAX_OCLINFO_STRING_LEN];
+	cl_uint num_platforms, num_devices, entries;
+	cl_ulong long_entries;
+	int i, d;
+	cl_int err;
+	size_t p_size;
+
+	/* Obtain list of platforms available */
+	err = clGetPlatformIDs(MAX_PLATFORMS, platform, &num_platforms);
+	if (err != CL_SUCCESS) {
+		fprintf(stderr,
+		    "Error: Failure in clGetPlatformIDs, error code=%d \n",
+		    err);
+		return;
+	}
+	//printf("%d platforms found\n", num_platforms);
+
+	for (i = 0; i < num_platforms; i++) {
+		/* Obtain information about platform */
+		clGetPlatformInfo(platform[i], CL_PLATFORM_NAME,
+		    MAX_OCLINFO_STRING_LEN, dname, NULL);
+		printf("Platform #%d name: %s\n", i, dname);
+		clGetPlatformInfo(platform[i], CL_PLATFORM_VERSION,
+		    MAX_OCLINFO_STRING_LEN, dname, NULL);
+		printf("Platform version: %s\n", dname);
+
+		/* Obtain list of devices available on platform */
+		clGetDeviceIDs(platform[i], CL_DEVICE_TYPE_ALL, MAXGPUS,
+		    devices, &num_devices);
+		if (!num_devices)
+			printf("%d devices found\n", num_devices);
+
+		/* Query devices for information */
+		for (d = 0; d < num_devices; ++d) {
+			cl_device_local_mem_type memtype;
+			cl_bool boolean;
+
+			clGetDeviceInfo(devices[d], CL_DEVICE_NAME,
+			    MAX_OCLINFO_STRING_LEN, dname, NULL);
+			printf("\tDevice #%d name:\t\t%s\n", d, dname);
+			clGetDeviceInfo(devices[d], CL_DEVICE_VENDOR,
+			    MAX_OCLINFO_STRING_LEN, dname, NULL);
+			printf("\tDevice vendor:\t\t%s\n", dname);
+			clGetDeviceInfo(devices[d], CL_DEVICE_TYPE,
+			    sizeof(cl_ulong), &long_entries, NULL);
+			printf("\tDevice type:\t\t");
+			if (long_entries & CL_DEVICE_TYPE_CPU)
+				printf("CPU ");
+			if (long_entries & CL_DEVICE_TYPE_GPU)
+				printf("GPU ");
+			if (long_entries & CL_DEVICE_TYPE_ACCELERATOR)
+				printf("Accelerator ");
+			if (long_entries & CL_DEVICE_TYPE_DEFAULT)
+				printf("Default ");
+			if (long_entries & ~(CL_DEVICE_TYPE_DEFAULT |
+				CL_DEVICE_TYPE_ACCELERATOR | CL_DEVICE_TYPE_GPU
+				| CL_DEVICE_TYPE_CPU))
+				printf("Unknown ");
+			clGetDeviceInfo(devices[d], CL_DEVICE_ENDIAN_LITTLE,
+			    sizeof(cl_bool), &boolean, NULL);
+			printf("(%s)\n", boolean == CL_TRUE ? "LE" : "BE");
+			clGetDeviceInfo(devices[d], CL_DEVICE_VERSION,
+			    MAX_OCLINFO_STRING_LEN, dname, NULL);
+			printf("\tDevice version:\t\t%s\n", dname);
+			clGetDeviceInfo(devices[d], CL_DRIVER_VERSION,
+			    MAX_OCLINFO_STRING_LEN, dname, NULL);
+			printf("\tDriver version:\t\t%s\n", dname);
+			clGetDeviceInfo(devices[d], CL_DEVICE_GLOBAL_MEM_SIZE,
+			    sizeof(cl_ulong), &long_entries, NULL);
+			clGetDeviceInfo(devices[d],
+			    CL_DEVICE_ERROR_CORRECTION_SUPPORT,
+			    sizeof(cl_bool), &boolean, NULL);
+			printf("\tGlobal Memory:\t\t%s%s\n",
+			    megastring((unsigned long long) long_entries),
+			    boolean == CL_TRUE ? " (ECC)" : "");
+			clGetDeviceInfo(devices[d],
+			    CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(cl_ulong),
+			    &long_entries, NULL);
+			printf("\tGlobal Memory Cache:\t%s\n",
+			    megastring((unsigned long long) long_entries));
+			clGetDeviceInfo(devices[d], CL_DEVICE_LOCAL_MEM_SIZE,
+			    sizeof(cl_ulong), &long_entries, NULL);
+			clGetDeviceInfo(devices[d], CL_DEVICE_LOCAL_MEM_TYPE,
+			    sizeof(cl_device_local_mem_type), &memtype, NULL);
+			printf("\tLocal Memory:\t\t%s (%s)\n",
+			    megastring((unsigned long long) long_entries),
+			    memtype == CL_LOCAL ? "Local" : "Global");
+			clGetDeviceInfo(devices[d],
+			    CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_ulong),
+			    &long_entries, NULL);
+			printf("\tMax clock (MHz) :\t%llu\n",
+			    (unsigned long long) long_entries);
+			clGetDeviceInfo(devices[d],
+			    CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t),
+			    &p_size, NULL);
+			printf("\tMax Work Group Size:\t%d\n", (int) p_size);
+			clGetDeviceInfo(devices[d],
+			    CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
+			    &entries, NULL);
+			printf("\tParallel compute cores:\t%d\n", entries);
+
+			opencl_get_dev_info(d);
+			long_entries = get_processors_count(d);
+			if (cores_per_MP[d])
+				printf
+				    ("\tStream processors:\t%llu  (%d x %d)\n",
+				    (unsigned long long)long_entries, entries,
+				     cores_per_MP[d]);
+
+#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV
+			if (gpu_nvidia(device_info[d])) {
+				unsigned int major = 0, minor = 0;
+
+				clGetDeviceInfo(devices[d],
+				    CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint),
+				    &long_entries, NULL);
+				printf("\tWarp size:\t\t%llu\n",
+				       (unsigned long long)long_entries);
+
+				clGetDeviceInfo(devices[d],
+				    CL_DEVICE_REGISTERS_PER_BLOCK_NV,
+				    sizeof(cl_uint), &long_entries, NULL);
+				printf("\tMax. GPRs/work-group:\t%llu\n",
+				    (unsigned long long)long_entries);
+
+				get_compute_capability(d, &major, &minor);
+				printf
+				    ("\tCompute capability:\t%u.%u (sm_%u%u)\n",
+				    major, minor, major, minor);
+
+				clGetDeviceInfo(devices[d],
+				    CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV,
+				    sizeof(cl_bool), &boolean, NULL);
+				printf("\tKernel exec. timeout:\t%s\n",
+				    boolean ? "yes" : "no");
+			}
+#endif
+			puts("");
+		}
+	}
+	return;
+}
+
+#undef LOG_SIZE
+#undef SRC_SIZE
diff --git a/common-opencl.h b/common-opencl.h
new file mode 100644
index 0000000..77717ad
--- /dev/null
+++ b/common-opencl.h
@@ -0,0 +1,86 @@
+#ifndef _COMMON_OPENCL_H
+#define _COMMON_OPENCL_H
+
+#include <stdio.h>
+
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+/* Should there be an alternative cl_ext.h here? */
+#else
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+#endif
+
+#define PATH_BUFFER_SIZE 4096
+#define MAXGPUS	8
+#define MAX_PLATFORMS	8
+#define SUBSECTION_OPENCL	":OpenCL"
+#define MAX_OCLINFO_STRING_LEN	64
+
+/* Comment if you do not want to see OpenCL warnings during kernel compilation */
+//#define REPORT_OPENCL_WARNINGS
+
+/* Common OpenCL variables */
+int gpu_id, platform_id;
+cl_platform_id platform[MAX_PLATFORMS];
+cl_device_id devices[MAXGPUS];
+cl_context context[MAXGPUS];
+cl_program program[MAXGPUS];
+cl_command_queue queue[MAXGPUS];
+cl_int ret_code;
+cl_kernel crypt_kernel;
+cl_event profilingEvent;
+size_t local_work_size;
+size_t global_work_size;
+size_t max_group_size;
+
+int device_info[MAXGPUS];
+int cores_per_MP[MAXGPUS];
+
+cl_int oclGetDevCap(cl_device_id device, cl_int *iComputeCapMajor, cl_int *iComputeCapMinor);
+
+void opencl_init_dev(unsigned int dev_id, unsigned int platform_id);
+void opencl_init(char *kernel_filename, unsigned int dev_id,
+                 unsigned int platform_id);
+void opencl_build_kernel(char *kernel_filename, unsigned int dev_id);
+
+cl_device_type get_device_type(int dev_id);
+cl_ulong get_local_memory_size(int dev_id);
+size_t get_max_work_group_size(int dev_id);
+size_t get_current_work_group_size(int dev_id, cl_kernel crypt_kernel);
+cl_uint get_max_compute_units(int dev_id);
+cl_uint get_processors_count(int dev_id);
+cl_uint get_processor_family(int dev_id);
+int get_vendor_id(int dev_id);
+int get_device_version(int dev_id);
+
+#define UNKNOWN                 0
+#define CPU                     1
+#define GPU                     2
+#define ACCELERATOR             4
+#define AMD                     64
+#define NVIDIA                  128
+#define INTEL                   256
+#define AMD_GCN                 1024
+#define AMD_VLIW4               2048
+#define AMD_VLIW5               4096
+
+#define cpu(n)                  ((n & CPU) == (CPU))
+#define gpu(n)                  ((n & GPU) == (GPU))
+#define gpu_amd(n)              ((n & AMD) && gpu(n))
+#define gpu_amd_64(n)           (0)
+#define gpu_nvidia(n)           ((n & NVIDIA) && gpu(n))
+#define gpu_intel(n)            ((n & INTEL) && gpu(n))
+#define cpu_amd(n)              ((n & AMD) && cpu(n))
+#define amd_gcn(n)              ((n & AMD_GCN) && gpu_amd(n))
+#define amd_vliw4(n)            ((n & AMD_VLIW4) && gpu_amd(n))
+#define amd_vliw5(n)            ((n & AMD_VLIW5) && gpu_amd(n))
+
+char *get_error_name(cl_int cl_error);
+
+void handle_clerror(cl_int cl_error, const char *message, const char *file, int line);
+
+#define HANDLE_CLERROR(cl_error, message) (handle_clerror(cl_error,message,__FILE__,__LINE__))
+
+
+#endif
diff --git a/md5.c b/md5.c
new file mode 100644
index 0000000..9c4cd32
--- /dev/null
+++ b/md5.c
@@ -0,0 +1,126 @@
+/* gcc -Wall md5.c common-opencl.c -o md5 -lOpenCL */
+#include <string.h>
+#include <stdio.h>
+#include "common-opencl.h"
+
+#define PLAINTEXT_LENGTH    31
+#define FORMAT_LABEL        "raw-md5-opencl"
+#define FORMAT_NAME         "Raw MD5"
+#define ALGORITHM_NAME      "OpenCL"
+#define BENCHMARK_COMMENT   ""
+#define BENCHMARK_LENGTH    -1
+#define CIPHERTEXT_LENGTH   32
+#define BINARY_SIZE         16
+#define SALT_SIZE           0
+
+cl_command_queue queue_prof;
+cl_mem pinned_saved_keys, pinned_partial_hashes, buffer_out, buffer_keys,
+    data_info;
+static cl_uint *partial_hashes;
+static char *saved_plain;
+
+#define MIN_KEYS_PER_CRYPT      2048
+#define MAX_KEYS_PER_CRYPT      MD5_NUM_KEYS
+static unsigned int datai[2];
+
+static void print_hex(unsigned char *str, int len)
+{
+	int i;
+	for (i = 0; i < len; ++i)
+		printf("%02x", str[i]);
+	printf("\n");
+}
+
+static void create_clobj(int kpc)
+{
+	pinned_saved_keys =
+	    clCreateBuffer(context[gpu_id],
+	    CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+	    (PLAINTEXT_LENGTH + 1) * kpc, NULL, &ret_code);
+	HANDLE_CLERROR(ret_code,
+	    "Error creating page-locked memory pinned_saved_keys");
+
+	saved_plain =
+	    (char *) clEnqueueMapBuffer(queue[gpu_id], pinned_saved_keys,
+	    CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0,
+	    (PLAINTEXT_LENGTH + 1) * kpc, 0, NULL, NULL, &ret_code);
+	HANDLE_CLERROR(ret_code,
+	    "Error mapping page-locked memory saved_plain");
+
+	pinned_partial_hashes = clCreateBuffer(context[gpu_id],
+	    CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 16 * kpc, NULL,
+	    &ret_code);
+	HANDLE_CLERROR(ret_code,
+	    "Error creating page-locked memory pinned_partial_hashes");
+
+	partial_hashes = (cl_uint *) clEnqueueMapBuffer(queue[gpu_id],
+	    pinned_partial_hashes, CL_TRUE, CL_MAP_READ, 0, 4 * kpc, 0, NULL,
+	    NULL, &ret_code);
+	HANDLE_CLERROR(ret_code,
+	    "Error mapping page-locked memory partial_hashes");
+
+	// create and set arguments
+	buffer_keys = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY,
+	    (PLAINTEXT_LENGTH + 1) * kpc, NULL, &ret_code);
+	HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_keys");
+
+	buffer_out = clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY,
+	    BINARY_SIZE * kpc, NULL, &ret_code);
+	HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_out");
+
+	data_info =
+	    clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY,
+	    sizeof(unsigned int) * 2, NULL, &ret_code);
+	HANDLE_CLERROR(ret_code, "Error creating data_info out argument");
+
+	HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 0, sizeof(data_info),
+		(void *) &data_info), "Error setting argument 0");
+	HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 1, sizeof(buffer_keys),
+		(void *) &buffer_keys), "Error setting argument 1");
+	HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 2, sizeof(buffer_out),
+		(void *) &buffer_out), "Error setting argument 2");
+
+	datai[0] = PLAINTEXT_LENGTH;
+	datai[1] = kpc;
+}
+
+int main()
+{
+	int kpc = 2048 * 1024;
+	unsigned char *p;
+	global_work_size = kpc;
+	local_work_size = 32;;
+
+	opencl_init("md5_kernel.cl", gpu_id, platform_id);
+	crypt_kernel = clCreateKernel(program[gpu_id], "md5", &ret_code);
+	create_clobj(kpc);
+
+	int i;
+	for (i = 0; i < kpc; i++) {
+		memcpy(&(saved_plain[i * (PLAINTEXT_LENGTH + 1)]), "openwall",
+		    PLAINTEXT_LENGTH + 1);
+	}
+
+	// copy keys to the device
+	HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], data_info, CL_TRUE,
+		0, sizeof(unsigned int) * 2, datai, 0, NULL, NULL),
+	    "failed in clEnqueueWriteBuffer data_info");
+	HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_keys,
+		CL_TRUE, 0, (PLAINTEXT_LENGTH + 1) * kpc, saved_plain, 0, NULL,
+		NULL), "failed in clEnqueueWriteBuffer buffer_keys");
+
+	HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1,
+		NULL, &global_work_size, &local_work_size, 0, NULL,
+		&profilingEvent), "failed in clEnqueueNDRangeKernel");
+	HANDLE_CLERROR(clFinish(queue[gpu_id]), "failed in clFinish");
+
+	// read back partial hashes
+	HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], buffer_out, CL_TRUE,
+		0, sizeof(cl_uint) * 4 * kpc, partial_hashes, 0, NULL, NULL),
+	    "failed in reading data back");
+
+	p = (unsigned char *) partial_hashes;
+	print_hex(p, 16);
+
+	return 0;
+}
diff --git a/md5_kernel.cl b/md5_kernel.cl
new file mode 100644
index 0000000..522df54
--- /dev/null
+++ b/md5_kernel.cl
@@ -0,0 +1,141 @@
+/* MD5 OpenCL kernel based on Solar Designer's MD5 algorithm implementation at:
+ * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
+ *
+ * This software is Copyright © 2010, Dhiru Kholia <dhiru.kholia at gmail.com>,
+ * and it is hereby released to the general public under the following terms:
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted.
+ *
+ * Useful References:
+ * 1. CUDA MD5 Hashing Experiments, http://majuric.org/software/cudamd5/
+ * 2. oclcrack, http://sghctoma.extra.hu/index.php?p=entry&id=11
+ * 3. http://people.eku.edu/styere/Encrypt/JS-MD5.html
+ * 4. http://en.wikipedia.org/wiki/MD5#Algorithm */
+
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : disable
+
+/* Macros for reading/writing chars from int32's (from rar_kernel.cl) */
+#define GETCHAR(buf, index) (((uchar*)(buf))[(index)])
+#define PUTCHAR(buf, index, val) (buf)[(index)>>2] = ((buf)[(index)>>2] & ~(0xffU << (((index) & 3) << 3))) + ((val) << (((index) & 3) << 3))
+
+/* The basic MD5 functions */
+#define F(x, y, z)			((z) ^ ((x) & ((y) ^ (z))))
+#define G(x, y, z)			((y) ^ ((z) & ((x) ^ (y))))
+#define H(x, y, z)			((x) ^ (y) ^ (z))
+#define I(x, y, z)			((y) ^ ((x) | ~(z)))
+
+/* The MD5 transformation for all four rounds. */
+#define STEP(f, a, b, c, d, x, t, s) \
+    (a) += f((b), (c), (d)) + (x) + (t); \
+    (a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \
+    (a) += (b);
+
+#define GET(i) (key[(i)])
+
+/* OpenCL kernel entry point. Copy KEY_LENGTH bytes key to be hashed from
+ * global to local (thread) memory. Break the key into 16 32-bit (uint)
+ * words. MD5 hash of a key is 128 bit (uint4). */
+__kernel void md5(__global uint *data_info, __global const uint * keys, __global uint * hashes)
+{
+	int id = get_global_id(0);
+	uint key[16] = { 0 };
+	uint i;
+	uint num_keys = data_info[1];
+	uint KEY_LENGTH = data_info[0] + 1;
+
+	int base = id * (KEY_LENGTH / 4);
+
+	for (i = 0; i != (KEY_LENGTH / 4) && keys[base + i]; i++)
+		key[i] = keys[base + i];
+
+	/* padding code (borrowed from MD5_eq.c) */
+	char *p = (char *) key;
+	for (i = 0; i != 64 && p[i]; i++);
+
+        PUTCHAR(key, i, 0x80);
+        PUTCHAR(key, 56, i << 3);
+        PUTCHAR(key, 57, i >> 5);
+
+	uint a, b, c, d;
+	a = 0x67452301;
+	b = 0xefcdab89;
+	c = 0x98badcfe;
+	d = 0x10325476;
+
+	/* Round 1 */
+	STEP(F, a, b, c, d, GET(0), 0xd76aa478, 7)
+	STEP(F, d, a, b, c, GET(1), 0xe8c7b756, 12)
+	STEP(F, c, d, a, b, GET(2), 0x242070db, 17)
+	STEP(F, b, c, d, a, GET(3), 0xc1bdceee, 22)
+	STEP(F, a, b, c, d, GET(4), 0xf57c0faf, 7)
+	STEP(F, d, a, b, c, GET(5), 0x4787c62a, 12)
+	STEP(F, c, d, a, b, GET(6), 0xa8304613, 17)
+	STEP(F, b, c, d, a, GET(7), 0xfd469501, 22)
+	STEP(F, a, b, c, d, GET(8), 0x698098d8, 7)
+	STEP(F, d, a, b, c, GET(9), 0x8b44f7af, 12)
+	STEP(F, c, d, a, b, GET(10), 0xffff5bb1, 17)
+	STEP(F, b, c, d, a, GET(11), 0x895cd7be, 22)
+	STEP(F, a, b, c, d, GET(12), 0x6b901122, 7)
+	STEP(F, d, a, b, c, GET(13), 0xfd987193, 12)
+	STEP(F, c, d, a, b, GET(14), 0xa679438e, 17)
+	STEP(F, b, c, d, a, GET(15), 0x49b40821, 22)
+
+	/* Round 2 */
+	STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5)
+	STEP(G, d, a, b, c, GET(6), 0xc040b340, 9)
+	STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14)
+	STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20)
+	STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5)
+	STEP(G, d, a, b, c, GET(10), 0x02441453, 9)
+	STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14)
+	STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20)
+	STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5)
+	STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9)
+	STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14)
+	STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20)
+	STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5)
+	STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9)
+	STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14)
+	STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20)
+
+	/* Round 3 */
+	STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4)
+	STEP(H, d, a, b, c, GET(8), 0x8771f681, 11)
+	STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16)
+	STEP(H, b, c, d, a, GET(14), 0xfde5380c, 23)
+	STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4)
+	STEP(H, d, a, b, c, GET(4), 0x4bdecfa9, 11)
+	STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16)
+	STEP(H, b, c, d, a, GET(10), 0xbebfbc70, 23)
+	STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4)
+	STEP(H, d, a, b, c, GET(0), 0xeaa127fa, 11)
+	STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16)
+	STEP(H, b, c, d, a, GET(6), 0x04881d05, 23)
+	STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4)
+	STEP(H, d, a, b, c, GET(12), 0xe6db99e5, 11)
+	STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16)
+	STEP(H, b, c, d, a, GET(2), 0xc4ac5665, 23)
+
+	/* Round 4 */
+	STEP(I, a, b, c, d, GET(0), 0xf4292244, 6)
+	STEP(I, d, a, b, c, GET(7), 0x432aff97, 10)
+	STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15)
+	STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21)
+	STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6)
+	STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10)
+	STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15)
+	STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21)
+	STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6)
+	STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10)
+	STEP(I, c, d, a, b, GET(6), 0xa3014314, 15)
+	STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21)
+	STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6)
+	STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10)
+	STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15)
+	STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21)
+
+	hashes[id * 4 + 0] = a + 0x67452301;
+	hashes[id * 4 + 1] = b + 0xefcdab89;
+	hashes[id * 4 + 2] = c + 0x98badcfe;
+	hashes[id * 4 + 3] = d + 0x10325476;
+}
