diff -urpN magnum-jumbo/src/opencl/phpass_kernel.cl magnum-jumbo_phpass_oldtest//src/opencl/phpass_kernel.cl --- magnum-jumbo/src/opencl/phpass_kernel.cl 2012-06-25 00:40:42.738451690 +0000 +++ magnum-jumbo_phpass_oldtest//src/opencl/phpass_kernel.cl 2012-06-25 00:50:52.014452056 +0000 @@ -1,65 +1,65 @@ /* -* This software is Copyright (c) 2011,2012 Lukas Odzioba +* This software is Copyright (c) 2011,2012 Lukas Odzioba * and it is hereby released to the general public under the following terms: * Redistribution and use in source and binary forms, with or without modification, are permitted. */ #define PLAINTEXT_LENGTH 15 -#define SALT_SIZE 8 -#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : disable - typedef struct { - //uint v[4]; //15 bytes password + last byte for length - uchar v[PLAINTEXT_LENGTH]; - uchar length; + unsigned char v[PLAINTEXT_LENGTH]; + unsigned char length; } phpass_password; typedef struct { - uint v[4]; + unsigned int v[4]; } phpass_hash; -#define ROTATE_LEFT(x, s) rotate(x,s) -//#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) -//#define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y)))) - -#define F(x, y, z) bitselect((z), (y), (x)) -#define G(x, y, z) bitselect((y), (x), (z)) -#define H(x, y, z) ((x) ^ (y) ^ (z)) -#define I(x, y, z) ((y) ^ ((x) | (~z))) + +#define ROTATE_LEFT(x, s) ((x << s) | (x >> (32 - s))) +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | (~z))) #define FF(a, b, c, d, x, s, ac) \ - {(a) += F ((b), (c), (d)) + (x) + (ac); \ - (a) = ROTATE_LEFT ((a), (s)) + (b); \ + {(a) += F ((b), (c), (d)) + (x) + (uint32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ } #define GG(a, b, c, d, x, s, ac) \ - {(a) += G ((b), (c), (d)) + (x) + (ac); \ - (a) = ROTATE_LEFT ((a), (s)) + (b); \ + {(a) += G ((b), (c), (d)) + (x) + (uint32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ } #define HH(a, b, c, d, x, s, ac) \ - {(a) += H ((b), (c), (d)) + (x) + (ac); \ - (a) = ROTATE_LEFT ((a), (s)) + (b); \ + {(a) += H ((b), (c), (d)) + (x) + (uint32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ } #define II(a, b, c, d, x, s, ac) \ - {(a) += I ((b), (c), (d)) + (x) + (ac); \ - (a) = ROTATE_LEFT ((a), (s)) + (b); \ + {(a) += I ((b), (c), (d)) + (x) + (uint32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ } -#define S11 7 -#define S12 12 -#define S13 17 -#define S14 22 -#define S21 5 -#define S22 9 -#define S23 14 -#define S24 20 -#define S31 4 -#define S32 11 -#define S33 16 -#define S34 23 -#define S41 6 -#define S42 10 -#define S43 15 -#define S44 21 +#define S11 7 +#define S12 12 +#define S13 17 +#define S14 22 +#define S21 5 +#define S22 9 +#define S23 14 +#define S24 20 +#define S31 4 +#define S32 11 +#define S33 16 +#define S34 23 +#define S41 6 +#define S42 10 +#define S43 15 +#define S44 21 +#define uint32_t unsigned int +#define SALT_SIZE 8 #define AC1 0xd76aa477 #define AC2pCd 0xf8fa0bcc @@ -69,27 +69,20 @@ typedef struct { -inline void md5(uint4 len,__private uint4 * internal_ret,__private uint4 * x) +inline void cuda_md5(char len,__private uint32_t * internal_ret,__private uint32_t * x) { - uint4 x14 = len << 3; + x[len / 4] |= (((uint32_t) 0x80) << ((len & 0x3) << 3)); + uint32_t x14 = len << 3; - uint4 a; - uint4 b = 0xefcdab89; - uint4 c = 0x98badcfe; - uint4 d = 0x10325476; - - a = AC1 + x[0]; - a = ROTATE_LEFT(a, S11); - a += b; /* 1 */ - d = (c ^ (a & MASK1)) + x[1] + AC2pCd; - d = ROTATE_LEFT(d, S12); - d += a; /* 2 */ - c = F(d, a, b) + x[2] + AC3pCc; - c = ROTATE_LEFT(c, S13); - c += d; /* 3 */ - b = F(c, d, a) + x[3] + AC4pCb; - b = ROTATE_LEFT(b, S14); - b += c; + uint32_t a = 0x67452301; + uint32_t b = 0xefcdab89; + uint32_t c = 0x98badcfe; + uint32_t d = 0x10325476; + +FF(a, b, c, d, x[0], S11, 0xd76aa478); + FF(d, a, b, c, x[1], S12, 0xe8c7b756); + FF(c, d, a, b, x[2], S13, 0x242070db); + FF(b, c, d, a, x[3], S14, 0xc1bdceee); FF(a, b, c, d, x[4], S11, 0xf57c0faf); FF(d, a, b, c, x[5], S12, 0x4787c62a); FF(c, d, a, b, x[6], S13, 0xa8304613); @@ -160,132 +153,150 @@ inline void md5(uint4 len,__private uint internal_ret[3] = d + 0x10325476; } -inline void clear_ctx(__private uint4 * x) +inline void clear_ctx(__private uint32_t * x) { - uint4 zero = (uint4) (0, 0,0,0); - for (int i = 0; i < 8; i++) - x[i] = zero; -} -inline void clean_ctx(__private uint *x){ - for(int i=0;i<8;i++) - x[i]=0; + int i; + for (i = 0; i < 8; i++) + *x++ = 0; } + __kernel void phpass ( __global const phpass_password* data , __global phpass_hash* data_out , __global const char* setting ) { - uint4 x[8],length; - uint sx[8],i,idx = get_global_id(0); - uint count = 1 << setting[SALT_SIZE+3]; - + uint32_t x[8]; clear_ctx(x); + uint32_t a, b, c, d, x0, x1, x2, x3, x4, x5, x6, x7; - __global const uchar *password0=data[idx*4+0].v; - __global const uchar *password1=data[idx*4+1].v; - __global const uchar *password2=data[idx*4+2].v; - __global const uchar *password3=data[idx*4+3].v; - - - - length.s0=data[idx*4+0].length; - length.s1=data[idx*4+1].length; - length.s2=data[idx*4+2].length; - length.s3=data[idx*4+3].length; - - __private uint *buff2=(uint*)sx; - #define K1(q)\ - clean_ctx(sx);\ - buff2[0]=(setting[3]<<24)|setting[2]<<16|setting[1]<<8|setting[0];\ - buff2[1]=(setting[7]<<24)|setting[6]<<16|setting[5]<<8|setting[4];\ - buff2[2]=(password##q[3]<<24)|password##q[2]<<16|password##q[1]<<8|password##q[0];\ - buff2[3]=(password##q[7]<<24)|password##q[6]<<16|password##q[5]<<8|password##q[4];\ - buff2[4]=(password##q[11]<<24)|password##q[10]<<16|password##q[9]<<8|password##q[8];\ - buff2[5]=password##q[14]<<16|password##q[13]<<8|password##q[12];\ - for ( i = 0; i < 8; i++)\ - x[i].s##q=sx[i]; - K1(0); - K1(1); - K1(2); - K1(3); - #undef K1 - - -/* - #define K1(q)\ - clean_ctx(sx);\ - for (i = 0; i < 8; i++)\ - buff[i] = setting[i];\ - for (i = 8; i < 8 + length.s##q; i++)\ - buff[i] = password##q[i - 8];\ - for ( i = 0; i < 8; i++)\ - x[i].s##q=sx[i]; - K1(0); - K1(1); - K1(2); - K1(3); - #undef K1 -*/ - - uint4 len=length+(uint4)(8); - - x[len.s0 / 4].s0 |= (((uint) 0x80) << ((len.s0 & 0x3) << 3)); - x[len.s1 / 4].s1 |= (((uint) 0x80) << ((len.s1 & 0x3) << 3)); - x[len.s2 / 4].s2 |= (((uint) 0x80) << ((len.s2 & 0x3) << 3)); - x[len.s3 / 4].s3 |= (((uint) 0x80) << ((len.s3 & 0x3) << 3)); - - md5(len, x, x); - - - -/*#define K2(q)\ - clean_ctx(sx);\ - for(i=0;i +* This software is Copyright (c) 2011,2012 Lukas Odzioba * and it is hereby released to the general public under the following terms: * Redistribution and use in source and binary forms, with or without modification, are permitted. */ @@ -15,22 +15,24 @@ #define uint32_t unsigned int #define uint8_t unsigned char -#define FORMAT_LABEL "phpass-opencl" -#define FORMAT_NAME "phpass MD5" - -#define ALGORITHM_NAME "OpenCL" - -#define BENCHMARK_COMMENT " ($P$9 length 8)" -#define BENCHMARK_LENGTH -1 +#define PHPASS_TYPE "PORTABLE-MD5" #define PLAINTEXT_LENGTH 15 #define CIPHERTEXT_LENGTH 34 /// header = 3 | loopcnt = 1 | salt = 8 | ciphertext = 22 #define BINARY_SIZE 16 #define SALT_SIZE 8 -#define KEYS_PER_CRYPT 1024*9*4 +#define KEYS_PER_CRYPT 1024*9 #define MIN_KEYS_PER_CRYPT KEYS_PER_CRYPT #define MAX_KEYS_PER_CRYPT KEYS_PER_CRYPT +#define FORMAT_LABEL "phpass-opencl" +#define FORMAT_NAME "phpass MD5" + +#define ALGORITHM_NAME "OpenCL" + +#define BENCHMARK_COMMENT " ($P$9 length 8)" +#define BENCHMARK_LENGTH -1 + //#define _PHPASS_DEBUG @@ -43,8 +45,8 @@ typedef struct { uint32_t v[4]; ///128bits for hash } phpass_hash; -static phpass_password *inbuffer;//[MAX_KEYS_PER_CRYPT]; /** plaintext ciphertexts **/ -static phpass_hash *outbuffer;//[MAX_KEYS_PER_CRYPT]; /** calculated hashes **/ +static phpass_password inbuffer[MAX_KEYS_PER_CRYPT]; /** plaintext ciphertexts **/ +static phpass_hash outbuffer[MAX_KEYS_PER_CRYPT]; /** calculated hashes **/ static const char phpass_prefix[] = "$P$"; static char currentsalt[SALT_SIZE + 1]; @@ -57,7 +59,7 @@ static cl_mem mem_in, mem_out, mem_setti static size_t insize = sizeof(phpass_password) * KEYS_PER_CRYPT; static size_t outsize = sizeof(phpass_hash) * KEYS_PER_CRYPT; static size_t settingsize = sizeof(uint8_t) * SALT_SIZE + 4; -static size_t global_work_size = KEYS_PER_CRYPT/4; +static size_t global_work_size = KEYS_PER_CRYPT; static struct fmt_tests tests[] = { @@ -72,15 +74,15 @@ static struct fmt_tests tests[] = { {"$P$9sadli2.wzQIuzsR2nYVhUSlHNKgG/0", "john"}, {"$P$90000000000tbNYOc9TwXvLEI62rPt1", ""}, */ - /*{"$P$9saltstriAcRMGl.91RgbAD6WSq64z.", "a"}, + {"$P$9saltstriAcRMGl.91RgbAD6WSq64z.", "a"}, {"$P$9saltstriMljTzvdluiefEfDeGGQEl/", "ab"}, {"$P$9saltstrikCftjZCE7EY2Kg/pjbl8S.", "abc"}, {"$P$9saltstriV/GXRIRi9UVeMLMph9BxF0", "abcd"}, {"$P$9saltstri3JPgLni16rBZtI03oeqT.0", "abcde"}, {"$P$9saltstri0D3A6JyITCuY72ZoXdejV.", "abcdef"}, - {"$P$9saltstriXeNc.xV8N.K9cTs/XEn13.", "abcdefg"}, */ + {"$P$9saltstriXeNc.xV8N.K9cTs/XEn13.", "abcdefg"}, {"$P$9saltstrinwvfzVRP3u1gxG2gTLWqv.", "abcdefgh"}, - /* + {"$P$9saltstriSUQTD.yC2WigjF8RU0Q.Z.", "abcdefghi"}, {"$P$9saltstriWPpGLG.jwJkwGRwdKNEsg.", "abcdefghij"}, {"$P$9saltstrizjDEWUMXTlQHQ3/jhpR4C.", "abcdefghijk"}, @@ -88,7 +90,7 @@ static struct fmt_tests tests[] = { {"$P$9saltstriq7s97e2m7dXnTEx2mtPzx.", "abcdefghijklm"}, {"$P$9saltstriTWMzWKsEeiE7CKOVVU.rS0", "abcdefghijklmn"}, {"$P$9saltstriXt7EDPKtkyRVOqcqEW5UU.", "abcdefghijklmno"}, - */ + {NULL} }; @@ -99,8 +101,6 @@ static void release_all(void) HANDLE_CLERROR(clReleaseMemObject(mem_setting), "Release mem setting"); HANDLE_CLERROR(clReleaseMemObject(mem_out), "Release mem out"); HANDLE_CLERROR(clReleaseCommandQueue(queue[gpu_id]), "Release Queue"); - free(inbuffer); - free(outbuffer); } static void set_key(char *key, int index) @@ -121,21 +121,71 @@ static char *get_key(int index) return ret; } +static void find_best_workgroup() +{ + cl_event myEvent; + cl_ulong startTime, endTime, kernelExecTimeNs = CL_ULONG_MAX; + size_t my_work_group = 1; + cl_int ret_code; + int i; + size_t max_group_size; + clGetDeviceInfo(devices[gpu_id], CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(max_group_size), &max_group_size, NULL); + cl_command_queue queue_prof = + clCreateCommandQueue(context[gpu_id], devices[gpu_id], + CL_QUEUE_PROFILING_ENABLE, + &ret_code); + HANDLE_CLERROR(ret_code, "Error while creating command queue"); + local_work_size = 1; + /// Set keys + char *pass = "aaaaaaaa"; + for (i = 0; i < KEYS_PER_CRYPT; i++) { + set_key(pass, i); + } + ///Set salt + memcpy(currentsalt, "saltstri9", 9); + char setting[SALT_SIZE + 3 + 1] = { 0 }; + strcpy(setting, currentsalt); + strcpy(setting + SALT_SIZE, phpass_prefix); + setting[SALT_SIZE + 3] = atoi64[ARCH_INDEX(currentsalt[8])]; + + ///Copy data to GPU + HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, mem_in, CL_FALSE, 0, + insize, inbuffer, 0, NULL, NULL), "Copy data to gpu"); + HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, mem_setting, CL_FALSE, + 0, settingsize, setting, 0, NULL, NULL), + "Copy setting to gpu"); + + ///Find best local work size + for (my_work_group = 1; (int) my_work_group <= (int) max_group_size; + my_work_group *= 2) { + + HANDLE_CLERROR(clEnqueueNDRangeKernel(queue_prof, crypt_kernel, + 1, NULL, &global_work_size, &my_work_group, 0, NULL, + &myEvent), "Run kernel"); + + HANDLE_CLERROR(clFinish(queue_prof), "clFinish error"); + clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, + sizeof(cl_ulong), &startTime, NULL); + clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, + sizeof(cl_ulong), &endTime, NULL); + + if ((endTime - startTime) < kernelExecTimeNs) { + kernelExecTimeNs = endTime - startTime; + local_work_size = my_work_group; + } + //printf("%d time=%lld\n",(int) my_work_group, endTime-startTime); + } + printf("Optimal local work size = %d\n", (int) local_work_size); + clReleaseCommandQueue(queue_prof); +} + static void init(struct fmt_main *pFmt) { - atexit(release_all); + //atexit(release_all); opencl_init("$JOHN/phpass_kernel.cl", gpu_id,platform_id); /// Alocate memory - inbuffer = - (phpass_password *) calloc(MAX_KEYS_PER_CRYPT, - sizeof(phpass_password)); - assert(inbuffer != NULL); - outbuffer = - (phpass_hash *) calloc(MAX_KEYS_PER_CRYPT, - sizeof(phpass_hash)); - assert(inbuffer != NULL); - cl_int cl_error; mem_in = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, insize, NULL, @@ -153,14 +203,11 @@ static void init(struct fmt_main *pFmt) /// Setup kernel parameters crypt_kernel = clCreateKernel(program[gpu_id], "phpass", &cl_error); HANDLE_CLERROR(cl_error, "Error creating kernel"); - HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 0, sizeof(mem_in), - &mem_in), "Error while setting mem_in"); - HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 1, sizeof(mem_out), - &mem_out), "Error while setting mem_out"); - HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 2, sizeof(mem_setting), - &mem_setting), "Error while setting mem_setting"); + clSetKernelArg(crypt_kernel, 0, sizeof(mem_in), &mem_in); + clSetKernelArg(crypt_kernel, 1, sizeof(mem_out), &mem_out); + clSetKernelArg(crypt_kernel, 2, sizeof(mem_setting), &mem_setting); - opencl_find_best_workgroup(pFmt); + find_best_workgroup(); } static int valid(char *ciphertext, struct fmt_main *pFmt) @@ -252,7 +299,7 @@ static void crypt_all(int count) /// Run kernel HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1, - NULL, &global_work_size, &local_work_size, 0, NULL, &profilingEvent), + NULL, &global_work_size, &local_work_size, 0, NULL, NULL), "Run kernel"); HANDLE_CLERROR(clFinish(queue[gpu_id]), "clFinish");