>From 2ccd9e1d08b7d8d7069c6fd2d6146b14013cf503 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Claudio=20Andr=C3=A9?= <claudio@claudioandre-desktop.(none)>
Date: Mon, 12 Mar 2012 16:25:48 -0300
Subject: [PATCH] OpenCL Crypt 512 (development)

---
 src/Makefile                     |    4 +-
 src/common-opencl.c              |   11 +-
 src/john.c                       |    2 +
 src/opencl/cryptsha512_kernel.cl |  315 ++++++++++++++++++++++
 src/opencl_cryptsha512.h         |   79 ++++++
 src/opencl_cryptsha512_fmt.c     |  548 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 957 insertions(+), 2 deletions(-)
 create mode 100644 src/opencl/cryptsha512_kernel.cl
 create mode 100644 src/opencl_cryptsha512.h
 create mode 100644 src/opencl_cryptsha512_fmt.c

diff --git a/src/Makefile b/src/Makefile
index ec51902..9448b59 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -119,7 +119,8 @@ JOHN_OBJS = \
 OCL_OBJS = \
 	common-opencl.o opencl_mysqlsha1_fmt.o \
 	opencl_cryptmd5_fmt.o opencl_phpass_fmt.o opencl_rawsha1_fmt.o \
-	opencl_nt_fmt.o opencl_rawmd5_fmt.o  opencl_nsldaps_fmt.o
+	opencl_nt_fmt.o opencl_rawmd5_fmt.o  opencl_nsldaps_fmt.o \
+        opencl_cryptsha512_fmt.o
 
 CUDA_OBJS = \
 	cuda_common.o \
@@ -318,6 +319,7 @@ linux-x86-64-opencl:
 		CFLAGS="$(CFLAGS) -I$(OCLROOT)/include -I$(OCLROOT)/include -DHAVE_CRYPT -DCL_VERSION_1_0 -DHAVE_DL" \
 		LDFLAGS="$(LDFLAGS) -L$(OCLROOT)/lib/x86_64 -L$(OCLROOT)/lib64 -lcrypt -lOpenCL -ldl"
 	$(CP) opencl/*.cl ../run/
+	$(CP) opencl_cryptsha512.h ../run/
 
 linux-x86-64-cuda:
 	$(LN) x86-64.h arch.h
diff --git a/src/common-opencl.c b/src/common-opencl.c
index 02f2d6e..96dfec2 100644
--- a/src/common-opencl.c
+++ b/src/common-opencl.c
@@ -83,6 +83,14 @@ static void dev_init(unsigned int dev_id, unsigned int platform_id)
 	HANDLE_CLERROR(ret_code, "Error creating command queue");
 }
 
+static char * include_source(char *pathname)
+{
+	static char include[PATH_BUFFER_SIZE];        
+        sprintf(include, "-I %s", path_expand(pathname));        
+        
+        return include;
+}
+
 
 static void build_kernel(int dev_id)
 {
@@ -94,7 +102,8 @@ static void build_kernel(int dev_id)
 	HANDLE_CLERROR(ret_code, "Error while creating program");
 
 	cl_int build_code;
-	build_code = clBuildProgram(program[dev_id], 0, NULL, "", NULL, NULL);
+	build_code = clBuildProgram(program[dev_id], 0, NULL, 
+                include_source("$JOHN/"), NULL, NULL);
 
 	HANDLE_CLERROR(clGetProgramBuildInfo(program[dev_id], devices[dev_id],
 		CL_PROGRAM_BUILD_LOG, sizeof(opencl_log), (void *) opencl_log,
diff --git a/src/john.c b/src/john.c
index c18c13c..bb59bd4 100644
--- a/src/john.c
+++ b/src/john.c
@@ -110,6 +110,7 @@ extern struct fmt_main fmt_opencl_rawSHA1;
 extern struct fmt_main fmt_opencl_cryptMD5;
 extern struct fmt_main fmt_opencl_phpass;
 extern struct fmt_main fmt_opencl_mysqlsha1;
+extern struct fmt_main fmt_opencl_cryptsha512;
 #endif 
 #ifdef HAVE_CUDA
 extern struct fmt_main fmt_cuda_cryptmd5;
@@ -223,6 +224,7 @@ static void john_register_all(void)
 	john_register_one(&fmt_opencl_cryptMD5);
 	john_register_one(&fmt_opencl_phpass);
 	john_register_one(&fmt_opencl_mysqlsha1);
+	john_register_one(&fmt_opencl_cryptsha512);
 #endif 
 
 #ifdef HAVE_CUDA
diff --git a/src/opencl/cryptsha512_kernel.cl b/src/opencl/cryptsha512_kernel.cl
new file mode 100644
index 0000000..95e1f12
--- /dev/null
+++ b/src/opencl/cryptsha512_kernel.cl
@@ -0,0 +1,315 @@
+/*
+* This software is Copyright (c) 2011 Lukas Odzioba <lukas dot odzioba at gmail dot com> 
+* and it is hereby released to the general public under the following terms:
+* Redistribution and use in source and binary forms, with or without modification, are permitted.
+*/
+
+#include "opencl_cryptsha512.h"
+//#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+__constant uint64_t k[] = {
+    0x428a2f98d728ae22UL, 0x7137449123ef65cdUL, 0xb5c0fbcfec4d3b2fUL, 0xe9b5dba58189dbbcUL,
+    0x3956c25bf348b538UL, 0x59f111f1b605d019UL, 0x923f82a4af194f9bUL, 0xab1c5ed5da6d8118UL,
+    0xd807aa98a3030242UL, 0x12835b0145706fbeUL, 0x243185be4ee4b28cUL, 0x550c7dc3d5ffb4e2UL,
+    0x72be5d74f27b896fUL, 0x80deb1fe3b1696b1UL, 0x9bdc06a725c71235UL, 0xc19bf174cf692694UL,
+    0xe49b69c19ef14ad2UL, 0xefbe4786384f25e3UL, 0x0fc19dc68b8cd5b5UL, 0x240ca1cc77ac9c65UL,
+    0x2de92c6f592b0275UL, 0x4a7484aa6ea6e483UL, 0x5cb0a9dcbd41fbd4UL, 0x76f988da831153b5UL,
+    0x983e5152ee66dfabUL, 0xa831c66d2db43210UL, 0xb00327c898fb213fUL, 0xbf597fc7beef0ee4UL,
+    0xc6e00bf33da88fc2UL, 0xd5a79147930aa725UL, 0x06ca6351e003826fUL, 0x142929670a0e6e70UL,
+    0x27b70a8546d22ffcUL, 0x2e1b21385c26c926UL, 0x4d2c6dfc5ac42aedUL, 0x53380d139d95b3dfUL,
+    0x650a73548baf63deUL, 0x766a0abb3c77b2a8UL, 0x81c2c92e47edaee6UL, 0x92722c851482353bUL,
+    0xa2bfe8a14cf10364UL, 0xa81a664bbc423001UL, 0xc24b8b70d0f89791UL, 0xc76c51a30654be30UL,
+    0xd192e819d6ef5218UL, 0xd69906245565a910UL, 0xf40e35855771202aUL, 0x106aa07032bbd1b8UL,
+    0x19a4c116b8d2d0c8UL, 0x1e376c085141ab53UL, 0x2748774cdf8eeb99UL, 0x34b0bcb5e19b48a8UL,
+    0x391c0cb3c5c95a63UL, 0x4ed8aa4ae3418acbUL, 0x5b9cca4f7763e373UL, 0x682e6ff3d6b2b8a3UL,
+    0x748f82ee5defb2fcUL, 0x78a5636f43172f60UL, 0x84c87814a1f0ab72UL, 0x8cc702081a6439ecUL,
+    0x90befffa23631e28UL, 0xa4506cebde82bde9UL, 0xbef9a3f7b2c67915UL, 0xc67178f2e372532bUL,
+    0xca273eceea26619cUL, 0xd186b8c721c0c207UL, 0xeada7dd6cde0eb1eUL, 0xf57d4f7fee6ed178UL,
+    0x06f067aa72176fbaUL, 0x0a637dc5a2c898a6UL, 0x113f9804bef90daeUL, 0x1b710b35131c471bUL,
+    0x28db77f523047d84UL, 0x32caab7b40c72493UL, 0x3c9ebe0a15c9bebcUL, 0x431d67c49c100d4cUL,
+    0x4cc5d4becb3e42b6UL, 0x597f299cfc657e2aUL, 0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL,
+};
+
+void init_ctx(sha512_ctx * ctx) {
+    ctx->H[0] = 0x6a09e667f3bcc908UL;
+    ctx->H[1] = 0xbb67ae8584caa73bUL;
+    ctx->H[2] = 0x3c6ef372fe94f82bUL;
+    ctx->H[3] = 0xa54ff53a5f1d36f1UL;
+    ctx->H[4] = 0x510e527fade682d1UL;
+    ctx->H[5] = 0x9b05688c2b3e6c1fUL;
+    ctx->H[6] = 0x1f83d9abfb41bd6bUL;
+    ctx->H[7] = 0x5be0cd19137e2179UL;
+    ctx->total = 0;
+    ctx->buflen = 0;
+}
+
+void memcpy_1(uint8_t * dest, const uint8_t * src, const size_t n) {
+    for (int i = 0; i < n; i++)
+        dest[i] = src[i];
+}
+
+void memcpy (uint8_t * dest, buffer_64 * src, const size_t n) {
+    for (int i = 0; i < n; i++)
+        dest[i] = src->mem_08[i];
+}
+
+void insert_to_buffer(sha512_ctx * ctx, const uint8_t * string,
+                      const uint8_t len) {
+    uint8_t *d = ctx->buffer->mem_08 + ctx->buflen;  //Position ctx->buffer[buflen] (in char size)
+    memcpy_1(d, string, len);
+    ctx->buflen += len;
+}
+
+void sha512_block(sha512_ctx * ctx) {
+    int i;
+    uint64_t a = ctx->H[0];
+    uint64_t b = ctx->H[1];
+    uint64_t c = ctx->H[2];
+    uint64_t d = ctx->H[3];
+    uint64_t e = ctx->H[4];
+    uint64_t f = ctx->H[5];
+    uint64_t g = ctx->H[6];
+    uint64_t h = ctx->H[7];
+
+    uint64_t w[16];
+
+    uint64_t *data = ctx->buffer->mem_64;  //The same as buffer[0]
+    //#pragma unroll 16
+    for (i = 0; i < 16; i++)
+        w[i] = SWAP64(data[i]);
+
+    uint64_t t1, t2;
+    //#pragma unroll 16
+    for (i = 0; i < 16; i++) {
+        t1 = k[i] + w[i] + h + Sigma1(e) + Ch(e, f, g);
+        t2 = Maj(a, b, c) + Sigma0(a);
+
+        h = g;
+        g = f;
+        f = e;
+        e = d + t1;
+        d = c;
+        c = b;
+        b = a;
+        a = t1 + t2;
+    }
+
+
+    for (i = 16; i < 80; i++) {
+        w[i & 15] = sigma1(w[(i - 2) & 15]) + sigma0(w[(i - 15) & 15]) + w[(i - 16) & 15] + w[(i - 7) & 15];
+        t1 = k[i] + w[i & 15] + h + Sigma1(e) + Ch(e, f, g);
+        t2 = Maj(a, b, c) + Sigma0(a);
+
+        h = g;
+        g = f;
+        f = e;
+        e = d + t1;
+        d = c;
+        c = b;
+        b = a;
+        a = t1 + t2;
+    }
+    /* Put checksum in context given as argument. */
+    ctx->H[0] += a;
+    ctx->H[1] += b;
+    ctx->H[2] += c;
+    ctx->H[3] += d;
+    ctx->H[4] += e;
+    ctx->H[5] += f;
+    ctx->H[6] += g;
+    ctx->H[7] += h;
+}
+
+void ctx_append_1(sha512_ctx * ctx) {
+    uint32_t length = ctx->buflen;
+    int i = 127 - length;
+    uint8_t *d = ctx->buffer->mem_08 + length;
+    *d++ = 0x80;
+
+    while (i--) {
+        d[i] = 0;
+    }
+
+/* TODO: 
+     while(  length%4!=0)
+160     {  *d  =0;
+161     i--;
+162     }
+163     x=(uint32_t*)d;
+164     while(i>0)
+165     {  i-=4;
+166         *x  =0;
+167     }
+*/
+}
+
+void ctx_add_length(sha512_ctx * ctx) {
+    uint64_t *blocks = ctx->buffer->mem_64;
+    blocks[15] = SWAP64((uint64_t) (ctx->total * 8));
+}
+
+void finish_ctx(sha512_ctx * ctx) {
+    ctx_append_1(ctx);
+    ctx_add_length(ctx);
+    ctx->buflen = 0;
+}
+
+void ctx_update(sha512_ctx * ctx, uint8_t *string, uint8_t len) {
+    ctx->total += len;
+    uint8_t startpos = ctx->buflen;
+    uint8_t partsize;
+    if (startpos + len <= 128) {
+        partsize = len;
+    } else
+        partsize = 128 - startpos;
+
+    insert_to_buffer(ctx, string, partsize);
+    if (ctx->buflen == 128) {
+        uint8_t offset = 128 - startpos;
+        sha512_block(ctx);
+        ctx->buflen = 0;
+        insert_to_buffer(ctx, (string + offset), len - offset);
+    }
+}
+
+void clear_ctx_buffer(sha512_ctx * ctx) {
+
+    uint32_t *w = ctx->buffer->mem_32;
+    //#pragma unroll 30
+    for (int i = 0; i < 30; i++) //TODO: why 30? Not 32?
+        w[i] = 0;
+
+    ctx->buflen = 0;
+}
+
+void sha512_digest(sha512_ctx * ctx, uint64_t * result) {
+    uint8_t i;
+    if (ctx->buflen <= 111) { //data+0x80+datasize fits in one 1024bit block
+        finish_ctx(ctx);
+        sha512_block(ctx);
+    } else {
+        uint8_t moved = 1;
+        if (ctx->buflen < 128) { //data and 0x80 fits in one block
+            ctx_append_1(ctx);
+            moved = 0;
+        }
+        sha512_block(ctx);
+        clear_ctx_buffer(ctx);
+        if (moved)
+            ctx->buffer->mem_08[0] = 0x80; //append 1,the rest is already clean
+        ctx_add_length(ctx);
+        sha512_block(ctx);
+    }
+    //#pragma unroll 8
+    for (i = 0; i < 8; i++)
+        result[i] = SWAP64(ctx->H[i]);
+}
+
+void sha512crypt(uint8_t *pass, uint8_t passlength,
+                 crypt_sha512_salt cuda_salt, 
+                 __global crypt_sha512_hash * output) {
+
+    buffer_64 alt_result[8], temp_result[8];
+    int i;
+    sha512_ctx ctx;
+    init_ctx(&ctx);
+
+    ctx_update(&ctx, pass, passlength);
+    ctx_update(&ctx, cuda_salt.salt, cuda_salt.saltlen);
+    ctx_update(&ctx, pass, passlength);
+
+    sha512_digest(&ctx, alt_result->mem_64);
+    init_ctx(&ctx);
+
+    ctx_update(&ctx, pass, passlength);
+    ctx_update(&ctx, cuda_salt.salt, cuda_salt.saltlen);
+    ctx_update(&ctx, alt_result->mem_08, passlength);
+
+    for (i = passlength; i > 0; i >>= 1) {
+        if ((i & 1) != 0)
+            ctx_update(&ctx, alt_result->mem_08, 64);
+        else
+            ctx_update(&ctx, pass, passlength);
+    }
+    sha512_digest(&ctx, alt_result->mem_64);
+    init_ctx(&ctx);
+
+    for (i = 0; i < passlength; i++)
+        ctx_update(&ctx, pass, passlength);
+
+    sha512_digest(&ctx, temp_result->mem_64);
+
+    uint8_t sp_sequence[16 + 4];
+    uint8_t *p_sequence = sp_sequence;
+    memcpy(p_sequence, temp_result, passlength);
+
+    init_ctx(&ctx);
+    
+    /* For every character in the password add the entire password.  */
+    for (i = 0; i < 16 + (alt_result->mem_08)[0]; i++)  //Analyse, TÁ CERTO?###
+        ctx_update(&ctx, cuda_salt.salt, cuda_salt.saltlen);
+
+    /* Finish the digest.  */
+    sha512_digest(&ctx, temp_result->mem_64);
+
+    uint8_t saltlength = cuda_salt.saltlen;
+
+    uint8_t ss_sequence[16 + 4];
+    uint8_t *s_sequence = ss_sequence;
+    memcpy(s_sequence, temp_result, saltlength);
+
+    /* Repeatedly run the collected hash value through SHA512 to
+       burn CPU cycles.  */
+    for (i = 0; i < cuda_salt.rounds; i++) {
+        init_ctx(&ctx);
+
+        if ((i & 1) != 0)
+            ctx_update(&ctx, p_sequence, passlength);
+        else
+            ctx_update(&ctx, alt_result->mem_08, 64);  
+
+        if ((i % 3) != 0)
+            ctx_update(&ctx, s_sequence, saltlength);
+
+        if ((i % 7) != 0)
+            ctx_update(&ctx, p_sequence, passlength);
+
+        if ((i & 1) != 0)
+            ctx_update(&ctx, alt_result->mem_08, 64);  
+        else
+            ctx_update(&ctx, p_sequence, passlength);
+
+        sha512_digest(&ctx, alt_result->mem_64);
+    }
+    //Send results to the host.
+    //#pragma unroll 8
+    for (i = 0; i < 8; i++)
+        output->v[i] = alt_result[i].mem_64[0];
+}
+
+__kernel void kernel_crypt(__constant crypt_sha512_salt * hsalt,
+                           __constant crypt_sha512_password * inbuffer,
+                           __global   crypt_sha512_hash * outbuffer) {
+
+    uint8_t pass[PLAINTEXT_LENGTH];
+    crypt_sha512_salt salt_data;
+
+    //Get the task to be done
+    uint32_t idx = get_global_id(0);
+
+    //Use fast memory.
+
+    //Get password information, put in faster memory.
+    for (int i = 0; i < inbuffer[idx].length; i++)
+        pass[i] = inbuffer[idx].v[i]; 
+    
+    //Get salt information, put in faster memory.
+    salt_data.saltlen = hsalt->saltlen;
+    salt_data.rounds = hsalt->rounds;
+
+    for (int i = 0; i < salt_data.saltlen; i++)
+	salt_data.salt[i] = hsalt->salt[i];
+
+    //Do the job
+    sha512crypt(pass, inbuffer[idx].length, salt_data, &outbuffer[idx]);
+}
diff --git a/src/opencl_cryptsha512.h b/src/opencl_cryptsha512.h
new file mode 100644
index 0000000..f8f1852
--- /dev/null
+++ b/src/opencl_cryptsha512.h
@@ -0,0 +1,79 @@
+/*
+* This software is Copyright (c) 2011 Lukas Odzioba <lukas dot odzioba at gmail dot com> 
+* and it is hereby released to the general public under the following terms:
+* Redistribution and use in source and binary forms, with or without modification, are permitted.
+*/
+#ifndef _CRYPTSHA512_H 
+#define _CRYPTSHA512_H
+
+//Type names definition. ///TODO: move to a new file and share this new file where needed.
+#define uint8_t  unsigned char
+#define uint16_t unsigned short
+#define uint32_t unsigned int
+#define uint64_t ulong  //Tip: unsigned long long int failed on compile (AMD).
+
+//Functions.
+#define MAX(x,y)                ((x) > (y) ? (x) : (y))
+#define MIN(x,y)                ((x) < (y) ? (x) : (y))
+
+#define ROUNDS_DEFAULT          5000
+#define ROUNDS_MIN              1000
+#define ROUNDS_MAX              999999999
+
+#define SALT_SIZE               16
+#define PLAINTEXT_LENGTH        16     
+#define KEYS_PER_CRYPT          1024*2048
+
+#define rol(x,n)                ((x << n) | (x >> (64-n)))
+#define ror(x,n)                ((x >> n) | (x << (64-n)))
+#define Ch(x,y,z)               ((x & y) ^ ( (~x) & z))
+#define Maj(x,y,z)              ((x & y) ^ (x & z) ^ (y & z))
+#define Sigma0(x)               ((ror(x,28)) ^ (ror(x,34)) ^ (ror(x,39)))
+#define Sigma1(x)               ((ror(x,14)) ^ (ror(x,18)) ^ (ror(x,41)))
+#define sigma0(x)               ((ror(x,1))  ^ (ror(x,8))  ^ (x>>7))
+#define sigma1(x)               ((ror(x,19)) ^ (ror(x,61)) ^ (x>>6))
+
+# define SWAP32(n) \
+    (((n) << 24) | (((n) & 0xff00) << 8) | (((n) >> 8) & 0xff00) | ((n) >> 24))
+
+# define SWAP64(n) \
+  (((n) << 56)					\
+   | (((n) & 0xff00) << 40)			\
+   | (((n) & 0xff0000) << 24)			\
+   | (((n) & 0xff000000) << 8)			\
+   | (((n) >> 8) & 0xff000000)			\
+   | (((n) >> 24) & 0xff0000)			\
+   | (((n) >> 40) & 0xff00)			\
+   | ((n) >> 56))
+
+//Data types.
+typedef union {
+    uint8_t  mem_08[8];
+    uint16_t mem_16[4];
+    uint32_t mem_32[2];
+    uint64_t mem_64[1];
+} buffer_64;
+
+typedef struct {
+	uint64_t  H[8];          //512 bits
+	uint32_t  total;
+	uint32_t  buflen;
+	buffer_64 buffer[16];	//1024bits
+} sha512_ctx;
+
+typedef struct {
+	uint32_t rounds;
+	uint8_t  saltlen;
+	uint8_t  salt[SALT_SIZE];
+} crypt_sha512_salt;
+
+typedef struct {
+	uint8_t length;
+	uint8_t v[PLAINTEXT_LENGTH];
+} crypt_sha512_password;
+
+typedef struct {
+	uint64_t v[8];		//512 bits
+} crypt_sha512_hash;
+
+#endif
\ No newline at end of file
diff --git a/src/opencl_cryptsha512_fmt.c b/src/opencl_cryptsha512_fmt.c
new file mode 100644
index 0000000..dfaa949
--- /dev/null
+++ b/src/opencl_cryptsha512_fmt.c
@@ -0,0 +1,548 @@
+/*
+ * Copyright (c) 2011 Samuele Giovanni Tonon
+ * samu at linuxasylum dot net
+ * This program comes with ABSOLUTELY NO WARRANTY; express or
+ * implied .
+ * This is free software, and you are welcome to redistribute it
+ * under certain conditions; as expressed here 
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ */
+
+#include <string.h>
+#include "common-opencl.h"  
+#include "opencl_cryptsha512.h"
+
+#define FORMAT_LABEL			"cryptsha512-opencl"
+#define FORMAT_NAME			"crypt SHA-512 OpenCL"
+
+#if ARCH_BITS >= 64
+#define ALGORITHM_NAME			"OpenSSL 64/" ARCH_BITS_STR
+#else
+#define ALGORITHM_NAME			"OpenSSL 32/" ARCH_BITS_STR
+#endif
+
+#define BENCHMARK_COMMENT		" rounds=5000"
+#define BENCHMARK_LENGTH		-1
+
+#define BINARY_SIZE                     (3+16+86)       ///TODO: Magic number?
+
+#define MIN_KEYS_PER_CRYPT		1024            
+#define MAX_KEYS_PER_CRYPT		KEYS_PER_CRYPT
+
+static crypt_sha512_password            *plaintext;     // plaintext ciphertexts
+static crypt_sha512_hash                *out_hashes;    // calculated hashes
+static crypt_sha512_salt                salt_data;
+
+cl_mem salt_info;       //Salt information.
+cl_mem buffer_in;       //Plaintext buffer.
+cl_mem buffer_out;      //Hash keys (output)
+cl_mem pinned_saved_keys, pinned_partial_hashes;
+
+cl_command_queue queue_prof;
+cl_kernel crypt_kernel;
+
+static size_t max_keys_per_crypt = KEYS_PER_CRYPT;
+
+static struct fmt_tests tests[] = {
+    {"$6$saltstring$svn8UoSVapNtMuq1ukKS4tPQd8iKwSMHWjl/O817G3uBnIFNjnQJuesI68u4OTLiBFdcbYEdFCoEOfaS35inz1", "Hello world!"},
+    {"$6$LKO/Ute40T3FNF95$6S/6T2YuOIHY0N3XpLKABJ3soYcXD9mB7uVbtEZDj/LNscVhZoZ9DEH.sBciDrMsHOWOoASbNLTypH/5X26gN0", "U*U*U*U*"},
+    {"$6$LKO/Ute40T3FNF95$wK80cNqkiAUzFuVGxW6eFe8J.fSVI65MD5yEm8EjYMaJuDrhwe5XXpHDJpwF/kY.afsUs1LlgQAaOapVNbggZ1", "U*U***U"},
+    {"$6$OmBOuxFYBZCYAadG$WCckkSZok9xhp4U1shIZEV7CCVwQUwMVea7L3A77th6SaE9jOPupEMJB.z0vIWCDiN9WLh2m9Oszrj5G.gt330", "*U*U*U*U"},
+    {"$6$ojWH1AiTee9x1peC$QVEnTvRVlPRhcLQCk/HnHaZmlGAAjCfrAN0FtOsOnUk5K5Bn/9eLHHiRzrTzaIKjW9NTLNIBUCtNVOowWS2mN.", ""},
+    {NULL}
+}; 
+
+/* ------- Create and destroy necessary objects ------- */
+static void create_clobj(int kpc) {           
+    pinned_saved_keys = clCreateBuffer(context[gpu_id], 
+            CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+            sizeof(crypt_sha512_password) * kpc, NULL, &ret_code);
+    HANDLE_CLERROR(ret_code, "Error creating page-locked memory pinned_saved_keys");
+
+    plaintext = (crypt_sha512_password *) clEnqueueMapBuffer(queue[gpu_id], 
+            pinned_saved_keys, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0,
+            sizeof(crypt_sha512_password) * kpc, 0, NULL, NULL, &ret_code);
+    HANDLE_CLERROR(ret_code, "Error mapping page-locked memory saved_plain");
+    
+    pinned_partial_hashes = clCreateBuffer(context[gpu_id],
+            CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 
+            sizeof(crypt_sha512_hash) * kpc, NULL, &ret_code);
+    HANDLE_CLERROR(ret_code, "Error creating page-locked memory pinned_partial_hashes");
+
+    out_hashes = (crypt_sha512_hash *) clEnqueueMapBuffer(queue[gpu_id],
+            pinned_partial_hashes, CL_TRUE, CL_MAP_READ, 0, 
+            sizeof(crypt_sha512_hash) * kpc, 0, NULL, NULL, &ret_code);
+    HANDLE_CLERROR(ret_code, "Error mapping page-locked memory out_hashes");
+
+    // create arguments (buffers)
+    salt_info = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, 
+            sizeof(crypt_sha512_salt), NULL, &ret_code);
+    HANDLE_CLERROR(ret_code, "Error creating data_info out argument");
+    
+    buffer_in = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY,
+            sizeof(crypt_sha512_password) * kpc, NULL, &ret_code);
+    HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_keys");
+
+    buffer_out = clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY,
+            sizeof(crypt_sha512_hash) * kpc, NULL, &ret_code);
+    HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_out");
+
+    //Set kernel arguments
+    HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 0, sizeof (cl_mem),
+            (void *) &salt_info), "Error setting argument 0");
+    HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 1, sizeof (cl_mem),
+            (void *) &buffer_in), "Error setting argument 1");
+    HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 2, sizeof (cl_mem),
+            (void *) &buffer_out), "Error setting argument 2");
+    
+    memset(plaintext, '\0', sizeof(crypt_sha512_password) * kpc);
+    salt_data.saltlen = 0;
+    salt_data.rounds = 0;
+    max_keys_per_crypt = kpc;
+}
+
+static void release_clobj(void) {
+    cl_int ret_code;
+
+    ret_code = clEnqueueUnmapMemObject(queue[gpu_id], pinned_partial_hashes,
+            out_hashes, 0, NULL, NULL);
+    HANDLE_CLERROR(ret_code, "Error Ummapping out_hashes");
+    
+    ret_code = clEnqueueUnmapMemObject(queue[gpu_id], pinned_saved_keys,
+            plaintext, 0, NULL, NULL);
+    HANDLE_CLERROR(ret_code, "Error Ummapping saved_plain");
+    
+    ret_code = clReleaseMemObject(salt_info);
+    HANDLE_CLERROR(ret_code, "Error Releasing data_info");
+    ret_code = clReleaseMemObject(buffer_in);
+    HANDLE_CLERROR(ret_code, "Error Releasing buffer_keys");
+    ret_code = clReleaseMemObject(buffer_out);
+    HANDLE_CLERROR(ret_code, "Error Releasing buffer_out");
+    
+    ret_code = clReleaseMemObject(pinned_saved_keys);
+    HANDLE_CLERROR(ret_code, "Error Releasing pinned_saved_keys");
+    
+    ret_code = clReleaseMemObject(pinned_partial_hashes);
+    HANDLE_CLERROR(ret_code, "Error Releasing pinned_partial_hashes");
+}
+
+/* ------- Key functions ------- */
+static void set_key(char *key, int index) {
+    int len = strlen(key);
+    plaintext[index].length = len;
+    memcpy(plaintext[index].v, key, len); 
+}
+
+static char *get_key(int index) {
+    static char ret[PLAINTEXT_LENGTH + 1];
+    memcpy(ret, plaintext[index].v, PLAINTEXT_LENGTH);
+    ret[plaintext[index].length] = '\0';
+    return ret;
+}
+
+/* ------- Try to find the best configuration ------- */
+/* --
+  This function could be used to calculated the best num
+  of keys per crypt for the given format
+-- */
+static void find_best_workgroup(void) {
+    cl_event myEvent;
+    cl_ulong startTime, endTime, kernelExecTimeNs = CL_ULONG_MAX;
+    size_t my_work_group = 1;
+    cl_int ret_code;
+    int i;
+    size_t max_group_size;
+
+    clGetDeviceInfo(devices[gpu_id], CL_DEVICE_MAX_WORK_GROUP_SIZE, 
+            sizeof (max_group_size), &max_group_size, NULL);
+    queue_prof = clCreateCommandQueue(context[gpu_id], devices[gpu_id], 
+            CL_QUEUE_PROFILING_ENABLE, &ret_code);
+    printf("Max Group Work Size %d ", (int) max_group_size);
+    local_work_size = 1;
+
+    // Set keys
+    for (i = 0; i < KEYS_PER_CRYPT; i++) {
+        set_key("aaabaabaaa", i);
+    }
+    clEnqueueWriteBuffer(queue[gpu_id], salt_info, CL_TRUE, 0,
+            sizeof (crypt_sha512_salt), &salt_data, 0, NULL, NULL);
+    clEnqueueWriteBuffer(queue_prof, buffer_in, CL_TRUE, 0, 
+            sizeof (crypt_sha512_password) * KEYS_PER_CRYPT, plaintext, 0, NULL, NULL);
+
+    // Find minimum time
+    for (my_work_group = 1; (int) my_work_group <= (int) max_group_size; my_work_group *= 2) {
+        ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 
+                1, NULL, &max_keys_per_crypt, &my_work_group, 0, NULL, &myEvent);
+        clFinish(queue_prof);
+
+        if (ret_code != CL_SUCCESS) {
+            printf("Error %d\n", ret_code); ///Better commented by default.
+            break;
+        }
+        //Get profile information
+        clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, 
+                sizeof (cl_ulong), &startTime, NULL);
+        clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, 
+                sizeof (cl_ulong), &endTime, NULL);
+
+        if ((endTime - startTime) < kernelExecTimeNs) {
+            kernelExecTimeNs = endTime - startTime;
+            local_work_size = my_work_group;
+        }
+    }
+    printf("Optimal local work size %d\n", (int) local_work_size);
+    printf("(to avoid this test on next run do export LWS=%d)\n", (int) local_work_size);
+    clReleaseCommandQueue(queue_prof);
+}
+
+/* --
+  This function could be used to calculated the best num
+  of keys per crypt for the given format
+-- */
+static void find_best_kpc(void) {
+    int num;
+    cl_event myEvent;
+    cl_ulong startTime, endTime, tmpTime;
+    int kernelExecTimeNs = 6969;
+    cl_int ret_code;
+    int optimal_kpc = MIN_KEYS_PER_CRYPT;
+    int i;
+    cl_uint *tmpbuffer;
+
+    printf("Calculating best keys per crypt, this will take a while ");
+    
+    for (num = MAX_KEYS_PER_CRYPT; num > MIN_KEYS_PER_CRYPT; num -= 4096) {
+        release_clobj();
+        create_clobj(num);
+        advance_cursor();
+        queue_prof = clCreateCommandQueue(context[gpu_id], devices[gpu_id], 
+                CL_QUEUE_PROFILING_ENABLE, &ret_code);
+      
+        // Set keys
+        for (i = 0; i < num; i++) {
+            set_key("aaabaabaaa", i);
+        }
+        clEnqueueWriteBuffer(queue[gpu_id], salt_info, CL_FALSE, 0,
+                sizeof (crypt_sha512_salt), &salt_data, 0, NULL, NULL);
+        clEnqueueWriteBuffer(queue_prof, buffer_in, CL_TRUE, 0, 
+                sizeof (crypt_sha512_password) * num, plaintext, 0, NULL, NULL);
+           
+        ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 
+                1, NULL, &max_keys_per_crypt, &local_work_size, 0, NULL, &myEvent);
+        clFinish(queue_prof);
+                
+        if (ret_code != CL_SUCCESS) {
+            printf("Error %d\n", ret_code);
+            continue;
+        }       
+        clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, 
+                sizeof (cl_ulong), &startTime, NULL);
+        clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, 
+                sizeof (cl_ulong), &endTime, NULL);
+        
+        tmpTime = endTime - startTime;
+        tmpbuffer = malloc(sizeof (cl_uint) * num);
+        
+        clEnqueueReadBuffer(queue_prof, buffer_out, CL_TRUE, 0, 
+                sizeof (cl_uint) * num, tmpbuffer, 0, NULL, &myEvent);
+        clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, 
+                sizeof (cl_ulong), &startTime, NULL);
+        clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, 
+                sizeof (cl_ulong), &endTime, NULL);
+        tmpTime = tmpTime + (endTime - startTime);
+        
+        if (((int) (((float) (tmpTime) / num) * 10)) <= kernelExecTimeNs) {
+            kernelExecTimeNs = ((int) (((float) (tmpTime) / num) * 10));
+            optimal_kpc = num;
+        }
+        free(tmpbuffer);
+        clReleaseCommandQueue(queue_prof);
+    }
+    printf("Optimal keys per crypt %d\n", optimal_kpc);
+    printf("(to avoid this test on next run do \"export KPC=%d\")\n", optimal_kpc);
+
+    max_keys_per_crypt = optimal_kpc;
+    release_clobj();
+    create_clobj(optimal_kpc);
+}
+
+/* ------- Initialization  ------- */
+static void init(struct fmt_main *pFmt) {
+    char *kpc;
+    opencl_init("$JOHN/cryptsha512_kernel.cl", gpu_id, platform_id);
+
+    // create kernel to execute
+    crypt_kernel = clCreateKernel(program[gpu_id], "kernel_crypt", &ret_code);
+    HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");
+
+    if (((kpc = getenv("LWS")) == NULL) || (atoi(kpc) == 0)) {
+        create_clobj(KEYS_PER_CRYPT);
+        find_best_workgroup();
+        release_clobj();
+    } else {
+        local_work_size = atoi(kpc);
+    }
+    if ((kpc = getenv("KPC")) == NULL) {
+        max_keys_per_crypt = KEYS_PER_CRYPT;
+        create_clobj(KEYS_PER_CRYPT);
+    } else {
+        if (atoi(kpc) == 0) {
+            //user chose to die of boredom
+            max_keys_per_crypt = KEYS_PER_CRYPT;
+            create_clobj(KEYS_PER_CRYPT); 
+            find_best_kpc();
+        } else {
+            max_keys_per_crypt = atoi(kpc);
+            create_clobj(max_keys_per_crypt);
+        }
+    }
+    printf("Local work size (LWS) %d, Keys per crypt (KPC) %Zd\n", 
+            (int) local_work_size, max_keys_per_crypt);   
+    pFmt->params.max_keys_per_crypt = max_keys_per_crypt;
+}
+
+/* ------- Check if the ciphertext if a valid SHA-512 crypt ------- */
+static int valid(char *ciphertext, struct fmt_main *pFmt) {
+	uint32_t i, j;
+	int len = strlen(ciphertext);
+
+	if (strncmp(ciphertext, "$6$", 3) != 0)
+		return 0;
+	char *p = strrchr(ciphertext, '$');
+	if (p == NULL)
+		return 0;
+	for (i = p - ciphertext + 1; i < len; i++) {
+		int found = 0;
+		for (j = 0; j < 64; j++)
+			if (itoa64[j] == ARCH_INDEX(ciphertext[i]))
+				found = 1;
+		if (found == 0) {
+			puts("not found");
+			return 0;
+		}
+	}
+	if (len - (p - ciphertext + 1) != 86)
+		return 0;
+	return 1;
+}
+
+/* ------- Salt functions ------- */
+static void *get_salt(char *ciphertext) {
+    int end = 0, i, len = strlen(ciphertext);
+    for (i = len - 1; i >= 0; i--)
+        if (ciphertext[i] == '$') {
+            end = i;
+            break;
+        }
+
+    static unsigned char ret[50];
+    for (i = 0; i < end; i++)
+        ret[i] = ciphertext[i];
+    ret[end] = 0;
+    return (void *) ret;
+}
+
+static void set_salt(void *salt) {
+    unsigned char *s = salt;
+    int len = strlen(salt);
+    static char currentsalt[64];
+    memcpy(currentsalt, s, len + 1);
+    unsigned char offset = 0;
+    salt_data.rounds = ROUNDS_DEFAULT;
+
+    if (strncmp((char *) "$6$", (char *) currentsalt, 3) == 0)
+        offset += 3;
+
+    if (strncmp((char *) currentsalt + offset, (char *) "rounds=", 7) == 0) {
+        const char *num = currentsalt + offset + 7;
+        char *endp;
+        unsigned long int srounds = strtoul(num, &endp, 10);
+
+        if (*endp == '$') {
+            endp += 1;
+            salt_data.rounds =
+                    MAX(ROUNDS_MIN, MIN(srounds, ROUNDS_MAX));
+        }
+        offset = endp - currentsalt;
+    }
+    memcpy(salt_data.salt, currentsalt + offset, 16);
+    salt_data.saltlen = strlen((char *) salt_data.salt);
+}
+
+/* ------- To binary functions ------- */
+static int findb64(char c) {
+    int ret = ARCH_INDEX(atoi64[(uint8_t) c]);
+    return ret != 0x7f ? ret : 0;
+}
+
+static void magic(char *crypt, unsigned char *alt) {
+#define _24bit_from_b64(I,B2,B1,B0) \
+  {\
+      unsigned char c1=findb64(crypt[I+0]);\
+      unsigned char c2=findb64(crypt[I+1]);\
+      unsigned char c3=findb64(crypt[I+2]);\
+      unsigned char c4=findb64(crypt[I+3]);\
+      unsigned int w=c4<<18|c3<<12|c2<<6|c1;\
+      unsigned char b2=w&0xff;w>>=8;\
+      unsigned char b1=w&0xff;w>>=8;\
+      unsigned char b0=w&0xff;w>>=8;\
+      alt[B2]=b0;\
+      alt[B1]=b1;\
+      alt[B0]=b2;\
+  }
+    _24bit_from_b64(0, 0, 21, 42);
+    _24bit_from_b64(4, 22, 43, 1);
+    _24bit_from_b64(8, 44, 2, 23);
+    _24bit_from_b64(12, 3, 24, 45);
+    _24bit_from_b64(16, 25, 46, 4);
+    _24bit_from_b64(20, 47, 5, 26);
+    _24bit_from_b64(24, 6, 27, 48);
+    _24bit_from_b64(28, 28, 49, 7); 
+    _24bit_from_b64(32, 50, 8, 29);
+    _24bit_from_b64(36, 9, 30, 51);
+    _24bit_from_b64(40, 31, 52, 10);
+    _24bit_from_b64(44, 53, 11, 32);
+    _24bit_from_b64(48, 12, 33, 54);
+    _24bit_from_b64(52, 34, 55, 13);
+    _24bit_from_b64(56, 56, 14, 35);
+    _24bit_from_b64(60, 15, 36, 57);
+    _24bit_from_b64(64, 37, 58, 16);
+    _24bit_from_b64(68, 59, 17, 38);
+    _24bit_from_b64(72, 18, 39, 60);
+    _24bit_from_b64(76, 40, 61, 19);
+    _24bit_from_b64(80, 62, 20, 41);
+
+    uint32_t w = findb64(crypt[85]) << 6 | findb64(crypt[84]) << 0;
+    alt[63] = (w & 0xff);
+}
+
+static void * get_binary(char *ciphertext) {
+    static unsigned char b[BINARY_SIZE];
+    memset(b, 0, BINARY_SIZE);
+    char *p = strrchr(ciphertext, '$');
+    
+    if (p != NULL)
+        magic(p + 1, b);
+    return (void *) b;
+}
+
+/* ------- Compare functins ------- */
+static int cmp_all(void *binary, int count) {
+    uint32_t i;
+    uint64_t b = ((uint64_t *) binary)[0];
+
+    for (i = 0; i < count; i++)
+        if (b == out_hashes[i].v[0])
+            return 1;
+    return 0;
+}
+
+static int cmp_one(void *binary, int index) { 
+    int i;
+    uint64_t *t = (uint64_t *) binary;
+    
+    for (i = 0; i < 8; i++) {
+        if (t[i] != out_hashes[index].v[i])
+            return 0;
+    }
+    return 1;
+}
+
+static int cmp_exact(char *source, int count) { 
+    return 1;
+}
+
+/* ------- Crypt function ------- */
+static void crypt_all(int count) {
+    //Send data to the dispositive
+    HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_info, CL_FALSE, 0,
+            sizeof (crypt_sha512_salt), &salt_data, 0, NULL, NULL),
+            "failed in clEnqueueWriteBuffer data_info");
+    HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_in, CL_FALSE, 0,
+            sizeof(crypt_sha512_password) * max_keys_per_crypt, plaintext, 0, NULL, NULL),
+            "failed in clEnqueueWriteBuffer buffer_in");
+
+    //Enqueue the kernel
+    HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1, NULL,
+            &max_keys_per_crypt, &local_work_size, 0, NULL, NULL),
+            "failed in clEnqueueNDRangeKernel");
+
+    //Read back hashes
+    HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], buffer_out, CL_FALSE, 0,
+            sizeof (crypt_sha512_hash) * max_keys_per_crypt, out_hashes, 0, NULL, NULL),
+            "failed in reading data back");
+ 
+    //Do the work
+    HANDLE_CLERROR(clFinish(queue[gpu_id]), "failed in clFinish");
+}
+
+/* ------- Binary Hash functions group ------- */
+static int binary_hash_0(void * binary) { return *(ARCH_WORD_32 *) binary & 0xF; }
+static int binary_hash_1(void * binary) { return *(ARCH_WORD_32 *) binary & 0xFF; }
+static int binary_hash_2(void * binary) { return *(ARCH_WORD_32 *) binary & 0xFFF; }
+static int binary_hash_3(void * binary) { return *(ARCH_WORD_32 *) binary & 0xFFFF; }
+static int binary_hash_4(void * binary) { return *(ARCH_WORD_32 *) binary & 0xFFFFF; }
+static int binary_hash_5(void * binary) { return *(ARCH_WORD_32 *) binary & 0xFFFFFF; }
+static int binary_hash_6(void * binary) { return *(ARCH_WORD_32 *) binary & 0x7FFFFFF; }
+
+//Get Hash functions group.
+static int get_hash_0(int index) { return out_hashes[index].v[0] & 0xF; }
+static int get_hash_1(int index) { return out_hashes[index].v[0] & 0xFF; }
+static int get_hash_2(int index) { return out_hashes[index].v[0] & 0xFFF; }
+static int get_hash_3(int index) { return out_hashes[index].v[0] & 0xFFFF; }
+static int get_hash_4(int index) { return out_hashes[index].v[0] & 0xFFFFF; }
+static int get_hash_5(int index) { return out_hashes[index].v[0] & 0xFFFFFF; }
+static int get_hash_6(int index) { return out_hashes[index].v[0] & 0x7FFFFFF; }
+
+/* ------- Format structure ------- */
+struct fmt_main fmt_opencl_cryptsha512 = {
+    {
+        FORMAT_LABEL,
+        FORMAT_NAME,
+        ALGORITHM_NAME,
+        BENCHMARK_COMMENT,
+        BENCHMARK_LENGTH,
+        PLAINTEXT_LENGTH,
+        BINARY_SIZE,
+        SALT_SIZE,
+        MIN_KEYS_PER_CRYPT,
+        MAX_KEYS_PER_CRYPT,
+        FMT_CASE | FMT_8_BIT,
+        tests
+    },
+    {
+        init,
+        fmt_default_prepare,
+        valid,
+        fmt_default_split,
+        get_binary,
+        get_salt,
+        {
+            binary_hash_0,
+            binary_hash_1,
+            binary_hash_2,
+            binary_hash_3,
+            binary_hash_4,
+            binary_hash_5,
+            binary_hash_6
+        },
+        fmt_default_salt_hash,
+        set_salt,
+        set_key,
+        get_key,
+        fmt_default_clear_keys,
+        crypt_all,
+        {
+            get_hash_0,
+            get_hash_1,
+            get_hash_2,
+            get_hash_3,
+            get_hash_4,
+            get_hash_5,
+            get_hash_6
+        },
+        cmp_all,
+        cmp_one,
+        cmp_exact
+    }
+};
-- 
1.7.5.4