john-dev - Re: [patch] sse/xop implementation of raw-sha512

Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <cc6e296990663cdca418e853a914e66b@smtp.hushmail.com>
Date: Mon, 15 Apr 2013 01:53:32 +0200
From: magnum <john.magnum@...hmail.com>
To: john-dev@...ts.openwall.com
Subject: Re: [patch] sse/xop implementation of raw-sha512

On 15 Apr, 2013, at 0:41 , magnum <john.magnum@...hmail.com> wrote:
> With the fixed version I do get a 2x speedup on Intel, and this despite it's only SSE2 (because Apple's silly OSX assembler can't handle AVX and stuff).
> 
> $ ../run/john -t -fo:raw-sha512*
> Benchmarking: Raw SHA-512 [128/128 SSE2 intrinsics 2x]... DONE
> Raw:	2890K c/s real, 2890K c/s virtual
> 
> Benchmarking: Raw SHA-512 [64/64 CommonCrypto]... DONE
> Raw:	1449K c/s real, 1449K c/s virtual
> 
> ...and just over 2x on Bull:
> 
> $ ../run/john -t -fo:raw-sha512*
> Benchmarking: Raw SHA-512 [128/128 XOP intrinsics 2x]... DONE
> Raw:    3813K c/s real, 3778K c/s virtual
> 
> Benchmarking: Raw SHA-512 [64/64 OpenSSL]... DONE
> Raw:    1889K c/s real, 1889K c/s virtual

I added OMP support. Scales well on Intel, but worse on AMD:

magnum@...r-osx:src [bleeding-jumbo]$ for i in 1 4 8; do OMP_NUM_THREADS=$i ../run/john -t -fo:raw-sha512* ; done
Benchmarking: Raw SHA-512 [128/128 SSE2 intrinsics 2x]... DONE
Raw:	2985K c/s real, 2985K c/s virtual

Benchmarking: Raw SHA-512 [64/64 CommonCrypto]... DONE
Raw:	1499K c/s real, 1484K c/s virtual

All 2 formats passed self-tests!
Benchmarking: Raw SHA-512 [128/128 SSE2 intrinsics 2x]... (4xOMP) DONE
Raw:	8011K c/s real, 2644K c/s virtual

Benchmarking: Raw SHA-512 [64/64 CommonCrypto]... (4xOMP) DONE
Raw:	4014K c/s real, 1238K c/s virtual

All 2 formats passed self-tests!
Benchmarking: Raw SHA-512 [128/128 SSE2 intrinsics 2x]... (8xOMP) DONE
Raw:	8093K c/s real, 1574K c/s virtual

Benchmarking: Raw SHA-512 [64/64 CommonCrypto]... (8xOMP) DONE
Raw:	5079K c/s real, 843694 c/s virtual

All 2 formats passed self-tests!

The drop at 8x is because of it's HT and only 4 real cores.


magnum@...l:src [bleeding-jumbo]$ for i in 1 4 8; do OMP_NUM_THREADS=$i ../run/john -t -fo:raw-sha512* ; done
Benchmarking: Raw SHA-512 [128/128 XOP intrinsics 2x]... DONE
Raw:    3207K c/s real, 3207K c/s virtual

Benchmarking: Raw SHA-512 [64/64 OpenSSL]... DONE
Raw:    1822K c/s real, 1822K c/s virtual

All 2 formats passed self-tests!
Benchmarking: Raw SHA-512 [128/128 XOP intrinsics 2x]... (4xOMP) DONE
Raw:    7012K c/s real, 1753K c/s virtual

Benchmarking: Raw SHA-512 [64/64 OpenSSL]... (4xOMP) DONE
Raw:    5215K c/s real, 1316K c/s virtual

All 2 formats passed self-tests!
Benchmarking: Raw SHA-512 [128/128 XOP intrinsics 2x]... (8xOMP) DONE
Raw:    8077K c/s real, 1008K c/s virtual

Benchmarking: Raw SHA-512 [64/64 OpenSSL]... (8xOMP) DONE
Raw:    7072K c/s real, 891813 c/s virtual

All 2 formats passed self-tests!


Maybe it can be tweaked. I already use a lower OMP_SCALE for XOP but I haven't looked much into it.

This is committed already but here's a diff -w:

diff --git a/src/rawSHA512_ng_fmt.c b/src/rawSHA512_ng_fmt.c
index 6bcb3da..c58ada5 100644
--- a/src/rawSHA512_ng_fmt.c
+++ b/src/rawSHA512_ng_fmt.c
@@ -7,6 +7,15 @@
  */
 
 
+#ifdef _OPENMP
+#include <omp.h>
+#if defined __XOP__
+#define OMP_SCALE                 1024 /* AMD */
+#else
+#define OMP_SCALE                 2048 /* Intel */
+#endif
+#endif
+
 #include "arch.h"
 #ifdef MMX_COEF
 
@@ -95,7 +104,7 @@
 #define GATHER(x,y,z)                                                     \
 {                                                                         \
     x = _mm_setzero_si128 ();                                             \
-    x = _mm_set_epi64x (y[1][z], y[0][z]);                                \
+    x = _mm_set_epi64x (y[index + 1][z], y[index][z]);                    \
 }
 
 #define S0(x)                                                             \
@@ -176,13 +185,25 @@ static struct fmt_tests tests[] = {
     {NULL}
 };
 
-#ifdef _MSC_VER
-__declspec(align(16)) static uint64_t saved_key[VWIDTH][80];
-__declspec(align(16)) static uint64_t crypt_key[ 8][VWIDTH];
-#else
-static uint64_t saved_key[VWIDTH][80] __attribute__ ((aligned(16)));
-static uint64_t crypt_key[ 8][VWIDTH] __attribute__ ((aligned(16)));
+static uint64_t (*saved_key)[80];
+static uint64_t *crypt_key[ 8];
+
+
+static void init(struct fmt_main *self)
+{
+    int i;
+#ifdef _OPENMP
+    int omp_t;
+
+    omp_t = omp_get_max_threads();
+    self->params.min_keys_per_crypt *= omp_t;
+    omp_t *= OMP_SCALE;
+    self->params.max_keys_per_crypt *= omp_t;
 #endif
+    saved_key = mem_calloc_tiny(sizeof(*saved_key) * self->params.max_keys_per_crypt, MEM_ALIGN_SIMD);
+    for (i = 0; i < 8; i++)
+           crypt_key[i] = mem_calloc_tiny(sizeof(uint64_t) * self->params.max_keys_per_crypt, MEM_ALIGN_SIMD);
+}
 
 
 static inline void alter_endianity_64 (void *_x, unsigned int size)
@@ -306,6 +327,16 @@ static int crypt_all (int *pcount, struct db_salt *salt)
 static void crypt_all (int count)
 #endif
 {
+#if FMT_MAIN_VERSION > 10
+    int count = *pcount;
+#endif
+    int index = 0;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+    for (index = 0; index < count; index += 2)
+#endif
+    {
     int i;
 
     __m128i a, b, c, d, e, f, g, h;
@@ -419,17 +450,18 @@ static void crypt_all (int count)
     g = _mm_add_epi64 (g, _mm_set1_epi64x (0x1f83d9abfb41bd6b));
     h = _mm_add_epi64 (h, _mm_set1_epi64x (0x5be0cd19137e2179));
 
-    _mm_store_si128 ((__m128i *) crypt_key[0], a);
-    _mm_store_si128 ((__m128i *) crypt_key[1], b);
-    _mm_store_si128 ((__m128i *) crypt_key[2], c);
-    _mm_store_si128 ((__m128i *) crypt_key[3], d);
-    _mm_store_si128 ((__m128i *) crypt_key[4], e);
-    _mm_store_si128 ((__m128i *) crypt_key[5], f);
-    _mm_store_si128 ((__m128i *) crypt_key[6], g);
-    _mm_store_si128 ((__m128i *) crypt_key[7], h);
+        _mm_store_si128 ((__m128i *) &crypt_key[0][index], a);
+        _mm_store_si128 ((__m128i *) &crypt_key[1][index], b);
+        _mm_store_si128 ((__m128i *) &crypt_key[2][index], c);
+        _mm_store_si128 ((__m128i *) &crypt_key[3][index], d);
+        _mm_store_si128 ((__m128i *) &crypt_key[4][index], e);
+        _mm_store_si128 ((__m128i *) &crypt_key[5][index], f);
+        _mm_store_si128 ((__m128i *) &crypt_key[6][index], g);
+        _mm_store_si128 ((__m128i *) &crypt_key[7][index], h);
+    }
 
 #if FMT_MAIN_VERSION > 10
-    return *pcount;
+    return count;
 #endif
 }
 
@@ -438,7 +470,11 @@ static int cmp_all (void *binary, int count)
 {
     int i;
 
+#ifdef _OPENMP
+    for (i=0; i < count; i++)
+#else
     for (i=0; i < 2; i++)
+#endif
         if (((uint64_t *) binary)[0] == crypt_key[0][i])
             return 1;
 
@@ -485,10 +521,10 @@ struct fmt_main fmt_rawSHA512_ng = {
 #endif
         MIN_KEYS_PER_CRYPT,
         MAX_KEYS_PER_CRYPT,
-        FMT_CASE | FMT_8_BIT | FMT_SPLIT_UNIFIES_CASE,
+        FMT_CASE | FMT_8_BIT | FMT_SPLIT_UNIFIES_CASE | FMT_OMP,
         tests
     }, {
-        fmt_default_init,
+        init,
 #if FMT_MAIN_VERSION > 10
         fmt_default_done,
         fmt_default_reset,


magnum
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.