|
|
Message-ID: <cc6e296990663cdca418e853a914e66b@smtp.hushmail.com>
Date: Mon, 15 Apr 2013 01:53:32 +0200
From: magnum <john.magnum@...hmail.com>
To: john-dev@...ts.openwall.com
Subject: Re: [patch] sse/xop implementation of raw-sha512
On 15 Apr, 2013, at 0:41 , magnum <john.magnum@...hmail.com> wrote:
> With the fixed version I do get a 2x speedup on Intel, and this despite it's only SSE2 (because Apple's silly OSX assembler can't handle AVX and stuff).
>
> $ ../run/john -t -fo:raw-sha512*
> Benchmarking: Raw SHA-512 [128/128 SSE2 intrinsics 2x]... DONE
> Raw: 2890K c/s real, 2890K c/s virtual
>
> Benchmarking: Raw SHA-512 [64/64 CommonCrypto]... DONE
> Raw: 1449K c/s real, 1449K c/s virtual
>
> ...and just over 2x on Bull:
>
> $ ../run/john -t -fo:raw-sha512*
> Benchmarking: Raw SHA-512 [128/128 XOP intrinsics 2x]... DONE
> Raw: 3813K c/s real, 3778K c/s virtual
>
> Benchmarking: Raw SHA-512 [64/64 OpenSSL]... DONE
> Raw: 1889K c/s real, 1889K c/s virtual
I added OMP support. Scales well on Intel, but worse on AMD:
magnum@...r-osx:src [bleeding-jumbo]$ for i in 1 4 8; do OMP_NUM_THREADS=$i ../run/john -t -fo:raw-sha512* ; done
Benchmarking: Raw SHA-512 [128/128 SSE2 intrinsics 2x]... DONE
Raw: 2985K c/s real, 2985K c/s virtual
Benchmarking: Raw SHA-512 [64/64 CommonCrypto]... DONE
Raw: 1499K c/s real, 1484K c/s virtual
All 2 formats passed self-tests!
Benchmarking: Raw SHA-512 [128/128 SSE2 intrinsics 2x]... (4xOMP) DONE
Raw: 8011K c/s real, 2644K c/s virtual
Benchmarking: Raw SHA-512 [64/64 CommonCrypto]... (4xOMP) DONE
Raw: 4014K c/s real, 1238K c/s virtual
All 2 formats passed self-tests!
Benchmarking: Raw SHA-512 [128/128 SSE2 intrinsics 2x]... (8xOMP) DONE
Raw: 8093K c/s real, 1574K c/s virtual
Benchmarking: Raw SHA-512 [64/64 CommonCrypto]... (8xOMP) DONE
Raw: 5079K c/s real, 843694 c/s virtual
All 2 formats passed self-tests!
The drop at 8x is because of it's HT and only 4 real cores.
magnum@...l:src [bleeding-jumbo]$ for i in 1 4 8; do OMP_NUM_THREADS=$i ../run/john -t -fo:raw-sha512* ; done
Benchmarking: Raw SHA-512 [128/128 XOP intrinsics 2x]... DONE
Raw: 3207K c/s real, 3207K c/s virtual
Benchmarking: Raw SHA-512 [64/64 OpenSSL]... DONE
Raw: 1822K c/s real, 1822K c/s virtual
All 2 formats passed self-tests!
Benchmarking: Raw SHA-512 [128/128 XOP intrinsics 2x]... (4xOMP) DONE
Raw: 7012K c/s real, 1753K c/s virtual
Benchmarking: Raw SHA-512 [64/64 OpenSSL]... (4xOMP) DONE
Raw: 5215K c/s real, 1316K c/s virtual
All 2 formats passed self-tests!
Benchmarking: Raw SHA-512 [128/128 XOP intrinsics 2x]... (8xOMP) DONE
Raw: 8077K c/s real, 1008K c/s virtual
Benchmarking: Raw SHA-512 [64/64 OpenSSL]... (8xOMP) DONE
Raw: 7072K c/s real, 891813 c/s virtual
All 2 formats passed self-tests!
Maybe it can be tweaked. I already use a lower OMP_SCALE for XOP but I haven't looked much into it.
This is committed already but here's a diff -w:
diff --git a/src/rawSHA512_ng_fmt.c b/src/rawSHA512_ng_fmt.c
index 6bcb3da..c58ada5 100644
--- a/src/rawSHA512_ng_fmt.c
+++ b/src/rawSHA512_ng_fmt.c
@@ -7,6 +7,15 @@
*/
+#ifdef _OPENMP
+#include <omp.h>
+#if defined __XOP__
+#define OMP_SCALE 1024 /* AMD */
+#else
+#define OMP_SCALE 2048 /* Intel */
+#endif
+#endif
+
#include "arch.h"
#ifdef MMX_COEF
@@ -95,7 +104,7 @@
#define GATHER(x,y,z) \
{ \
x = _mm_setzero_si128 (); \
- x = _mm_set_epi64x (y[1][z], y[0][z]); \
+ x = _mm_set_epi64x (y[index + 1][z], y[index][z]); \
}
#define S0(x) \
@@ -176,13 +185,25 @@ static struct fmt_tests tests[] = {
{NULL}
};
-#ifdef _MSC_VER
-__declspec(align(16)) static uint64_t saved_key[VWIDTH][80];
-__declspec(align(16)) static uint64_t crypt_key[ 8][VWIDTH];
-#else
-static uint64_t saved_key[VWIDTH][80] __attribute__ ((aligned(16)));
-static uint64_t crypt_key[ 8][VWIDTH] __attribute__ ((aligned(16)));
+static uint64_t (*saved_key)[80];
+static uint64_t *crypt_key[ 8];
+
+
+static void init(struct fmt_main *self)
+{
+ int i;
+#ifdef _OPENMP
+ int omp_t;
+
+ omp_t = omp_get_max_threads();
+ self->params.min_keys_per_crypt *= omp_t;
+ omp_t *= OMP_SCALE;
+ self->params.max_keys_per_crypt *= omp_t;
#endif
+ saved_key = mem_calloc_tiny(sizeof(*saved_key) * self->params.max_keys_per_crypt, MEM_ALIGN_SIMD);
+ for (i = 0; i < 8; i++)
+ crypt_key[i] = mem_calloc_tiny(sizeof(uint64_t) * self->params.max_keys_per_crypt, MEM_ALIGN_SIMD);
+}
static inline void alter_endianity_64 (void *_x, unsigned int size)
@@ -306,6 +327,16 @@ static int crypt_all (int *pcount, struct db_salt *salt)
static void crypt_all (int count)
#endif
{
+#if FMT_MAIN_VERSION > 10
+ int count = *pcount;
+#endif
+ int index = 0;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+ for (index = 0; index < count; index += 2)
+#endif
+ {
int i;
__m128i a, b, c, d, e, f, g, h;
@@ -419,17 +450,18 @@ static void crypt_all (int count)
g = _mm_add_epi64 (g, _mm_set1_epi64x (0x1f83d9abfb41bd6b));
h = _mm_add_epi64 (h, _mm_set1_epi64x (0x5be0cd19137e2179));
- _mm_store_si128 ((__m128i *) crypt_key[0], a);
- _mm_store_si128 ((__m128i *) crypt_key[1], b);
- _mm_store_si128 ((__m128i *) crypt_key[2], c);
- _mm_store_si128 ((__m128i *) crypt_key[3], d);
- _mm_store_si128 ((__m128i *) crypt_key[4], e);
- _mm_store_si128 ((__m128i *) crypt_key[5], f);
- _mm_store_si128 ((__m128i *) crypt_key[6], g);
- _mm_store_si128 ((__m128i *) crypt_key[7], h);
+ _mm_store_si128 ((__m128i *) &crypt_key[0][index], a);
+ _mm_store_si128 ((__m128i *) &crypt_key[1][index], b);
+ _mm_store_si128 ((__m128i *) &crypt_key[2][index], c);
+ _mm_store_si128 ((__m128i *) &crypt_key[3][index], d);
+ _mm_store_si128 ((__m128i *) &crypt_key[4][index], e);
+ _mm_store_si128 ((__m128i *) &crypt_key[5][index], f);
+ _mm_store_si128 ((__m128i *) &crypt_key[6][index], g);
+ _mm_store_si128 ((__m128i *) &crypt_key[7][index], h);
+ }
#if FMT_MAIN_VERSION > 10
- return *pcount;
+ return count;
#endif
}
@@ -438,7 +470,11 @@ static int cmp_all (void *binary, int count)
{
int i;
+#ifdef _OPENMP
+ for (i=0; i < count; i++)
+#else
for (i=0; i < 2; i++)
+#endif
if (((uint64_t *) binary)[0] == crypt_key[0][i])
return 1;
@@ -485,10 +521,10 @@ struct fmt_main fmt_rawSHA512_ng = {
#endif
MIN_KEYS_PER_CRYPT,
MAX_KEYS_PER_CRYPT,
- FMT_CASE | FMT_8_BIT | FMT_SPLIT_UNIFIES_CASE,
+ FMT_CASE | FMT_8_BIT | FMT_SPLIT_UNIFIES_CASE | FMT_OMP,
tests
}, {
- fmt_default_init,
+ init,
#if FMT_MAIN_VERSION > 10
fmt_default_done,
fmt_default_reset,
magnum
Powered by blists - more mailing lists
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.