From 4d6ad40c39ae7622ad86d2a97b11504f3e728a87 Mon Sep 17 00:00:00 2001 From: magnum Date: Wed, 1 Apr 2015 00:39:06 +0200 Subject: [PATCH] Add a pseudo-intrinsics header, and use it for raw-sha512-ng. This adds support for AVX2 and AVX512 (including Xeon Phi) but is not complete and totally untested for AVX512. --- src/pseudo_intrinsics.h | 196 ++++++++++++++++++++++++++++++ src/rawSHA512_ng_fmt_plug.c | 288 +++++++++++++++++++++----------------------- 2 files changed, 334 insertions(+), 150 deletions(-) create mode 100644 src/pseudo_intrinsics.h diff --git a/src/pseudo_intrinsics.h b/src/pseudo_intrinsics.h new file mode 100644 index 0000000..6b52e1f --- /dev/null +++ b/src/pseudo_intrinsics.h @@ -0,0 +1,196 @@ +/* + * Minimalistic pseudo-instrinsics for width-agnostic x86 SIMD code. + * + * This software is Copyright (c) 2015 magnum, and it is hereby released to the + * general public under the following terms: Redistribution and use in source + * and binary forms, with or without modification, are permitted. + * + * Synopsis: + * + * SSE2: __m128i a = _mm_add_epi32(b, c); + * AVX2: __m256i a = _mm256_add_epi32(b, c); + * AVX512: __m512i a = _mm512_add_epi32(b, c); + * -> Pseudo: vtype a = vadd_epi32(b, c); + * + * SSE2: __m128i a = _mm_load_si128(p); + * AVX2: __m256i a = _mm256_load_si256(p); + * AVX512: __m512i a = _mm512_load_si512(p); + * -> Pseudo: vtype a = vload(p); + * + * intrinsics are emulated where the target does not support them. + */ + +#ifndef _SSE_PSEUDO_H +#define _SSE_PSEUDO_H + +#undef SIMD_COEF_32 +#undef SIMD_COEF_64 + +#if __MIC__ || __AVX512__ +#include + +#define SIMD_COEF_32 16 +#define SIMD_COEF_64 8 + +typedef __m512i vtype; + +#define vadd_epi32 _mm512_add_epi32 +#define vadd_epi64 _mm512_add_epi64 +#define vand _mm512_and_si512 +#define vandnot _mm512_andnot_si512 +#define vcmov(y, z, x) vxor(z, vand(x, vxor(y, z))) +#define vcmpeq_epi32 _mm512_cmpeq_epi32 +#define vcmpeq_epi8 _mm512_cmpeq_epi8 +#define vload _mm512_load_si512 +#define vloadu _mm512_loadu_si512 +#define vmovemask_epi8 _mm512_movemask_epi8 +#define vor _mm512_or_si512 +#define vroti_epi32(x, n) vxor(vsrli_epi32(x, ~n + 1), \ + vslli_epi32(x, 32 + n)) +#define vroti_epi64(x, n) vxor(vsrli_epi64(x, ~n + 1), \ + vslli_epi64(x, 64 + n)) +#define vroti_epi64 _mm512_roti_epi64 +#define vset1_epi32 _mm512_set1_epi32 +#define vset1_epi64x _mm512_set1_epi64x +#define vset_epi64x _mm512_set_epi64x +#define vsetzero _mm512_setzero_si512 +#define vshuffle_epi8 _mm512_shuffle_epi8 +#define vshuffle_epi32 _mm512_shuffle_epi32 +#define vshufflehi_epi16 _mm512_shufflehi_epi16 +#define vshufflelo_epi16 _mm512_shufflelo_epi16 +#define vslli_epi16 _mm512_slli_epi16 +#define vslli_epi32 _mm512_slli_epi32 +#define vslli_epi64 _mm512_slli_epi64 +#define vsrli_epi16 _mm512_srli_epi16 +#define vsrli_epi32 _mm512_srli_epi32 +#define vsrli_epi64 _mm512_srli_epi64 +#define vstore _mm512_store_si512 +#define vtestz_epi32 _mm512_testz_epi32 +#define vunpackhi_epi32 _mm512_unpackhi_epi32 +#define vunpackhi_epi64 _mm512_unpackhi_epi64 +#define vunpacklo_epi32 _mm512_unpacklo_epi32 +#define vunpacklo_epi64 _mm512_unpacklo_epi64 +#define vxor _mm512_xor_si512 + +#elif __AVX2__ +#include + +#define SIMD_COEF_32 8 +#define SIMD_COEF_64 4 + +typedef __m256i vtype; + +#define vadd_epi32 _mm256_add_epi32 +#define vadd_epi64 _mm256_add_epi64 +#define vand _mm256_and_si256 +#define vandnot _mm256_andnot_si256 +#define vcmov(y, z, x) vxor(z, vand(x, vxor(y, z))) +#define vcmpeq_epi32 _mm256_cmpeq_epi32 +#define vcmpeq_epi8 _mm256_cmpeq_epi8 +#define vload _mm256_load_si256 +#define vloadu _mm256_loadu_si256 +#define vmovemask_epi8 _mm256_movemask_epi8 +#define vor _mm256_or_si256 +#define vroti_epi32(x, n) vxor(vsrli_epi32(x, ~n + 1), \ + vslli_epi32(x, 32 + n)) +#define vroti_epi64(x, n) vxor(vsrli_epi64(x, ~n + 1), \ + vslli_epi64(x, 64 + n)) +#define vset1_epi32 _mm256_set1_epi32 +#define vset1_epi64x _mm256_set1_epi64x +#define vset_epi64x _mm256_set_epi64x +#define vsetzero _mm256_setzero_si256 +#define vshuffle_epi8 _mm256_shuffle_epi8 +#define vshuffle_epi32 _mm256_shuffle_epi32 +#define vshufflehi_epi16 _mm256_shufflehi_epi16 +#define vshufflelo_epi16 _mm256_shufflelo_epi16 +#define vslli_epi16 _mm256_slli_epi16 +#define vslli_epi32 _mm256_slli_epi32 +#define vslli_epi64 _mm256_slli_epi64 +#define vsrli_epi16 _mm256_srli_epi16 +#define vsrli_epi32 _mm256_srli_epi32 +#define vsrli_epi64 _mm256_srli_epi64 +#define vstore _mm256_store_si256 +#define vtestz_epi32 _mm256_testz_epi32 +#define vunpackhi_epi32 _mm256_unpackhi_epi32 +#define vunpackhi_epi64 _mm256_unpackhi_epi64 +#define vunpacklo_epi32 _mm256_unpacklo_epi32 +#define vunpacklo_epi64 _mm256_unpacklo_epi64 +#define vxor _mm256_xor_si256 + +#elif __SSE2__ +#if __AVX__ +#include +#if __XOP__ +#include +#endif +#endif +#include + +#define SIMD_COEF_32 4 +#define SIMD_COEF_64 2 + +typedef __m128i vtype; + +#define vadd_epi32 _mm_add_epi32 +#define vadd_epi64 _mm_add_epi64 +#define vand _mm_and_si128 +#define vandnot _mm_andnot_si128 +#if __XOP__ +#define vcmov _mm_cmov_si128 +#else +#define vcmov(y, z, x) vxor(z, vand(x, vxor(y, z))) +#endif +#define vcmpeq_epi32 _mm_cmpeq_epi32 +#define vcmpeq_epi8 _mm_cmpeq_epi8 +#define vload _mm_load_si128 +#define vloadu _mm_loadu_si128 +#define vmovemask_epi8 _mm_movemask_epi8 +#define vor _mm_or_si128 +#if __XOP__ +#define vroti_epi32 _mm_roti_epi32 +#define vroti_epi64 _mm_roti_epi64 +#else +#define vroti_epi32(x, n) vxor(vsrli_epi32(x, ~n + 1), \ + vslli_epi32(x, 32 + n)) +#define vroti_epi64(x, n) vxor(vsrli_epi64(x, ~n + 1), \ + vslli_epi64(x, 64 + n)) +#endif +#define vset1_epi32 _mm_set1_epi32 +#define vset1_epi64x _mm_set1_epi64x +#define vset_epi64x _mm_set_epi64x +#define vsetzero _mm_setzero_si128 +#ifdef __SSSE3__ +#define vshuffle_epi8 _mm_shuffle_epi8 +#endif +#define vshuffle_epi32 _mm_shuffle_epi32 +#define vshufflehi_epi16 _mm_shufflehi_epi16 +#define vshufflelo_epi16 _mm_shufflelo_epi16 +#define vslli_epi16 _mm_slli_epi16 +#define vslli_epi32 _mm_slli_epi32 +#define vslli_epi64 _mm_slli_epi64 +#define vsrli_epi16 _mm_srli_epi16 +#define vsrli_epi32 _mm_srli_epi32 +#define vsrli_epi64 _mm_srli_epi64 +#define vstore _mm_store_si128 +#define vtestz_epi32 _mm_testz_epi32 +#define vunpackhi_epi32 _mm_unpackhi_epi32 +#define vunpackhi_epi64 _mm_unpackhi_epi64 +#define vunpacklo_epi32 _mm_unpacklo_epi32 +#define vunpacklo_epi64 _mm_unpacklo_epi64 +#define vxor _mm_xor_si128 + +#elif __MMX__ +#include + +#define SIMD_COEF_32 2 +#define SIMD_COEF_64 1 + +typedef __m64i vtype; + +#error MMX intrinsics not implemented + +#endif /* __SIMD__ elif __SIMD__ elif __SIMD__ */ + +#define MEM_ALIGN_SIMD (SIMD_COEF_32 * 4) + +#endif /* _SSE_PSEUDO_H */ diff --git a/src/rawSHA512_ng_fmt_plug.c b/src/rawSHA512_ng_fmt_plug.c index b939a5d..94ada12 100644 --- a/src/rawSHA512_ng_fmt_plug.c +++ b/src/rawSHA512_ng_fmt_plug.c @@ -1,5 +1,6 @@ /* - * Copyright 2013, epixoip. + * Copyright (c) 2013, epixoip. + * Copyright (c) 2015, magnum (pseudo-intrinsics also supporting AVX2/AVX512) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that redistribution of source @@ -7,7 +8,7 @@ */ #include "arch.h" -#if defined __SSE2__ +#if __SSE2__ #if FMT_EXTERNS_H extern struct fmt_main fmt_rawSHA512_ng; @@ -17,7 +18,7 @@ john_register_one(&fmt_rawSHA512_ng); #ifdef _OPENMP #include -#if defined __XOP__ +#if __XOP__ #define OMP_SCALE 768 /* AMD */ #else #define OMP_SCALE 2048 /* Intel */ @@ -25,42 +26,42 @@ john_register_one(&fmt_rawSHA512_ng); #endif // These compilers claim to be __GNUC__ but warn on gcc pragmas. -#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) && !defined(__llvm__) && !defined (_MSC_VER) +#if __GNUC__ && !__INTEL_COMPILER && !__clang__ && !__llvm__ && !_MSC_VER #pragma GCC optimize 3 #endif #include "stdint.h" #include -#include - -#if defined __XOP__ -#include -#elif defined __SSSE3__ -#include -#endif +#include "pseudo_intrinsics.h" #include "common.h" #include "formats.h" #include "johnswap.h" #include "memdbg.h" -#if defined __XOP__ -#define SIMD_TYPE "XOP" -#elif defined __SSSE3__ -#define SIMD_TYPE "SSSE3" +#if __AVX512__ || __MIC__ +#define SIMD_TYPE "512/512 AVX512 8x" +#elif __AVX2__ +#define SIMD_TYPE "256/256 AVX2 4x" +#elif __XOP__ +#define SIMD_TYPE "128/128 XOP 2x" +#elif __SSSE3__ +#define SIMD_TYPE "128/128 SSSE3 2x" #else -#define SIMD_TYPE "SSE2" +#define SIMD_TYPE "128/128 SSE2 2x" #endif #define FORMAT_LABEL "Raw-SHA512-ng" #define FORMAT_NAME "" -#define ALGORITHM_NAME "SHA512 128/128 " SIMD_TYPE " 2x" +#define ALGORITHM_NAME "SHA512 " SIMD_TYPE #define FORMAT_TAG "$SHA512$" #define TAG_LENGTH 8 #define BENCHMARK_COMMENT "" #define BENCHMARK_LENGTH -1 +#define VWIDTH SIMD_COEF_64 + // max length is not 119, but 8 less than this, or 111. 111 actually make sense. // For SHA512 there are 14 'usable' 8 byte ints, minus 1 byte (for the 0x80). // 14*8-1 is 111. This comment left for reference for future sha2 hackers within JtR. @@ -73,10 +74,10 @@ john_register_one(&fmt_rawSHA512_ng); #define BINARY_ALIGN 8 #define SALT_SIZE 0 #define SALT_ALIGN 1 -#define MIN_KEYS_PER_CRYPT 2 -#define MAX_KEYS_PER_CRYPT 2 +#define MIN_KEYS_PER_CRYPT VWIDTH +#define MAX_KEYS_PER_CRYPT VWIDTH -#if defined (_MSC_VER) && !defined (_M_X64) +#if _MSC_VER && !_M_X64 // 32 bit VC does NOT define these intrinsics :(((( _inline __m128i _mm_set_epi64x(uint64_t a, uint64_t b) { __m128i x; @@ -92,110 +93,117 @@ _inline __m128i _mm_set1_epi64x(uint64_t a) { } #endif -#ifndef __XOP__ -#define _mm_roti_epi64(x, n) \ -( \ - _mm_xor_si128 ( \ - _mm_srli_epi64(x, ~n + 1), \ - _mm_slli_epi64(x, 64 + n) \ - ) \ -) - -#define _mm_cmov_si128(y, z, x) \ -( \ - _mm_xor_si128 (z, \ - _mm_and_si128 (x, \ - _mm_xor_si128 (y, z) \ - ) \ - ) \ -) -#endif - -#ifdef __SSSE3__ +#if __AVX512__ || __MIC__ +#define SWAP_ENDIAN(n) \ +{ \ + n = vshuffle_epi8(n, \ + vset_epi64x(0x38393a3b3c3d3e3f, 0x3031323334353637, \ + 0x28292a2b2c2d2e2f, 0x2021222324252627, \ + 0x18191a1b1c1d1e1f, 0x1011121314151617, \ + 0x08090a0b0c0d0e0f, 0x0001020304050607) \ + ); \ +} +#elif __AVX2__ +#define SWAP_ENDIAN(n) \ +{ \ + n = vshuffle_epi8(n, \ + vset_epi64x(0x18191a1b1c1d1e1f, 0x1011121314151617, \ + 0x08090a0b0c0d0e0f, 0x0001020304050607) \ + ); \ +} +#elif __SSSE3__ #define SWAP_ENDIAN(n) \ { \ - n = _mm_shuffle_epi8 (n, \ - _mm_set_epi64x (0x08090a0b0c0d0e0f, 0x0001020304050607) \ + n = vshuffle_epi8(n, \ + vset_epi64x(0x08090a0b0c0d0e0f, 0x0001020304050607) \ ); \ } #else #define SWAP_ENDIAN(n) \ { \ - n = _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (n, 0xb1), 0xb1); \ - n = _mm_xor_si128 (_mm_slli_epi16 (n, 8), _mm_srli_epi16 (n, 8)); \ - n = _mm_shuffle_epi32 (n, 0xb1); \ + n = vshufflehi_epi16(vshufflelo_epi16(n, 0xb1), 0xb1); \ + n = vxor(vslli_epi16(n, 8), vsrli_epi16(n, 8)); \ + n = vshuffle_epi32(n, 0xb1); \ } #endif +#if __AVX2__ #define GATHER(x,y,z) \ { \ - x = _mm_set_epi64x (y[index + 1][z], y[index][z]); \ + x = vset_epi64x(y[index + 3][z], y[index + 2][z], \ + y[index + 1][z], y[index ][z]); \ } +#else +#define GATHER(x,y,z) \ +{ \ + x = vset_epi64x(y[index + 1][z], y[index ][z]); \ +} +#endif #define S0(x) \ ( \ - _mm_xor_si128 ( \ - _mm_roti_epi64 (x, -39), \ - _mm_xor_si128 ( \ - _mm_roti_epi64 (x, -28), \ - _mm_roti_epi64 (x, -34) \ + vxor( \ + vroti_epi64(x, -39), \ + vxor( \ + vroti_epi64(x, -28), \ + vroti_epi64(x, -34) \ ) \ ) \ ) #define S1(x) \ ( \ - _mm_xor_si128 ( \ - _mm_roti_epi64 (x, -41), \ - _mm_xor_si128 ( \ - _mm_roti_epi64 (x, -14), \ - _mm_roti_epi64 (x, -18) \ + vxor( \ + vroti_epi64(x, -41), \ + vxor( \ + vroti_epi64(x, -14), \ + vroti_epi64(x, -18) \ ) \ ) \ ) #define s0(x) \ ( \ - _mm_xor_si128 ( \ - _mm_srli_epi64 (x, 7), \ - _mm_xor_si128 ( \ - _mm_roti_epi64 (x, -1), \ - _mm_roti_epi64 (x, -8) \ + vxor( \ + vsrli_epi64(x, 7), \ + vxor( \ + vroti_epi64(x, -1), \ + vroti_epi64(x, -8) \ ) \ ) \ ) #define s1(x) \ ( \ - _mm_xor_si128 ( \ - _mm_srli_epi64 (x, 6), \ - _mm_xor_si128 ( \ - _mm_roti_epi64 (x, -19), \ - _mm_roti_epi64 (x, -61) \ + vxor( \ + vsrli_epi64(x, 6), \ + vxor( \ + vroti_epi64(x, -19), \ + vroti_epi64(x, -61) \ ) \ ) \ ) -#define Maj(x,y,z) _mm_cmov_si128 (x, y, _mm_xor_si128 (z, y)) +#define Maj(x,y,z) vcmov(x, y, vxor(z, y)) -#define Ch(x,y,z) _mm_cmov_si128 (y, z, x) +#define Ch(x,y,z) vcmov(y, z, x) #define R(t) \ { \ - tmp1 = _mm_add_epi64 (s1(w[t - 2]), w[t - 7]); \ - tmp2 = _mm_add_epi64 (s0(w[t - 15]), w[t - 16]); \ - w[t] = _mm_add_epi64 (tmp1, tmp2); \ + tmp1 = vadd_epi64(s1(w[t - 2]), w[t - 7]); \ + tmp2 = vadd_epi64(s0(w[t - 15]), w[t - 16]); \ + w[t] = vadd_epi64(tmp1, tmp2); \ } #define SHA512_STEP(a,b,c,d,e,f,g,h,x,K) \ { \ - tmp1 = _mm_add_epi64 (h, w[x]); \ - tmp2 = _mm_add_epi64 (S1(e),_mm_set1_epi64x(K)); \ - tmp1 = _mm_add_epi64 (tmp1, Ch(e,f,g)); \ - tmp1 = _mm_add_epi64 (tmp1, tmp2); \ - tmp2 = _mm_add_epi64 (S0(a),Maj(a,b,c)); \ - d = _mm_add_epi64 (tmp1, d); \ - h = _mm_add_epi64 (tmp1, tmp2); \ + tmp1 = vadd_epi64(h, w[x]); \ + tmp2 = vadd_epi64(S1(e),vset1_epi64x(K)); \ + tmp1 = vadd_epi64(tmp1, Ch(e,f,g)); \ + tmp1 = vadd_epi64(tmp1, tmp2); \ + tmp2 = vadd_epi64(S0(a),Maj(a,b,c)); \ + d = vadd_epi64(tmp1, d); \ + h = vadd_epi64(tmp1, tmp2); \ } @@ -259,7 +267,7 @@ static void done(void) } -static inline void alter_endianity_64 (uint64_t *x, unsigned int size) +static inline void alter_endianity_64(uint64_t *x, unsigned int size) { int i; @@ -268,13 +276,13 @@ static inline void alter_endianity_64 (uint64_t *x, unsigned int size) } -static int valid (char *ciphertext, struct fmt_main *self) +static int valid(char *ciphertext, struct fmt_main *self) { char *p, *q; p = ciphertext; - if (! strncmp (p, FORMAT_TAG, TAG_LENGTH)) + if (! strncmp(p, FORMAT_TAG, TAG_LENGTH)) p += TAG_LENGTH; q = p; @@ -284,26 +292,22 @@ static int valid (char *ciphertext, struct fmt_main *self) } -#if FMT_MAIN_VERSION > 9 -static char *split (char *ciphertext, int index, struct fmt_main *self) -#else -static char *split (char *ciphertext, int index) -#endif +static char *split(char *ciphertext, int index, struct fmt_main *self) { static char out[TAG_LENGTH + CIPHERTEXT_LENGTH + 1]; - if (!strncmp (ciphertext, FORMAT_TAG, TAG_LENGTH)) + if (!strncmp(ciphertext, FORMAT_TAG, TAG_LENGTH)) ciphertext += TAG_LENGTH; - memcpy (out, FORMAT_TAG, TAG_LENGTH); - memcpy (out + TAG_LENGTH, ciphertext, CIPHERTEXT_LENGTH + 1); - strlwr (out + TAG_LENGTH); + memcpy(out, FORMAT_TAG, TAG_LENGTH); + memcpy(out + TAG_LENGTH, ciphertext, CIPHERTEXT_LENGTH + 1); + strlwr(out + TAG_LENGTH); return out; } -static void *get_binary (char *ciphertext) +static void *get_binary(char *ciphertext) { static union { unsigned char c[FULL_BINARY_SIZE]; @@ -312,7 +316,7 @@ static void *get_binary (char *ciphertext) int i; if (!out) - out = mem_alloc_tiny (FULL_BINARY_SIZE, BINARY_ALIGN); + out = mem_alloc_tiny(FULL_BINARY_SIZE, BINARY_ALIGN); ciphertext += TAG_LENGTH; @@ -320,7 +324,7 @@ static void *get_binary (char *ciphertext) out->c[i] = atoi16[ARCH_INDEX(ciphertext[i*2])] * 16 + atoi16[ARCH_INDEX(ciphertext[i*2 + 1])]; - alter_endianity_64 (out->w, FULL_BINARY_SIZE); + alter_endianity_64(out->w, FULL_BINARY_SIZE); out->w[0] -= 0x6a09e667f3bcc908ULL; out->w[1] -= 0xbb67ae8584caa73bULL; @@ -334,16 +338,16 @@ static void *get_binary (char *ciphertext) return (void *) out; } -static int get_hash_0 (int index) { return crypt_key[0][index] & 0xf; } -static int get_hash_1 (int index) { return crypt_key[0][index] & 0xff; } -static int get_hash_2 (int index) { return crypt_key[0][index] & 0xfff; } -static int get_hash_3 (int index) { return crypt_key[0][index] & 0xffff; } -static int get_hash_4 (int index) { return crypt_key[0][index] & 0xfffff; } -static int get_hash_5 (int index) { return crypt_key[0][index] & 0xffffff; } -static int get_hash_6 (int index) { return crypt_key[0][index] & 0x7ffffff; } +static int get_hash_0(int index) { return crypt_key[0][index] & 0xf; } +static int get_hash_1(int index) { return crypt_key[0][index] & 0xff; } +static int get_hash_2(int index) { return crypt_key[0][index] & 0xfff; } +static int get_hash_3(int index) { return crypt_key[0][index] & 0xffff; } +static int get_hash_4(int index) { return crypt_key[0][index] & 0xfffff; } +static int get_hash_5(int index) { return crypt_key[0][index] & 0xffffff; } +static int get_hash_6(int index) { return crypt_key[0][index] & 0x7ffffff; } -static void set_key (char *key, int index) +static void set_key(char *key, int index) { uint64_t *buf64 = (uint64_t *) &saved_key[index]; uint8_t *buf8 = (uint8_t * ) buf64; @@ -358,7 +362,7 @@ static void set_key (char *key, int index) } -static char *get_key (int index) +static char *get_key(int index) { uint64_t *buf64 = (uint64_t *) &saved_key[index]; uint8_t *buf8 = (uint8_t * ) buf64; @@ -375,50 +379,44 @@ static char *get_key (int index) } -#if FMT_MAIN_VERSION > 10 -static int crypt_all (int *pcount, struct db_salt *salt) -#else -static void crypt_all (int count) -#endif +static int crypt_all(int *pcount, struct db_salt *salt) { -#if FMT_MAIN_VERSION > 10 int count = *pcount; -#endif int index = 0; #ifdef _OPENMP #pragma omp parallel for - for (index = 0; index < count; index += 2) + for (index = 0; index < count; index += VWIDTH) #endif { int i; - __m128i a, b, c, d, e, f, g, h; - __m128i w[80], tmp1, tmp2; + vtype a, b, c, d, e, f, g, h; + vtype w[80], tmp1, tmp2; for (i = 0; i < 14; i += 2) { - GATHER (tmp1, saved_key, i); - GATHER (tmp2, saved_key, i + 1); - SWAP_ENDIAN (tmp1); - SWAP_ENDIAN (tmp2); + GATHER(tmp1, saved_key, i); + GATHER(tmp2, saved_key, i + 1); + SWAP_ENDIAN(tmp1); + SWAP_ENDIAN(tmp2); w[i] = tmp1; w[i + 1] = tmp2; } - GATHER (tmp1, saved_key, 14); - SWAP_ENDIAN (tmp1); + GATHER(tmp1, saved_key, 14); + SWAP_ENDIAN(tmp1); w[14] = tmp1; - GATHER (w[15], saved_key, 15); + GATHER(w[15], saved_key, 15); for (i = 16; i < 80; i++) R(i); - a = _mm_set1_epi64x (0x6a09e667f3bcc908ULL); - b = _mm_set1_epi64x (0xbb67ae8584caa73bULL); - c = _mm_set1_epi64x (0x3c6ef372fe94f82bULL); - d = _mm_set1_epi64x (0xa54ff53a5f1d36f1ULL); - e = _mm_set1_epi64x (0x510e527fade682d1ULL); - f = _mm_set1_epi64x (0x9b05688c2b3e6c1fULL); - g = _mm_set1_epi64x (0x1f83d9abfb41bd6bULL); - h = _mm_set1_epi64x (0x5be0cd19137e2179ULL); + a = vset1_epi64x(0x6a09e667f3bcc908ULL); + b = vset1_epi64x(0xbb67ae8584caa73bULL); + c = vset1_epi64x(0x3c6ef372fe94f82bULL); + d = vset1_epi64x(0xa54ff53a5f1d36f1ULL); + e = vset1_epi64x(0x510e527fade682d1ULL); + f = vset1_epi64x(0x9b05688c2b3e6c1fULL); + g = vset1_epi64x(0x1f83d9abfb41bd6bULL); + h = vset1_epi64x(0x5be0cd19137e2179ULL); SHA512_STEP(a, b, c, d, e, f, g, h, 0, 0x428a2f98d728ae22ULL); SHA512_STEP(h, a, b, c, d, e, f, g, 1, 0x7137449123ef65cdULL); @@ -505,30 +503,28 @@ static void crypt_all (int count) SHA512_STEP(c, d, e, f, g, h, a, b, 78, 0x5fcb6fab3ad6faecULL); SHA512_STEP(b, c, d, e, f, g, h, a, 79, 0x6c44198c4a475817ULL); - _mm_store_si128 ((__m128i *) &crypt_key[0][index], a); - _mm_store_si128 ((__m128i *) &crypt_key[1][index], b); - _mm_store_si128 ((__m128i *) &crypt_key[2][index], c); - _mm_store_si128 ((__m128i *) &crypt_key[3][index], d); - _mm_store_si128 ((__m128i *) &crypt_key[4][index], e); - _mm_store_si128 ((__m128i *) &crypt_key[5][index], f); - _mm_store_si128 ((__m128i *) &crypt_key[6][index], g); - _mm_store_si128 ((__m128i *) &crypt_key[7][index], h); + vstore((vtype*) &crypt_key[0][index], a); + vstore((vtype*) &crypt_key[1][index], b); + vstore((vtype*) &crypt_key[2][index], c); + vstore((vtype*) &crypt_key[3][index], d); + vstore((vtype*) &crypt_key[4][index], e); + vstore((vtype*) &crypt_key[5][index], f); + vstore((vtype*) &crypt_key[6][index], g); + vstore((vtype*) &crypt_key[7][index], h); } -#if FMT_MAIN_VERSION > 10 return count; -#endif } -static int cmp_all (void *binary, int count) +static int cmp_all(void *binary, int count) { int i; #ifdef _OPENMP for (i=0; i < count; i++) #else - for (i=0; i < 2; i++) + for (i=0; i < VWIDTH; i++) #endif if (((uint64_t *) binary)[0] == crypt_key[0][i]) return 1; @@ -537,18 +533,18 @@ static int cmp_all (void *binary, int count) } -static int cmp_one (void *binary, int index) +static int cmp_one(void *binary, int index) { return (((uint64_t *) binary)[0] == crypt_key[0][index]); } -static int cmp_exact (char *source, int index) +static int cmp_exact(char *source, int index) { int i; uint64_t *bin; - bin = (uint64_t *) get_binary (source); + bin = (uint64_t *) get_binary(source); for (i=1; i < 8; i++) if (((uint64_t *) bin)[i] != crypt_key[i][index]) @@ -568,13 +564,9 @@ struct fmt_main fmt_rawSHA512_ng = { 0, MAXLEN, BINARY_SIZE, -#if FMT_MAIN_VERSION > 9 BINARY_ALIGN, -#endif SALT_SIZE, -#if FMT_MAIN_VERSION > 9 SALT_ALIGN, -#endif MIN_KEYS_PER_CRYPT, MAX_KEYS_PER_CRYPT, FMT_CASE | FMT_8_BIT | FMT_SPLIT_UNIFIES_CASE | FMT_OMP, @@ -584,21 +576,17 @@ struct fmt_main fmt_rawSHA512_ng = { tests }, { init, -#if FMT_MAIN_VERSION > 10 done, fmt_default_reset, -#endif fmt_default_prepare, valid, split, get_binary, fmt_default_salt, -#if FMT_MAIN_VERSION > 9 #if FMT_MAIN_VERSION > 11 { NULL }, #endif fmt_default_source, -#endif { fmt_default_binary_hash_0, fmt_default_binary_hash_1, -- 2.3.2