From 4d6ad40c39ae7622ad86d2a97b11504f3e728a87 Mon Sep 17 00:00:00 2001
From: magnum <john.magnum@hushmail.com>
Date: Wed, 1 Apr 2015 00:39:06 +0200
Subject: [PATCH] Add a pseudo-intrinsics header, and use it for raw-sha512-ng.
 This adds support for AVX2 and AVX512 (including Xeon Phi) but is not
 complete and totally untested for AVX512.

---
 src/pseudo_intrinsics.h     | 196 ++++++++++++++++++++++++++++++
 src/rawSHA512_ng_fmt_plug.c | 288 +++++++++++++++++++++-----------------------
 2 files changed, 334 insertions(+), 150 deletions(-)
 create mode 100644 src/pseudo_intrinsics.h

diff --git a/src/pseudo_intrinsics.h b/src/pseudo_intrinsics.h
new file mode 100644
index 0000000..6b52e1f
--- /dev/null
+++ b/src/pseudo_intrinsics.h
@@ -0,0 +1,196 @@
+/*
+ * Minimalistic pseudo-instrinsics for width-agnostic x86 SIMD code.
+ *
+ * This software is Copyright (c) 2015 magnum, and it is hereby released to the
+ * general public under the following terms:  Redistribution and use in source
+ * and binary forms, with or without modification, are permitted.
+ *
+ * Synopsis:
+ *
+ * SSE2:     __m128i a = _mm_add_epi32(b, c);
+ * AVX2:     __m256i a = _mm256_add_epi32(b, c);
+ * AVX512:   __m512i a = _mm512_add_epi32(b, c);
+ * -> Pseudo:  vtype a = vadd_epi32(b, c);
+ *
+ * SSE2:     __m128i a = _mm_load_si128(p);
+ * AVX2:     __m256i a = _mm256_load_si256(p);
+ * AVX512:   __m512i a = _mm512_load_si512(p);
+ * -> Pseudo:  vtype a = vload(p);
+ *
+ * intrinsics are emulated where the target does not support them.
+ */
+
+#ifndef _SSE_PSEUDO_H
+#define _SSE_PSEUDO_H
+
+#undef SIMD_COEF_32
+#undef SIMD_COEF_64
+
+#if __MIC__ || __AVX512__
+#include <immintrin.h>
+
+#define SIMD_COEF_32            16
+#define SIMD_COEF_64            8
+
+typedef __m512i vtype;
+
+#define vadd_epi32              _mm512_add_epi32
+#define vadd_epi64              _mm512_add_epi64
+#define vand                    _mm512_and_si512
+#define vandnot                 _mm512_andnot_si512
+#define vcmov(y, z, x)          vxor(z, vand(x, vxor(y, z)))
+#define vcmpeq_epi32            _mm512_cmpeq_epi32
+#define vcmpeq_epi8             _mm512_cmpeq_epi8
+#define vload                   _mm512_load_si512
+#define vloadu                  _mm512_loadu_si512
+#define vmovemask_epi8          _mm512_movemask_epi8
+#define vor                     _mm512_or_si512
+#define vroti_epi32(x, n)       vxor(vsrli_epi32(x, ~n + 1),    \
+                                     vslli_epi32(x, 32 + n))
+#define vroti_epi64(x, n)       vxor(vsrli_epi64(x, ~n + 1),    \
+                                     vslli_epi64(x, 64 + n))
+#define vroti_epi64             _mm512_roti_epi64
+#define vset1_epi32             _mm512_set1_epi32
+#define vset1_epi64x            _mm512_set1_epi64x
+#define vset_epi64x             _mm512_set_epi64x
+#define vsetzero                _mm512_setzero_si512
+#define vshuffle_epi8           _mm512_shuffle_epi8
+#define vshuffle_epi32          _mm512_shuffle_epi32
+#define vshufflehi_epi16        _mm512_shufflehi_epi16
+#define vshufflelo_epi16        _mm512_shufflelo_epi16
+#define vslli_epi16             _mm512_slli_epi16
+#define vslli_epi32             _mm512_slli_epi32
+#define vslli_epi64             _mm512_slli_epi64
+#define vsrli_epi16             _mm512_srli_epi16
+#define vsrli_epi32             _mm512_srli_epi32
+#define vsrli_epi64             _mm512_srli_epi64
+#define vstore                  _mm512_store_si512
+#define vtestz_epi32            _mm512_testz_epi32
+#define vunpackhi_epi32         _mm512_unpackhi_epi32
+#define vunpackhi_epi64         _mm512_unpackhi_epi64
+#define vunpacklo_epi32         _mm512_unpacklo_epi32
+#define vunpacklo_epi64         _mm512_unpacklo_epi64
+#define vxor                    _mm512_xor_si512
+
+#elif __AVX2__
+#include <immintrin.h>
+
+#define SIMD_COEF_32            8
+#define SIMD_COEF_64            4
+
+typedef __m256i vtype;
+
+#define vadd_epi32              _mm256_add_epi32
+#define vadd_epi64              _mm256_add_epi64
+#define vand                    _mm256_and_si256
+#define vandnot                 _mm256_andnot_si256
+#define vcmov(y, z, x)          vxor(z, vand(x, vxor(y, z)))
+#define vcmpeq_epi32            _mm256_cmpeq_epi32
+#define vcmpeq_epi8             _mm256_cmpeq_epi8
+#define vload                   _mm256_load_si256
+#define vloadu                  _mm256_loadu_si256
+#define vmovemask_epi8          _mm256_movemask_epi8
+#define vor                     _mm256_or_si256
+#define vroti_epi32(x, n)       vxor(vsrli_epi32(x, ~n + 1),    \
+                                     vslli_epi32(x, 32 + n))
+#define vroti_epi64(x, n)       vxor(vsrli_epi64(x, ~n + 1),    \
+                                     vslli_epi64(x, 64 + n))
+#define vset1_epi32             _mm256_set1_epi32
+#define vset1_epi64x            _mm256_set1_epi64x
+#define vset_epi64x             _mm256_set_epi64x
+#define vsetzero                _mm256_setzero_si256
+#define vshuffle_epi8           _mm256_shuffle_epi8
+#define vshuffle_epi32          _mm256_shuffle_epi32
+#define vshufflehi_epi16        _mm256_shufflehi_epi16
+#define vshufflelo_epi16        _mm256_shufflelo_epi16
+#define vslli_epi16             _mm256_slli_epi16
+#define vslli_epi32             _mm256_slli_epi32
+#define vslli_epi64             _mm256_slli_epi64
+#define vsrli_epi16             _mm256_srli_epi16
+#define vsrli_epi32             _mm256_srli_epi32
+#define vsrli_epi64             _mm256_srli_epi64
+#define vstore                  _mm256_store_si256
+#define vtestz_epi32            _mm256_testz_epi32
+#define vunpackhi_epi32         _mm256_unpackhi_epi32
+#define vunpackhi_epi64         _mm256_unpackhi_epi64
+#define vunpacklo_epi32         _mm256_unpacklo_epi32
+#define vunpacklo_epi64         _mm256_unpacklo_epi64
+#define vxor                    _mm256_xor_si256
+
+#elif __SSE2__
+#if __AVX__
+#include <immintrin.h>
+#if __XOP__
+#include <x86intrin.h>
+#endif
+#endif
+#include <emmintrin.h>
+
+#define SIMD_COEF_32            4
+#define SIMD_COEF_64            2
+
+typedef __m128i vtype;
+
+#define vadd_epi32              _mm_add_epi32
+#define vadd_epi64              _mm_add_epi64
+#define vand                    _mm_and_si128
+#define vandnot                 _mm_andnot_si128
+#if __XOP__
+#define vcmov                   _mm_cmov_si128
+#else
+#define vcmov(y, z, x)          vxor(z, vand(x, vxor(y, z)))
+#endif
+#define vcmpeq_epi32            _mm_cmpeq_epi32
+#define vcmpeq_epi8             _mm_cmpeq_epi8
+#define vload                   _mm_load_si128
+#define vloadu                  _mm_loadu_si128
+#define vmovemask_epi8          _mm_movemask_epi8
+#define vor                     _mm_or_si128
+#if __XOP__
+#define vroti_epi32             _mm_roti_epi32
+#define vroti_epi64             _mm_roti_epi64
+#else
+#define vroti_epi32(x, n)       vxor(vsrli_epi32(x, ~n + 1),    \
+                                     vslli_epi32(x, 32 + n))
+#define vroti_epi64(x, n)       vxor(vsrli_epi64(x, ~n + 1),    \
+                                     vslli_epi64(x, 64 + n))
+#endif
+#define vset1_epi32             _mm_set1_epi32
+#define vset1_epi64x            _mm_set1_epi64x
+#define vset_epi64x             _mm_set_epi64x
+#define vsetzero                _mm_setzero_si128
+#ifdef __SSSE3__
+#define vshuffle_epi8           _mm_shuffle_epi8
+#endif
+#define vshuffle_epi32          _mm_shuffle_epi32
+#define vshufflehi_epi16        _mm_shufflehi_epi16
+#define vshufflelo_epi16        _mm_shufflelo_epi16
+#define vslli_epi16             _mm_slli_epi16
+#define vslli_epi32             _mm_slli_epi32
+#define vslli_epi64             _mm_slli_epi64
+#define vsrli_epi16             _mm_srli_epi16
+#define vsrli_epi32             _mm_srli_epi32
+#define vsrli_epi64             _mm_srli_epi64
+#define vstore                  _mm_store_si128
+#define vtestz_epi32            _mm_testz_epi32
+#define vunpackhi_epi32         _mm_unpackhi_epi32
+#define vunpackhi_epi64         _mm_unpackhi_epi64
+#define vunpacklo_epi32         _mm_unpacklo_epi32
+#define vunpacklo_epi64         _mm_unpacklo_epi64
+#define vxor                    _mm_xor_si128
+
+#elif __MMX__
+#include <mmintrin.h>
+
+#define SIMD_COEF_32            2
+#define SIMD_COEF_64            1
+
+typedef __m64i vtype;
+
+#error MMX intrinsics not implemented
+
+#endif /* __SIMD__ elif __SIMD__ elif __SIMD__ */
+
+#define MEM_ALIGN_SIMD          (SIMD_COEF_32 * 4)
+
+#endif /* _SSE_PSEUDO_H */
diff --git a/src/rawSHA512_ng_fmt_plug.c b/src/rawSHA512_ng_fmt_plug.c
index b939a5d..94ada12 100644
--- a/src/rawSHA512_ng_fmt_plug.c
+++ b/src/rawSHA512_ng_fmt_plug.c
@@ -1,5 +1,6 @@
 /*
- * Copyright 2013, epixoip.
+ * Copyright (c) 2013, epixoip.
+ * Copyright (c) 2015, magnum (pseudo-intrinsics also supporting AVX2/AVX512)
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that redistribution of source
@@ -7,7 +8,7 @@
  */
 
 #include "arch.h"
-#if defined __SSE2__
+#if __SSE2__
 
 #if FMT_EXTERNS_H
 extern struct fmt_main fmt_rawSHA512_ng;
@@ -17,7 +18,7 @@ john_register_one(&fmt_rawSHA512_ng);
 
 #ifdef _OPENMP
 #include <omp.h>
-#if defined __XOP__
+#if __XOP__
 #define OMP_SCALE                 768 /* AMD */
 #else
 #define OMP_SCALE                 2048 /* Intel */
@@ -25,42 +26,42 @@ john_register_one(&fmt_rawSHA512_ng);
 #endif
 
 // These compilers claim to be __GNUC__ but warn on gcc pragmas.
-#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) && !defined(__llvm__) && !defined (_MSC_VER)
+#if __GNUC__ && !__INTEL_COMPILER && !__clang__ && !__llvm__ && !_MSC_VER
 #pragma GCC optimize 3
 #endif
 
 #include "stdint.h"
 #include <string.h>
-#include <emmintrin.h>
-
-#if defined __XOP__
-#include <x86intrin.h>
-#elif defined __SSSE3__
-#include <tmmintrin.h>
-#endif
 
+#include "pseudo_intrinsics.h"
 #include "common.h"
 #include "formats.h"
 #include "johnswap.h"
 #include "memdbg.h"
 
-#if defined __XOP__
-#define SIMD_TYPE                 "XOP"
-#elif defined __SSSE3__
-#define SIMD_TYPE                 "SSSE3"
+#if __AVX512__ || __MIC__
+#define SIMD_TYPE                 "512/512 AVX512 8x"
+#elif __AVX2__
+#define SIMD_TYPE                 "256/256 AVX2 4x"
+#elif __XOP__
+#define SIMD_TYPE                 "128/128 XOP 2x"
+#elif __SSSE3__
+#define SIMD_TYPE                 "128/128 SSSE3 2x"
 #else
-#define SIMD_TYPE                 "SSE2"
+#define SIMD_TYPE                 "128/128 SSE2 2x"
 #endif
 
 #define FORMAT_LABEL              "Raw-SHA512-ng"
 #define FORMAT_NAME               ""
-#define ALGORITHM_NAME            "SHA512 128/128 " SIMD_TYPE " 2x"
+#define ALGORITHM_NAME            "SHA512 " SIMD_TYPE
 #define FORMAT_TAG                "$SHA512$"
 #define TAG_LENGTH                8
 
 #define BENCHMARK_COMMENT         ""
 #define BENCHMARK_LENGTH          -1
 
+#define VWIDTH                    SIMD_COEF_64
+
 // max length is not 119, but 8 less than this, or 111.  111 actually make sense.
 // For SHA512 there are 14 'usable' 8 byte ints, minus 1 byte (for the 0x80).
 // 14*8-1 is 111. This comment left for reference for future sha2 hackers within JtR.
@@ -73,10 +74,10 @@ john_register_one(&fmt_rawSHA512_ng);
 #define BINARY_ALIGN              8
 #define SALT_SIZE                 0
 #define SALT_ALIGN                1
-#define MIN_KEYS_PER_CRYPT        2
-#define MAX_KEYS_PER_CRYPT        2
+#define MIN_KEYS_PER_CRYPT        VWIDTH
+#define MAX_KEYS_PER_CRYPT        VWIDTH
 
-#if defined (_MSC_VER) && !defined (_M_X64)
+#if _MSC_VER && !_M_X64
 // 32 bit VC does NOT define these intrinsics :((((
 _inline __m128i _mm_set_epi64x(uint64_t a, uint64_t b) {
 	__m128i x;
@@ -92,110 +93,117 @@ _inline __m128i _mm_set1_epi64x(uint64_t a) {
 }
 #endif
 
-#ifndef __XOP__
-#define _mm_roti_epi64(x, n)                                              \
-(                                                                         \
-    _mm_xor_si128 (                                                       \
-        _mm_srli_epi64(x, ~n + 1),                                        \
-        _mm_slli_epi64(x, 64 + n)                                         \
-    )                                                                     \
-)
-
-#define _mm_cmov_si128(y, z, x)                                           \
-(                                                                         \
-    _mm_xor_si128 (z,                                                     \
-        _mm_and_si128 (x,                                                 \
-            _mm_xor_si128 (y, z)                                          \
-        )                                                                 \
-    )                                                                     \
-)
-#endif
-
-#ifdef __SSSE3__
+#if __AVX512__ || __MIC__
+#define SWAP_ENDIAN(n)                                                    \
+{                                                                         \
+    n = vshuffle_epi8(n,                                                  \
+            vset_epi64x(0x38393a3b3c3d3e3f, 0x3031323334353637,           \
+                        0x28292a2b2c2d2e2f, 0x2021222324252627,           \
+                        0x18191a1b1c1d1e1f, 0x1011121314151617,           \
+                        0x08090a0b0c0d0e0f, 0x0001020304050607)           \
+        );                                                                \
+}
+#elif __AVX2__
+#define SWAP_ENDIAN(n)                                                    \
+{                                                                         \
+    n = vshuffle_epi8(n,                                                  \
+            vset_epi64x(0x18191a1b1c1d1e1f, 0x1011121314151617,           \
+                        0x08090a0b0c0d0e0f, 0x0001020304050607)           \
+        );                                                                \
+}
+#elif __SSSE3__
 #define SWAP_ENDIAN(n)                                                    \
 {                                                                         \
-    n = _mm_shuffle_epi8 (n,                                              \
-            _mm_set_epi64x (0x08090a0b0c0d0e0f, 0x0001020304050607)       \
+    n = vshuffle_epi8(n,                                                  \
+            vset_epi64x(0x08090a0b0c0d0e0f, 0x0001020304050607)           \
         );                                                                \
 }
 #else
 #define SWAP_ENDIAN(n)                                                    \
 {                                                                         \
-    n = _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (n, 0xb1), 0xb1);        \
-    n = _mm_xor_si128 (_mm_slli_epi16 (n, 8), _mm_srli_epi16 (n, 8));     \
-    n = _mm_shuffle_epi32 (n, 0xb1);                                      \
+    n = vshufflehi_epi16(vshufflelo_epi16(n, 0xb1), 0xb1);                \
+    n = vxor(vslli_epi16(n, 8), vsrli_epi16(n, 8));                       \
+    n = vshuffle_epi32(n, 0xb1);                                          \
 }
 #endif
 
+#if __AVX2__
 #define GATHER(x,y,z)                                                     \
 {                                                                         \
-    x = _mm_set_epi64x (y[index + 1][z], y[index][z]);                    \
+    x = vset_epi64x(y[index + 3][z], y[index + 2][z],                     \
+                    y[index + 1][z], y[index    ][z]);                    \
 }
+#else
+#define GATHER(x,y,z)                                                     \
+{                                                                         \
+    x = vset_epi64x(y[index + 1][z], y[index    ][z]);                    \
+}
+#endif
 
 #define S0(x)                                                             \
 (                                                                         \
-    _mm_xor_si128 (                                                       \
-        _mm_roti_epi64 (x, -39),                                          \
-        _mm_xor_si128 (                                                   \
-            _mm_roti_epi64 (x, -28),                                      \
-            _mm_roti_epi64 (x, -34)                                       \
+    vxor(                                                                 \
+        vroti_epi64(x, -39),                                              \
+        vxor(                                                             \
+            vroti_epi64(x, -28),                                          \
+            vroti_epi64(x, -34)                                           \
         )                                                                 \
     )                                                                     \
 )
 
 #define S1(x)                                                             \
 (                                                                         \
-    _mm_xor_si128 (                                                       \
-        _mm_roti_epi64 (x, -41),                                          \
-        _mm_xor_si128 (                                                   \
-            _mm_roti_epi64 (x, -14),                                      \
-            _mm_roti_epi64 (x, -18)                                       \
+    vxor(                                                                 \
+        vroti_epi64(x, -41),                                              \
+        vxor(                                                             \
+            vroti_epi64(x, -14),                                          \
+            vroti_epi64(x, -18)                                           \
         )                                                                 \
     )                                                                     \
 )
 
 #define s0(x)                                                             \
 (                                                                         \
-    _mm_xor_si128 (                                                       \
-        _mm_srli_epi64 (x, 7),                                            \
-        _mm_xor_si128 (                                                   \
-            _mm_roti_epi64 (x, -1),                                       \
-            _mm_roti_epi64 (x, -8)                                        \
+    vxor(                                                                 \
+        vsrli_epi64(x, 7),                                                \
+        vxor(                                                             \
+            vroti_epi64(x, -1),                                           \
+            vroti_epi64(x, -8)                                            \
         )                                                                 \
     )                                                                     \
 )
 
 #define s1(x)                                                             \
 (                                                                         \
-    _mm_xor_si128 (                                                       \
-        _mm_srli_epi64 (x, 6),                                            \
-        _mm_xor_si128 (                                                   \
-            _mm_roti_epi64 (x, -19),                                      \
-            _mm_roti_epi64 (x, -61)                                       \
+    vxor(                                                                 \
+        vsrli_epi64(x, 6),                                                \
+        vxor(                                                             \
+            vroti_epi64(x, -19),                                          \
+            vroti_epi64(x, -61)                                           \
         )                                                                 \
     )                                                                     \
 )
 
-#define Maj(x,y,z) _mm_cmov_si128 (x, y, _mm_xor_si128 (z, y))
+#define Maj(x,y,z) vcmov(x, y, vxor(z, y))
 
-#define Ch(x,y,z)  _mm_cmov_si128 (y, z, x)
+#define Ch(x,y,z)  vcmov(y, z, x)
 
 #define R(t)                                                              \
 {                                                                         \
-    tmp1 = _mm_add_epi64 (s1(w[t -  2]), w[t - 7]);                       \
-    tmp2 = _mm_add_epi64 (s0(w[t - 15]), w[t - 16]);                      \
-    w[t] = _mm_add_epi64 (tmp1, tmp2);                                    \
+    tmp1 = vadd_epi64(s1(w[t -  2]), w[t - 7]);                           \
+    tmp2 = vadd_epi64(s0(w[t - 15]), w[t - 16]);                          \
+    w[t] = vadd_epi64(tmp1, tmp2);                                        \
 }
 
 #define SHA512_STEP(a,b,c,d,e,f,g,h,x,K)                                  \
 {                                                                         \
-    tmp1 = _mm_add_epi64 (h,    w[x]);                                    \
-    tmp2 = _mm_add_epi64 (S1(e),_mm_set1_epi64x(K));                      \
-    tmp1 = _mm_add_epi64 (tmp1, Ch(e,f,g));                               \
-    tmp1 = _mm_add_epi64 (tmp1, tmp2);                                    \
-    tmp2 = _mm_add_epi64 (S0(a),Maj(a,b,c));                              \
-    d    = _mm_add_epi64 (tmp1, d);                                       \
-    h    = _mm_add_epi64 (tmp1, tmp2);                                    \
+    tmp1 = vadd_epi64(h,    w[x]);                                        \
+    tmp2 = vadd_epi64(S1(e),vset1_epi64x(K));                             \
+    tmp1 = vadd_epi64(tmp1, Ch(e,f,g));                                   \
+    tmp1 = vadd_epi64(tmp1, tmp2);                                        \
+    tmp2 = vadd_epi64(S0(a),Maj(a,b,c));                                  \
+    d    = vadd_epi64(tmp1, d);                                           \
+    h    = vadd_epi64(tmp1, tmp2);                                        \
 }
 
 
@@ -259,7 +267,7 @@ static void done(void)
 }
 
 
-static inline void alter_endianity_64 (uint64_t *x, unsigned int size)
+static inline void alter_endianity_64(uint64_t *x, unsigned int size)
 {
     int i;
 
@@ -268,13 +276,13 @@ static inline void alter_endianity_64 (uint64_t *x, unsigned int size)
 }
 
 
-static int valid (char *ciphertext, struct fmt_main *self)
+static int valid(char *ciphertext, struct fmt_main *self)
 {
     char *p, *q;
 
     p = ciphertext;
 
-    if (! strncmp (p, FORMAT_TAG, TAG_LENGTH))
+    if (! strncmp(p, FORMAT_TAG, TAG_LENGTH))
         p += TAG_LENGTH;
 
     q = p;
@@ -284,26 +292,22 @@ static int valid (char *ciphertext, struct fmt_main *self)
 }
 
 
-#if FMT_MAIN_VERSION > 9
-static char *split (char *ciphertext, int index, struct fmt_main *self)
-#else
-static char *split (char *ciphertext, int index)
-#endif
+static char *split(char *ciphertext, int index, struct fmt_main *self)
 {
     static char out[TAG_LENGTH + CIPHERTEXT_LENGTH + 1];
 
-    if (!strncmp (ciphertext, FORMAT_TAG, TAG_LENGTH))
+    if (!strncmp(ciphertext, FORMAT_TAG, TAG_LENGTH))
         ciphertext += TAG_LENGTH;
 
-    memcpy (out,  FORMAT_TAG, TAG_LENGTH);
-    memcpy (out + TAG_LENGTH, ciphertext, CIPHERTEXT_LENGTH + 1);
-    strlwr (out + TAG_LENGTH);
+    memcpy(out,  FORMAT_TAG, TAG_LENGTH);
+    memcpy(out + TAG_LENGTH, ciphertext, CIPHERTEXT_LENGTH + 1);
+    strlwr(out + TAG_LENGTH);
 
     return out;
 }
 
 
-static void *get_binary (char *ciphertext)
+static void *get_binary(char *ciphertext)
 {
     static union {
         unsigned char c[FULL_BINARY_SIZE];
@@ -312,7 +316,7 @@ static void *get_binary (char *ciphertext)
     int i;
 
     if (!out)
-        out = mem_alloc_tiny (FULL_BINARY_SIZE, BINARY_ALIGN);
+        out = mem_alloc_tiny(FULL_BINARY_SIZE, BINARY_ALIGN);
 
     ciphertext += TAG_LENGTH;
 
@@ -320,7 +324,7 @@ static void *get_binary (char *ciphertext)
         out->c[i] = atoi16[ARCH_INDEX(ciphertext[i*2])] * 16 +
                     atoi16[ARCH_INDEX(ciphertext[i*2 + 1])];
 
-    alter_endianity_64 (out->w, FULL_BINARY_SIZE);
+    alter_endianity_64(out->w, FULL_BINARY_SIZE);
 
     out->w[0] -= 0x6a09e667f3bcc908ULL;
     out->w[1] -= 0xbb67ae8584caa73bULL;
@@ -334,16 +338,16 @@ static void *get_binary (char *ciphertext)
     return (void *) out;
 }
 
-static int get_hash_0 (int index) { return crypt_key[0][index] & 0xf; }
-static int get_hash_1 (int index) { return crypt_key[0][index] & 0xff; }
-static int get_hash_2 (int index) { return crypt_key[0][index] & 0xfff; }
-static int get_hash_3 (int index) { return crypt_key[0][index] & 0xffff; }
-static int get_hash_4 (int index) { return crypt_key[0][index] & 0xfffff; }
-static int get_hash_5 (int index) { return crypt_key[0][index] & 0xffffff; }
-static int get_hash_6 (int index) { return crypt_key[0][index] & 0x7ffffff; }
+static int get_hash_0(int index) { return crypt_key[0][index] & 0xf; }
+static int get_hash_1(int index) { return crypt_key[0][index] & 0xff; }
+static int get_hash_2(int index) { return crypt_key[0][index] & 0xfff; }
+static int get_hash_3(int index) { return crypt_key[0][index] & 0xffff; }
+static int get_hash_4(int index) { return crypt_key[0][index] & 0xfffff; }
+static int get_hash_5(int index) { return crypt_key[0][index] & 0xffffff; }
+static int get_hash_6(int index) { return crypt_key[0][index] & 0x7ffffff; }
 
 
-static void set_key (char *key, int index)
+static void set_key(char *key, int index)
 {
     uint64_t *buf64 = (uint64_t *) &saved_key[index];
     uint8_t  *buf8  = (uint8_t * ) buf64;
@@ -358,7 +362,7 @@ static void set_key (char *key, int index)
 }
 
 
-static char *get_key (int index)
+static char *get_key(int index)
 {
     uint64_t *buf64 = (uint64_t *) &saved_key[index];
     uint8_t  *buf8  = (uint8_t * ) buf64;
@@ -375,50 +379,44 @@ static char *get_key (int index)
 }
 
 
-#if FMT_MAIN_VERSION > 10
-static int crypt_all (int *pcount, struct db_salt *salt)
-#else
-static void crypt_all (int count)
-#endif
+static int crypt_all(int *pcount, struct db_salt *salt)
 {
-#if FMT_MAIN_VERSION > 10
     int count = *pcount;
-#endif
     int index = 0;
 
 #ifdef _OPENMP
 #pragma omp parallel for
-    for (index = 0; index < count; index += 2)
+    for (index = 0; index < count; index += VWIDTH)
 #endif
     {
         int i;
 
-        __m128i a, b, c, d, e, f, g, h;
-        __m128i w[80], tmp1, tmp2;
+        vtype a, b, c, d, e, f, g, h;
+        vtype w[80], tmp1, tmp2;
 
 
         for (i = 0; i < 14; i += 2) {
-            GATHER (tmp1, saved_key, i);
-            GATHER (tmp2, saved_key, i + 1);
-            SWAP_ENDIAN (tmp1);
-            SWAP_ENDIAN (tmp2);
+            GATHER(tmp1, saved_key, i);
+            GATHER(tmp2, saved_key, i + 1);
+            SWAP_ENDIAN(tmp1);
+            SWAP_ENDIAN(tmp2);
             w[i] = tmp1;
             w[i + 1] = tmp2;
         }
-        GATHER (tmp1, saved_key, 14);
-        SWAP_ENDIAN (tmp1);
+        GATHER(tmp1, saved_key, 14);
+        SWAP_ENDIAN(tmp1);
         w[14] = tmp1;
-        GATHER (w[15], saved_key, 15);
+        GATHER(w[15], saved_key, 15);
         for (i = 16; i < 80; i++) R(i);
 
-        a = _mm_set1_epi64x (0x6a09e667f3bcc908ULL);
-        b = _mm_set1_epi64x (0xbb67ae8584caa73bULL);
-        c = _mm_set1_epi64x (0x3c6ef372fe94f82bULL);
-        d = _mm_set1_epi64x (0xa54ff53a5f1d36f1ULL);
-        e = _mm_set1_epi64x (0x510e527fade682d1ULL);
-        f = _mm_set1_epi64x (0x9b05688c2b3e6c1fULL);
-        g = _mm_set1_epi64x (0x1f83d9abfb41bd6bULL);
-        h = _mm_set1_epi64x (0x5be0cd19137e2179ULL);
+        a = vset1_epi64x(0x6a09e667f3bcc908ULL);
+        b = vset1_epi64x(0xbb67ae8584caa73bULL);
+        c = vset1_epi64x(0x3c6ef372fe94f82bULL);
+        d = vset1_epi64x(0xa54ff53a5f1d36f1ULL);
+        e = vset1_epi64x(0x510e527fade682d1ULL);
+        f = vset1_epi64x(0x9b05688c2b3e6c1fULL);
+        g = vset1_epi64x(0x1f83d9abfb41bd6bULL);
+        h = vset1_epi64x(0x5be0cd19137e2179ULL);
 
         SHA512_STEP(a, b, c, d, e, f, g, h,  0, 0x428a2f98d728ae22ULL);
         SHA512_STEP(h, a, b, c, d, e, f, g,  1, 0x7137449123ef65cdULL);
@@ -505,30 +503,28 @@ static void crypt_all (int count)
         SHA512_STEP(c, d, e, f, g, h, a, b, 78, 0x5fcb6fab3ad6faecULL);
         SHA512_STEP(b, c, d, e, f, g, h, a, 79, 0x6c44198c4a475817ULL);
 
-        _mm_store_si128 ((__m128i *) &crypt_key[0][index], a);
-        _mm_store_si128 ((__m128i *) &crypt_key[1][index], b);
-        _mm_store_si128 ((__m128i *) &crypt_key[2][index], c);
-        _mm_store_si128 ((__m128i *) &crypt_key[3][index], d);
-        _mm_store_si128 ((__m128i *) &crypt_key[4][index], e);
-        _mm_store_si128 ((__m128i *) &crypt_key[5][index], f);
-        _mm_store_si128 ((__m128i *) &crypt_key[6][index], g);
-        _mm_store_si128 ((__m128i *) &crypt_key[7][index], h);
+        vstore((vtype*) &crypt_key[0][index], a);
+        vstore((vtype*) &crypt_key[1][index], b);
+        vstore((vtype*) &crypt_key[2][index], c);
+        vstore((vtype*) &crypt_key[3][index], d);
+        vstore((vtype*) &crypt_key[4][index], e);
+        vstore((vtype*) &crypt_key[5][index], f);
+        vstore((vtype*) &crypt_key[6][index], g);
+        vstore((vtype*) &crypt_key[7][index], h);
     }
 
-#if FMT_MAIN_VERSION > 10
     return count;
-#endif
 }
 
 
-static int cmp_all (void *binary, int count)
+static int cmp_all(void *binary, int count)
 {
     int i;
 
 #ifdef _OPENMP
     for (i=0; i < count; i++)
 #else
-    for (i=0; i < 2; i++)
+    for (i=0; i < VWIDTH; i++)
 #endif
         if (((uint64_t *) binary)[0] == crypt_key[0][i])
             return 1;
@@ -537,18 +533,18 @@ static int cmp_all (void *binary, int count)
 }
 
 
-static int cmp_one (void *binary, int index)
+static int cmp_one(void *binary, int index)
 {
     return (((uint64_t *) binary)[0] == crypt_key[0][index]);
 }
 
 
-static int cmp_exact (char *source, int index)
+static int cmp_exact(char *source, int index)
 {
     int i;
     uint64_t *bin;
 
-    bin = (uint64_t *) get_binary (source);
+    bin = (uint64_t *) get_binary(source);
 
     for (i=1; i < 8; i++)
         if (((uint64_t *) bin)[i] != crypt_key[i][index])
@@ -568,13 +564,9 @@ struct fmt_main fmt_rawSHA512_ng = {
         0,
         MAXLEN,
         BINARY_SIZE,
-#if FMT_MAIN_VERSION > 9
         BINARY_ALIGN,
-#endif
         SALT_SIZE,
-#if FMT_MAIN_VERSION > 9
         SALT_ALIGN,
-#endif
         MIN_KEYS_PER_CRYPT,
         MAX_KEYS_PER_CRYPT,
         FMT_CASE | FMT_8_BIT | FMT_SPLIT_UNIFIES_CASE | FMT_OMP,
@@ -584,21 +576,17 @@ struct fmt_main fmt_rawSHA512_ng = {
         tests
     }, {
         init,
-#if FMT_MAIN_VERSION > 10
         done,
         fmt_default_reset,
-#endif
         fmt_default_prepare,
         valid,
         split,
         get_binary,
         fmt_default_salt,
-#if FMT_MAIN_VERSION > 9
 #if FMT_MAIN_VERSION > 11
 		{ NULL },
 #endif
         fmt_default_source,
-#endif
         {
 		fmt_default_binary_hash_0,
 		fmt_default_binary_hash_1,
-- 
2.3.2