|
|
Message-ID: <CA+TsHUAcZA5s9s-co3xmOT9+buawXbPn2Qc4KxT-Cvnsm=TwmA@mail.gmail.com>
Date: Thu, 18 Sep 2014 18:31:15 +0530
From: Sayantan Datta <std2048@...il.com>
To: john-users@...ts.openwall.com
Subject: Re: Re: nVidia Maxwell support (especially descrypt)?
On Thu, Sep 18, 2014 at 5:36 PM, Roman Rusakov <rusakovster@...il.com>
wrote:
> Good day.
>
> I started work to optimize bitslice DES sboxes for Nvidia Maxwell
> instruction set (particularly for LOP3.LUT).
> Looks like gate count can be decreased by 9 (from ~33 to 24).
> For example, S4 (only 17 gates).
> ===
> #include <stdio.h>
>
> typedef unsigned long long vtype;
>
> vtype lut3(vtype x,vtype y,vtype z,unsigned char m)
> {
> int i;
> vtype r=0;
> for(i=0;i<sizeof(vtype)*8;i++)
> r|=(vtype)((m>>( (((x>>i)&1)<<2) | (((y>>i)&1)<<1) | ((z>>i)&1) ))&1) << i;
> return r;
> }
>
> void s4(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
> vtype * out1, vtype * out2, vtype * out3, vtype * out4)
> {
> vtype x55AAFF00=lut3(a1,a4,a5,0x36);
> vtype x00F00F00=lut3(a3,a4,a5,0x24);
> vtype x1926330C=lut3(a2,a3,x55AAFF00,0xA4);
> vtype x4CA36B59=lut3(x00F00F00,a1,x1926330C,0xB6);
>
> vtype x00FF55AA=lut3(a1,a4,a5,0x6C);
> vtype x3FCC6E9D=lut3(a2,a3,x00FF55AA,0x5E);
> vtype x6A7935C8=lut3(a1,x00F00F00,x3FCC6E9D,0xD6);
>
> vtype x5D016B55=lut3(a1,x4CA36B59,x00FF55AA,0xD4);
> vtype x07AE9F5A=lut3(a3,x55AAFF00,x5D016B55,0xD6);
> vtype x61C8F93C=lut3(a1,a2,x07AE9F5A,0x96);
>
> vtype x3=lut3(a6,x4CA36B59,x61C8F93C,0xC9);
> vtype x4=lut3(a6,x4CA36B59,x61C8F93C,0x93);
> *out3^=x3;
> *out4^=x4;
>
> vtype x26DA5E91=x4CA36B59^x6A7935C8;
> vtype x37217F22=lut3(a2,a4,x26DA5E91,0x72);
> vtype x56E9861E=x37217F22^x61C8F93C;
>
> vtype x1=lut3(a6,x56E9861E,x6A7935C8,0x5C);
> vtype x2=lut3(a6,x56E9861E,x6A7935C8,0x35);
> *out1^=x1;
> *out2^=x2;
> }
>
> void main()
> {
> vtype a1,a2,a3,a4,a5,a6,x1,x2,x3,x4;
> a1=0x5555555555555555;
> a2=0x3333333333333333;
> a3=0x0F0F0F0F0F0F0F0F;
> a4=0x00FF00FF00FF00FF;
> a5=0x0000FFFF0000FFFF;
> a6=0x00000000FFFFFFFF;
> x1=x2=x3=x4=0;
> s4(a1,a2,a3,a4,a5,a6,&x1,&x2,&x3,&x4);
> printf("%016llX\n",x1);
> printf("%016llX\n",x2);
> printf("%016llX\n",x3);
> printf("%016llX\n",x4);
> }
> ===
>
How much impact does it have on performance ? I guess it also reduces the
number of instruction which could translate into better utilization of
i-cache provided you do the same for all 8 sboxes.
Regards,
Sayantan
Powered by blists - more mailing lists
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.