sha256/512 removed static values for thread safe

pull/36/head
Takashi Kojo 2015-03-25 18:26:46 +09:00
parent 28109b01f7
commit 994ffa96e0
3 changed files with 165 additions and 166 deletions

View File

@ -238,7 +238,7 @@ static void set_Transform(void) {
/* Dummy for saving MM_REGs on behalf of Transform */
#if defined(HAVE_INTEL_AVX2)&& !defined(HAVE_INTEL_AVX1)
#define SAVE_XMM_YMM __asm__ volatile("vpxor %%ymm7, %%ymm7, %%ymm7":::\
"%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15")
"%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15")
#elif defined(HAVE_INTEL_AVX1)
#define SAVE_XMM_YMM __asm__ volatile("vpxor %%xmm7, %%xmm7, %%xmm7":::\
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\
@ -956,29 +956,28 @@ __asm__ volatile("movl %r8d, "#h"\n\t"); \
#define W_K_from_buff\
{/* X0..3(xmm4..7) = sha256->buffer[0.15]; */\
static word64 buff[16] ;\
buff[0] = *(word64*)&sha256->buffer[0] ;\
buff[1] = *(word64*)&sha256->buffer[2] ;\
__asm__ volatile("vmovaps %1, %%xmm4\n\t"\
"vpshufb %0, %%xmm4, %%xmm4\n\t"\
: "=m"(mBYTE_FLIP_MASK[0]),"=m"(buff) :) ;\
buff[2] = *(word64*)&sha256->buffer[4] ;\
buff[3] = *(word64*)&sha256->buffer[6] ;\
__asm__ volatile("\n\tvmovaps %1, %%xmm5\n\t"\
"vpshufb %0, %%xmm5, %%xmm5\n\t"\
: "=m"(mBYTE_FLIP_MASK[0]),"=m"(buff[2]) :) ;\
buff[4] = *(word64*)&sha256->buffer[8] ;\
buff[5] = *(word64*)&sha256->buffer[10] ;\
__asm__ volatile("vmovaps %1, %%xmm6\n\t"\
"vpshufb %0, %%xmm6, %%xmm6\n\t"\
: "=m"(mBYTE_FLIP_MASK[0]),"=m"(buff[4]) :) ;\
buff[6] = *(word64*)&sha256->buffer[12] ;\
buff[7] = *(word64*)&sha256->buffer[14] ;\
__asm__ volatile("vmovaps %1, %%xmm7\n\t"\
"vpshufb %0, %%xmm7, %%xmm7\n\t"\
: "=m"(mBYTE_FLIP_MASK[0]),"=m"(buff[6]) :) ;\
}
{ word64 _buff[2] ;/* X0..3(xmm4..7) = sha256->buffer[0.15]; */\
_buff[0] = *(word64*)&sha256->buffer[0] ;\
_buff[1] = *(word64*)&sha256->buffer[2] ;\
__asm__ volatile("vmovaps %0, %%xmm4\n\t"\
"vpshufb %%xmm13, %%xmm4, %%xmm4\n\t"\
:: "m"(_buff[0]):"%xmm4") ;\
_buff[0] = *(word64*)&sha256->buffer[4] ;\
_buff[1] = *(word64*)&sha256->buffer[6] ;\
__asm__ volatile("vmovaps %0, %%xmm5\n\t"\
"vpshufb %%xmm13, %%xmm5, %%xmm5\n\t"\
::"m"(_buff[0]):"%xmm5") ;\
_buff[0] = *(word64*)&sha256->buffer[8] ;\
_buff[1] = *(word64*)&sha256->buffer[10] ;\
__asm__ volatile("vmovaps %0, %%xmm6\n\t"\
"vpshufb %%xmm13, %%xmm6, %%xmm6\n\t"\
::"m"(_buff[0]):"%xmm6") ;\
_buff[0] = *(word64*)&sha256->buffer[12] ;\
_buff[1] = *(word64*)&sha256->buffer[14] ;\
__asm__ volatile("vmovaps %0, %%xmm7\n\t"\
"vpshufb %%xmm13, %%xmm7, %%xmm7\n\t"\
::"m"(_buff[0]):"%xmm7") ;\
}\
#define _SET_W_K_XFER(reg, i)\
__asm__ volatile("vpaddd %0, %"#reg", %%xmm9"::"m"(K[i]):XMM_REGs) ;\
@ -992,9 +991,9 @@ static word64 mBYTE_FLIP_MASK[] = { 0x0405060700010203, 0x0c0d0e0f08090a0b } ;
#define _Init_Masks(mask1, mask2, mask3)\
__asm__ volatile("vmovaps %0, %"#mask1 :"=m"(mBYTE_FLIP_MASK)) ;\
__asm__ volatile("vmovaps %0, %"#mask2 :"=m"(mSHUF_00BA)) ;\
__asm__ volatile("vmovaps %0, %"#mask3 :"=m"(mSHUF_DC00)) ;
__asm__ volatile("vmovaps %0, %"#mask1 ::"m"(mBYTE_FLIP_MASK[0]):) ;\
__asm__ volatile("vmovaps %0, %"#mask2 ::"m"(mSHUF_00BA[0]):) ;\
__asm__ volatile("vmovaps %0, %"#mask3 ::"m"(mSHUF_DC00[0]):) ;
#define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\
_Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
@ -1022,6 +1021,7 @@ __asm__ volatile("vmovaps %0, %"#mask3 :"=m"(mSHUF_DC00)) ;
static int Transform_AVX1(Sha256* sha256)
{
word32 W_K[64] ; /* temp for W+K */
#if defined(DEBUG_XMM)
@ -1115,6 +1115,7 @@ static int Transform_AVX1(Sha256* sha256)
#if defined(HAVE_INTEL_RORX)
static int Transform_AVX1_RORX(Sha256* sha256)
{
word32 W_K[64] ; /* temp for W+K */
#if defined(DEBUG_XMM)
@ -1126,7 +1127,6 @@ static int Transform_AVX1_RORX(Sha256* sha256)
W_K_from_buff ; /* X0, X1, X2, X3 = W[0..15] ; */
DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
SET_W_K_XFER(X0, 0) ;
MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ;
@ -1250,7 +1250,7 @@ static int Transform_AVX1_RORX(Sha256* sha256)
#define DUMP_ALL DUMP_REG(W_I_16) ; DUMP_REG(W_I_15) ; DUMP_REG(W_I_7) ; DUMP_REG(W_I_2) ; DUMP_REG(W_I) ;
#define _MOVE_to_REG(ymm, mem) __asm__ volatile("vmovdqu %0, %%"#ymm" ":: "m"(mem):YMM_REGs) ;
#define _MOVE_to_MEM(mem, ymm) __asm__ volatile("vmovdqu %%"#ymm", %0" :: "m"(mem):YMM_REGs) ;
#define _MOVE_to_MEM(mem, ymm) __asm__ volatile("vmovdqu %%"#ymm", %0" : "=m"(mem)::YMM_REGs) ;
#define _BYTE_SWAP(ymm, map) __asm__ volatile("vpshufb %0, %%"#ymm", %%"#ymm"\n\t"\
:: "m"(map):YMM_REGs) ;
#define _MOVE_128(ymm0, ymm1, ymm2, map) __asm__ volatile("vperm2i128 $"#map", %%"\
@ -1448,7 +1448,7 @@ static int Transform_AVX2(Sha256* sha256)
if (W_K == NULL)
return MEMORY_E;
#else
static word32 W_K[64];
word32 W_K[64] ;
#endif
MOVE_to_REG(W_I_16, sha256->buffer[0]); BYTE_SWAP(W_I_16, mBYTE_FLIP_MASK_16[0]) ;
@ -1457,8 +1457,6 @@ static int Transform_AVX2(Sha256* sha256)
MOVE_to_REG(W_I_7, sha256->buffer[16-7]) ; BYTE_SWAP(W_I_7, mBYTE_FLIP_MASK_7[0]) ;
MOVE_to_REG(W_I_2, sha256->buffer[16-2]) ; BYTE_SWAP(W_I_2, mBYTE_FLIP_MASK_2[0]) ;
DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
ADD_MEM(W_K_TEMP, W_I_16, K[0]) ;

View File

@ -97,6 +97,7 @@ int wc_Sha384Hash(const byte* data, word32 len, byte* out)
#endif /* min */
//#undef USE_INTEL_SPEEDUP
#if defined(USE_INTEL_SPEEDUP)
#define HAVE_INTEL_AVX1
#define HAVE_INTEL_AVX2
@ -235,14 +236,15 @@ static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
return 0 ;
}
static void set_cpuid_flags(void) {
static int set_cpuid_flags(void) {
if(cpuid_check==0) {
if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;}
if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; }
if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; }
if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; }
cpuid_check = 1 ;
return 0 ;
}
return 1 ;
}
@ -268,7 +270,7 @@ static int (*Transform_p)(Sha512* sha512) = _Transform ;
#define Transform(sha512) (*Transform_p)(sha512)
static void set_Transform(void) {
set_cpuid_flags() ;
if(set_cpuid_flags()) return ;
#if defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2)
Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : _Transform) ;
@ -290,13 +292,13 @@ static void set_Transform(void) {
#endif
/* Dummy for saving MM_REGs on behalf of Transform */
#if defined(HAVE_INTEL_AVX2)
#define SAVE_XMM_YMM __asm__ volatile("orq %%r8, %%r8":::\
"%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11",\
"%ymm12","%ymm13","%ymm14","%ymm15")
#elif defined(HAVE_INTEL_AVX1)
//#if defined(HAVE_INTEL_AVX2)
//#define SAVE_XMM_YMM __asm__ volatile("orq %%r8, %%r8":::\
// "%ymm0","%ymm1","%ymm2","%ymm3","%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11",\
// "%ymm12","%ymm13","%ymm14","%ymm15")
#if defined(HAVE_INTEL_AVX1)
#define SAVE_XMM_YMM __asm__ volatile("orq %%r8, %%r8":::\
"xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13")
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15")
#else
#define SAVE_XMM_YMM
#endif
@ -769,7 +771,7 @@ static word64 mBYTE_FLIP_MASK[] = { 0x0001020304050607, 0x08090a0b0c0d0e0f } ;
#define s0_(dest, src) s0_1(dest, src) ; s0_2(dest, src) ; s0_3(dest, src)
#define s1_(dest, src) s1_1(dest, src) ; s1_2(dest, src) ; s1_3(dest, src)
static word64 W_X[16+4];
//static word64 W_X[16+4];
#define Block_xx_1(i) \
MOVE_to_REG(W_I_15, W_X[(i-15)&15]) ;\
MOVE_to_REG(W_I_7, W_X[(i- 7)&15]) ;\
@ -818,57 +820,57 @@ static word64 W_X[16+4];
if((i)==0)\
MOVE_to_MEM(W_X,16, W_I) ;\
static inline void Block_0_1(void) { Block_xx_1(0) ; }
static inline void Block_0_2(void) { Block_xx_2(0) ; }
static inline void Block_0_1(word64 *W_X) { Block_xx_1(0) ; }
static inline void Block_0_2(word64 *W_X) { Block_xx_2(0) ; }
static inline void Block_0_3(void) { Block_xx_3(0) ; }
static inline void Block_0_4(void) { Block_xx_4(0) ; }
static inline void Block_0_5(void) { Block_xx_5(0) ; }
static inline void Block_0_6(void) { Block_xx_6(0) ; }
static inline void Block_0_7(void) { Block_xx_7(2) ; }
static inline void Block_0_8(void) { Block_xx_8(2) ; }
static inline void Block_0_6(word64 *W_X) { Block_xx_6(0) ; }
static inline void Block_0_7(word64 *W_X) { Block_xx_7(2) ; }
static inline void Block_0_8(word64 *W_X) { Block_xx_8(2) ; }
static inline void Block_0_9(void) { Block_xx_9(2) ; }
static inline void Block_0_10(void){ Block_xx_10(2) ; }
static inline void Block_0_11(void){ Block_xx_11(2) ; }
static inline void Block_0_12(void){ Block_xx_12(2) ; }
static inline void Block_0_12(word64 *W_X){ Block_xx_12(2) ; }
static inline void Block_4_1(void) { Block_xx_1(4) ; }
static inline void Block_4_2(void) { Block_xx_2(4) ; }
static inline void Block_4_1(word64 *W_X) { Block_xx_1(4) ; }
static inline void Block_4_2(word64 *W_X) { Block_xx_2(4) ; }
static inline void Block_4_3(void) { Block_xx_3(4) ; }
static inline void Block_4_4(void) { Block_xx_4(4) ; }
static inline void Block_4_5(void) { Block_xx_5(4) ; }
static inline void Block_4_6(void) { Block_xx_6(4) ; }
static inline void Block_4_7(void) { Block_xx_7(6) ; }
static inline void Block_4_8(void) { Block_xx_8(6) ; }
static inline void Block_4_6(word64 *W_X) { Block_xx_6(4) ; }
static inline void Block_4_7(word64 *W_X) { Block_xx_7(6) ; }
static inline void Block_4_8(word64 *W_X) { Block_xx_8(6) ; }
static inline void Block_4_9(void) { Block_xx_9(6) ; }
static inline void Block_4_10(void){ Block_xx_10(6) ; }
static inline void Block_4_11(void){ Block_xx_11(6) ; }
static inline void Block_4_12(void){ Block_xx_12(6) ; }
static inline void Block_4_12(word64 *W_X){ Block_xx_12(6) ; }
static inline void Block_8_1(void) { Block_xx_1(8) ; }
static inline void Block_8_2(void) { Block_xx_2(8) ; }
static inline void Block_8_1(word64 *W_X) { Block_xx_1(8) ; }
static inline void Block_8_2(word64 *W_X) { Block_xx_2(8) ; }
static inline void Block_8_3(void) { Block_xx_3(8) ; }
static inline void Block_8_4(void) { Block_xx_4(8) ; }
static inline void Block_8_5(void) { Block_xx_5(8) ; }
static inline void Block_8_6(void) { Block_xx_6(8) ; }
static inline void Block_8_7(void) { Block_xx_7(10) ; }
static inline void Block_8_8(void) { Block_xx_8(10) ; }
static inline void Block_8_6(word64 *W_X) { Block_xx_6(8) ; }
static inline void Block_8_7(word64 *W_X) { Block_xx_7(10) ; }
static inline void Block_8_8(word64 *W_X) { Block_xx_8(10) ; }
static inline void Block_8_9(void) { Block_xx_9(10) ; }
static inline void Block_8_10(void){ Block_xx_10(10) ; }
static inline void Block_8_11(void){ Block_xx_11(10) ; }
static inline void Block_8_12(void){ Block_xx_12(10) ; }
static inline void Block_8_12(word64 *W_X){ Block_xx_12(10) ; }
static inline void Block_12_1(void) { Block_xx_1(12) ; }
static inline void Block_12_2(void) { Block_xx_2(12) ; }
static inline void Block_12_1(word64 *W_X) { Block_xx_1(12) ; }
static inline void Block_12_2(word64 *W_X) { Block_xx_2(12) ; }
static inline void Block_12_3(void) { Block_xx_3(12) ; }
static inline void Block_12_4(void) { Block_xx_4(12) ; }
static inline void Block_12_5(void) { Block_xx_5(12) ; }
static inline void Block_12_6(void) { Block_xx_6(12) ; }
static inline void Block_12_7(void) { Block_xx_7(14) ; }
static inline void Block_12_8(void) { Block_xx_8(14) ; }
static inline void Block_12_6(word64 *W_X) { Block_xx_6(12) ; }
static inline void Block_12_7(word64 *W_X) { Block_xx_7(14) ; }
static inline void Block_12_8(word64 *W_X) { Block_xx_8(14) ; }
static inline void Block_12_9(void) { Block_xx_9(14) ; }
static inline void Block_12_10(void){ Block_xx_10(14) ; }
static inline void Block_12_11(void){ Block_xx_11(14) ; }
static inline void Block_12_12(void){ Block_xx_12(14) ; }
static inline void Block_12_12(word64 *W_X){ Block_xx_12(14) ; }
#endif
@ -1011,13 +1013,14 @@ static unsigned long mBYTE_FLIP_MASK_Y[] =
#endif
/*** Transform Body ***/
#if defined(HAVE_INTEL_AVX1)
static int Transform_AVX1(Sha512* sha512)
{
const word64* K = K512;
word64 W_X[16+4];
word32 j;
word64 T[8];
/* Copy digest to working vars */
@ -1025,25 +1028,25 @@ static int Transform_AVX1(Sha512* sha512)
W_from_buff(W_X, sha512->buffer) ;
for (j = 0; j < 80; j += 16) {
Rx_1( 0); Block_0_1(); Rx_2( 0); Block_0_2(); Rx_3( 0); Block_0_3();
Rx_1( 1); Block_0_4(); Rx_2( 1); Block_0_5(); Rx_3( 1); Block_0_6();
Rx_1( 2); Block_0_7(); Rx_2( 2); Block_0_8(); Rx_3( 2); Block_0_9();
Rx_1( 3); Block_0_10();Rx_2( 3); Block_0_11();Rx_3( 3); Block_0_12();
Rx_1( 0); Block_0_1(W_X); Rx_2( 0); Block_0_2(W_X); Rx_3( 0); Block_0_3();
Rx_1( 1); Block_0_4(); Rx_2( 1); Block_0_5(); Rx_3( 1); Block_0_6(W_X);
Rx_1( 2); Block_0_7(W_X); Rx_2( 2); Block_0_8(W_X); Rx_3( 2); Block_0_9();
Rx_1( 3); Block_0_10();Rx_2( 3); Block_0_11();Rx_3( 3); Block_0_12(W_X);
Rx_1( 4); Block_4_1(); Rx_2( 4); Block_4_2(); Rx_3( 4); Block_4_3();
Rx_1( 5); Block_4_4(); Rx_2( 5); Block_4_5(); Rx_3( 5); Block_4_6();
Rx_1( 6); Block_4_7(); Rx_2( 6); Block_4_8(); Rx_3( 6); Block_4_9();
Rx_1( 7); Block_4_10();Rx_2( 7); Block_4_11();Rx_3( 7); Block_4_12();
Rx_1( 4); Block_4_1(W_X); Rx_2( 4); Block_4_2(W_X); Rx_3( 4); Block_4_3();
Rx_1( 5); Block_4_4(); Rx_2( 5); Block_4_5(); Rx_3( 5); Block_4_6(W_X);
Rx_1( 6); Block_4_7(W_X); Rx_2( 6); Block_4_8(W_X); Rx_3( 6); Block_4_9();
Rx_1( 7); Block_4_10();Rx_2( 7); Block_4_11();Rx_3( 7); Block_4_12(W_X);
Rx_1( 8); Block_8_1(); Rx_2( 8); Block_8_2(); Rx_3( 8); Block_8_3();
Rx_1( 9); Block_8_4(); Rx_2( 9); Block_8_5(); Rx_3( 9); Block_8_6();
Rx_1(10); Block_8_7(); Rx_2(10); Block_8_8(); Rx_3(10); Block_8_9();
Rx_1(11); Block_8_10();Rx_2(11); Block_8_11();Rx_3(11); Block_8_12();
Rx_1( 8); Block_8_1(W_X); Rx_2( 8); Block_8_2(W_X); Rx_3( 8); Block_8_3();
Rx_1( 9); Block_8_4(); Rx_2( 9); Block_8_5(); Rx_3( 9); Block_8_6(W_X);
Rx_1(10); Block_8_7(W_X); Rx_2(10); Block_8_8(W_X); Rx_3(10); Block_8_9();
Rx_1(11); Block_8_10();Rx_2(11); Block_8_11();Rx_3(11); Block_8_12(W_X);
Rx_1(12); Block_12_1(); Rx_2(12); Block_12_2(); Rx_3(12); Block_12_3();
Rx_1(13); Block_12_4(); Rx_2(13); Block_12_5(); Rx_3(13); Block_12_6();
Rx_1(14); Block_12_7(); Rx_2(14); Block_12_8(); Rx_3(14); Block_12_9();
Rx_1(15); Block_12_10();Rx_2(15); Block_12_11();Rx_3(15); Block_12_12();
Rx_1(12); Block_12_1(W_X); Rx_2(12); Block_12_2(W_X); Rx_3(12); Block_12_3();
Rx_1(13); Block_12_4(); Rx_2(13); Block_12_5(); Rx_3(13); Block_12_6(W_X);
Rx_1(14); Block_12_7(W_X); Rx_2(14); Block_12_8(W_X); Rx_3(14); Block_12_9();
Rx_1(15); Block_12_10();Rx_2(15); Block_12_11();Rx_3(15); Block_12_12(W_X);
}
/* Add the working vars back into digest */
@ -1073,7 +1076,7 @@ static int Transform_AVX1(Sha512* sha512)
static int Transform_AVX1_RORX(Sha512* sha512)
{
const word64* K = K512;
word64 W_X[16+4];
word32 j;
word64 T[8];
/* Copy digest to working vars */
@ -1081,41 +1084,41 @@ static int Transform_AVX1_RORX(Sha512* sha512)
W_from_buff(W_X, sha512->buffer) ;
for (j = 0; j < 80; j += 16) {
Rx_RORX_1( 0); Block_0_1(); Rx_RORX_2( 0); Block_0_2();
Rx_RORX_1( 0); Block_0_1(W_X); Rx_RORX_2( 0); Block_0_2(W_X);
Rx_RORX_3( 0); Block_0_3();
Rx_RORX_1( 1); Block_0_4(); Rx_RORX_2( 1); Block_0_5();
Rx_RORX_3( 1); Block_0_6();
Rx_RORX_1( 2); Block_0_7(); Rx_RORX_2( 2); Block_0_8();
Rx_RORX_3( 1); Block_0_6(W_X);
Rx_RORX_1( 2); Block_0_7(W_X); Rx_RORX_2( 2); Block_0_8(W_X);
Rx_RORX_3( 2); Block_0_9();
Rx_RORX_1( 3); Block_0_10();Rx_RORX_2( 3); Block_0_11();
Rx_RORX_3( 3); Block_0_12();
Rx_RORX_3( 3); Block_0_12(W_X);
Rx_RORX_1( 4); Block_4_1(); Rx_RORX_2( 4); Block_4_2();
Rx_RORX_1( 4); Block_4_1(W_X); Rx_RORX_2( 4); Block_4_2(W_X);
Rx_RORX_3( 4); Block_4_3();
Rx_RORX_1( 5); Block_4_4(); Rx_RORX_2( 5); Block_4_5();
Rx_RORX_3( 5); Block_4_6();
Rx_RORX_1( 6); Block_4_7(); Rx_RORX_2( 6); Block_4_8();
Rx_RORX_3( 5); Block_4_6(W_X);
Rx_RORX_1( 6); Block_4_7(W_X); Rx_RORX_2( 6); Block_4_8(W_X);
Rx_RORX_3( 6); Block_4_9();
Rx_RORX_1( 7); Block_4_10();Rx_RORX_2( 7); Block_4_11();
Rx_RORX_3( 7); Block_4_12();
Rx_RORX_3( 7); Block_4_12(W_X);
Rx_RORX_1( 8); Block_8_1(); Rx_RORX_2( 8); Block_8_2();
Rx_RORX_1( 8); Block_8_1(W_X); Rx_RORX_2( 8); Block_8_2(W_X);
Rx_RORX_3( 8); Block_8_3();
Rx_RORX_1( 9); Block_8_4(); Rx_RORX_2( 9); Block_8_5();
Rx_RORX_3( 9); Block_8_6();
Rx_RORX_1(10); Block_8_7(); Rx_RORX_2(10); Block_8_8();
Rx_RORX_3( 9); Block_8_6(W_X);
Rx_RORX_1(10); Block_8_7(W_X); Rx_RORX_2(10); Block_8_8(W_X);
Rx_RORX_3(10); Block_8_9();
Rx_RORX_1(11); Block_8_10();Rx_RORX_2(11); Block_8_11();
Rx_RORX_3(11); Block_8_12();
Rx_RORX_3(11); Block_8_12(W_X);
Rx_RORX_1(12); Block_12_1(); Rx_RORX_2(12); Block_12_2();
Rx_RORX_1(12); Block_12_1(W_X); Rx_RORX_2(12); Block_12_2(W_X);
Rx_RORX_3(12); Block_12_3();
Rx_RORX_1(13); Block_12_4(); Rx_RORX_2(13); Block_12_5();
Rx_RORX_3(13); Block_12_6();
Rx_RORX_1(14); Block_12_7(); Rx_RORX_2(14); Block_12_8();
Rx_RORX_3(13); Block_12_6(W_X);
Rx_RORX_1(14); Block_12_7(W_X); Rx_RORX_2(14); Block_12_8(W_X);
Rx_RORX_3(14); Block_12_9();
Rx_RORX_1(15); Block_12_10();Rx_RORX_2(15); Block_12_11();
Rx_RORX_3(15); Block_12_12();
Rx_RORX_3(15); Block_12_12(W_X);
}
/* Add the working vars back into digest */
@ -1153,7 +1156,6 @@ static int Transform_AVX1_RORX(Sha512* sha512)
#define blk384(i) (W[i] = sha384->buffer[i])
static word64 w[4] ;
#define Block_Y_xx_1(i, w_0, w_4, w_8, w_12)\
MOVE_W_to_W_I_15(W_I_15y, w_0, w_4) ;\
@ -1209,7 +1211,7 @@ static inline void Block_Y_0_8(void) { Block_Y_xx_8(0, W_0y, W_4y, W_8y, W_12y)
static inline void Block_Y_0_9(void) { Block_Y_xx_9(0, W_0y, W_4y, W_8y, W_12y) ; }
static inline void Block_Y_0_10(void){ Block_Y_xx_10(0, W_0y, W_4y, W_8y, W_12y) ; }
static inline void Block_Y_0_11(void){ Block_Y_xx_11(0, W_0y, W_4y, W_8y, W_12y) ; }
static inline void Block_Y_0_12(void){ Block_Y_xx_12(0, W_0y, W_4y, W_8y, W_12y) ; }
static inline void Block_Y_0_12(word64 *w){ Block_Y_xx_12(0, W_0y, W_4y, W_8y, W_12y) ; }
static inline void Block_Y_4_1(void) { Block_Y_xx_1(4, W_4y, W_8y, W_12y, W_0y) ; }
static inline void Block_Y_4_2(void) { Block_Y_xx_2(4, W_4y, W_8y, W_12y, W_0y) ; }
@ -1222,7 +1224,7 @@ static inline void Block_Y_4_8(void) { Block_Y_xx_8(4, W_4y, W_8y, W_12y, W_0y)
static inline void Block_Y_4_9(void) { Block_Y_xx_9(4, W_4y, W_8y, W_12y, W_0y) ; }
static inline void Block_Y_4_10(void) { Block_Y_xx_10(4, W_4y, W_8y, W_12y, W_0y) ; }
static inline void Block_Y_4_11(void) { Block_Y_xx_11(4, W_4y, W_8y, W_12y, W_0y) ; }
static inline void Block_Y_4_12(void) { Block_Y_xx_12(4, W_4y, W_8y, W_12y, W_0y) ; }
static inline void Block_Y_4_12(word64 *w) { Block_Y_xx_12(4, W_4y, W_8y, W_12y, W_0y) ; }
static inline void Block_Y_8_1(void) { Block_Y_xx_1(8, W_8y, W_12y, W_0y, W_4y) ; }
static inline void Block_Y_8_2(void) { Block_Y_xx_2(8, W_8y, W_12y, W_0y, W_4y) ; }
@ -1235,7 +1237,7 @@ static inline void Block_Y_8_8(void) { Block_Y_xx_8(8, W_8y, W_12y, W_0y, W_4y)
static inline void Block_Y_8_9(void) { Block_Y_xx_9(8, W_8y, W_12y, W_0y, W_4y) ; }
static inline void Block_Y_8_10(void) { Block_Y_xx_10(8, W_8y, W_12y, W_0y, W_4y) ; }
static inline void Block_Y_8_11(void) { Block_Y_xx_11(8, W_8y, W_12y, W_0y, W_4y) ; }
static inline void Block_Y_8_12(void) { Block_Y_xx_12(8, W_8y, W_12y, W_0y, W_4y) ; }
static inline void Block_Y_8_12(word64 *w) { Block_Y_xx_12(8, W_8y, W_12y, W_0y, W_4y) ; }
static inline void Block_Y_12_1(void) { Block_Y_xx_1(12, W_12y, W_0y, W_4y, W_8y) ; }
static inline void Block_Y_12_2(void) { Block_Y_xx_2(12, W_12y, W_0y, W_4y, W_8y) ; }
@ -1248,13 +1250,13 @@ static inline void Block_Y_12_8(void) { Block_Y_xx_8(12, W_12y, W_0y, W_4y, W_8y
static inline void Block_Y_12_9(void) { Block_Y_xx_9(12, W_12y, W_0y, W_4y, W_8y) ; }
static inline void Block_Y_12_10(void) { Block_Y_xx_10(12, W_12y, W_0y, W_4y, W_8y) ; }
static inline void Block_Y_12_11(void) { Block_Y_xx_11(12, W_12y, W_0y, W_4y, W_8y) ; }
static inline void Block_Y_12_12(void) { Block_Y_xx_12(12, W_12y, W_0y, W_4y, W_8y) ; }
static inline void Block_Y_12_12(word64 *w) { Block_Y_xx_12(12, W_12y, W_0y, W_4y, W_8y) ; }
static int Transform_AVX2(Sha512* sha512)
{
const word64* K = K512;
word64 w[4] ;
word32 j /*, k*/;
word64 T[8];
/* Copy digest to working vars */
@ -1270,7 +1272,7 @@ static int Transform_AVX2(Sha512* sha512)
Ry_1( 2, w[2]); Block_Y_0_7(); Ry_2( 2, w[2]); Block_Y_0_8();
Ry_3( 2, w[2]); Block_Y_0_9();
Ry_1( 3, w[3]); Block_Y_0_10();Ry_2( 3, w[3]); Block_Y_0_11();
Ry_3( 3, w[3]); Block_Y_0_12();
Ry_3( 3, w[3]); Block_Y_0_12(w);
Ry_1( 4, w[0]); Block_Y_4_1(); Ry_2( 4, w[0]); Block_Y_4_2();
Ry_3( 4, w[0]); Block_Y_4_3();
@ -1279,7 +1281,7 @@ static int Transform_AVX2(Sha512* sha512)
Ry_1( 6, w[2]); Block_Y_4_7(); Ry_2( 6, w[2]); Block_Y_4_8();
Ry_3( 6, w[2]); Block_Y_4_9();
Ry_1( 7, w[3]); Block_Y_4_10(); Ry_2( 7, w[3]);Block_Y_4_11();
Ry_3( 7, w[3]);Block_Y_4_12();
Ry_3( 7, w[3]);Block_Y_4_12(w);
Ry_1( 8, w[0]); Block_Y_8_1(); Ry_2( 8, w[0]); Block_Y_8_2();
Ry_3( 8, w[0]); Block_Y_8_3();
@ -1288,7 +1290,7 @@ static int Transform_AVX2(Sha512* sha512)
Ry_1(10, w[2]); Block_Y_8_7(); Ry_2(10, w[2]); Block_Y_8_8();
Ry_3(10, w[2]); Block_Y_8_9();
Ry_1(11, w[3]); Block_Y_8_10();Ry_2(11, w[3]); Block_Y_8_11();
Ry_3(11, w[3]); Block_Y_8_12();
Ry_3(11, w[3]); Block_Y_8_12(w);
Ry_1(12, w[0]); Block_Y_12_1(); Ry_2(12, w[0]); Block_Y_12_2();
Ry_3(12, w[0]); Block_Y_12_3();
@ -1297,7 +1299,7 @@ static int Transform_AVX2(Sha512* sha512)
Ry_1(14, w[2]); Block_Y_12_7(); Ry_2(14, w[2]); Block_Y_12_8();
Ry_3(14, w[2]); Block_Y_12_9();
Ry_1(15, w[3]); Block_Y_12_10();Ry_2(15, w[3]); Block_Y_12_11();
Ry_3(15, w[3]);Block_Y_12_12();
Ry_3(15, w[3]);Block_Y_12_12(w);
}
/* Add the working vars back into digest */
@ -1343,7 +1345,7 @@ static int (*Transform384_p)(Sha384* sha384) = _Transform384 ;
#define Transform384(sha384) (*Transform384_p)(sha384)
static void set_Transform384(void) {
set_cpuid_flags() ;
if(set_cpuid_flags())return ;
#if defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2)
Transform384_p = ((IS_INTEL_AVX1) ? Transform384_AVX1 : _Transform384) ;
@ -1593,7 +1595,7 @@ int wc_Sha384Hash(const byte* data, word32 len, byte* hash)
static int Transform384_AVX1(Sha384* sha384)
{
const word64* K = K512;
word64 W_X[16+4];
word32 j;
word64 T[8];
@ -1601,25 +1603,25 @@ static int Transform384_AVX1(Sha384* sha384)
XMEMCPY(T, sha384->digest, sizeof(T));
W_from_buff(W_X, sha384->buffer) ;
for (j = 0; j < 80; j += 16) {
Rx_1( 0); Block_0_1(); Rx_2( 0); Block_0_2(); Rx_3( 0); Block_0_3();
Rx_1( 1); Block_0_4(); Rx_2( 1); Block_0_5(); Rx_3( 1); Block_0_6();
Rx_1( 2); Block_0_7(); Rx_2( 2); Block_0_8(); Rx_3( 2); Block_0_9();
Rx_1( 3); Block_0_10();Rx_2( 3); Block_0_11();Rx_3( 3); Block_0_12();
Rx_1( 0); Block_0_1(W_X); Rx_2( 0); Block_0_2(W_X); Rx_3( 0); Block_0_3();
Rx_1( 1); Block_0_4(); Rx_2( 1); Block_0_5(); Rx_3( 1); Block_0_6(W_X);
Rx_1( 2); Block_0_7(W_X); Rx_2( 2); Block_0_8(W_X); Rx_3( 2); Block_0_9();
Rx_1( 3); Block_0_10();Rx_2( 3); Block_0_11();Rx_3( 3); Block_0_12(W_X);
Rx_1( 4); Block_4_1(); Rx_2( 4); Block_4_2(); Rx_3( 4); Block_4_3();
Rx_1( 5); Block_4_4(); Rx_2( 5); Block_4_5(); Rx_3( 5); Block_4_6();
Rx_1( 6); Block_4_7(); Rx_2( 6); Block_4_8(); Rx_3( 6); Block_4_9();
Rx_1( 7); Block_4_10();Rx_2( 7); Block_4_11();Rx_3( 7); Block_4_12();
Rx_1( 4); Block_4_1(W_X); Rx_2( 4); Block_4_2(W_X); Rx_3( 4); Block_4_3();
Rx_1( 5); Block_4_4(); Rx_2( 5); Block_4_5(); Rx_3( 5); Block_4_6(W_X);
Rx_1( 6); Block_4_7(W_X); Rx_2( 6); Block_4_8(W_X); Rx_3( 6); Block_4_9();
Rx_1( 7); Block_4_10();Rx_2( 7); Block_4_11();Rx_3( 7); Block_4_12(W_X);
Rx_1( 8); Block_8_1(); Rx_2( 8); Block_8_2(); Rx_3( 8); Block_8_3();
Rx_1( 9); Block_8_4(); Rx_2( 9); Block_8_5(); Rx_3( 9); Block_8_6();
Rx_1(10); Block_8_7(); Rx_2(10); Block_8_8(); Rx_3(10); Block_8_9();
Rx_1(11); Block_8_10();Rx_2(11); Block_8_11();Rx_3(11); Block_8_12();
Rx_1( 8); Block_8_1(W_X); Rx_2( 8); Block_8_2(W_X); Rx_3( 8); Block_8_3();
Rx_1( 9); Block_8_4(); Rx_2( 9); Block_8_5(); Rx_3( 9); Block_8_6(W_X);
Rx_1(10); Block_8_7(W_X); Rx_2(10); Block_8_8(W_X); Rx_3(10); Block_8_9();
Rx_1(11); Block_8_10();Rx_2(11); Block_8_11();Rx_3(11); Block_8_12(W_X);
Rx_1(12); Block_12_1(); Rx_2(12); Block_12_2(); Rx_3(12); Block_12_3();
Rx_1(13); Block_12_4(); Rx_2(13); Block_12_5(); Rx_3(13); Block_12_6();
Rx_1(14); Block_12_7(); Rx_2(14); Block_12_8(); Rx_3(14); Block_12_9();
Rx_1(15); Block_12_10();Rx_2(15); Block_12_11();Rx_3(15); Block_12_12();
Rx_1(12); Block_12_1(W_X); Rx_2(12); Block_12_2(W_X); Rx_3(12); Block_12_3();
Rx_1(13); Block_12_4(); Rx_2(13); Block_12_5(); Rx_3(13); Block_12_6(W_X);
Rx_1(14); Block_12_7(W_X); Rx_2(14); Block_12_8(W_X); Rx_3(14); Block_12_9();
Rx_1(15); Block_12_10();Rx_2(15); Block_12_11();Rx_3(15); Block_12_12(W_X);
}
/* Add the working vars back into digest */
@ -1648,7 +1650,7 @@ static int Transform384_AVX1(Sha384* sha384)
static int Transform384_AVX1_RORX(Sha384* sha384)
{
const word64* K = K512;
word64 W_X[16+4];
word32 j;
word64 T[8];
@ -1657,41 +1659,41 @@ static int Transform384_AVX1_RORX(Sha384* sha384)
W_from_buff(W_X, sha384->buffer) ;
for (j = 0; j < 80; j += 16) {
Rx_RORX_1( 0); Block_0_1(); Rx_RORX_2( 0);
Block_0_2(); Rx_RORX_3( 0); Block_0_3();
Rx_RORX_1( 0); Block_0_1(W_X); Rx_RORX_2( 0);
Block_0_2(W_X); Rx_RORX_3( 0); Block_0_3();
Rx_RORX_1( 1); Block_0_4(); Rx_RORX_2( 1);
Block_0_5(); Rx_RORX_3( 1); Block_0_6();
Rx_RORX_1( 2); Block_0_7(); Rx_RORX_2( 2);
Block_0_8(); Rx_RORX_3( 2); Block_0_9();
Block_0_5(); Rx_RORX_3( 1); Block_0_6(W_X);
Rx_RORX_1( 2); Block_0_7(W_X); Rx_RORX_2( 2);
Block_0_8(W_X); Rx_RORX_3( 2); Block_0_9();
Rx_RORX_1( 3); Block_0_10();Rx_RORX_2( 3);
Block_0_11();Rx_RORX_3( 3); Block_0_12();
Block_0_11();Rx_RORX_3( 3); Block_0_12(W_X);
Rx_RORX_1( 4); Block_4_1(); Rx_RORX_2( 4);
Block_4_2(); Rx_RORX_3( 4); Block_4_3();
Rx_RORX_1( 4); Block_4_1(W_X); Rx_RORX_2( 4);
Block_4_2(W_X); Rx_RORX_3( 4); Block_4_3();
Rx_RORX_1( 5); Block_4_4(); Rx_RORX_2( 5);
Block_4_5(); Rx_RORX_3( 5); Block_4_6();
Rx_RORX_1( 6); Block_4_7(); Rx_RORX_2( 6);
Block_4_8(); Rx_RORX_3( 6); Block_4_9();
Block_4_5(); Rx_RORX_3( 5); Block_4_6(W_X);
Rx_RORX_1( 6); Block_4_7(W_X); Rx_RORX_2( 6);
Block_4_8(W_X); Rx_RORX_3( 6); Block_4_9();
Rx_RORX_1( 7); Block_4_10();Rx_RORX_2( 7);
Block_4_11();Rx_RORX_3( 7); Block_4_12();
Block_4_11();Rx_RORX_3( 7); Block_4_12(W_X);
Rx_RORX_1( 8); Block_8_1(); Rx_RORX_2( 8);
Block_8_2(); Rx_RORX_3( 8); Block_8_3();
Rx_RORX_1( 8); Block_8_1(W_X); Rx_RORX_2( 8);
Block_8_2(W_X); Rx_RORX_3( 8); Block_8_3();
Rx_RORX_1( 9); Block_8_4(); Rx_RORX_2( 9);
Block_8_5(); Rx_RORX_3( 9); Block_8_6();
Rx_RORX_1(10); Block_8_7(); Rx_RORX_2(10);
Block_8_8(); Rx_RORX_3(10); Block_8_9();
Block_8_5(); Rx_RORX_3( 9); Block_8_6(W_X);
Rx_RORX_1(10); Block_8_7(W_X); Rx_RORX_2(10);
Block_8_8(W_X); Rx_RORX_3(10); Block_8_9();
Rx_RORX_1(11); Block_8_10();Rx_RORX_2(11);
Block_8_11();Rx_RORX_3(11); Block_8_12();
Block_8_11();Rx_RORX_3(11); Block_8_12(W_X);
Rx_RORX_1(12); Block_12_1(); Rx_RORX_2(12);
Block_12_2(); Rx_RORX_3(12); Block_12_3();
Rx_RORX_1(12); Block_12_1(W_X); Rx_RORX_2(12);
Block_12_2(W_X); Rx_RORX_3(12); Block_12_3();
Rx_RORX_1(13); Block_12_4(); Rx_RORX_2(13);
Block_12_5(); Rx_RORX_3(13); Block_12_6();
Rx_RORX_1(14); Block_12_7(); Rx_RORX_2(14);
Block_12_8(); Rx_RORX_3(14); Block_12_9();
Block_12_5(); Rx_RORX_3(13); Block_12_6(W_X);
Rx_RORX_1(14); Block_12_7(W_X); Rx_RORX_2(14);
Block_12_8(W_X); Rx_RORX_3(14); Block_12_9();
Rx_RORX_1(15); Block_12_10();Rx_RORX_2(15);
Block_12_11();Rx_RORX_3(15); Block_12_12();
Block_12_11();Rx_RORX_3(15); Block_12_12(W_X);
}
/* Add the working vars back into digest */
@ -1720,7 +1722,7 @@ static int Transform384_AVX1_RORX(Sha384* sha384)
static int Transform384_AVX2(Sha384* sha384)
{
const word64* K = K512;
word64 w[4] ;
word32 j;
word64 T[8];
@ -1741,7 +1743,7 @@ static int Transform384_AVX2(Sha384* sha384)
Ry_1( 2, w[2]); Block_Y_0_7(); Ry_2( 2, w[2]);
Block_Y_0_8(); Ry_3( 2, w[2]); Block_Y_0_9();
Ry_1( 3, w[3]); Block_Y_0_10();Ry_2( 3, w[3]);
Block_Y_0_11();Ry_3( 3, w[3]); Block_Y_0_12();
Block_Y_0_11();Ry_3( 3, w[3]); Block_Y_0_12(w);
Ry_1( 4, w[0]); Block_Y_4_1(); Ry_2( 4, w[0]);
Block_Y_4_2(); Ry_3( 4, w[0]); Block_Y_4_3();
@ -1750,7 +1752,7 @@ static int Transform384_AVX2(Sha384* sha384)
Ry_1( 6, w[2]); Block_Y_4_7(); Ry_2( 6, w[2]);
Block_Y_4_8(); Ry_3( 6, w[2]); Block_Y_4_9();
Ry_1( 7, w[3]); Block_Y_4_10(); Ry_2( 7, w[3]);
Block_Y_4_11(); Ry_3( 7, w[3]);Block_Y_4_12();
Block_Y_4_11(); Ry_3( 7, w[3]);Block_Y_4_12(w);
Ry_1( 8, w[0]); Block_Y_8_1(); Ry_2( 8, w[0]);
Block_Y_8_2(); Ry_3( 8, w[0]); Block_Y_8_3();
@ -1759,7 +1761,7 @@ static int Transform384_AVX2(Sha384* sha384)
Ry_1(10, w[2]); Block_Y_8_7(); Ry_2(10, w[2]);
Block_Y_8_8(); Ry_3(10, w[2]); Block_Y_8_9();
Ry_1(11, w[3]); Block_Y_8_10();Ry_2(11, w[3]);
Block_Y_8_11();Ry_3(11, w[3]); Block_Y_8_12();
Block_Y_8_11();Ry_3(11, w[3]); Block_Y_8_12(w);
Ry_1(12, w[0]); Block_Y_12_1(); Ry_2(12, w[0]);
Block_Y_12_2(); Ry_3(12, w[0]); Block_Y_12_3();
@ -1768,7 +1770,7 @@ static int Transform384_AVX2(Sha384* sha384)
Ry_1(14, w[2]); Block_Y_12_7(); Ry_2(14, w[2]);
Block_Y_12_8(); Ry_3(14, w[2]); Block_Y_12_9();
Ry_1(15, w[3]); Block_Y_12_10();Ry_2(15, w[3]);
Block_Y_12_11();Ry_3(15, w[3]); Block_Y_12_12();
Block_Y_12_11();Ry_3(15, w[3]); Block_Y_12_12(w);
}
/* Add the working vars back into digest */

View File

@ -3221,7 +3221,6 @@ int random_test(void)
ret = wc_RNG_GenerateBlock(&rng, block, sizeof(block));
if (ret != 0) return -40;
return 0;
wc_FreeRng(&rng);
return 0;