mirror of https://github.com/drowe67/LPCNet.git
Suppress aarch64 specific code for non-aarch64 builds.
parent
4041719fdc
commit
855b0a692b
215
src/vec_neon.h
215
src/vec_neon.h
|
@ -276,6 +276,8 @@ static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, co
|
|||
#define MAX_INPUTS 2048
|
||||
#define MAX_OUTPUTS 8192
|
||||
|
||||
#if __aarch64__
|
||||
|
||||
#if __ARM_FEATURE_DOTPROD
|
||||
static inline int32x4_t vdotprod(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||
return vdotq_s32(acc, a, b);
|
||||
|
@ -361,3 +363,216 @@ static inline void sparse_sgemv_accum8x4(float *_out, const qweight *w, int rows
|
|||
vst1q_f32(&_out[i+4], vmulq_f32(scale_1, vcvtq_f32_s32(acc1)));
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#ifdef DOT_PROD
|
||||
|
||||
#define SCALE (128.f*127.f)
|
||||
#define SCALE_1 (1.f/128.f/127.f)
|
||||
|
||||
|
||||
#ifdef USE_SU_BIAS
|
||||
|
||||
static inline void sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, int col_stride, const float *_x)
|
||||
{
|
||||
int i, j;
|
||||
unsigned char x[MAX_INPUTS];
|
||||
(void)col_stride;
|
||||
for (i=0;i<rows;i++) out[i] *= SCALE;
|
||||
for (i=0;i<cols;i++) x[i] = 127+(int)floor(.5+127*_x[i]);
|
||||
for (i=0;i<rows;i+=8)
|
||||
{
|
||||
for (j=0;j<cols;j+=4)
|
||||
{
|
||||
float * restrict y;
|
||||
float xj0, xj1, xj2, xj3;
|
||||
xj0 = x[j+0];
|
||||
xj1 = x[j+1];
|
||||
xj2 = x[j+2];
|
||||
xj3 = x[j+3];
|
||||
y = &out[i];
|
||||
y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
|
||||
y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
|
||||
y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
|
||||
y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
|
||||
y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
|
||||
y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
|
||||
y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
|
||||
y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
|
||||
w += 32;
|
||||
}
|
||||
}
|
||||
for (i=0;i<rows;i++) out[i] *= SCALE_1;
|
||||
}
|
||||
|
||||
static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
|
||||
{
|
||||
int i, j;
|
||||
unsigned char x[MAX_INPUTS];
|
||||
for (i=0;i<rows;i++) out[i] *= SCALE;
|
||||
for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);
|
||||
for (i=0;i<rows;i+=8)
|
||||
{
|
||||
int colblocks;
|
||||
colblocks = *idx++;
|
||||
for (j=0;j<colblocks;j++)
|
||||
{
|
||||
int pos;
|
||||
float * restrict y;
|
||||
int xj0, xj1, xj2, xj3;
|
||||
pos = (*idx++);
|
||||
xj0 = x[pos+0];
|
||||
xj1 = x[pos+1];
|
||||
xj2 = x[pos+2];
|
||||
xj3 = x[pos+3];
|
||||
y = &out[i];
|
||||
y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
|
||||
y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
|
||||
y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
|
||||
y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
|
||||
y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
|
||||
y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
|
||||
y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
|
||||
y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
|
||||
w += 32;
|
||||
}
|
||||
}
|
||||
for (i=0;i<rows;i++) out[i] *= SCALE_1;
|
||||
}
|
||||
#else /*USE_SU_BIAS*/
|
||||
|
||||
static inline void sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, int col_stride, const float *_x)
|
||||
{
|
||||
int i, j;
|
||||
signed char x[MAX_INPUTS];
|
||||
(void)col_stride;
|
||||
for (i=0;i<rows;i++) out[i] *= SCALE;
|
||||
for (i=0;i<cols;i++) x[i] = (int)floor(.5+127*_x[i]);
|
||||
for (i=0;i<rows;i+=8)
|
||||
{
|
||||
for (j=0;j<cols;j+=4)
|
||||
{
|
||||
float * restrict y;
|
||||
float xj0, xj1, xj2, xj3;
|
||||
xj0 = x[j+0];
|
||||
xj1 = x[j+1];
|
||||
xj2 = x[j+2];
|
||||
xj3 = x[j+3];
|
||||
y = &out[i];
|
||||
y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
|
||||
y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
|
||||
y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
|
||||
y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
|
||||
y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
|
||||
y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
|
||||
y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
|
||||
y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
|
||||
w += 32;
|
||||
}
|
||||
}
|
||||
for (i=0;i<rows;i++) out[i] *= SCALE_1;
|
||||
}
|
||||
|
||||
static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
|
||||
{
|
||||
int i, j;
|
||||
signed char x[MAX_INPUTS];
|
||||
for (i=0;i<rows;i++) out[i] *= SCALE;
|
||||
for (i=0;i<cols;i++) x[i] = floor(.5+127*_x[i]);
|
||||
for (i=0;i<rows;i+=8)
|
||||
{
|
||||
int colblocks;
|
||||
colblocks = *idx++;
|
||||
for (j=0;j<colblocks;j++)
|
||||
{
|
||||
int pos;
|
||||
float * restrict y;
|
||||
int xj0, xj1, xj2, xj3;
|
||||
pos = (*idx++);
|
||||
xj0 = x[pos+0];
|
||||
xj1 = x[pos+1];
|
||||
xj2 = x[pos+2];
|
||||
xj3 = x[pos+3];
|
||||
y = &out[i];
|
||||
y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
|
||||
y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
|
||||
y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
|
||||
y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
|
||||
y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
|
||||
y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
|
||||
y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
|
||||
y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
|
||||
w += 32;
|
||||
}
|
||||
}
|
||||
for (i=0;i<rows;i++) out[i] *= SCALE_1;
|
||||
}
|
||||
#endif /*USE_SU_BIAS*/
|
||||
|
||||
#else /*DOT_PROD*/
|
||||
|
||||
#define sgemv_accum8x4 sgemv_accum
|
||||
|
||||
|
||||
static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int ignore, const int *idx, const float *x)
|
||||
{
|
||||
int i, j;
|
||||
(void)ignore;
|
||||
for (i=0;i<rows;i+=8)
|
||||
{
|
||||
int cols;
|
||||
cols = *idx++;
|
||||
for (j=0;j<cols;j++)
|
||||
{
|
||||
int pos;
|
||||
float * restrict y;
|
||||
float xj0, xj1, xj2, xj3;
|
||||
pos = (*idx++);
|
||||
xj0 = x[pos+0];
|
||||
xj1 = x[pos+1];
|
||||
xj2 = x[pos+2];
|
||||
xj3 = x[pos+3];
|
||||
y = &out[i];
|
||||
y[0] += w[0]*xj0;
|
||||
y[1] += w[1]*xj0;
|
||||
y[2] += w[2]*xj0;
|
||||
y[3] += w[3]*xj0;
|
||||
y[4] += w[4]*xj0;
|
||||
y[5] += w[5]*xj0;
|
||||
y[6] += w[6]*xj0;
|
||||
y[7] += w[7]*xj0;
|
||||
|
||||
y[0] += w[8]*xj1;
|
||||
y[1] += w[9]*xj1;
|
||||
y[2] += w[10]*xj1;
|
||||
y[3] += w[11]*xj1;
|
||||
y[4] += w[12]*xj1;
|
||||
y[5] += w[13]*xj1;
|
||||
y[6] += w[14]*xj1;
|
||||
y[7] += w[15]*xj1;
|
||||
|
||||
y[0] += w[16]*xj2;
|
||||
y[1] += w[17]*xj2;
|
||||
y[2] += w[18]*xj2;
|
||||
y[3] += w[19]*xj2;
|
||||
y[4] += w[20]*xj2;
|
||||
y[5] += w[21]*xj2;
|
||||
y[6] += w[22]*xj2;
|
||||
y[7] += w[23]*xj2;
|
||||
|
||||
y[0] += w[24]*xj3;
|
||||
y[1] += w[25]*xj3;
|
||||
y[2] += w[26]*xj3;
|
||||
y[3] += w[27]*xj3;
|
||||
y[4] += w[28]*xj3;
|
||||
y[5] += w[29]*xj3;
|
||||
y[6] += w[30]*xj3;
|
||||
y[7] += w[31]*xj3;
|
||||
w += 32;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /*DOT_PROD*/
|
||||
|
||||
#endif /* __aarch64__ */
|
||||
|
|
Loading…
Reference in New Issue