Commit b18e3579 authored by Maksim Shabunin's avatar Maksim Shabunin Committed by Vadim Pisarevsky

dnn: fixed GEMM1T AVX2 implementation (#1231)

parent 81283e9d
...@@ -169,7 +169,7 @@ public: ...@@ -169,7 +169,7 @@ public:
for( k = 0; k < vecsize; k += 4 ) for( k = 0; k < vecsize; k += 4 )
{ {
vfloat32x4 v = v_load_aligned(sptr + k); vfloat32x4 v = v_load(sptr + k);
vs0 += v*v_load_aligned(wptr + k); vs0 += v*v_load_aligned(wptr + k);
vs1 += v*v_load_aligned(wptr + wstep + k); vs1 += v*v_load_aligned(wptr + wstep + k);
vs2 += v*v_load_aligned(wptr + wstep*2 + k); vs2 += v*v_load_aligned(wptr + wstep*2 + k);
......
...@@ -204,7 +204,7 @@ void fastGEMM1T_avx2( const float* vec, const float* weights, ...@@ -204,7 +204,7 @@ void fastGEMM1T_avx2( const float* vec, const float* weights,
for( int k = 0; k < vecsize; k += 8, wptr += 8 ) for( int k = 0; k < vecsize; k += 8, wptr += 8 )
{ {
__m256 v = _mm256_load_ps(vec + k); __m256 v = _mm256_loadu_ps(vec + k);
vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0); vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1); vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1);
...@@ -237,7 +237,7 @@ void fastGEMM1T_avx2( const float* vec, const float* weights, ...@@ -237,7 +237,7 @@ void fastGEMM1T_avx2( const float* vec, const float* weights,
for( int k = 0; k < vecsize; k += 8, wptr += 8 ) for( int k = 0; k < vecsize; k += 8, wptr += 8 )
{ {
__m256 v = _mm256_load_ps(vec + k); __m256 v = _mm256_loadu_ps(vec + k);
vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0); vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment