c - How Do I Attain Peak CPU Performance With Dot Product? -


problem

i have been studying hpc, using matrix multiplication project (see other posts in profile). achieve performance in those, not enough. taking step see how can dot product calculation.

dot product vs. matrix multiplication

the dot product simpler, , allow me test hpc concepts without dealing packing , other related issues. cache blocking still issue, forms second question.

algorithm

multiply n corresponding elements in 2 double arrays a , b , sum them. double dot product in assembly series of movapd, mulpd, addpd. unrolled , arranged in clever way, possible have groups of movapd/mulpd/addpd operate on different xmm registers , independent, optimizing pipelining. of course, turns out not matter cpu has out-of-order execution. note re-arrangement requires peeling off last iteration.

other assumptions

i not writing code general dot products. code specific sizes , not handling fringe cases. test hpc concepts , see type of cpu usage can attain.

results

compiled gcc -std=c99 -o2 -m32 -mincoming-stack-boundary=2 -msse3 -mfpmath=sse,387 -masm=intel. on different computer usual. computer has i5 540m can obtain 2.8 ghz * 4 flops/cycle/core = 11.2 gflops/s per core after two-step intel turbo boost (both cores on right gets 2 step...a 4 step boost possible if turn off 1 core). 32 bit linpack gets around 9.5 gflops/s when set run 1 thread.

       n   total gflops/s         residual      256         5.580521    1.421085e-014      384         5.734344   -2.842171e-014      512         5.791168    0.000000e+000      640         5.821629    0.000000e+000      768         5.814255    2.842171e-014      896         5.807132    0.000000e+000     1024         5.817208   -1.421085e-013     1152         5.805388    0.000000e+000     1280         5.830746   -5.684342e-014     1408         5.881937   -5.684342e-014     1536         5.872159   -1.705303e-013     1664         5.881536    5.684342e-014     1792         5.906261   -2.842171e-013     1920         5.477966    2.273737e-013     2048         5.620931    0.000000e+000     2176         3.998713    1.136868e-013     2304         3.370095   -3.410605e-013     2432         3.371386   -3.410605e-013 

question 1

how can better this? not coming close peak performance. have optimized assembly code high heaven. further unrolling might boost little more, less unrolling seems degrade performance.

question 2

when n > 2048, can see drop in performance. because l1 cache 32kb, , when n = 2048 , a , b double, take entire cache. bigger , streamed memory.

i tried cache blocking (not shown in source), maybe did wrong. can provide code or explain how block dot product cache?

source code

    #include <stdio.h>     #include <time.h>     #include <stdlib.h>     #include <string.h>     #include <x86intrin.h>     #include <math.h>     #include <omp.h>     #include <stdint.h>     #include <windows.h>      // computes 8 dot products #define kernel(address) \             "movapd     xmm4, xmmword ptr [eax+"#address"]      \n\t" \             "mulpd      xmm7, xmmword ptr [edx+48+"#address"]   \n\t" \             "addpd      xmm2, xmm6                              \n\t" \             "movapd     xmm5, xmmword ptr [eax+16+"#address"]   \n\t" \             "mulpd      xmm4, xmmword ptr [edx+"#address"]      \n\t" \             "addpd      xmm3, xmm7                              \n\t" \             "movapd     xmm6, xmmword ptr [eax+96+"#address"]   \n\t" \             "mulpd      xmm5, xmmword ptr [edx+16+"#address"]   \n\t" \             "addpd      xmm0, xmm4                              \n\t" \             "movapd     xmm7, xmmword ptr [eax+112+"#address"]  \n\t" \             "mulpd      xmm6, xmmword ptr [edx+96+"#address"]   \n\t" \             "addpd      xmm1, xmm5                              \n\t"  #define peeled(address) \             "movapd     xmm4, xmmword ptr [eax+"#address"]      \n\t" \             "mulpd      xmm7, [edx+48+"#address"]               \n\t" \             "addpd      xmm2, xmm6                  \n\t" \             "movapd     xmm5, xmmword ptr [eax+16+"#address"]   \n\t" \             "mulpd      xmm4, xmmword ptr [edx+"#address"]      \n\t" \             "addpd      xmm3, xmm7                  \n\t" \             "mulpd      xmm5, xmmword ptr [edx+16+"#address"]   \n\t" \             "addpd      xmm0, xmm4                  \n\t" \             "addpd      xmm1, xmm5                  \n\t"  inline double  __attribute__ ((gnu_inline))         __attribute__ ((aligned(64))) ddot_ref(     int n,     const double* restrict a,     const double* restrict b) {     double sum0 = 0.0;     double sum1 = 0.0;     double sum2 = 0.0;     double sum3 = 0.0;     double sum;     for(int = 0; < n; i+=4) {         sum0 += *(a +  ) * *(b +  );         sum1 += *(a + i+1) * *(b + i+1);         sum2 += *(a + i+2) * *(b + i+2);         sum3 += *(a + i+3) * *(b + i+3);     }     sum = sum0+sum1+sum2+sum3;     return(sum); }  inline double  __attribute__ ((gnu_inline))         __attribute__ ((aligned(64))) ddot_asm (   int n,     const double* restrict a,     const double* restrict b) {          double sum;              __asm__ __volatile__         (             "mov        eax, %[a]                   \n\t"             "mov        edx, %[b]                   \n\t"             "mov        ecx, %[n]                   \n\t"             "pxor       xmm0, xmm0                  \n\t"             "pxor       xmm1, xmm1                  \n\t"             "pxor       xmm2, xmm2                  \n\t"             "pxor       xmm3, xmm3                  \n\t"             "movapd     xmm6, xmmword ptr [eax+32]  \n\t"             "movapd     xmm7, xmmword ptr [eax+48]  \n\t"             "mulpd      xmm6, xmmword ptr [edx+32]  \n\t"             "sar        ecx, 7                      \n\t"             "sub        ecx, 1                      \n\t" // peel             "l%=:                                   \n\t"             kernel(64   *   0)             kernel(64   *   1)             kernel(64   *   2)             kernel(64   *   3)             kernel(64   *   4)             kernel(64   *   5)             kernel(64   *   6)             kernel(64   *   7)             kernel(64   *   8)             kernel(64   *   9)             kernel(64   *   10)             kernel(64   *   11)             kernel(64   *   12)             kernel(64   *   13)             kernel(64   *   14)             kernel(64   *   15)             "lea        eax, [eax+1024]             \n\t"             "lea        edx, [edx+1024]             \n\t"             "                                       \n\t"             "dec        ecx                         \n\t"             "jnz        l%=                         \n\t" // end loop             "                                       \n\t"             kernel(64   *   0)             kernel(64   *   1)             kernel(64   *   2)             kernel(64   *   3)             kernel(64   *   4)             kernel(64   *   5)             kernel(64   *   6)             kernel(64   *   7)             kernel(64   *   8)             kernel(64   *   9)             kernel(64   *   10)             kernel(64   *   11)             kernel(64   *   12)             kernel(64   *   13)             kernel(64   *   14)             peeled(64   *   15)             "                                       \n\t"             "addpd      xmm0, xmm1                  \n\t" // summing result             "addpd      xmm2, xmm3                  \n\t"             "addpd      xmm0, xmm2                  \n\t" // cascading add             "movapd     xmm1, xmm0                  \n\t" // copy xmm0             "shufpd     xmm1, xmm0, 0x03            \n\t" // shuffle             "addsd      xmm0, xmm1                  \n\t" // add low qword             "movsd      %[sum], xmm0                \n\t" // mov low qw sum             : // outputs             [sum]   "=m"    (sum)             : // inputs             [a] "m" (a),             [b] "m" (b),              [n] "m" (n)             : //register clobber             "memory",             "eax","ecx","edx","edi",             "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"             );         return(sum); }  int main() {     // timers     large_integer frequency, time1, time2;     double time3;     queryperformancefrequency(&frequency);     // clock_t time1, time2;     double gflops;      int nmax = 4096;     int trials = 1e7;     double sum, residual;     file *f = fopen("soddot.txt","w+");      printf("%16s %16s %16s\n","n","total gflops/s","residual");     fprintf(f,"%16s %16s %16s\n","n","total gflops/s","residual");      for(int n = 256; n <= nmax; n += 128 ) {         double* = null;         double* b = null;         = _mm_malloc(n*sizeof(*a), 64); if (!a) {printf("a failed\n"); return(1);}         b = _mm_malloc(n*sizeof(*b), 64); if (!b) {printf("b failed\n"); return(1);}          srand(time(null));          // create arrays         for(int = 0; < n; ++i) {             *(a + i) = (double) rand()/rand_max;             *(b + i) = (double) rand()/rand_max;         }          // warmup         sum = ddot_asm(n,a,b);          queryperformancecounter(&time1);         // time1 = clock();         (int count = 0; count < trials; count++){             // sum = ddot_ref(n,a,b);             sum = ddot_asm(n,a,b);         }         queryperformancecounter(&time2);         time3 = (double)(time2.quadpart - time1.quadpart) / frequency.quadpart;         // time3 = (double) (clock() - time1)/clocks_per_sec;         gflops = (double) (2.0*n*trials)/time3/1.0e9;         residual = ddot_ref(n,a,b) - sum;         printf("%16d %16f %16e\n",n,gflops,residual);         fprintf(f,"%16d %16f %16e\n",n,gflops,residual);          _mm_free(a);         _mm_free(b);     }     fclose(f);     return(0); // successful completion } 

edit: explanation of assembly

a dot product repeat sum of products of 2 numbers: sum += a[i]*b[i]. sum must initialized 0 before first iteration. vectorized, can 2 sums @ time must summed @ end: [sum0 sum1] = [a[i] a[i+1]]*[b[i] b[i+1]], sum = sum0 + sum1. in (intel) assembly, 3 steps (after initialization):

pxor   xmm0, xmm0              // accumulator [sum0 sum1] = [0 0] movapd xmm1, xmmword ptr [eax] // load [a[i] a[i+1]] xmm1 mulpd  xmm1, xmmword ptr [edx] // xmm1 = xmm1 * [b[i] b[i+1]] addpd  xmm0, xmm1              // xmm0 = xmm0 + xmm1 

at point have nothing special, compiler can come this. can better performance unrolling code enough times use xmm registers available (8 registers in 32 bit mode). if unroll 4 times allows utilize 8 registers xmm0 through xmm7. have 4 accumulators , 4 registers storing results of movapd , addpd. again, compiler can come this. real thinking part trying come way pipeline code, i.e., make each instruction in group of mov/mul/add operate on different registers 3 instructions execute @ same time (usually case on cpus). that's how beat compiler. have pattern 4x unrolled code that, may require loading vectors ahead of time , peeling off first or last iteration. kernel(address) is. made macro of 4x unrolled pipelined code convenience. way can unroll in multiples of 4 changing address. each kernel computes 8 dot products.

to answer overall question can't achieve peak performance dot product.

the problem cpu can 1 128-bit load per clock cycle , dot product need 2 128-bit loads per clock cycle.

but it's worse large n. answer second question dot product memory bound , not compute bound , cannot parallelize large n fast cores. explained better here why-vectorizing-the-loop-does-not-have-performance-improvement. big problem parallelization fast cores. took me while figure out it's important learn.

there few basic algorithms can benefit parallelization on fast cores. in terms of blas algorithms it's level-3 algorithms (o(n^3)) such matrix multiplication benefit parallelization. situation better on slow cores e.g. gpus , xeon phi because discrepancy between memory speed , core speed smaller.

if want find algorithm can close peak flops small n try e.g. scalar * vector or sum of scalar * vector. first case should 1 load, 1 mult, , 1 store every clock cycle , second case 1 mult, 1 add, , 1 load every clock cycle.

i tested following code on core 2 duo p9600@2.67ghz in knoppix 7.3 32-bit. i 75% of peak scalar product , 75% of peakfor sum of scalar product. flops/cycle scalar product 2 , sum of scalar product it's 4.

compiled g++ -msse2 -o3 -fopenmp foo.cpp -ffast-math

#include <stdio.h> #include <stdlib.h> #include <omp.h> #include <x86intrin.h>  void scalar_product(double * __restrict a, int n) {     = (double*)__builtin_assume_aligned (a, 64);     double k = 3.14159;     for(int i=0; i<n; i++) {         a[i] = k*a[i];      } }  void scalar_product_sse(double * __restrict a, int n) {     = (double*)__builtin_assume_aligned (a, 64);     __m128d k = _mm_set1_pd(3.14159);         for(int i=0; i<n; i+=8) {         __m128d t1 = _mm_load_pd(&a[i+0]);         _mm_store_pd(&a[i],_mm_mul_pd(k,t1));         __m128d t2 = _mm_load_pd(&a[i+2]);         _mm_store_pd(&a[i+2],_mm_mul_pd(k,t2));         __m128d t3 = _mm_load_pd(&a[i+4]);         _mm_store_pd(&a[i+4],_mm_mul_pd(k,t3));         __m128d t4 = _mm_load_pd(&a[i+6]);         _mm_store_pd(&a[i+6],_mm_mul_pd(k,t4));     } }  double scalar_sum(double * __restrict a, int n) {     = (double*)__builtin_assume_aligned (a, 64);     double sum = 0.0;         double k = 3.14159;     for(int i=0; i<n; i++) {         sum += k*a[i];      }     return sum; }  double scalar_sum_sse(double * __restrict a, int n) {     = (double*)__builtin_assume_aligned (a, 64);     __m128d sum1 = _mm_setzero_pd();     __m128d sum2 = _mm_setzero_pd();     __m128d sum3 = _mm_setzero_pd();     __m128d sum4 = _mm_setzero_pd();     __m128d k = _mm_set1_pd(3.14159);        for(int i=0; i<n; i+=8) {         __m128d t1 = _mm_load_pd(&a[i+0]);         sum1 = _mm_add_pd(_mm_mul_pd(k,t1),sum1);         __m128d t2 = _mm_load_pd(&a[i+2]);         sum2 = _mm_add_pd(_mm_mul_pd(k,t2),sum2);         __m128d t3 = _mm_load_pd(&a[i+4]);         sum3 = _mm_add_pd(_mm_mul_pd(k,t3),sum3);         __m128d t4 = _mm_load_pd(&a[i+6]);         sum4 = _mm_add_pd(_mm_mul_pd(k,t4),sum4);           }     double tmp[8];     _mm_storeu_pd(&tmp[0],sum1);     _mm_storeu_pd(&tmp[2],sum2);     _mm_storeu_pd(&tmp[4],sum3);     _mm_storeu_pd(&tmp[6],sum4);     double sum = 0;     for(int i=0; i<8; i++) sum+=tmp[i];     return sum; }  int main() {     //_mm_set_flush_zero_mode(_mm_flush_zero_on);     //_mm_setcsr(_mm_getcsr() | 0x8040);     double dtime, peak, flops, sum;     int repeat = 1<<18;     const int n = 2048;     double *a = (double*)_mm_malloc(sizeof(double)*n,64);     double *b = (double*)_mm_malloc(sizeof(double)*n,64);     for(int i=0; i<n; i++) a[i] = 1.0*rand()/rand_max;      dtime = omp_get_wtime();     for(int r=0; r<repeat; r++) {         scalar_product_sse(a,n);     }     dtime = omp_get_wtime() - dtime;     peak = 2*2.67;     flops = 1.0*n/dtime*1e-9*repeat;     printf("time %f, %f, %f\n", dtime,flops, flops/peak);      //for(int i=0; i<n; i++) a[i] = 1.0*rand()/rand_max;     //sum = 0.0;         dtime = omp_get_wtime();     for(int r=0; r<repeat; r++) {         scalar_sum_sse(a,n);     }     dtime = omp_get_wtime() - dtime;     peak = 2*2*2.67;     flops = 2.0*n/dtime*1e-9*repeat;     printf("time %f, %f, %f\n", dtime,flops, flops/peak);     //printf("sum %f\n", sum);  } 

Comments

Popular posts from this blog

C# random value from dictionary and tuple -

cgi - How do I interpret URLs without extension as files rather than missing directories in nginx? -

.htaccess - htaccess convert request to clean url and add slash at the end of the url -