c - How Do I Attain Peak CPU Performance With Dot Product? -
problem
i have been studying hpc, using matrix multiplication project (see other posts in profile). achieve performance in those, not enough. taking step see how can dot product calculation.
dot product vs. matrix multiplication
the dot product simpler, , allow me test hpc concepts without dealing packing , other related issues. cache blocking still issue, forms second question.
algorithm
multiply n
corresponding elements in 2 double
arrays a
, b
, sum them. double
dot product in assembly series of movapd
, mulpd
, addpd
. unrolled , arranged in clever way, possible have groups of movapd
/mulpd
/addpd
operate on different xmm
registers , independent, optimizing pipelining. of course, turns out not matter cpu has out-of-order execution. note re-arrangement requires peeling off last iteration.
other assumptions
i not writing code general dot products. code specific sizes , not handling fringe cases. test hpc concepts , see type of cpu usage can attain.
results
compiled gcc -std=c99 -o2 -m32 -mincoming-stack-boundary=2 -msse3 -mfpmath=sse,387 -masm=intel
. on different computer usual. computer has i5 540m
can obtain 2.8 ghz * 4 flops/cycle/core = 11.2 gflops/s per core
after two-step intel turbo boost (both cores on right gets 2 step...a 4 step boost possible if turn off 1 core). 32 bit linpack gets around 9.5 gflops/s when set run 1 thread.
n total gflops/s residual 256 5.580521 1.421085e-014 384 5.734344 -2.842171e-014 512 5.791168 0.000000e+000 640 5.821629 0.000000e+000 768 5.814255 2.842171e-014 896 5.807132 0.000000e+000 1024 5.817208 -1.421085e-013 1152 5.805388 0.000000e+000 1280 5.830746 -5.684342e-014 1408 5.881937 -5.684342e-014 1536 5.872159 -1.705303e-013 1664 5.881536 5.684342e-014 1792 5.906261 -2.842171e-013 1920 5.477966 2.273737e-013 2048 5.620931 0.000000e+000 2176 3.998713 1.136868e-013 2304 3.370095 -3.410605e-013 2432 3.371386 -3.410605e-013
question 1
how can better this? not coming close peak performance. have optimized assembly code high heaven. further unrolling might boost little more, less unrolling seems degrade performance.
question 2
when n > 2048
, can see drop in performance. because l1 cache 32kb, , when n = 2048
, a
, b
double
, take entire cache. bigger , streamed memory.
i tried cache blocking (not shown in source), maybe did wrong. can provide code or explain how block dot product cache?
source code
#include <stdio.h> #include <time.h> #include <stdlib.h> #include <string.h> #include <x86intrin.h> #include <math.h> #include <omp.h> #include <stdint.h> #include <windows.h> // computes 8 dot products #define kernel(address) \ "movapd xmm4, xmmword ptr [eax+"#address"] \n\t" \ "mulpd xmm7, xmmword ptr [edx+48+"#address"] \n\t" \ "addpd xmm2, xmm6 \n\t" \ "movapd xmm5, xmmword ptr [eax+16+"#address"] \n\t" \ "mulpd xmm4, xmmword ptr [edx+"#address"] \n\t" \ "addpd xmm3, xmm7 \n\t" \ "movapd xmm6, xmmword ptr [eax+96+"#address"] \n\t" \ "mulpd xmm5, xmmword ptr [edx+16+"#address"] \n\t" \ "addpd xmm0, xmm4 \n\t" \ "movapd xmm7, xmmword ptr [eax+112+"#address"] \n\t" \ "mulpd xmm6, xmmword ptr [edx+96+"#address"] \n\t" \ "addpd xmm1, xmm5 \n\t" #define peeled(address) \ "movapd xmm4, xmmword ptr [eax+"#address"] \n\t" \ "mulpd xmm7, [edx+48+"#address"] \n\t" \ "addpd xmm2, xmm6 \n\t" \ "movapd xmm5, xmmword ptr [eax+16+"#address"] \n\t" \ "mulpd xmm4, xmmword ptr [edx+"#address"] \n\t" \ "addpd xmm3, xmm7 \n\t" \ "mulpd xmm5, xmmword ptr [edx+16+"#address"] \n\t" \ "addpd xmm0, xmm4 \n\t" \ "addpd xmm1, xmm5 \n\t" inline double __attribute__ ((gnu_inline)) __attribute__ ((aligned(64))) ddot_ref( int n, const double* restrict a, const double* restrict b) { double sum0 = 0.0; double sum1 = 0.0; double sum2 = 0.0; double sum3 = 0.0; double sum; for(int = 0; < n; i+=4) { sum0 += *(a + ) * *(b + ); sum1 += *(a + i+1) * *(b + i+1); sum2 += *(a + i+2) * *(b + i+2); sum3 += *(a + i+3) * *(b + i+3); } sum = sum0+sum1+sum2+sum3; return(sum); } inline double __attribute__ ((gnu_inline)) __attribute__ ((aligned(64))) ddot_asm ( int n, const double* restrict a, const double* restrict b) { double sum; __asm__ __volatile__ ( "mov eax, %[a] \n\t" "mov edx, %[b] \n\t" "mov ecx, %[n] \n\t" "pxor xmm0, xmm0 \n\t" "pxor xmm1, xmm1 \n\t" "pxor xmm2, xmm2 \n\t" "pxor xmm3, xmm3 \n\t" "movapd xmm6, xmmword ptr [eax+32] \n\t" "movapd xmm7, xmmword ptr [eax+48] \n\t" "mulpd xmm6, xmmword ptr [edx+32] \n\t" "sar ecx, 7 \n\t" "sub ecx, 1 \n\t" // peel "l%=: \n\t" kernel(64 * 0) kernel(64 * 1) kernel(64 * 2) kernel(64 * 3) kernel(64 * 4) kernel(64 * 5) kernel(64 * 6) kernel(64 * 7) kernel(64 * 8) kernel(64 * 9) kernel(64 * 10) kernel(64 * 11) kernel(64 * 12) kernel(64 * 13) kernel(64 * 14) kernel(64 * 15) "lea eax, [eax+1024] \n\t" "lea edx, [edx+1024] \n\t" " \n\t" "dec ecx \n\t" "jnz l%= \n\t" // end loop " \n\t" kernel(64 * 0) kernel(64 * 1) kernel(64 * 2) kernel(64 * 3) kernel(64 * 4) kernel(64 * 5) kernel(64 * 6) kernel(64 * 7) kernel(64 * 8) kernel(64 * 9) kernel(64 * 10) kernel(64 * 11) kernel(64 * 12) kernel(64 * 13) kernel(64 * 14) peeled(64 * 15) " \n\t" "addpd xmm0, xmm1 \n\t" // summing result "addpd xmm2, xmm3 \n\t" "addpd xmm0, xmm2 \n\t" // cascading add "movapd xmm1, xmm0 \n\t" // copy xmm0 "shufpd xmm1, xmm0, 0x03 \n\t" // shuffle "addsd xmm0, xmm1 \n\t" // add low qword "movsd %[sum], xmm0 \n\t" // mov low qw sum : // outputs [sum] "=m" (sum) : // inputs [a] "m" (a), [b] "m" (b), [n] "m" (n) : //register clobber "memory", "eax","ecx","edx","edi", "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7" ); return(sum); } int main() { // timers large_integer frequency, time1, time2; double time3; queryperformancefrequency(&frequency); // clock_t time1, time2; double gflops; int nmax = 4096; int trials = 1e7; double sum, residual; file *f = fopen("soddot.txt","w+"); printf("%16s %16s %16s\n","n","total gflops/s","residual"); fprintf(f,"%16s %16s %16s\n","n","total gflops/s","residual"); for(int n = 256; n <= nmax; n += 128 ) { double* = null; double* b = null; = _mm_malloc(n*sizeof(*a), 64); if (!a) {printf("a failed\n"); return(1);} b = _mm_malloc(n*sizeof(*b), 64); if (!b) {printf("b failed\n"); return(1);} srand(time(null)); // create arrays for(int = 0; < n; ++i) { *(a + i) = (double) rand()/rand_max; *(b + i) = (double) rand()/rand_max; } // warmup sum = ddot_asm(n,a,b); queryperformancecounter(&time1); // time1 = clock(); (int count = 0; count < trials; count++){ // sum = ddot_ref(n,a,b); sum = ddot_asm(n,a,b); } queryperformancecounter(&time2); time3 = (double)(time2.quadpart - time1.quadpart) / frequency.quadpart; // time3 = (double) (clock() - time1)/clocks_per_sec; gflops = (double) (2.0*n*trials)/time3/1.0e9; residual = ddot_ref(n,a,b) - sum; printf("%16d %16f %16e\n",n,gflops,residual); fprintf(f,"%16d %16f %16e\n",n,gflops,residual); _mm_free(a); _mm_free(b); } fclose(f); return(0); // successful completion }
edit: explanation of assembly
a dot product repeat sum of products of 2 numbers: sum += a[i]*b[i]
. sum
must initialized 0
before first iteration. vectorized, can 2 sums @ time must summed @ end: [sum0 sum1] = [a[i] a[i+1]]*[b[i] b[i+1]]
, sum = sum0 + sum1
. in (intel) assembly, 3 steps (after initialization):
pxor xmm0, xmm0 // accumulator [sum0 sum1] = [0 0] movapd xmm1, xmmword ptr [eax] // load [a[i] a[i+1]] xmm1 mulpd xmm1, xmmword ptr [edx] // xmm1 = xmm1 * [b[i] b[i+1]] addpd xmm0, xmm1 // xmm0 = xmm0 + xmm1
at point have nothing special, compiler can come this. can better performance unrolling code enough times use xmm
registers available (8 registers in 32 bit mode). if unroll 4 times allows utilize 8 registers xmm0
through xmm7
. have 4 accumulators , 4 registers storing results of movapd
, addpd
. again, compiler can come this. real thinking part trying come way pipeline code, i.e., make each instruction in group of mov/mul/add operate on different registers 3 instructions execute @ same time (usually case on cpus). that's how beat compiler. have pattern 4x unrolled code that, may require loading vectors ahead of time , peeling off first or last iteration. kernel(address)
is. made macro of 4x unrolled pipelined code convenience. way can unroll in multiples of 4 changing address
. each kernel
computes 8 dot products.
to answer overall question can't achieve peak performance dot product.
the problem cpu can 1 128-bit load per clock cycle , dot product need 2 128-bit loads per clock cycle.
but it's worse large n. answer second question dot product memory bound , not compute bound , cannot parallelize large n fast cores. explained better here why-vectorizing-the-loop-does-not-have-performance-improvement. big problem parallelization fast cores. took me while figure out it's important learn.
there few basic algorithms can benefit parallelization on fast cores. in terms of blas algorithms it's level-3 algorithms (o(n^3)) such matrix multiplication benefit parallelization. situation better on slow cores e.g. gpus , xeon phi because discrepancy between memory speed , core speed smaller.
if want find algorithm can close peak flops small n try e.g. scalar * vector or sum of scalar * vector. first case should 1 load, 1 mult, , 1 store every clock cycle , second case 1 mult, 1 add, , 1 load every clock cycle.
i tested following code on core 2 duo p9600@2.67ghz in knoppix 7.3 32-bit. i 75% of peak scalar product , 75% of peakfor sum of scalar product. flops/cycle scalar product 2 , sum of scalar product it's 4.
compiled g++ -msse2 -o3 -fopenmp foo.cpp -ffast-math
#include <stdio.h> #include <stdlib.h> #include <omp.h> #include <x86intrin.h> void scalar_product(double * __restrict a, int n) { = (double*)__builtin_assume_aligned (a, 64); double k = 3.14159; for(int i=0; i<n; i++) { a[i] = k*a[i]; } } void scalar_product_sse(double * __restrict a, int n) { = (double*)__builtin_assume_aligned (a, 64); __m128d k = _mm_set1_pd(3.14159); for(int i=0; i<n; i+=8) { __m128d t1 = _mm_load_pd(&a[i+0]); _mm_store_pd(&a[i],_mm_mul_pd(k,t1)); __m128d t2 = _mm_load_pd(&a[i+2]); _mm_store_pd(&a[i+2],_mm_mul_pd(k,t2)); __m128d t3 = _mm_load_pd(&a[i+4]); _mm_store_pd(&a[i+4],_mm_mul_pd(k,t3)); __m128d t4 = _mm_load_pd(&a[i+6]); _mm_store_pd(&a[i+6],_mm_mul_pd(k,t4)); } } double scalar_sum(double * __restrict a, int n) { = (double*)__builtin_assume_aligned (a, 64); double sum = 0.0; double k = 3.14159; for(int i=0; i<n; i++) { sum += k*a[i]; } return sum; } double scalar_sum_sse(double * __restrict a, int n) { = (double*)__builtin_assume_aligned (a, 64); __m128d sum1 = _mm_setzero_pd(); __m128d sum2 = _mm_setzero_pd(); __m128d sum3 = _mm_setzero_pd(); __m128d sum4 = _mm_setzero_pd(); __m128d k = _mm_set1_pd(3.14159); for(int i=0; i<n; i+=8) { __m128d t1 = _mm_load_pd(&a[i+0]); sum1 = _mm_add_pd(_mm_mul_pd(k,t1),sum1); __m128d t2 = _mm_load_pd(&a[i+2]); sum2 = _mm_add_pd(_mm_mul_pd(k,t2),sum2); __m128d t3 = _mm_load_pd(&a[i+4]); sum3 = _mm_add_pd(_mm_mul_pd(k,t3),sum3); __m128d t4 = _mm_load_pd(&a[i+6]); sum4 = _mm_add_pd(_mm_mul_pd(k,t4),sum4); } double tmp[8]; _mm_storeu_pd(&tmp[0],sum1); _mm_storeu_pd(&tmp[2],sum2); _mm_storeu_pd(&tmp[4],sum3); _mm_storeu_pd(&tmp[6],sum4); double sum = 0; for(int i=0; i<8; i++) sum+=tmp[i]; return sum; } int main() { //_mm_set_flush_zero_mode(_mm_flush_zero_on); //_mm_setcsr(_mm_getcsr() | 0x8040); double dtime, peak, flops, sum; int repeat = 1<<18; const int n = 2048; double *a = (double*)_mm_malloc(sizeof(double)*n,64); double *b = (double*)_mm_malloc(sizeof(double)*n,64); for(int i=0; i<n; i++) a[i] = 1.0*rand()/rand_max; dtime = omp_get_wtime(); for(int r=0; r<repeat; r++) { scalar_product_sse(a,n); } dtime = omp_get_wtime() - dtime; peak = 2*2.67; flops = 1.0*n/dtime*1e-9*repeat; printf("time %f, %f, %f\n", dtime,flops, flops/peak); //for(int i=0; i<n; i++) a[i] = 1.0*rand()/rand_max; //sum = 0.0; dtime = omp_get_wtime(); for(int r=0; r<repeat; r++) { scalar_sum_sse(a,n); } dtime = omp_get_wtime() - dtime; peak = 2*2*2.67; flops = 2.0*n/dtime*1e-9*repeat; printf("time %f, %f, %f\n", dtime,flops, flops/peak); //printf("sum %f\n", sum); }
Comments
Post a Comment