4、+){doublex=(double)i/dt;pi+=delta/(1.0+x*x);}returnpi*4;}doubleget_pi_sse(size_tdt){doublepi=0.0;doubledelta=1.0/dt;__m128dxmm0,xmm1,xmm2,xmm3,xmm4;xmm0=_mm_set1_pd(1.0);xmm1=_mm_set1_pd(delta);xmm2=_mm_set_pd(delta,0.0);xmm4=_mm_setzero_pd();for(longint
5、i=0;i<=dt-2;i+=2){xmm3=_mm_set1_pd((double)i*delta);xmm3=_mm_add_pd(xmm3,xmm2);xmm3=_mm_mul_pd(xmm3,xmm3);xmm3=_mm_add_pd(xmm0,xmm3);xmm3=_mm_div_pd(xmm1,xmm3);xmm4=_mm_add_pd(xmm4,xmm3);}doubletmp[2]__attribute__((aligned(16)));_mm_store_pd(tmp,xmm4);pi
6、+=tmp[0]+tmp[1]/*+tmp[2]+tmp[3]*/;returnpi*4.0;}intmain(){intdx;doublepai;doublestart,finish;dx=N;start=clock();pai=get_pi_sse(dx);finish=clock();printf("%.8lf",pai);printf("%.8lfS",(double)((finish-start)/CLOCKS_PER_SEC));return0;}時(shí)間運(yùn)行如下:第一次:time=0.
7、S第二次:time=0.S第三次:time=0.S三次平均為:0.00783S以下是SSE單精度的代碼:#include#include#include#defineNfloatget_pi_sse(size_tdt){floatpi=0.0;floatdelta=1.0/dt;__m128xmm0,xmm1,xmm2,xmm3,xmm4;xmm0=_mm_set1_ps(1.0);xmm1=_mm_set1_ps(delta);xmm2=_m
8、m_set_ps(delta*3,delta*2,delta,0.0);xmm4=_mm_setzero_ps();for(longinti=0;i<=dt-4;i+=4){xmm3=_mm_set1_ps((float)i*delta);xmm3=_mm_add_ps(xmm3,xmm2);xmm3=_mm_mul_ps(xmm3,xmm3);xmm3=_mm_add_ps(xmm0,xmm3);xmm3=_mm_div_ps(xmm1,