I'm working through the code for su3_rhmd_hisq and calculating flops. I'm currently looking at the main loop in update_u().
|
FORALLSITES_OMP(i,s,private(dir,link,temp1,temp2,htemp)) { |
|
for(dir=XUP; dir <=TUP; dir++){ |
|
uncompress_anti_hermitian( &(s->mom[dir]) , &htemp ); |
|
link = &(s->link[dir]); |
|
mult_su3_nn(&htemp,link,&temp1); |
|
scalar_mult_add_su3_matrix(link,&temp1,t8,&temp2); |
|
mult_su3_nn(&htemp,&temp2,&temp1); |
|
scalar_mult_add_su3_matrix(link,&temp1,t7,&temp2); |
|
mult_su3_nn(&htemp,&temp2,&temp1); |
|
scalar_mult_add_su3_matrix(link,&temp1,t6,&temp2); |
|
mult_su3_nn(&htemp,&temp2,&temp1); |
|
scalar_mult_add_su3_matrix(link,&temp1,t5,&temp2); |
|
mult_su3_nn(&htemp,&temp2,&temp1); |
|
scalar_mult_add_su3_matrix(link,&temp1,t4,&temp2); |
|
mult_su3_nn(&htemp,&temp2,&temp1); |
|
scalar_mult_add_su3_matrix(link,&temp1,t3,&temp2); |
|
mult_su3_nn(&htemp,&temp2,&temp1); |
|
scalar_mult_add_su3_matrix(link,&temp1,t2,&temp2); |
|
mult_su3_nn(&htemp,&temp2,&temp1); |
|
scalar_mult_add_su3_matrix(link,&temp1,eps ,&temp2); |
|
su3mat_copy(&temp2,link); |
|
} |
|
} END_LOOP_OMP |
It appears that the reported Mflops/second is based on a flop count of 5616.
|
node0_printf("LINK_UPDATE: time = %e mflops = %e\n", |
|
dtime, (double)(5616.0*volume/(1.0e6*dtime*numnodes())) ); |
If I calculate flops myself I get 7488 not 5616.
For a complex 3x3 matrix multiply I'm expecting:
flops = 6*n^3 + 2*n^2*(n-1) = 6*3^3 + 2*3^2*(3-1) = 198
and for the scalar multiply add I'm expecting:
flops = 2 * 2*n^2 = 2 * 2*3^2 = 36
Since matmul and scalar mul plus add occur 8 times each and there are 4 directions:
flops = 4 * 8 * (198 + 36) = 7488
5616 happens to be exactly 3/4 of 7488.
Saving temps and looking at the preprocessed output seems to confirm my count:
For the matmuls there are 36 multiplies and 30 add/sub in each of 3 loops so 198 flops.
int iii;
for (iii = 0; iii < 3; iii++)
{
Real c0r, c0i, c1r, c1i, c2r, c2i;
Real aikr, aiki;
aikr = (((fsu3_matrix *)(&htemp))->e[iii][0].real);
aiki = (((fsu3_matrix *)(&htemp))->e[iii][0].imag);
c0r = aikr * (((fsu3_matrix *)(link))->e[0][0].real);
c0i = aikr * (((fsu3_matrix *)(link))->e[0][0].imag);
c1r = aikr * (((fsu3_matrix *)(link))->e[0][1].real);
c1i = aikr * (((fsu3_matrix *)(link))->e[0][1].imag);
c2r = aikr * (((fsu3_matrix *)(link))->e[0][2].real);
c2i = aikr * (((fsu3_matrix *)(link))->e[0][2].imag);
c0r -= aiki * (((fsu3_matrix *)(link))->e[0][0].imag);
c0i += aiki * (((fsu3_matrix *)(link))->e[0][0].real);
c1r -= aiki * (((fsu3_matrix *)(link))->e[0][1].imag);
c1i += aiki * (((fsu3_matrix *)(link))->e[0][1].real);
c2r -= aiki * (((fsu3_matrix *)(link))->e[0][2].imag);
c2i += aiki * (((fsu3_matrix *)(link))->e[0][2].real);
aikr = (((fsu3_matrix *)(&htemp))->e[iii][1].real);
aiki = (((fsu3_matrix *)(&htemp))->e[iii][1].imag);
c0r += aikr * (((fsu3_matrix *)(link))->e[1][0].real);
c0i += aikr * (((fsu3_matrix *)(link))->e[1][0].imag);
c1r += aikr * (((fsu3_matrix *)(link))->e[1][1].real);
c1i += aikr * (((fsu3_matrix *)(link))->e[1][1].imag);
c2r += aikr * (((fsu3_matrix *)(link))->e[1][2].real);
c2i += aikr * (((fsu3_matrix *)(link))->e[1][2].imag);
c0r -= aiki * (((fsu3_matrix *)(link))->e[1][0].imag);
c0i += aiki * (((fsu3_matrix *)(link))->e[1][0].real);
c1r -= aiki * (((fsu3_matrix *)(link))->e[1][1].imag);
c1i += aiki * (((fsu3_matrix *)(link))->e[1][1].real);
c2r -= aiki * (((fsu3_matrix *)(link))->e[1][2].imag);
c2i += aiki * (((fsu3_matrix *)(link))->e[1][2].real);
aikr = (((fsu3_matrix *)(&htemp))->e[iii][2].real);
aiki = (((fsu3_matrix *)(&htemp))->e[iii][2].imag);
c0r += aikr * (((fsu3_matrix *)(link))->e[2][0].real);
c0i += aikr * (((fsu3_matrix *)(link))->e[2][0].imag);
c1r += aikr * (((fsu3_matrix *)(link))->e[2][1].real);
c1i += aikr * (((fsu3_matrix *)(link))->e[2][1].imag);
c2r += aikr * (((fsu3_matrix *)(link))->e[2][2].real);
c2i += aikr * (((fsu3_matrix *)(link))->e[2][2].imag);
c0r -= aiki * (((fsu3_matrix *)(link))->e[2][0].imag);
c0i += aiki * (((fsu3_matrix *)(link))->e[2][0].real);
c1r -= aiki * (((fsu3_matrix *)(link))->e[2][1].imag);
c1i += aiki * (((fsu3_matrix *)(link))->e[2][1].real);
c2r -= aiki * (((fsu3_matrix *)(link))->e[2][2].imag);
c2i += aiki * (((fsu3_matrix *)(link))->e[2][2].real);
(((fsu3_matrix *)(&temp1))->e[iii][0].real) = c0r;
(((fsu3_matrix *)(&temp1))->e[iii][0].imag) = c0i;
(((fsu3_matrix *)(&temp1))->e[iii][1].real) = c1r;
(((fsu3_matrix *)(&temp1))->e[iii][1].imag) = c1i;
(((fsu3_matrix *)(&temp1))->e[iii][2].real) = c2r;
(((fsu3_matrix *)(&temp1))->e[iii][2].imag) = c2i;
}
And for the scalar multiply adds there are 6 adds and 6 multiples in each of 3 loop iterations for 36 flops.
Real _temp = t8;
{
register fsu3_matrix *aaa, *bbb, *ccc;
register Real sss;
register int iii;
aaa = (link);
bbb = (&temp1);
ccc = (&temp2);
sss = (_temp);
for (iii = 0; iii < 3; iii++)
{
(ccc)->e[iii][0].real = (aaa)->e[iii][0].real + (sss) * (bbb)->e[iii][0].real;
(ccc)->e[iii][0].imag = (aaa)->e[iii][0].imag + (sss) * (bbb)->e[iii][0].imag;
(ccc)->e[iii][1].real = (aaa)->e[iii][1].real + (sss) * (bbb)->e[iii][1].real;
(ccc)->e[iii][1].imag = (aaa)->e[iii][1].imag + (sss) * (bbb)->e[iii][1].imag;
(ccc)->e[iii][2].real = (aaa)->e[iii][2].real + (sss) * (bbb)->e[iii][2].real;
(ccc)->e[iii][2].imag = (aaa)->e[iii][2].imag + (sss) * (bbb)->e[iii][2].imag;
}
};
I'm wondering what accounts for the lower count of 5616...
I'm working through the code for su3_rhmd_hisq and calculating flops. I'm currently looking at the main loop in update_u().
milc_qcd/ks_imp_rhmc/update_u.c
Lines 84 to 106 in 1e11e12
It appears that the reported Mflops/second is based on a flop count of 5616.
milc_qcd/ks_imp_rhmc/update_u.c
Lines 110 to 111 in 1e11e12
If I calculate flops myself I get 7488 not 5616.
For a complex 3x3 matrix multiply I'm expecting:
and for the scalar multiply add I'm expecting:
Since matmul and scalar mul plus add occur 8 times each and there are 4 directions:
flops = 4 * 8 * (198 + 36) = 74885616 happens to be exactly 3/4 of 7488.
Saving temps and looking at the preprocessed output seems to confirm my count:
For the matmuls there are 36 multiplies and 30 add/sub in each of 3 loops so 198 flops.
And for the scalar multiply adds there are 6 adds and 6 multiples in each of 3 loop iterations for 36 flops.
I'm wondering what accounts for the lower count of 5616...