Skip to content

update_u flop calculation #67

@mneilly

Description

@mneilly

I'm working through the code for su3_rhmd_hisq and calculating flops. I'm currently looking at the main loop in update_u().

FORALLSITES_OMP(i,s,private(dir,link,temp1,temp2,htemp)) {
for(dir=XUP; dir <=TUP; dir++){
uncompress_anti_hermitian( &(s->mom[dir]) , &htemp );
link = &(s->link[dir]);
mult_su3_nn(&htemp,link,&temp1);
scalar_mult_add_su3_matrix(link,&temp1,t8,&temp2);
mult_su3_nn(&htemp,&temp2,&temp1);
scalar_mult_add_su3_matrix(link,&temp1,t7,&temp2);
mult_su3_nn(&htemp,&temp2,&temp1);
scalar_mult_add_su3_matrix(link,&temp1,t6,&temp2);
mult_su3_nn(&htemp,&temp2,&temp1);
scalar_mult_add_su3_matrix(link,&temp1,t5,&temp2);
mult_su3_nn(&htemp,&temp2,&temp1);
scalar_mult_add_su3_matrix(link,&temp1,t4,&temp2);
mult_su3_nn(&htemp,&temp2,&temp1);
scalar_mult_add_su3_matrix(link,&temp1,t3,&temp2);
mult_su3_nn(&htemp,&temp2,&temp1);
scalar_mult_add_su3_matrix(link,&temp1,t2,&temp2);
mult_su3_nn(&htemp,&temp2,&temp1);
scalar_mult_add_su3_matrix(link,&temp1,eps ,&temp2);
su3mat_copy(&temp2,link);
}
} END_LOOP_OMP

It appears that the reported Mflops/second is based on a flop count of 5616.

node0_printf("LINK_UPDATE: time = %e mflops = %e\n",
dtime, (double)(5616.0*volume/(1.0e6*dtime*numnodes())) );

If I calculate flops myself I get 7488 not 5616.

For a complex 3x3 matrix multiply I'm expecting:

flops = 6*n^3 + 2*n^2*(n-1) = 6*3^3 + 2*3^2*(3-1) = 198

and for the scalar multiply add I'm expecting:

flops = 2 * 2*n^2 = 2 * 2*3^2 = 36

Since matmul and scalar mul plus add occur 8 times each and there are 4 directions:

flops = 4 * 8 * (198 + 36) = 7488

5616 happens to be exactly 3/4 of 7488.

Saving temps and looking at the preprocessed output seems to confirm my count:

For the matmuls there are 36 multiplies and 30 add/sub in each of 3 loops so 198 flops.

int iii;
for (iii = 0; iii < 3; iii++)
{
    Real c0r, c0i, c1r, c1i, c2r, c2i;
    Real aikr, aiki;
    aikr = (((fsu3_matrix *)(&htemp))->e[iii][0].real);
    aiki = (((fsu3_matrix *)(&htemp))->e[iii][0].imag);
    c0r = aikr * (((fsu3_matrix *)(link))->e[0][0].real);
    c0i = aikr * (((fsu3_matrix *)(link))->e[0][0].imag);
    c1r = aikr * (((fsu3_matrix *)(link))->e[0][1].real);
    c1i = aikr * (((fsu3_matrix *)(link))->e[0][1].imag);
    c2r = aikr * (((fsu3_matrix *)(link))->e[0][2].real);
    c2i = aikr * (((fsu3_matrix *)(link))->e[0][2].imag);
    c0r -= aiki * (((fsu3_matrix *)(link))->e[0][0].imag);
    c0i += aiki * (((fsu3_matrix *)(link))->e[0][0].real);
    c1r -= aiki * (((fsu3_matrix *)(link))->e[0][1].imag);
    c1i += aiki * (((fsu3_matrix *)(link))->e[0][1].real);
    c2r -= aiki * (((fsu3_matrix *)(link))->e[0][2].imag);
    c2i += aiki * (((fsu3_matrix *)(link))->e[0][2].real);
    aikr = (((fsu3_matrix *)(&htemp))->e[iii][1].real);
    aiki = (((fsu3_matrix *)(&htemp))->e[iii][1].imag);
    c0r += aikr * (((fsu3_matrix *)(link))->e[1][0].real);
    c0i += aikr * (((fsu3_matrix *)(link))->e[1][0].imag);
    c1r += aikr * (((fsu3_matrix *)(link))->e[1][1].real);
    c1i += aikr * (((fsu3_matrix *)(link))->e[1][1].imag);
    c2r += aikr * (((fsu3_matrix *)(link))->e[1][2].real);
    c2i += aikr * (((fsu3_matrix *)(link))->e[1][2].imag);
    c0r -= aiki * (((fsu3_matrix *)(link))->e[1][0].imag);
    c0i += aiki * (((fsu3_matrix *)(link))->e[1][0].real);
    c1r -= aiki * (((fsu3_matrix *)(link))->e[1][1].imag);
    c1i += aiki * (((fsu3_matrix *)(link))->e[1][1].real);
    c2r -= aiki * (((fsu3_matrix *)(link))->e[1][2].imag);
    c2i += aiki * (((fsu3_matrix *)(link))->e[1][2].real);
    aikr = (((fsu3_matrix *)(&htemp))->e[iii][2].real);
    aiki = (((fsu3_matrix *)(&htemp))->e[iii][2].imag);
    c0r += aikr * (((fsu3_matrix *)(link))->e[2][0].real);
    c0i += aikr * (((fsu3_matrix *)(link))->e[2][0].imag);
    c1r += aikr * (((fsu3_matrix *)(link))->e[2][1].real);
    c1i += aikr * (((fsu3_matrix *)(link))->e[2][1].imag);
    c2r += aikr * (((fsu3_matrix *)(link))->e[2][2].real);
    c2i += aikr * (((fsu3_matrix *)(link))->e[2][2].imag);
    c0r -= aiki * (((fsu3_matrix *)(link))->e[2][0].imag);
    c0i += aiki * (((fsu3_matrix *)(link))->e[2][0].real);
    c1r -= aiki * (((fsu3_matrix *)(link))->e[2][1].imag);
    c1i += aiki * (((fsu3_matrix *)(link))->e[2][1].real);
    c2r -= aiki * (((fsu3_matrix *)(link))->e[2][2].imag);
    c2i += aiki * (((fsu3_matrix *)(link))->e[2][2].real);
    (((fsu3_matrix *)(&temp1))->e[iii][0].real) = c0r;
    (((fsu3_matrix *)(&temp1))->e[iii][0].imag) = c0i;
    (((fsu3_matrix *)(&temp1))->e[iii][1].real) = c1r;
    (((fsu3_matrix *)(&temp1))->e[iii][1].imag) = c1i;
    (((fsu3_matrix *)(&temp1))->e[iii][2].real) = c2r;
    (((fsu3_matrix *)(&temp1))->e[iii][2].imag) = c2i;
}

And for the scalar multiply adds there are 6 adds and 6 multiples in each of 3 loop iterations for 36 flops.

Real _temp = t8;
{
    register fsu3_matrix *aaa, *bbb, *ccc;
    register Real sss;
    register int iii;
    aaa = (link);
    bbb = (&temp1);
    ccc = (&temp2);
    sss = (_temp);
    for (iii = 0; iii < 3; iii++)
    {
        (ccc)->e[iii][0].real = (aaa)->e[iii][0].real + (sss) * (bbb)->e[iii][0].real;
        (ccc)->e[iii][0].imag = (aaa)->e[iii][0].imag + (sss) * (bbb)->e[iii][0].imag;
        (ccc)->e[iii][1].real = (aaa)->e[iii][1].real + (sss) * (bbb)->e[iii][1].real;
        (ccc)->e[iii][1].imag = (aaa)->e[iii][1].imag + (sss) * (bbb)->e[iii][1].imag;
        (ccc)->e[iii][2].real = (aaa)->e[iii][2].real + (sss) * (bbb)->e[iii][2].real;
        (ccc)->e[iii][2].imag = (aaa)->e[iii][2].imag + (sss) * (bbb)->e[iii][2].imag;
    }
};

I'm wondering what accounts for the lower count of 5616...

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions