Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
294f146
rns fgemv with parseqhelper adapted but still need to wrap the functi…
May 3, 2019
545645e
rns fgemv with parseqhelper adapted and its corresponding benchmark-f…
May 6, 2019
3d38bed
cleaned up for code review
May 7, 2019
3663c83
Rolled back the benchmark-fgemv-mp and adopted benchmark-fgemv-rns fo…
May 13, 2019
f632d38
Ready for rns benchmark
May 16, 2019
27dd46c
Instant backup for code review
May 16, 2019
c8928bb
Check if it is required impl
May 17, 2019
59326f1
rns for fgemv implemented but no obvious speedup can be found
May 21, 2019
7bc53b9
Having used SYNCH_GROUP to avoid segmmentation fault but still no obs…
May 28, 2019
93cda61
add benchmark-fgemv-rns
May 29, 2019
d194b9a
Instant backup before code review
May 29, 2019
3bfb919
detailed timing
Jun 3, 2019
5913444
Instant backup before code review
Jun 12, 2019
ba60d52
Use Givaro::absCompare in InfNorm() to compute bound on the output
Jun 19, 2019
4db380d
Ready for benchmarks on a server
Jun 24, 2019
636a6d8
Merge branch 'master' into parallel-rns-fgemv
Jun 24, 2019
3b12ce7
Got ready for benchmark on a server
Jun 24, 2019
14a3dea
Fallback to PARFOR1D in the rsn-double::init
Jun 25, 2019
45eb9fa
Instant backup before code review
Jun 25, 2019
34a1277
Further improved to validate on hpac
Jun 25, 2019
58d0706
Check if freduce is speeded up
Jun 25, 2019
63f157b
Corrected the wrong parallelization of freduce
Jun 25, 2019
e601bb4
Instant backup for code review
Jun 26, 2019
83ce47e
Merge branch 'master' into parallel-rns-fgemv
Jun 26, 2019
f1db3bc
Fixed templated fgemv confusion for Givaro::Modular<RecInt::ruint<K>>
Jun 26, 2019
af2f7e1
Used FOR1D instead of FOR1DBLOCK for both InfNorm and freduce(rns_dou…
Jun 27, 2019
10c90d9
Cleaned up for code review before PR
Jun 27, 2019
02d1d85
Corrected bug in InfNorm by falling back to FOR1DBLOCK
Jun 27, 2019
ba96e19
Trying to add test for rns fgemv
Jul 1, 2019
8f17bd4
Updated test-fgemv-rns
Jul 1, 2019
418b010
Merge branch 'master' into parallel-rns-fgemv
Jul 1, 2019
18d2d14
Fall back to 3.3.3 version for the libtool in the configure.ac
Jul 17, 2019
cd29283
Merged with Master
Jul 17, 2019
6808a05
Chnaged LT_PREREQ to 2.4.2
Jul 17, 2019
03f1430
Chnaged back LT_PREREQ to 2.4.3
Jul 17, 2019
a30ca0b
Corrected Makefile.am typo error for test-fgemv-rns
Jul 17, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion benchmarks/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ endif

PERFPUBLISHERFILE=benchmarks-report.xml

FFLA_BENCH = benchmark-fgemm benchmark-fgemm-rns benchmark-wino benchmark-ftrsm benchmark-fgesv benchmark-ftrsv benchmark-ftrtri benchmark-inverse benchmark-fsytrf benchmark-fsyrk benchmark-lqup benchmark-pluq benchmark-charpoly benchmark-charpoly-mp benchmark-fgemm-mp benchmark-fgemv-mp benchmark-ftrsm-mp benchmark-lqup-mp benchmark-checkers benchmark-fadd-lvl2 benchmark-fdot benchmark-fgemv
FFLA_BENCH = benchmark-fgemm benchmark-fgemm-rns benchmark-wino benchmark-ftrsm benchmark-fgesv benchmark-ftrsv benchmark-ftrtri benchmark-inverse benchmark-fsytrf benchmark-fsyrk benchmark-lqup benchmark-pluq benchmark-charpoly benchmark-charpoly-mp benchmark-fgemm-mp benchmark-fgemv-mp benchmark-ftrsm-mp benchmark-lqup-mp benchmark-checkers benchmark-fadd-lvl2 benchmark-fdot benchmark-fgemv benchmark-fgemv-rns

BLAS_BENCH = benchmark-sgemm$(EXEEXT) benchmark-dgemm benchmark-dtrsm
LAPA_BENCH = benchmark-dtrtri benchmark-dgetri benchmark-dgetrf benchmark-dsytrf
Expand Down Expand Up @@ -85,6 +85,7 @@ benchmark_checkers_SOURCES = benchmark-checkers.C
benchmark_fadd_lvl2_SOURCES = benchmark-fadd-lvl2.C
benchmark_fdot_SOURCES = benchmark-fdot.C
benchmark_fgemv_SOURCES = benchmark-fgemv.C
benchmark_fgemv_rns_SOURCES = benchmark-fgemv-rns.C

benchmark_sgemm_CXXFLAGS = $(AM_CXXFLAGS) -D__SGEMM__

Expand Down
135 changes: 93 additions & 42 deletions benchmarks/benchmark-fgemv-mp.C
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

/*
* Copyright (C) FFLAS-FFPACK
* Written by Pascal Giorgi <pascal.giorgi@lirmm.fr>
Expand Down Expand Up @@ -28,23 +29,24 @@
// everywhere in the call stack
#define __FFLASFFPACK_OPENBLAS_NT_ALREADY_SET 1


#if not defined(MG_DEFAULT)
#define MG_DEFAULT MG_ACTIVE
#endif
#if not defined(STD_RECINT_SIZE)
#define STD_RECINT_SIZE 8
#endif


#include "fflas-ffpack/fflas-ffpack-config.h"
#include <iostream>
#include <typeinfo>
#include <vector>
#include <string>
using namespace std;

#include "fflas-ffpack/utils/timer.h"
#include "fflas-ffpack/fflas/fflas.h"
#include "fflas-ffpack/utils/fflas_io.h"
#include "fflas-ffpack/utils/timer.h"
#include "fflas-ffpack/utils/args-parser.h"
#include "givaro/modular-integer.h"
#include "givaro/givcaster.h"
Expand All @@ -53,47 +55,32 @@ using namespace std;
#include "recint/recint.h"
#endif


template<typename T>
std::ostream& write_matrix(std::ostream& out, Givaro::Integer p, size_t m, size_t n, T* C, size_t ldc){

size_t www(size_t((double(p.bitsize())*log(2.))/log(10.)));
out<<"Matrix("<<m<<','<<n<<",[[";
out.width(www+1);
out<<std::right<<C[0];
for (size_t j=1;j<n;++j){
out<<',';
out.width(www);
out<<std::right<<C[j];
}
out<<']';
for (size_t i=1;i<m;++i){
out<<endl<<",[";
out.width(www+1);
out<<std::right<<C[i*ldc];
for (size_t j=1;j<n;++j){
out<<',';
out.width(www);
out<<std::right<<C[i*ldc+j];
}
out<<']';
}
return out<<"])";
#ifdef BENCH_FLINT
#define __GMP_BITS_PER_MP_LIMB 64
extern "C" {
#include "flint/longlong.h"
#include "flint/long_extras.h"
#include "flint/fmpz_mat.h"
#include "flint/fmpz.h"
#include "flint/flint.h"
}
#endif


static size_t iters = 3 ;
static Givaro::Integer q = -1 ;
static unsigned long b = 512 ;
static size_t m = 512 ;
static size_t k = 512 ;
static size_t n = 512 ;
static int nbw = -1 ;
static size_t seed= time(NULL);
static Argument as[] = {
{ 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q },
{ 'b', "-b B", "Set the bitsize of the random characteristic.", TYPE_INT , &b },
{ 'm', "-m M", "Set the dimension m of the matrix.", TYPE_INT , &m },
{ 'k', "-k K", "Set the dimension k of the matrix.", TYPE_INT , &k },
{ 'n', "-n N", "Set the dimension n of the matrix.", TYPE_INT , &n },
{ 'w', "-w N", "Set the number of winograd levels (-1 for random).", TYPE_INT , &nbw },
{ 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iters },
{ 's', "-s S", "Sets seed.", TYPE_INT , &seed },
Expand All @@ -109,7 +96,10 @@ int tmain(){
typedef Givaro::Modular<Ints> Field;
Givaro::Integer p;
FFLAS::Timer chrono, TimFreivalds;
double time=0.;
double time=0.,timev=0.;
#ifdef BENCH_FLINT
double timeFlint=0.;
#endif
for (size_t loop=0;loop<iters;loop++){
Givaro::Integer::random_exact_2exp(p, b);
Givaro::IntPrimeDom IPD;
Expand All @@ -120,8 +110,8 @@ int tmain(){
Field F(ip);
size_t lda,ldb,ldc;
lda=k;
ldb=1;
ldc=1;
ldb=n;
ldc=n;

typename Field::RandIter Rand(F,seed);
typename Field::Element_ptr A,B,C;
Expand All @@ -140,43 +130,104 @@ int tmain(){
// Rand.random(C[i*ldc+j]);

PAR_BLOCK { FFLAS::pfrand(F,Rand, m,k,A,m/size_t(MAX_THREADS)); }
PAR_BLOCK { FFLAS::pfrand(F,Rand, k,1,B,k/MAX_THREADS); }
PAR_BLOCK { FFLAS::pfzero(F, m,1,C,m/MAX_THREADS); }
PAR_BLOCK { FFLAS::pfrand(F,Rand, k,n,B,k/MAX_THREADS); }
PAR_BLOCK { FFLAS::pfzero(F, m,n,C,m/MAX_THREADS); }


Ints alpha,beta;
alpha=F.one;
beta=F.zero;


#ifdef BENCH_FLINT
// FLINT MUL //
fmpz_t modp,tmp;
fmpz_init(modp);
fmpz_init(tmp);
fmpz_set_mpz(modp, *(reinterpret_cast<const mpz_t*>(&p)));
fmpz_mat_t AA,BB,CC,DD;
fmpz_mat_init (AA, m, k);
fmpz_mat_init (BB, k, n);
fmpz_mat_init (CC, m, n);
fmpz_mat_init (DD, m, n);
fmpz_t aalpha, bbeta;
fmpz_set_mpz(aalpha,*(reinterpret_cast<const mpz_t*>(&alpha)));
fmpz_set_mpz(bbeta,*(reinterpret_cast<const mpz_t*>(&beta)));

for (size_t i=0;i<m;++i)
for (size_t j=0;j<k;++j)
fmpz_set_mpz(fmpz_mat_entry(AA,i,j),*(reinterpret_cast<const mpz_t*>(A+i*lda+j)));
for (size_t i=0;i<k;++i)
for (size_t j=0;j<n;++j)
fmpz_set_mpz(fmpz_mat_entry(BB,i,j),*(reinterpret_cast<const mpz_t*>(B+i*ldb+j)));
for (size_t i=0;i<m;++i)
for (size_t j=0;j<n;++j)
fmpz_set_mpz(fmpz_mat_entry(CC,i,j),*(reinterpret_cast<const mpz_t*>(C+i*ldc+j)));
chrono.clear();chrono.start();
// DD= A.B
fmpz_mat_mul(DD,AA,BB);
// CC = beta.C
fmpz_mat_scalar_mul_fmpz(CC,CC,bbeta);
// CC = CC + DD.alpha
fmpz_mat_scalar_addmul_fmpz(CC,DD,aalpha);
// CC = CC mod p
for (size_t i=0;i<m;++i)
for (size_t j=0;j<n;++j)
fmpz_mod(fmpz_mat_entry(CC,i,j),fmpz_mat_entry(CC,i,j),modp);

chrono.stop();
timeFlint+=chrono.usertime();
fmpz_mat_clear(AA);
fmpz_mat_clear(BB);
#endif
//END FLINT CODE //
using FFLAS::CuttingStrategy::Recursive;
using FFLAS::StrategyParameter::TwoDAdaptive;
// RNS MUL_LA
chrono.clear();chrono.start();
// PAR_BLOCK{
// FFLAS::fgemm(F,FFLAS::FflasNoTrans,FFLAS::FflasNoTrans,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc, SPLITTER(NUM_THREADS,Recursive,TwoDAdaptive) );
// }
{
FFLAS::ParSeqHelper::Sequential seqH;
FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc,seqH);
FFLAS::fgemm(F,FFLAS::FflasNoTrans,FFLAS::FflasNoTrans,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,FFLAS::ParSeqHelper::Sequential());
}

chrono.stop();
time+=chrono.realtime();

TimFreivalds.start();
bool pass = FFLAS::freivalds(F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, m,n,k, alpha, A, k, B, n, C,n);
TimFreivalds.stop();
timev+=TimFreivalds.usertime();
if (!pass) {
std::cout<<"FAILED"<<std::endl;
std::cout << "p:=" << p << ';'<<std::endl;
FFLAS::WriteMatrix (std::cout<<"A:=",F,m,k,A,lda)<<';'<<std::endl;
FFLAS::WriteMatrix(std::cout<<"B:=",F,k,n,B,ldb)<<';'<<std::endl;
FFLAS::WriteMatrix(std::cout<<"C:=",F,m,n,C,ldc)<<';'<<std::endl;
}

FFLAS::fflas_delete(A);
FFLAS::fflas_delete(B);
FFLAS::fflas_delete(C);

}

double Mflops=((2.*double(m)-1)/1000.*double(k)/1000.0) /time * double(iters);
// Mflops*=p.bitsize()/16.;
cout << "Time: "<< (time/double(iters)) <<" Gfops: "<<Mflops*1.0/1000.0
double Gflops=(2.*double(m)/1000.*double(n)/1000.*double(k)/1000.0) / time * double(iters);
// Gflops*=p.bitsize()/16.;
cout << "Time: "<< (time/double(iters))
<<" Gfops: "<<Gflops
<< " (total:" << time <<") "
<<typeid(Ints).name()
<<" perword: "<< (Mflops*double(p.bitsize()))/64. ;
FFLAS::writeCommandString(std::cout << " | " << p << " (" << p.bitsize()<<")|", as) << std::endl;
return 0;
}
<<" | perword: "<< (Gflops*double(p.bitsize()))/64. ;

FFLAS::writeCommandString(std::cout << '|' << p << " (" << p.bitsize()<<")|", as) << " | Freivalds: "<< timev/double(iters) << std::endl;

#ifdef BENCH_FLINT
cout<<"Time FLINT: "<<timeFlint<<endl;
#endif
return 0;
}

int main(int argc, char** argv){

Expand Down
Loading