From 294f146080f7a8cda8f8951a5ed865df940d8006 Mon Sep 17 00:00:00 2001 From: ZHG Date: Fri, 3 May 2019 17:01:22 +0200 Subject: [PATCH 01/31] rns fgemv with parseqhelper adapted but still need to wrap the function into pfgemv which will no more require to be labeled with PAR_BLOCK --- benchmarks/benchmark-fgemv-mp.C | 12 ++++++++++-- fflas-ffpack/fflas/fflas_fgemv_mp.inl | 12 +++++++----- tests/test-fgemv.C | 4 ++-- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/benchmarks/benchmark-fgemv-mp.C b/benchmarks/benchmark-fgemv-mp.C index 13db28fc9..7301ae8f6 100644 --- a/benchmarks/benchmark-fgemv-mp.C +++ b/benchmarks/benchmark-fgemv-mp.C @@ -150,8 +150,16 @@ int tmain(){ // RNS MUL_LA chrono.clear();chrono.start(); { - FFLAS::ParSeqHelper::Sequential seqH; - FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc,seqH); +//@TODO: Still need to use PAR_BLOCK to label the parallel region, impl as pDet to wrap this into one function +// FFLAS::ParSeqHelper::Sequential seqH; +// FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc,seqH); +PAR_BLOCK{ + FFLAS::ParSeqHelper::Parallel parH; +parH.set_numthreads(NUM_THREADS); + + FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc,parH); +} } chrono.stop(); time+=chrono.realtime(); diff --git a/fflas-ffpack/fflas/fflas_fgemv_mp.inl b/fflas-ffpack/fflas/fflas_fgemv_mp.inl index 4f7017f40..22cee36b8 100644 --- a/fflas-ffpack/fflas/fflas_fgemv_mp.inl +++ b/fflas-ffpack/fflas/fflas_fgemv_mp.inl @@ -83,6 +83,7 @@ namespace FFLAS { // BB hack. might not work. // Calling fgemm, TODO: really specialize fgemv // specialization of the fgemv function for the field Givaro::ZRing + template inline Givaro::Integer* fgemv (const Givaro::ZRing& F, const FFLAS_TRANSPOSE ta, const size_t m, const size_t n, @@ -91,15 +92,16 @@ namespace FFLAS { Givaro::Integer* X, const size_t ldx, Givaro::Integer beta, Givaro::Integer* Y, const size_t ldy, - MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo > & H) + MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> & H) { - MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeqHelper::Sequential> H2; + MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> H2; fgemm(F,ta,FFLAS::FflasNoTrans, (ta==FFLAS::FflasNoTrans)?m:n, 1,(ta==FFLAS::FflasNoTrans)?n:m, alpha,A,lda,X,ldx,beta,Y,ldy,H2); return Y; } // specialization of the fgemv function for the field Givaro::Modular // Calling fgemm, TODO: really specialize fgemv + template inline Givaro::Integer* fgemv (const Givaro::Modular& F, const FFLAS_TRANSPOSE ta, const size_t m, const size_t n, @@ -108,9 +110,9 @@ namespace FFLAS { Givaro::Integer* X, const size_t ldx, Givaro::Integer beta, Givaro::Integer* Y, const size_t ldy, - MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo > & H) + MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> & H) { - MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeqHelper::Sequential> H2; + MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> H2; fgemm(F,ta,FFLAS::FflasNoTrans,(ta==FFLAS::FflasNoTrans)?m:n,1,(ta==FFLAS::FflasNoTrans)?n:m,alpha,A,lda,X,ldx,beta,Y,ldy,H2); return Y; } @@ -131,7 +133,7 @@ namespace FFLAS { MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq > & H) { - MMHelper,RecInt::ruint >, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeqHelper::Sequential> H2; + MMHelper,RecInt::ruint >, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> H2; fgemm (F,ta,FflasNoTrans,(ta==FFLAS::FflasNoTrans)?m:n,1,(ta==FFLAS::FflasNoTrans)?n:m,alpha,A,lda,X,incx,beta,Y,incy,H2); return Y; } diff --git a/tests/test-fgemv.C b/tests/test-fgemv.C index a8b9e2197..b1bf2714e 100644 --- a/tests/test-fgemv.C +++ b/tests/test-fgemv.C @@ -400,7 +400,7 @@ int main(int argc, char** argv) bool ok = true; srand(seed); - do{ + do{/* ok = ok && run_with_field >(q,b,m,k,iters,p, seed); ok = ok && run_with_field >(q,b,m,k,iters,p, seed); ok = ok && run_with_field >(q,b,m,k,iters,p, seed); @@ -411,7 +411,7 @@ int main(int argc, char** argv) ok = ok && run_with_field >(q,b,m,k,iters, p, seed); ok = ok && run_with_field > >(q,b?b:127_ui64,m,k,iters, p, seed); - ok = ok && run_with_field,RecInt::ruint<8> > >(q,b?b:127_ui64,m,k,iters, p, seed); + ok = ok && run_with_field,RecInt::ruint<8> > >(q,b?b:127_ui64,m,k,iters, p, seed);*/ ok = ok && run_with_field >(q,(b?b:512_ui64),m,k,iters,p, seed); ok = ok && run_with_field >(0,(b?b:512_ui64),m,k,iters,p, seed); } while (loop && ok); From 545645edb2598335de7b8ebfa14009fce2b89daa Mon Sep 17 00:00:00 2001 From: ZHG Date: Mon, 6 May 2019 17:18:14 +0200 Subject: [PATCH 02/31] rns fgemv with parseqhelper adapted and its corresponding benchmark-fgemv-mp has been restructured for different parameter values --- benchmarks/benchmark-fgemv-mp.C | 163 +++++++++++++++++------------ fflas-ffpack/fflas/fflas_fgemv.inl | 25 +++++ tests/test-fgemv.C | 4 +- 3 files changed, 121 insertions(+), 71 deletions(-) diff --git a/benchmarks/benchmark-fgemv-mp.C b/benchmarks/benchmark-fgemv-mp.C index 7301ae8f6..a7aa11ed3 100644 --- a/benchmarks/benchmark-fgemv-mp.C +++ b/benchmarks/benchmark-fgemv-mp.C @@ -31,7 +31,6 @@ #define STD_RECINT_SIZE 8 #endif - #include "fflas-ffpack/fflas-ffpack-config.h" #include #include @@ -50,34 +49,6 @@ using namespace std; #endif -template -std::ostream& write_matrix(std::ostream& out, Givaro::Integer p, size_t m, size_t n, T* C, size_t ldc){ - - size_t www(size_t((double(p.bitsize())*log(2.))/log(10.))); - out<<"Matrix("<, 2 for , 3 for .", + TYPE_INT , &par }, + { 'g', "-g G", "Sets GrainSize.", TYPE_INT , &GrainSize }, + { 't', "-t T", "number of virtual threads to drive the partition.", TYPE_INT , &t }, END_OF_ARGUMENTS }; +template +bool check_result(Field& F, size_t m, size_t lda, Matrix& A, Vector& X, size_t incX, Vector& Y, size_t incY){ + //Naive result checking by comparing result from pfgemv against the one from fgemv + typename Field::Element_ptr Y2 = FFLAS::fflas_new(F,m,1); + FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y2, incY); + + for(size_t j=0; j +void benchmark_disp(Field& F, double& time, size_t iters, int p, size_t m, size_t k, arg& as){ + + std::cout << "Time: " << time / double(iters) + << " Gflops: " << (2.*double(m)/1000.*double(k)/1000.0/1000.0) / time * double(iters); + FFLAS::writeCommandString(std::cout, as) << std::endl; +} + template int tmain(){ srand( (int)seed); @@ -124,17 +127,8 @@ int tmain(){ A= FFLAS::fflas_new(F,m,lda); B= FFLAS::fflas_new(F,k,ldb); C= FFLAS::fflas_new(F,m,ldc); - - // for (size_t i=0;i parH; -parH.set_numthreads(NUM_THREADS); - - FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc,parH); -} + chrono.clear(); + + //@TODO: Still need to use PAR_BLOCK to label the parallel region, impl as pDet to wrap this into one function + PAR_BLOCK { + if (par){ + typedef FFLAS::CuttingStrategy::Row row; + typedef FFLAS::CuttingStrategy::Recursive rec; + typedef FFLAS::StrategyParameter::Threads threads; + typedef FFLAS::StrategyParameter::Grain grain; + + if (loop) { chrono.start(); } + + switch (par){ + + case 1:{ + FFLAS::ParSeqHelper::Parallel H(t); + FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc, H); + break; + } + case 2:{ + FFLAS::ParSeqHelper::Parallel H(t); + FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc, H); + break; + } + case 3:{ + FFLAS::ParSeqHelper::Parallel H(GrainSize); + FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc, H); + break; + } + default:{ + FFLAS::ParSeqHelper::Sequential H; + FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc, H); + break; + } + } + + if (loop) {chrono.stop(); time+=chrono.realtime();} + }else{ + if (loop) chrono.start(); + FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc); + if (loop) {chrono.stop(); time+=chrono.realtime();} + } + + time+=chrono.realtime(); + + FFLAS::fflas_delete(A); + FFLAS::fflas_delete(B); + FFLAS::fflas_delete(C); + + } + + if(!check_result(F, m, lda, A, B, ldb, C, ldc)){ + std::cerr<<"Computation failed with wrong result"<(); diff --git a/fflas-ffpack/fflas/fflas_fgemv.inl b/fflas-ffpack/fflas/fflas_fgemv.inl index 5cddce3a0..519536fe6 100644 --- a/fflas-ffpack/fflas/fflas_fgemv.inl +++ b/fflas-ffpack/fflas/fflas_fgemv.inl @@ -494,6 +494,31 @@ namespace FFLAS{ MMHelper pH(F,m,n,1,seqH); return fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY, pH); } +//TODO: Not sure about the defaut parameters, it is required to benchmark for different cutting strategies and parameters so as to find out the best defaut values for cutting strategies and parameters +/* + template + typename Field::Element_ptr + pfgemv(const Field& F, + const FFLAS_TRANSPOSE ta, + const size_t m, + const size_t n, + const typename Field::Element alpha, + const typename Field::ConstElement_ptr A, const size_t lda, + const typename Field::ConstElement_ptr X, const size_t incX, + const typename Field::Element beta, + typename Field::Element_ptr Y, const size_t incY){ + ParSeqHelper::Parallel parH; + PAR_BLOCK{ + parH.set_numthreads(NUM_THREADS); + MMHelper::value, + ParSeqHelper::Parallel > pH (F,m,n,1,parH); + fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY, pH); + } + return Y; + } +*/ } #endif // __FFLASFFPACK_fgemv_INL diff --git a/tests/test-fgemv.C b/tests/test-fgemv.C index b1bf2714e..a8b9e2197 100644 --- a/tests/test-fgemv.C +++ b/tests/test-fgemv.C @@ -400,7 +400,7 @@ int main(int argc, char** argv) bool ok = true; srand(seed); - do{/* + do{ ok = ok && run_with_field >(q,b,m,k,iters,p, seed); ok = ok && run_with_field >(q,b,m,k,iters,p, seed); ok = ok && run_with_field >(q,b,m,k,iters,p, seed); @@ -411,7 +411,7 @@ int main(int argc, char** argv) ok = ok && run_with_field >(q,b,m,k,iters, p, seed); ok = ok && run_with_field > >(q,b?b:127_ui64,m,k,iters, p, seed); - ok = ok && run_with_field,RecInt::ruint<8> > >(q,b?b:127_ui64,m,k,iters, p, seed);*/ + ok = ok && run_with_field,RecInt::ruint<8> > >(q,b?b:127_ui64,m,k,iters, p, seed); ok = ok && run_with_field >(q,(b?b:512_ui64),m,k,iters,p, seed); ok = ok && run_with_field >(0,(b?b:512_ui64),m,k,iters,p, seed); } while (loop && ok); From 3d38beda39767f0788b6c541348931864f4fbdc0 Mon Sep 17 00:00:00 2001 From: ZHG Date: Tue, 7 May 2019 10:58:03 +0200 Subject: [PATCH 03/31] cleaned up for code review --- benchmarks/benchmark-fgemv-mp.C | 35 +++++++++++++++++---------------- tests/test-fgemv.C | 1 - 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/benchmarks/benchmark-fgemv-mp.C b/benchmarks/benchmark-fgemv-mp.C index a7aa11ed3..7626d2aea 100644 --- a/benchmarks/benchmark-fgemv-mp.C +++ b/benchmarks/benchmark-fgemv-mp.C @@ -54,10 +54,10 @@ static Givaro::Integer q = -1 ; static unsigned long b = 512 ; static size_t m = 512 ; static size_t k = 512 ; -static int nbw = -1 ; + static size_t seed= time(NULL); static int par = 0; -int t; +int t = 1; size_t GrainSize = 64; static Argument as[] = { @@ -65,7 +65,7 @@ static Argument as[] = { { 'b', "-b B", "Set the bitsize of the random characteristic.", TYPE_INT , &b }, { 'm', "-m M", "Set the dimension m of the matrix.", TYPE_INT , &m }, { 'k', "-k K", "Set the dimension k of the matrix.", TYPE_INT , &k }, - { 'w', "-w N", "Set the number of winograd levels (-1 for random).", TYPE_INT , &nbw }, + { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iters }, { 's', "-s S", "Sets seed.", TYPE_INT , &seed }, { 'p', "-p P", "0 for sequential, 1 for , 2 for , 3 for .", @@ -157,24 +157,24 @@ int tmain(){ switch (par){ case 1:{ - FFLAS::ParSeqHelper::Parallel H(t); - FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc, H); - break; + FFLAS::ParSeqHelper::Parallel H(t); + FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc, H); + break; } case 2:{ - FFLAS::ParSeqHelper::Parallel H(t); - FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc, H); - break; + FFLAS::ParSeqHelper::Parallel H(t); + FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc, H); + break; } case 3:{ - FFLAS::ParSeqHelper::Parallel H(GrainSize); - FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc, H); - break; + FFLAS::ParSeqHelper::Parallel H(GrainSize); + FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc, H); + break; } default:{ - FFLAS::ParSeqHelper::Sequential H; - FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc, H); - break; + FFLAS::ParSeqHelper::Sequential H; + FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc, H); + break; } } @@ -192,11 +192,12 @@ int tmain(){ FFLAS::fflas_delete(C); } - +/* if(!check_result(F, m, lda, A, B, ldb, C, ldc)){ std::cerr<<"Computation failed with wrong result"< >(q,b,m,k,iters,p, seed); ok = ok && run_with_field >(q,b,m,k,iters, p, seed); ok = ok && run_with_field >(q,b,m,k,iters, p, seed); - ok = ok && run_with_field > >(q,b?b:127_ui64,m,k,iters, p, seed); ok = ok && run_with_field,RecInt::ruint<8> > >(q,b?b:127_ui64,m,k,iters, p, seed); ok = ok && run_with_field >(q,(b?b:512_ui64),m,k,iters,p, seed); From 3663c8391ae63f0c89505355cef2c3f791b660bf Mon Sep 17 00:00:00 2001 From: ZHG Date: Mon, 13 May 2019 16:18:05 +0200 Subject: [PATCH 04/31] Rolled back the benchmark-fgemv-mp and adopted benchmark-fgemv-rns for benchmarking fgemv in the rns field --- benchmarks/benchmark-fgemv-mp.C | 230 ++++++++++++++------------ benchmarks/benchmark-fgemv.C | 17 +- fflas-ffpack/fflas/fflas_fgemv.inl | 7 +- fflas-ffpack/fflas/fflas_fgemv_mp.inl | 10 +- fflas-ffpack/paladin/pfgemv.inl | 5 +- 5 files changed, 149 insertions(+), 120 deletions(-) diff --git a/benchmarks/benchmark-fgemv-mp.C b/benchmarks/benchmark-fgemv-mp.C index 7626d2aea..23c5f8735 100644 --- a/benchmarks/benchmark-fgemv-mp.C +++ b/benchmarks/benchmark-fgemv-mp.C @@ -1,3 +1,4 @@ + /* * Copyright (C) FFLAS-FFPACK * Written by Pascal Giorgi @@ -24,6 +25,11 @@ *. */ +// declare that the call to openblas_set_numthread will be made here, hence don't do it +// everywhere in the call stack +#define __FFLASFFPACK_OPENBLAS_NT_ALREADY_SET 1 + + #if not defined(MG_DEFAULT) #define MG_DEFAULT MG_ACTIVE #endif @@ -38,8 +44,9 @@ #include using namespace std; -#include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/fflas/fflas.h" +#include "fflas-ffpack/utils/fflas_io.h" +#include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/args-parser.h" #include "givaro/modular-integer.h" #include "givaro/givcaster.h" @@ -48,57 +55,38 @@ using namespace std; #include "recint/recint.h" #endif +#ifdef BENCH_FLINT +#define __GMP_BITS_PER_MP_LIMB 64 +extern "C" { +#include "flint/longlong.h" +#include "flint/long_extras.h" +#include "flint/fmpz_mat.h" +#include "flint/fmpz.h" +#include "flint/flint.h" +} +#endif + static size_t iters = 3 ; static Givaro::Integer q = -1 ; static unsigned long b = 512 ; static size_t m = 512 ; static size_t k = 512 ; - +static size_t n = 512 ; +static int nbw = -1 ; static size_t seed= time(NULL); -static int par = 0; -int t = 1; - -size_t GrainSize = 64; static Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q }, { 'b', "-b B", "Set the bitsize of the random characteristic.", TYPE_INT , &b }, { 'm', "-m M", "Set the dimension m of the matrix.", TYPE_INT , &m }, { 'k', "-k K", "Set the dimension k of the matrix.", TYPE_INT , &k }, - + { 'n', "-n N", "Set the dimension n of the matrix.", TYPE_INT , &n }, + { 'w', "-w N", "Set the number of winograd levels (-1 for random).", TYPE_INT , &nbw }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iters }, { 's', "-s S", "Sets seed.", TYPE_INT , &seed }, - { 'p', "-p P", "0 for sequential, 1 for , 2 for , 3 for .", - TYPE_INT , &par }, - { 'g', "-g G", "Sets GrainSize.", TYPE_INT , &GrainSize }, - { 't', "-t T", "number of virtual threads to drive the partition.", TYPE_INT , &t }, END_OF_ARGUMENTS }; -template -bool check_result(Field& F, size_t m, size_t lda, Matrix& A, Vector& X, size_t incX, Vector& Y, size_t incY){ - //Naive result checking by comparing result from pfgemv against the one from fgemv - typename Field::Element_ptr Y2 = FFLAS::fflas_new(F,m,1); - FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y2, incY); - - for(size_t j=0; j -void benchmark_disp(Field& F, double& time, size_t iters, int p, size_t m, size_t k, arg& as){ - - std::cout << "Time: " << time / double(iters) - << " Gflops: " << (2.*double(m)/1000.*double(k)/1000.0/1000.0) / time * double(iters); - FFLAS::writeCommandString(std::cout, as) << std::endl; -} - template int tmain(){ srand( (int)seed); @@ -108,7 +96,10 @@ int tmain(){ typedef Givaro::Modular Field; Givaro::Integer p; FFLAS::Timer chrono, TimFreivalds; - double time=0.; + double time=0.,timev=0.; +#ifdef BENCH_FLINT + double timeFlint=0.; +#endif for (size_t loop=0;loop(&p))); + fmpz_mat_t AA,BB,CC,DD; + fmpz_mat_init (AA, m, k); + fmpz_mat_init (BB, k, n); + fmpz_mat_init (CC, m, n); + fmpz_mat_init (DD, m, n); + fmpz_t aalpha, bbeta; + fmpz_set_mpz(aalpha,*(reinterpret_cast(&alpha))); + fmpz_set_mpz(bbeta,*(reinterpret_cast(&beta))); + + for (size_t i=0;i(A+i*lda+j))); + for (size_t i=0;i(B+i*ldb+j))); + for (size_t i=0;i(C+i*ldc+j))); + chrono.clear();chrono.start(); + // DD= A.B + fmpz_mat_mul(DD,AA,BB); + // CC = beta.C + fmpz_mat_scalar_mul_fmpz(CC,CC,bbeta); + // CC = CC + DD.alpha + fmpz_mat_scalar_addmul_fmpz(CC,DD,aalpha); + // CC = CC mod p + for (size_t i=0;i H(t); - FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc, H); - break; - } - case 2:{ - FFLAS::ParSeqHelper::Parallel H(t); - FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc, H); - break; - } - case 3:{ - FFLAS::ParSeqHelper::Parallel H(GrainSize); - FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc, H); - break; - } - default:{ - FFLAS::ParSeqHelper::Sequential H; - FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc, H); - break; - } - } - - if (loop) {chrono.stop(); time+=chrono.realtime();} - }else{ - if (loop) chrono.start(); - FFLAS::fgemv(F,FFLAS::FflasNoTrans,m,k,alpha,A,lda,B,ldb,beta,C,ldc); - if (loop) {chrono.stop(); time+=chrono.realtime();} - } - - time+=chrono.realtime(); - - FFLAS::fflas_delete(A); - FFLAS::fflas_delete(B); - FFLAS::fflas_delete(C); - - } -/* - if(!check_result(F, m, lda, A, B, ldb, C, ldc)){ - std::cerr<<"Computation failed with wrong result"< H(t); + FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); + break; + } default:{ FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY); break; @@ -259,15 +264,15 @@ int main(int argc, char** argv) { } }else{ PAR_BLOCK { - //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as); - //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as); - //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as); + //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); + //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); + //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); - //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as); + //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); - //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as); + //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); - //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as); + //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); } } diff --git a/fflas-ffpack/fflas/fflas_fgemv.inl b/fflas-ffpack/fflas/fflas_fgemv.inl index 519536fe6..63f551d5a 100644 --- a/fflas-ffpack/fflas/fflas_fgemv.inl +++ b/fflas-ffpack/fflas/fflas_fgemv.inl @@ -33,7 +33,7 @@ #if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS) and defined(__x86_64__) #include "fflas-ffpack/fflas/fflas_igemm/igemm.h" #endif - +//!Function converting to float field for the provided field namespace FFLAS{ namespace Protected { template inline typename Field::Element_ptr @@ -82,7 +82,7 @@ namespace FFLAS{ namespace Protected { } }// Protected }// FFLAS - +//!Convert to either float or double according to field's cardinality namespace FFLAS { template inline typename Field::Element_ptr @@ -114,7 +114,7 @@ namespace FFLAS { // Computes Y <- alpha.op(A).X + beta.Y // A is M*N, //--------------------------------------------------------------------- - + //! Performs Matrix Vector Multiplication with delayed mod reductions. Ensures result is reduced. template inline typename Field::Element_ptr fgemv (const Field& F, const FFLAS_TRANSPOSE ta, @@ -126,7 +126,6 @@ namespace FFLAS { typename Field::Element_ptr Y, const size_t incY, MMHelper & H) { - if (!M) {return Y;} size_t Ydim = (ta == FflasNoTrans)?M:N; size_t Xdim = (ta == FflasNoTrans)?N:M; diff --git a/fflas-ffpack/fflas/fflas_fgemv_mp.inl b/fflas-ffpack/fflas/fflas_fgemv_mp.inl index 22cee36b8..71cdf7a58 100644 --- a/fflas-ffpack/fflas/fflas_fgemv_mp.inl +++ b/fflas-ffpack/fflas/fflas_fgemv_mp.inl @@ -31,6 +31,7 @@ namespace FFLAS { +//@TODO: Only RNSInteger and RNSIntegerMod are implemented other type like Integer is not taken into account // specialization of the fgemv function for the field RNSInteger inline FFPACK::rns_double::Element_ptr @@ -70,7 +71,7 @@ namespace FFLAS { MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag> & H) { //std::cout<<"HERE 1"<, MMHelperAlgo::Classic, ModeCategories::DefaultTag > H2; + MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag > H2(H); //std::cout<<"HERE 2"< @@ -94,7 +94,7 @@ namespace FFLAS { Givaro::Integer* Y, const size_t ldy, MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> & H) { - MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> H2; + MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> H2(H); fgemm(F,ta,FFLAS::FflasNoTrans, (ta==FFLAS::FflasNoTrans)?m:n, 1,(ta==FFLAS::FflasNoTrans)?n:m, alpha,A,lda,X,ldx,beta,Y,ldy,H2); return Y; } @@ -112,7 +112,7 @@ namespace FFLAS { Givaro::Integer* Y, const size_t ldy, MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> & H) { - MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> H2; + MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> H2(H); fgemm(F,ta,FFLAS::FflasNoTrans,(ta==FFLAS::FflasNoTrans)?m:n,1,(ta==FFLAS::FflasNoTrans)?n:m,alpha,A,lda,X,ldx,beta,Y,ldy,H2); return Y; } @@ -133,7 +133,7 @@ namespace FFLAS { MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq > & H) { - MMHelper,RecInt::ruint >, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> H2; + MMHelper,RecInt::ruint >, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> H2(H); fgemm (F,ta,FflasNoTrans,(ta==FFLAS::FflasNoTrans)?m:n,1,(ta==FFLAS::FflasNoTrans)?n:m,alpha,A,lda,X,incx,beta,Y,incy,H2); return Y; } diff --git a/fflas-ffpack/paladin/pfgemv.inl b/fflas-ffpack/paladin/pfgemv.inl index c6f0234ad..a7780148b 100644 --- a/fflas-ffpack/paladin/pfgemv.inl +++ b/fflas-ffpack/paladin/pfgemv.inl @@ -25,7 +25,7 @@ namespace FFLAS { - + // specialization of the fgemv function for the MMHelper with CuttingStrategy::Recursive template typename Field::Element_ptr fgemv(const Field& F, @@ -38,7 +38,6 @@ namespace FFLAS const typename Field::Element beta, typename Field::Element_ptr Y, const size_t incY, MMHelper > & H){ - if (H.parseq.numthreads()==1 || m <= 1){ fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY); @@ -75,7 +74,7 @@ namespace FFLAS return Y; } - + // specialization of the fgemv function for the MMHelper with CuttingStrategy::Row template typename Field::Element_ptr fgemv(const Field& F, From f632d389ddac0662a962eb9041fbecbc74a35530 Mon Sep 17 00:00:00 2001 From: ZHG Date: Thu, 16 May 2019 10:03:52 +0200 Subject: [PATCH 05/31] Ready for rns benchmark --- fflas-ffpack/fflas/fflas_fgemv.inl | 7 ++++++- fflas-ffpack/fflas/fflas_fgemv_mp.inl | 3 --- fflas-ffpack/field/rns-double.h | 2 +- fflas-ffpack/paladin/pfgemv.inl | 4 ++-- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/fflas-ffpack/fflas/fflas_fgemv.inl b/fflas-ffpack/fflas/fflas_fgemv.inl index 63f551d5a..94b74dff0 100644 --- a/fflas-ffpack/fflas/fflas_fgemv.inl +++ b/fflas-ffpack/fflas/fflas_fgemv.inl @@ -403,6 +403,8 @@ namespace FFLAS{ #endif return Y; } + + //specialization for ZRing inline Givaro::DoubleDomain::Element_ptr fgemv (const Givaro::DoubleDomain& F, const FFLAS_TRANSPOSE ta, const size_t M, const size_t N, @@ -441,6 +443,7 @@ namespace FFLAS{ return fgemv(F, ta, M, N, alpha, A, lda, X, incX, beta, Y, incY, Hb); } + ////specialization for ZRing inline Givaro::FloatDomain::Element_ptr fgemv (const Givaro::FloatDomain& F, const FFLAS_TRANSPOSE ta, const size_t M, const size_t N, @@ -462,6 +465,7 @@ namespace FFLAS{ return Y; } + //Common interface for fgemv with ParSeqHelper::Parallel input parameter in which the corresponding parallel implementation will be called for the given field ref. pfgemv.inl template typename Field::Element_ptr fgemv(const Field& F, @@ -478,6 +482,7 @@ namespace FFLAS{ return fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY, pH); } + //Common interface for fgemv with ParSeqHelper::Sequential input parameter in which the corresponding sequential implementation will be called for the given field type either for common field implementated as above or multiprcesion field ref. fflas_fgemv_mp.inl template typename Field::Element_ptr fgemv(const Field& F, @@ -493,7 +498,7 @@ namespace FFLAS{ MMHelper pH(F,m,n,1,seqH); return fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY, pH); } -//TODO: Not sure about the defaut parameters, it is required to benchmark for different cutting strategies and parameters so as to find out the best defaut values for cutting strategies and parameters +//TODO: Not sure about the defaut parameters, it is required to benchmark for different cutting strategies and parameters so as to find out the best defaut values for the defaut parallel implementation /* template typename Field::Element_ptr diff --git a/fflas-ffpack/fflas/fflas_fgemv_mp.inl b/fflas-ffpack/fflas/fflas_fgemv_mp.inl index 71cdf7a58..96d409523 100644 --- a/fflas-ffpack/fflas/fflas_fgemv_mp.inl +++ b/fflas-ffpack/fflas/fflas_fgemv_mp.inl @@ -31,8 +31,6 @@ namespace FFLAS { -//@TODO: Only RNSInteger and RNSIntegerMod are implemented other type like Integer is not taken into account - // specialization of the fgemv function for the field RNSInteger inline FFPACK::rns_double::Element_ptr fgemv (const FFPACK::RNSInteger& F, const FFLAS_TRANSPOSE ta, @@ -58,7 +56,6 @@ namespace FFLAS { return Y; } - // specialization of the fgemv function for the field RNSIntegerMod inline FFPACK::rns_double::Element_ptr fgemv (const FFPACK::RNSIntegerMod& F, const FFLAS_TRANSPOSE ta, diff --git a/fflas-ffpack/field/rns-double.h b/fflas-ffpack/field/rns-double.h index 45b6586e0..767615a21 100644 --- a/fflas-ffpack/field/rns-double.h +++ b/fflas-ffpack/field/rns-double.h @@ -53,7 +53,7 @@ namespace FFPACK { /* Structure that handles rns representation given a bound and bitsize for prime moduli - * support sign representation (i.e. the bound must be twice larger then ||A||) + * support sign representation (i.e. the bound must be twice larger than ||A||) */ struct rns_double { typedef Givaro::Integer integer; diff --git a/fflas-ffpack/paladin/pfgemv.inl b/fflas-ffpack/paladin/pfgemv.inl index a7780148b..bd9b9bd92 100644 --- a/fflas-ffpack/paladin/pfgemv.inl +++ b/fflas-ffpack/paladin/pfgemv.inl @@ -25,7 +25,7 @@ namespace FFLAS { - // specialization of the fgemv function for the MMHelper with CuttingStrategy::Recursive + // specialization of the fgemv function for the MMHelper with CuttingStrategy::Recursive but templated for all possible field type so that the corresponding templated sequential implementation will be invoked in the parallel code region template typename Field::Element_ptr fgemv(const Field& F, @@ -74,7 +74,7 @@ namespace FFLAS return Y; } - // specialization of the fgemv function for the MMHelper with CuttingStrategy::Row + // specialization of the fgemv function for the MMHelper with CuttingStrategy::Row but templated for all possible field type so that the corresponding templated sequential implementation will be invoked in the parallel code region template typename Field::Element_ptr fgemv(const Field& F, From 27dd46c7bc76b732e5c21c963bd98943aa0a03e1 Mon Sep 17 00:00:00 2001 From: ZHG Date: Thu, 16 May 2019 17:46:49 +0200 Subject: [PATCH 06/31] Instant backup for code review --- benchmarks/benchmark-fgemv.C | 4 ++-- fflas-ffpack/paladin/pfgemv.inl | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark-fgemv.C b/benchmarks/benchmark-fgemv.C index 204c73b3f..9c795181c 100644 --- a/benchmarks/benchmark-fgemv.C +++ b/benchmarks/benchmark-fgemv.C @@ -151,12 +151,12 @@ bool benchmark_with_timer(Field& F, int p, Matrix& A, Vector& X, Vector& Y, size FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY); if (i) {chrono.stop(); time+=chrono.realtime();} } - +/* if(!check_result(F, m, lda, A, X, incX, Y, incY)){ pass = false; break; } - +*/ } return pass; } diff --git a/fflas-ffpack/paladin/pfgemv.inl b/fflas-ffpack/paladin/pfgemv.inl index bd9b9bd92..440a1f650 100644 --- a/fflas-ffpack/paladin/pfgemv.inl +++ b/fflas-ffpack/paladin/pfgemv.inl @@ -105,7 +105,6 @@ namespace FFLAS return Y; } - } // FFLAS /* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ From c8928bbb0f02758ad32a66ce1bbb4e0d83dff24f Mon Sep 17 00:00:00 2001 From: ZHG Date: Fri, 17 May 2019 16:25:17 +0200 Subject: [PATCH 07/31] Check if it is required impl --- benchmarks/benchmark-fgemv.C | 22 +++++++++++++++++----- fflas-ffpack/fflas/fflas_fgemv.inl | 26 ++++++++++++++++++++++++-- fflas-ffpack/fflas/fflas_fgemv_mp.inl | 4 ++-- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/benchmarks/benchmark-fgemv.C b/benchmarks/benchmark-fgemv.C index 9c795181c..acda21ddf 100644 --- a/benchmarks/benchmark-fgemv.C +++ b/benchmarks/benchmark-fgemv.C @@ -112,14 +112,14 @@ bool benchmark_with_timer(Field& F, int p, Matrix& A, Vector& X, Vector& Y, size if (p){ - typedef CuttingStrategy::Row row; + typedef CuttingStrategy::Row row;typedef CuttingStrategy::Block block; typedef CuttingStrategy::Recursive rec; typedef StrategyParameter::Threads threads; typedef StrategyParameter::Grain grain; if (i) { chrono.start(); } - switch (p){ + switch (p){/* case 1:{ ParSeqHelper::Parallel H(t); FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); @@ -133,12 +133,24 @@ bool benchmark_with_timer(Field& F, int p, Matrix& A, Vector& X, Vector& Y, size ParSeqHelper::Parallel H(GrainSize); FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); break; - } + }*/ case 4:{ - ParSeqHelper::Parallel H(t); + ParSeqHelper::Parallel PSH(t); + MMHelper::value, ParSeqHelper::Parallel> H(F,m,k,1,PSH); +// ParSeqHelper::Compose, ParSeqHelper::Parallel> H(1,1); + FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); + break; + }/* + case 5:{ + ParSeqHelper::Compose, ParSeqHelper::Parallel> H(t); FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); break; } + case 6:{ + ParSeqHelper::Compose, ParSeqHelper::Parallel> H(GrainSize); + FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); + break; + }*/ default:{ FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY); break; @@ -271,7 +283,7 @@ int main(int argc, char** argv) { //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); - benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); + //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); } } diff --git a/fflas-ffpack/fflas/fflas_fgemv.inl b/fflas-ffpack/fflas/fflas_fgemv.inl index 94b74dff0..52d1328d4 100644 --- a/fflas-ffpack/fflas/fflas_fgemv.inl +++ b/fflas-ffpack/fflas/fflas_fgemv.inl @@ -477,9 +477,10 @@ namespace FFLAS{ const typename Field::ConstElement_ptr X, const size_t incX, const typename Field::Element beta, typename Field::Element_ptr Y, const size_t incY, - ParSeqHelper::Parallel& parH){ + ParSeqHelper::Parallel& parH){std::cout<>>>>>>>>>>>>>>>>>>>>> "<::value, ParSeqHelper::Parallel > pH (F,m,n,1,parH); - return fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY, pH); + fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY, pH); + return Y; } //Common interface for fgemv with ParSeqHelper::Sequential input parameter in which the corresponding sequential implementation will be called for the given field type either for common field implementated as above or multiprcesion field ref. fflas_fgemv_mp.inl @@ -523,6 +524,27 @@ namespace FFLAS{ return Y; } */ + + + template + typename Field::Element_ptr + fgemv(const Field& F, + const FFLAS_TRANSPOSE ta, + const size_t m, + const size_t n, + const typename Field::Element alpha, + const typename Field::ConstElement_ptr A, const size_t lda, + const typename Field::ConstElement_ptr X, const size_t incX, + const typename Field::Element beta, + typename Field::Element_ptr Y, const size_t incY, + FFLAS::ParSeqHelper::Compose, + FFLAS::ParSeqHelper::Parallel >& cpsH){ + MMHelper::value, ParSeqHelper::Parallel > pH(F,-1,cpsH); +std::cout<>>>>>>>>>>>>>>>>>>>>> "<, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> & H) - { + {std::cout<<"("< <<<<<<<<<<<<<<<<< "<, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> H2(H); fgemm(F,ta,FFLAS::FflasNoTrans, (ta==FFLAS::FflasNoTrans)?m:n, 1,(ta==FFLAS::FflasNoTrans)?n:m, alpha,A,lda,X,ldx,beta,Y,ldy,H2); return Y; @@ -108,7 +108,7 @@ namespace FFLAS { Givaro::Integer beta, Givaro::Integer* Y, const size_t ldy, MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> & H) - { + {std::cout<<"("< <<<<<<<<<<<<<<<<< "<, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> H2(H); fgemm(F,ta,FFLAS::FflasNoTrans,(ta==FFLAS::FflasNoTrans)?m:n,1,(ta==FFLAS::FflasNoTrans)?n:m,alpha,A,lda,X,ldx,beta,Y,ldy,H2); return Y; From 59326f14e054dadde5cfcf6a4d2ef1ddc363dd57 Mon Sep 17 00:00:00 2001 From: ZHG Date: Tue, 21 May 2019 17:24:08 +0200 Subject: [PATCH 08/31] rns for fgemv implemented but no obvious speedup can be found --- benchmarks/benchmark-fgemv.C | 25 ++-------- .../fflas/fflas_fgemm/fgemm_classical_mp.inl | 18 ++++--- fflas-ffpack/fflas/fflas_fgemv.inl | 48 +++++++++++++++++-- fflas-ffpack/fflas/fflas_fgemv_mp.inl | 44 +++++++++++++---- fflas-ffpack/field/rns-double.inl | 3 +- fflas-ffpack/field/rns-integer.h | 6 +-- fflas-ffpack/paladin/blockcuts.inl | 2 +- 7 files changed, 100 insertions(+), 46 deletions(-) diff --git a/benchmarks/benchmark-fgemv.C b/benchmarks/benchmark-fgemv.C index acda21ddf..8e49752c6 100644 --- a/benchmarks/benchmark-fgemv.C +++ b/benchmarks/benchmark-fgemv.C @@ -112,14 +112,14 @@ bool benchmark_with_timer(Field& F, int p, Matrix& A, Vector& X, Vector& Y, size if (p){ - typedef CuttingStrategy::Row row;typedef CuttingStrategy::Block block; + typedef CuttingStrategy::Row row; typedef CuttingStrategy::Recursive rec; typedef StrategyParameter::Threads threads; typedef StrategyParameter::Grain grain; if (i) { chrono.start(); } - switch (p){/* + switch (p){ case 1:{ ParSeqHelper::Parallel H(t); FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); @@ -133,24 +133,7 @@ bool benchmark_with_timer(Field& F, int p, Matrix& A, Vector& X, Vector& Y, size ParSeqHelper::Parallel H(GrainSize); FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); break; - }*/ - case 4:{ - ParSeqHelper::Parallel PSH(t); - MMHelper::value, ParSeqHelper::Parallel> H(F,m,k,1,PSH); -// ParSeqHelper::Compose, ParSeqHelper::Parallel> H(1,1); - FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); - break; - }/* - case 5:{ - ParSeqHelper::Compose, ParSeqHelper::Parallel> H(t); - FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); - break; } - case 6:{ - ParSeqHelper::Compose, ParSeqHelper::Parallel> H(GrainSize); - FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); - break; - }*/ default:{ FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY); break; @@ -248,7 +231,7 @@ int main(int argc, char** argv) { int t; PAR_BLOCK { t = NUM_THREADS; } int NBK = -1; - int b=0; + int b=100; size_t GrainSize = 64; Argument as[] = { @@ -283,7 +266,7 @@ int main(int argc, char** argv) { //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); - //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); + benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); } } diff --git a/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl b/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl index 4fef8e4e4..fcf126ed1 100644 --- a/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl +++ b/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl @@ -192,7 +192,7 @@ namespace FFLAS { } // fgemm for RnsInteger: handle the moduli in parallel - template + template inline typename FFPACK::RNSInteger::Element_ptr fgemm (const FFPACK::RNSInteger &F, const FFLAS_TRANSPOSE ta, @@ -203,13 +203,14 @@ namespace FFLAS { typename FFPACK::RNSInteger::ConstElement_ptr Bd, const size_t ldb, const typename FFPACK::RNSInteger::Element beta, typename FFPACK::RNSInteger::Element_ptr Cd, const size_t ldc, - MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag, ParSeqHelper::Compose, ParSeqTrait> > & H) + MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag, ParSeqHelper::Compose, ParSeqTrait> > & H) { #ifdef PROFILE_FGEMM_MP Givaro::Timer t;t.start(); #endif size_t rns_size = F.size(); - typedef MMHelper::value, ParSeqTrait> SubHelper; + typedef MMHelper::value, ParSeqTrait> SubHelper; + if(H.parseq.second_component().numthreads()>1){ FORBLOCK1D(iter, rns_size, H.parseq.first_component(), TASK(MODE(CONSTREFERENCE(F,H)), { @@ -225,6 +226,9 @@ namespace FFLAS { } }) ); + }else{ + FFLAS::fgemm(F,ta,tb,m, n, k, alpha,Ad, lda,Bd, ldb, beta, Cd, ldc); + } #ifdef PROFILE_FGEMM_MP t.stop(); std::cerr<<"=========================================="<::ConstElement_ptr Bd, const size_t ldb, const typename FFPACK::RNSInteger::Element beta, typename FFPACK::RNSInteger::Element_ptr Cd, const size_t ldc, - MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag, ParSeqHelper::Parallel > & H) + MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag, ParSeqHelper::Parallel > & H) { // compute each fgemm componentwise size_t rns_size = F.size(); @@ -322,12 +326,13 @@ namespace FFLAS { Givaro::Integer* C, const size_t ldc, MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq > & H) { - //std::cerr<<"Entering fgemm> ParSeq"<> ParSeq"< + Givaro::Integer* + fgemv(const Givaro::ZRing& F, + const FFLAS_TRANSPOSE ta, + const size_t m, + const size_t n, + const Givaro::Integer alpha, + const Givaro::Integer* A, const size_t lda, + const Givaro::Integer* X, const size_t incX, + const Givaro::Integer beta, + Givaro::Integer* Y, const size_t incY, + ParSeqHelper::Compose& parH){ + MMHelper, MMHelperAlgo::Auto, FFLAS::ModeTraits>::value, ParSeqHelper::Compose> pH (F,m,n,1,parH); + fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY, pH); + return Y; + } + + template + Givaro::Integer* + fgemv(const Givaro::Modular& F, + const FFLAS_TRANSPOSE ta, + const size_t m, + const size_t n, + const Givaro::Integer alpha, + const Givaro::Integer* A, const size_t lda, + const Givaro::Integer* X, const size_t incX, + const Givaro::Integer beta, + Givaro::Integer* Y, const size_t incY, + ParSeqHelper::Compose& parH){ + MMHelper, MMHelperAlgo::Auto, FFLAS::ModeTraits>::value, ParSeqHelper::Compose> pH (F,m,n,1,parH); + fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY, pH); + return Y; + } + + + //Common interface for fgemv with ParSeqHelper::Parallel input parameter in which the corresponding parallel implementation will be called for the given field ref. pfgemv.inl template typename Field::Element_ptr @@ -477,8 +514,8 @@ namespace FFLAS{ const typename Field::ConstElement_ptr X, const size_t incX, const typename Field::Element beta, typename Field::Element_ptr Y, const size_t incY, - ParSeqHelper::Parallel& parH){std::cout<>>>>>>>>>>>>>>>>>>>>> "<::value, ParSeqHelper::Parallel > pH (F,m,n,1,parH); + ParSeqHelper::Parallel& parH){ + MMHelper::value, ParSeqHelper::Parallel > pH (F,m,n,1,parH); fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY, pH); return Y; } @@ -496,7 +533,7 @@ namespace FFLAS{ const typename Field::Element beta, typename Field::Element_ptr Y, const size_t incY, ParSeqHelper::Sequential& seqH ){ - MMHelper pH(F,m,n,1,seqH); + MMHelper pH(F,m,n,1,seqH); return fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY, pH); } //TODO: Not sure about the defaut parameters, it is required to benchmark for different cutting strategies and parameters so as to find out the best defaut values for the defaut parallel implementation @@ -525,7 +562,7 @@ namespace FFLAS{ } */ - +/* template typename Field::Element_ptr fgemv(const Field& F, @@ -544,9 +581,10 @@ std::cout<>>>>>>>>>>>>>>>>>>>>> "<s,f0,{0,g0,(0,\:0,t0,+0,=s + diff --git a/fflas-ffpack/fflas/fflas_fgemv_mp.inl b/fflas-ffpack/fflas/fflas_fgemv_mp.inl index e3124929e..578889064 100644 --- a/fflas-ffpack/fflas/fflas_fgemv_mp.inl +++ b/fflas-ffpack/fflas/fflas_fgemv_mp.inl @@ -80,22 +80,48 @@ namespace FFLAS { // BB hack. might not work. // Calling fgemm, TODO: really specialize fgemv // specialization of the fgemv function for the field Givaro::ZRing - template + template inline Givaro::Integer* fgemv (const Givaro::ZRing& F, const FFLAS_TRANSPOSE ta, const size_t m, const size_t n, const Givaro::Integer alpha, - Givaro::Integer* A, const size_t lda, - Givaro::Integer* X, const size_t ldx, + const Givaro::Integer* A, const size_t lda, // @fixme Why not originally const? + const Givaro::Integer* X, const size_t ldx, Givaro::Integer beta, Givaro::Integer* Y, const size_t ldy, - MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> & H) - {std::cout<<"("< <<<<<<<<<<<<<<<<< "<, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> H2(H); - fgemm(F,ta,FFLAS::FflasNoTrans, (ta==FFLAS::FflasNoTrans)?m:n, 1,(ta==FFLAS::FflasNoTrans)?n:m, alpha,A,lda,X,ldx,beta,Y,ldy,H2); - return Y; + MMHelper, AlgoT, ModeCategories::ConvertTo, ParSeqHelper::Sequential> & H) { + fgemm(F,ta,FFLAS::FflasNoTrans, (ta==FFLAS::FflasNoTrans)?m:n, 1,(ta==FFLAS::FflasNoTrans)?n:m, alpha,A,lda,X,ldx,beta,Y,ldy,H); + return Y; + } + template + inline Givaro::Integer* fgemv (const Givaro::ZRing& F, + const FFLAS_TRANSPOSE ta, + const size_t m, const size_t n, + const Givaro::Integer alpha, + const Givaro::Integer* A, const size_t lda, // @fixme Why not originally const? + const Givaro::Integer* X, const size_t ldx, + Givaro::Integer beta, + Givaro::Integer* Y, const size_t ldy, + MMHelper, AlgoT, ModeCategories::ConvertTo, ParSeqHelper::Parallel> & H){ + MMHelper, AlgoT, ModeCategories::ConvertTo, ParSeqHelper::Compose,ParSeqHelper::Sequential>> Hc(H); + fgemv(F,ta, m, n, alpha,A,lda,X,ldx,beta,Y,ldy,Hc); + return Y; + } + template + inline Givaro::Integer* fgemv (const Givaro::ZRing& F, + const FFLAS_TRANSPOSE ta, + const size_t m, const size_t n, + const Givaro::Integer alpha, + const Givaro::Integer* A, const size_t lda, // @fixme Why not originally const? + const Givaro::Integer* X, const size_t ldx, + Givaro::Integer beta, + Givaro::Integer* Y, const size_t ldy, + MMHelper, AlgoT, ModeCategories::ConvertTo, ParSeqHelper::Compose,ComposeArgs...>> & H){ + fgemm(F,ta,FFLAS::FflasNoTrans, (ta==FFLAS::FflasNoTrans)?m:n, 1,(ta==FFLAS::FflasNoTrans)?n:m, alpha,A,lda,X,ldx,beta,Y,ldy,H); + return Y; } + // specialization of the fgemv function for the field Givaro::Modular // Calling fgemm, TODO: really specialize fgemv template @@ -108,7 +134,7 @@ namespace FFLAS { Givaro::Integer beta, Givaro::Integer* Y, const size_t ldy, MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> & H) - {std::cout<<"("< <<<<<<<<<<<<<<<<< "<, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> H2(H); fgemm(F,ta,FFLAS::FflasNoTrans,(ta==FFLAS::FflasNoTrans)?m:n,1,(ta==FFLAS::FflasNoTrans)?n:m,alpha,A,lda,X,ldx,beta,Y,ldy,H2); return Y; diff --git a/fflas-ffpack/field/rns-double.inl b/fflas-ffpack/field/rns-double.inl index 7713129d3..a3060cf5f 100644 --- a/fflas-ffpack/field/rns-double.inl +++ b/fflas-ffpack/field/rns-double.inl @@ -58,7 +58,8 @@ namespace FFPACK { //for(size_t i=0;iinit(1,1,x._ptr,x._stride, &y,1,k); + init(x); + size_t k =(y.bitsize())/16+((y.bitsize())%16?1:0); + _rns->init(1,1,x._ptr,x._stride, &y,1,k); return x; } Element& reduce (Element& x, const Element& y) const {return assign (x,y);} diff --git a/fflas-ffpack/paladin/blockcuts.inl b/fflas-ffpack/paladin/blockcuts.inl index 96b337c22..50d2cc2f6 100644 --- a/fflas-ffpack/paladin/blockcuts.inl +++ b/fflas-ffpack/paladin/blockcuts.inl @@ -40,7 +40,7 @@ namespace FFLAS { struct Column{}; struct Block{}; struct Recursive{}; - typedef Row RNSModulus; + struct RNSModulus{}; } namespace StrategyParameter{ From 7bc53b90cf56634834ec60b09a376781b1d2b48e Mon Sep 17 00:00:00 2001 From: ZHG Date: Tue, 28 May 2019 14:08:39 +0200 Subject: [PATCH 09/31] Having used SYNCH_GROUP to avoid segmmentation fault but still no observable speedup for rns fgemv --- benchmarks/Makefile.am | 3 +- fflas-ffpack/fflas/fflas_fgemm.inl | 4 ++- .../fflas/fflas_fgemm/fgemm_classical_mp.inl | 30 +++++++++++-------- fflas-ffpack/fflas/fflas_fgemv_mp.inl | 11 ++++--- fflas-ffpack/paladin/blockcuts.inl | 8 ++--- 5 files changed, 34 insertions(+), 22 deletions(-) diff --git a/benchmarks/Makefile.am b/benchmarks/Makefile.am index 161626145..9b5b92d32 100755 --- a/benchmarks/Makefile.am +++ b/benchmarks/Makefile.am @@ -35,7 +35,7 @@ endif PERFPUBLISHERFILE=benchmarks-report.xml -FFLA_BENCH = benchmark-fgemm benchmark-fgemm-rns benchmark-wino benchmark-ftrsm benchmark-fgesv benchmark-ftrsv benchmark-ftrtri benchmark-inverse benchmark-fsytrf benchmark-fsyrk benchmark-lqup benchmark-pluq benchmark-charpoly benchmark-charpoly-mp benchmark-fgemm-mp benchmark-fgemv-mp benchmark-ftrsm-mp benchmark-lqup-mp benchmark-checkers benchmark-fadd-lvl2 benchmark-fdot benchmark-fgemv +FFLA_BENCH = benchmark-fgemm benchmark-fgemm-rns benchmark-wino benchmark-ftrsm benchmark-fgesv benchmark-ftrsv benchmark-ftrtri benchmark-inverse benchmark-fsytrf benchmark-fsyrk benchmark-lqup benchmark-pluq benchmark-charpoly benchmark-charpoly-mp benchmark-fgemm-mp benchmark-fgemv-mp benchmark-ftrsm-mp benchmark-lqup-mp benchmark-checkers benchmark-fadd-lvl2 benchmark-fdot benchmark-fgemv benchmark-fgemv-rns BLAS_BENCH = benchmark-sgemm$(EXEEXT) benchmark-dgemm benchmark-dtrsm LAPA_BENCH = benchmark-dtrtri benchmark-dgetri benchmark-dgetrf benchmark-dsytrf @@ -85,6 +85,7 @@ benchmark_checkers_SOURCES = benchmark-checkers.C benchmark_fadd_lvl2_SOURCES = benchmark-fadd-lvl2.C benchmark_fdot_SOURCES = benchmark-fdot.C benchmark_fgemv_SOURCES = benchmark-fgemv.C +benchmark_fgemv_rns_SOURCES = benchmark-fgemv-rns.C benchmark_sgemm_CXXFLAGS = $(AM_CXXFLAGS) -D__SGEMM__ diff --git a/fflas-ffpack/fflas/fflas_fgemm.inl b/fflas-ffpack/fflas/fflas_fgemm.inl index 3d99c52b6..e0d5b577d 100644 --- a/fflas-ffpack/fflas/fflas_fgemm.inl +++ b/fflas-ffpack/fflas/fflas_fgemm.inl @@ -410,7 +410,9 @@ namespace FFLAS { else if (!std::is_same >::value){ if (F.characteristic() < DOUBLE_TO_FLOAT_CROSSOVER) return Protected::fgemm_convert,Field>(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,H); - else if (!std::is_same >::value && 16*F.cardinality() < Givaro::ModularBalanced::maxCardinality()) + else if (!std::is_same >::value && + !std::is_same >::value && + 16*F.cardinality() < Givaro::ModularBalanced::maxCardinality()) return Protected::fgemm_convert,Field>(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,H); } } diff --git a/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl b/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl index fcf126ed1..c85fb0bad 100644 --- a/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl +++ b/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl @@ -55,7 +55,7 @@ namespace FFLAS { MMHelper() : normA(0), normB(0), recLevel(-1) {} template MMHelper(MMHelper H2) : - normA(H2.normA), normB(H2.normB), recLevel(H2.recLevel), parseq(H2.parseq) {} + normA(H2.normA), normB(H2.normB), recLevel(H2.recLevel), parseq(H2.parseq) {std::cerr<<"copy constructor of MMHelper: H2 = "<::value, ParSeqTrait> SubHelper; - if(H.parseq.second_component().numthreads()>1){ - FORBLOCK1D(iter, rns_size, H.parseq.first_component(), - TASK(MODE(CONSTREFERENCE(F,H)), +SYNCH_GROUP({ + // if(H.parseq.first_component().numthreads()>1){ +// std::cout<<"Thread("<"<>>>> works on i:="<, AlgoT, ModeCategories::ConvertTo, ParSeqHelper::Parallel> & H){ - MMHelper, AlgoT, ModeCategories::ConvertTo, ParSeqHelper::Compose,ParSeqHelper::Sequential>> Hc(H); - fgemv(F,ta, m, n, alpha,A,lda,X,ldx,beta,Y,ldy,Hc); - return Y; - } + ParSeqHelper::Compose,ParSeqHelper::Sequential> CompHelper (H.parseq, ParSeqHelper::Sequential()); + MMHelper, AlgoT, ModeCategories::ConvertTo, ParSeqHelper::Compose,ParSeqHelper::Sequential>> Hc(F,m,1,n, CompHelper); + fgemv(F,ta, m, n, alpha,A,lda,X,ldx,beta,Y,ldy,Hc); + return Y; + } template inline Givaro::Integer* fgemv (const Givaro::ZRing& F, const FFLAS_TRANSPOSE ta, @@ -117,7 +118,9 @@ namespace FFLAS { Givaro::Integer beta, Givaro::Integer* Y, const size_t ldy, MMHelper, AlgoT, ModeCategories::ConvertTo, ParSeqHelper::Compose,ComposeArgs...>> & H){ + fgemm(F,ta,FFLAS::FflasNoTrans, (ta==FFLAS::FflasNoTrans)?m:n, 1,(ta==FFLAS::FflasNoTrans)?n:m, alpha,A,lda,X,ldx,beta,Y,ldy,H); + return Y; } diff --git a/fflas-ffpack/paladin/blockcuts.inl b/fflas-ffpack/paladin/blockcuts.inl index 50d2cc2f6..9ec8e9ff0 100644 --- a/fflas-ffpack/paladin/blockcuts.inl +++ b/fflas-ffpack/paladin/blockcuts.inl @@ -96,10 +96,10 @@ namespace FFLAS { struct Compose{ Compose() : _comp1 (), _comp2 () {} - Compose(const Compose & other) : _comp1 (other.first_component()), _comp2 (other.second_component()) {} + Compose(const Compose & other) : _comp1 (other.first_component()), _comp2 (other.second_component()) {} Compose(const Sequential & S) : _comp1 (1), _comp2 (1) {} Compose(size_t th1, size_t th2) : _comp1 (th1), _comp2 (th2) {} - Compose(const H1 & o1, const H2 & o2) : _comp1 (o1), _comp2 (o2) {} + Compose(const H1 & o1, const H2 & o2) : _comp1 (o1), _comp2 (o2) {} H1 first_component () const { return _comp1; } H2 second_component () const { return _comp2; } @@ -336,8 +336,8 @@ namespace FFLAS { if ( Protected::AreEqual::value ) { numBlock = std::max((blocksize_t)(H.numthreads()),(blocksize_t)1); - } else if ( Protected::AreEqual::value ) { - numBlock = std::max(n/ (blocksize_t)(H.numthreads()), (blocksize_t)1); + } else if ( Protected::AreEqual::value ) {std::cout<<"H: "< numBlock:="< Date: Wed, 29 May 2019 10:11:43 +0200 Subject: [PATCH 10/31] add benchmark-fgemv-rns --- benchmarks/benchmark-fgemv-rns.C | 271 +++++++++++++++++++++++++++++++ 1 file changed, 271 insertions(+) create mode 100644 benchmarks/benchmark-fgemv-rns.C diff --git a/benchmarks/benchmark-fgemv-rns.C b/benchmarks/benchmark-fgemv-rns.C new file mode 100644 index 000000000..9849a0621 --- /dev/null +++ b/benchmarks/benchmark-fgemv-rns.C @@ -0,0 +1,271 @@ +/* Copyright (c) FFLAS-FFPACK + * ========LICENCE======== + * This file is part of the library FFLAS-FFPACK. + * + * FFLAS-FFPACK is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * ========LICENCE======== + */ + +//#include "goto-def.h" + +#include "fflas-ffpack/fflas-ffpack-config.h" +#include +#include + +#include "fflas-ffpack/config-blas.h" +#include "fflas-ffpack/fflas/fflas.h" +#include "fflas-ffpack/utils/timer.h" +#include "fflas-ffpack/utils/args-parser.h" + +#include "fflas-ffpack/utils/fflas_io.h" +#include "fflas-ffpack/utils/test-utils.h" + +#include "fflas-ffpack/utils/timer.h" +#include "givaro/modular-integer.h" +#include "givaro/givcaster.h" + +using namespace FFPACK; + +using namespace std; +using namespace FFLAS; + +template +struct need_field_characteristic { static constexpr bool value = false; }; +template +struct need_field_characteristic>{ static constexpr bool value = true; }; +template +struct need_field_characteristic>{ static constexpr bool value = true; }; + +template +struct compatible_data_type { static constexpr bool value = true; }; +template <> +struct compatible_data_type>{ static constexpr bool value = false; }; +template <> +struct compatible_data_type>{ static constexpr bool value = false; }; + + +template +void fill_value(Field& F, RandIter& Rand, + Matrix& A, Vector& X, Vector& Y, + size_t m, size_t k, size_t incX, size_t incY, size_t lda, int NBK){ + // TODO: replace by a 1D pfrand + SYNCH_GROUP( + FORBLOCK1D(iter, m, SPLITTER(NBK, CuttingStrategy::Row, StrategyParameter::Threads), + TASK(MODE(CONSTREFERENCE(F,Rand,A)), + { + frand(F, Rand, iter.end()-iter.begin(), k, A+iter.begin()*lda, lda); + } + ); + ); + ); + //FFLAS::pfrand(F,Rand, m,k,A,m/NBK); + FFLAS::frand(F,Rand, k,1,X,incX); + FFLAS::fzero(F, m,1,Y,incY); +} + +template +void genData(Field& F, + Matrix& A, Vector& X, Vector& Y, + size_t m, size_t k, size_t incX, size_t incY, size_t lda, int NBK, + int bitsize, uint64_t seed){ + typename Field::RandIter Rand(F,bitsize,seed); + fill_value(F, Rand, A, X, Y, m, k, incX, incY, lda, NBK); + std::cerr<<"filled A:"< +bool check_result(Field& F, size_t m, size_t lda, Matrix& A, Vector& X, size_t incX, Vector& Y, size_t incY){ + //Naive result checking by comparing result from pfgemv against the one from fgemv + typename Field::Element_ptr Y2 = FFLAS::fflas_new(F,m,1); + FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y2, incY); + + for(size_t j=0; j check_result"< +bool benchmark_with_timer(Field& F, int p, Matrix& A, Vector& X, Vector& Y, size_t m, size_t k, size_t incX, + size_t incY, size_t lda, size_t iters, int t, double& time, size_t GrainSize){ + Timer chrono; + bool pass = true; + for (size_t i=0;i<=iters;++i){ + + chrono.clear(); + + if (p){ + + //typedef CuttingStrategy::Row row; + typedef CuttingStrategy::Recursive rec; + typedef StrategyParameter::Threads threads; + typedef StrategyParameter::Grain grain; + + if (i) { chrono.start(); } + + switch (p){ + case 1:{ + ParSeqHelper::Parallel H(GrainSize);//TODO: The exectuion always appears toe be sequential, but why ? + std::cerr<<"fresh PSH -> "< H(t);//TODO: The exectuion always appears toe be sequential, but why ? + std::cerr<<"fresh PSH -> "<, ParSeqHelper::Parallel> H(GrainSize,t);//TODO: GrainSize should be >= t ot have parallelization otherwise the execution will be sequential, but why ? + + // FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); + break; + } + default:{ + FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY); + break; + } + } + if (i) {chrono.stop(); time+=chrono.realtime();} + }else{ + if (i) chrono.start(); + FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY); + if (i) {chrono.stop(); time+=chrono.realtime();} + } +/* + if(!check_result(F, m, lda, A, X, incX, Y, incY)){ + pass = false; + break; + } +*/ + } + return pass; +} + +template +void benchmark_disp(Field& F, bool pass, double& time, size_t iters, int p, size_t m, size_t k, arg& as){ + if(pass){ + std::cout << "Time: " << time / double(iters) + << " Gflops: " << (2.*double(m)/1000.*double(k)/1000.0/1000.0) / time * double(iters); + writeCommandString(std::cout, as) << std::endl; + }else{ + std::cout<<"FAILED for "<::value, + "The provided data type for ZRing is not compatible for the desired operation and could lead to inconsistent result !"); + + benchmark_in_Field(F, p, m, k, NBK, bitsize, seed, iters, t, as, GrainSize); + +} + +template +void benchmark_with_field(const Givaro::Integer& q, int p, size_t m, size_t k, + int NBK, int bitsize, uint64_t seed, size_t iters, int t, + arg& as, size_t GrainSize){ + Field F(q); + benchmark_in_Field(F, p, m, k, NBK, bitsize, seed, iters, t, as, GrainSize); +} + +int main(int argc, char** argv) { + + int p=0; + + size_t iters = 3; + Givaro::Integer q = 131071; + size_t m = 4000; + size_t k = 4000; + + uint64_t seed = getSeed(); + int t; + PAR_BLOCK { t = NUM_THREADS; } + int NBK = -1; + int b=100; + size_t GrainSize = 64; + + Argument as[] = { + { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q }, + { 'b', "-b B", "Set the bitsize of input.", TYPE_INT , &b }, + { 'p', "-p P", "0 for sequential, 1 for , 2 for , 3 for .", + TYPE_INT , &p }, + { 'm', "-m M", "Set the dimension m of the matrix.", TYPE_INT , &m }, + { 'k', "-k K", "Set the dimension k of the matrix.", TYPE_INT , &k }, + { 't', "-t T", "number of virtual threads to drive the partition.", TYPE_INT , &t }, + { 'N', "-n N", "number of numa blocks per dimension for the numa placement", TYPE_INT , &NBK }, + { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iters }, + { 's', "-s S", "Sets seed.", TYPE_INT , &seed }, + { 'g', "-g G", "Sets GrainSize.", TYPE_INT , &GrainSize }, + END_OF_ARGUMENTS + }; + + parseArguments(argc,argv,as); + + if (NBK==-1) NBK = t; + + PAR_BLOCK { + //benchmark_with_field>( p, m, k, NBK, b, seed, iters, t, as, GrainSize); + benchmark_with_field>( p, m, k, NBK, b, seed, iters, t, as, GrainSize); + } + + + return 0; +} +/* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +// vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s From d194b9adfbca3f7e61f55748ae171cf3a00b40ca Mon Sep 17 00:00:00 2001 From: ZHG Date: Wed, 29 May 2019 13:08:02 +0200 Subject: [PATCH 11/31] Instant backup before code review --- benchmarks/benchmark-fgemv-rns.C | 17 +++--- .../fflas/fflas_fgemm/fgemm_classical_mp.inl | 52 +++++++++---------- fflas-ffpack/paladin/blockcuts.inl | 4 +- 3 files changed, 32 insertions(+), 41 deletions(-) diff --git a/benchmarks/benchmark-fgemv-rns.C b/benchmarks/benchmark-fgemv-rns.C index 9849a0621..28a7438ec 100644 --- a/benchmarks/benchmark-fgemv-rns.C +++ b/benchmarks/benchmark-fgemv-rns.C @@ -82,9 +82,6 @@ void genData(Field& F, int bitsize, uint64_t seed){ typename Field::RandIter Rand(F,bitsize,seed); fill_value(F, Rand, A, X, Y, m, k, incX, incY, lda, NBK); - std::cerr<<"filled A:"< @@ -93,11 +90,11 @@ bool check_result(Field& F, size_t m, size_t lda, Matrix& A, Vector& X, size_t i typename Field::Element_ptr Y2 = FFLAS::fflas_new(F,m,1); FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y2, incY); - for(size_t j=0; j check_result"< H(GrainSize);//TODO: The exectuion always appears toe be sequential, but why ? - std::cerr<<"fresh PSH -> "< H(GrainSize); FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); break; } case 2:{ - ParSeqHelper::Parallel H(t);//TODO: The exectuion always appears toe be sequential, but why ? - std::cerr<<"fresh PSH -> "< H(t); FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); break; } case 3:{ - // ParSeqHelper::Compose, ParSeqHelper::Parallel> H(GrainSize,t);//TODO: GrainSize should be >= t ot have parallelization otherwise the execution will be sequential, but why ? + ParSeqHelper::Compose, ParSeqHelper::Parallel> H(GrainSize,t); - // FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); + FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); break; } default:{ diff --git a/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl b/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl index c85fb0bad..fa75d1264 100644 --- a/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl +++ b/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl @@ -55,7 +55,7 @@ namespace FFLAS { MMHelper() : normA(0), normB(0), recLevel(-1) {} template MMHelper(MMHelper H2) : - normA(H2.normA), normB(H2.normB), recLevel(H2.recLevel), parseq(H2.parseq) {std::cerr<<"copy constructor of MMHelper: H2 = "<::value, ParSeqTrait> SubHelper; -SYNCH_GROUP({ - // if(H.parseq.first_component().numthreads()>1){ -// std::cout<<"Thread("<"<>>>> works on i:="<::value ) { numBlock = std::max((blocksize_t)(H.numthreads()),(blocksize_t)1); - } else if ( Protected::AreEqual::value ) {std::cout<<"H: "< numBlock:="<::value ) { + numBlock = std::max(n/ (blocksize_t)(H.numthreads()), (blocksize_t)1); } else { numBlock = std::max(n/(blocksize_t)(__FFLASFFPACK_MINBLOCKCUTS),(blocksize_t)1); } From 3bfb91968f2f78e624f0c764e7d12c58db730331 Mon Sep 17 00:00:00 2001 From: ZHG Date: Mon, 3 Jun 2019 12:59:57 +0200 Subject: [PATCH 12/31] detailed timing --- benchmarks/benchmark-fgemv-rns.C | 2 +- fflas-ffpack/fflas/fflas_fgemv.inl | 12 ++++++------ fflas-ffpack/fflas/fflas_fgemv_mp.inl | 3 +-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/benchmarks/benchmark-fgemv-rns.C b/benchmarks/benchmark-fgemv-rns.C index 28a7438ec..5d2ddacb0 100644 --- a/benchmarks/benchmark-fgemv-rns.C +++ b/benchmarks/benchmark-fgemv-rns.C @@ -17,7 +17,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ - +#define PROFILE_FGEMM_MP //#include "goto-def.h" #include "fflas-ffpack/fflas-ffpack-config.h" diff --git a/fflas-ffpack/fflas/fflas_fgemv.inl b/fflas-ffpack/fflas/fflas_fgemv.inl index a65258056..f270df69e 100644 --- a/fflas-ffpack/fflas/fflas_fgemv.inl +++ b/fflas-ffpack/fflas/fflas_fgemv.inl @@ -465,7 +465,7 @@ namespace FFLAS{ return Y; } - + //specialization for Givaro::ZRing with ParSeqHelper::Compose template Givaro::Integer* fgemv(const Givaro::ZRing& F, @@ -483,6 +483,7 @@ namespace FFLAS{ return Y; } + //specialization for Givaro::Modular with ParSeqHelper::Compose template Givaro::Integer* fgemv(const Givaro::Modular& F, @@ -502,7 +503,7 @@ namespace FFLAS{ - //Common interface for fgemv with ParSeqHelper::Parallel input parameter in which the corresponding parallel implementation will be called for the given field ref. pfgemv.inl + //Common interface for fgemv with ParSeqHelper::Parallel input parameter in which the corresponding parallel implementation will be called for the given field (ref. pfgemv.inl) template typename Field::Element_ptr fgemv(const Field& F, @@ -536,7 +537,7 @@ namespace FFLAS{ MMHelper pH(F,m,n,1,seqH); return fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY, pH); } -//TODO: Not sure about the defaut parameters, it is required to benchmark for different cutting strategies and parameters so as to find out the best defaut values for the defaut parallel implementation + /* template typename Field::Element_ptr @@ -560,9 +561,7 @@ namespace FFLAS{ } return Y; } -*/ -/* template typename Field::Element_ptr fgemv(const Field& F, @@ -581,7 +580,8 @@ std::cout<>>>>>>>>>>>>>>>>>>>>> "<, AlgoT, ModeCategories::ConvertTo, ParSeqHelper::Compose,ComposeArgs...>> & H){ - - fgemm(F,ta,FFLAS::FflasNoTrans, (ta==FFLAS::FflasNoTrans)?m:n, 1,(ta==FFLAS::FflasNoTrans)?n:m, alpha,A,lda,X,ldx,beta,Y,ldy,H); + fgemm(F,ta,FFLAS::FflasNoTrans, (ta==FFLAS::FflasNoTrans)?m:n, 1,(ta==FFLAS::FflasNoTrans)?n:m, alpha,A,lda,X,ldx,beta,Y,ldy,H); return Y; } From 5913444a7bd22ec01286eff0a496af5486a863ca Mon Sep 17 00:00:00 2001 From: ZHG Date: Wed, 12 Jun 2019 09:00:21 +0200 Subject: [PATCH 13/31] Instant backup before code review --- benchmarks/benchmark-fgemv-rns.C | 11 ++---- fflas-ffpack/fflas/fflas_bounds.inl | 15 ++++++++ .../fflas/fflas_fgemm/fgemm_classical_mp.inl | 38 +++++++++++++++++-- fflas-ffpack/field/rns-double.h | 2 +- fflas-ffpack/field/rns-double.inl | 37 +++++++++++++++--- fflas-ffpack/field/rns-integer-mod.h | 2 +- 6 files changed, 87 insertions(+), 18 deletions(-) diff --git a/benchmarks/benchmark-fgemv-rns.C b/benchmarks/benchmark-fgemv-rns.C index 5d2ddacb0..d2d25f00b 100644 --- a/benchmarks/benchmark-fgemv-rns.C +++ b/benchmarks/benchmark-fgemv-rns.C @@ -60,7 +60,7 @@ template void fill_value(Field& F, RandIter& Rand, Matrix& A, Vector& X, Vector& Y, size_t m, size_t k, size_t incX, size_t incY, size_t lda, int NBK){ - // TODO: replace by a 1D pfrand + SYNCH_GROUP( FORBLOCK1D(iter, m, SPLITTER(NBK, CuttingStrategy::Row, StrategyParameter::Threads), TASK(MODE(CONSTREFERENCE(F,Rand,A)), @@ -147,13 +147,8 @@ bool benchmark_with_timer(Field& F, int p, Matrix& A, Vector& X, Vector& Y, size FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY); if (i) {chrono.stop(); time+=chrono.realtime();} } -/* - if(!check_result(F, m, lda, A, X, incX, Y, incY)){ - pass = false; - break; - } -*/ } + if(!check_result(F, m, lda, A, X, incX, Y, incY)) pass = false; return pass; } @@ -238,7 +233,7 @@ int main(int argc, char** argv) { Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q }, { 'b', "-b B", "Set the bitsize of input.", TYPE_INT , &b }, - { 'p', "-p P", "0 for sequential, 1 for , 2 for , 3 for .", + { 'p', "-p P", "0 for sequential, 1 for , 2 for , 3 for Compose<, >.", TYPE_INT , &p }, { 'm', "-m M", "Set the dimension m of the matrix.", TYPE_INT , &m }, { 'k', "-k K", "Set the dimension k of the matrix.", TYPE_INT , &k }, diff --git a/fflas-ffpack/fflas/fflas_bounds.inl b/fflas-ffpack/fflas/fflas_bounds.inl index 419871fe1..38249a28f 100644 --- a/fflas-ffpack/fflas/fflas_bounds.inl +++ b/fflas-ffpack/fflas/fflas_bounds.inl @@ -37,6 +37,10 @@ #include #include +#ifdef PROFILE_FGEMM_MP +#include "fflas-ffpack/utils/timer.h" +#endif + namespace FFLAS { namespace Protected { template @@ -116,6 +120,11 @@ namespace FFLAS { InfNorm (const size_t M, const size_t N, const Givaro::Integer* A, const size_t lda){ Givaro::Integer max = 0; size_t log=0; +#ifdef PROFILE_FGEMM_MP + Timer chrono; + chrono.start(); +#endif + for (size_t i=0; i> ParSeq"<>=1; ++lk;} size_t prime_bitsize= (53-lk)>>1; +#ifdef PROFILE_FGEMM_MP + chrono.stop(); + std::cout<<"-------------------------------"< RnsDomain; RnsDomain Zrns(RNS); @@ -380,11 +396,20 @@ namespace FFLAS { Bp = FFLAS::fflas_new(Zrns,Browd,Bcold); Cp = FFLAS::fflas_new(Zrns,m,n); +#ifdef PROFILE_FGEMM_MP + chrono.stop(); + std::cout<<"-------------------------------"<_ldm){ FFPACK::failure()(__func__,__FILE__,__LINE__,"rns_double [init] -> rns basis is too small to handle integers with 2^(16*k) values "); std::cerr<<"with k="<1 && n>1) std::cerr<<"Kronecker : "< void finit_rns(const FFPACK::RNSIntegerMod &F, const size_t m, const size_t n, size_t k, const Givaro::Integer *B, const size_t ldb, typename RNS::Element_ptr A) - { + {std::cout<<"Thread("<>>>>>> integer_mod: finit_rns"< From 4db380d934b2e6c92c17bfa72fc462d80c407b98 Mon Sep 17 00:00:00 2001 From: ZHG Date: Mon, 24 Jun 2019 10:21:08 +0200 Subject: [PATCH 14/31] Ready for benchmarks on a server --- fflas-ffpack/fflas/fflas_bounds.inl | 10 --- .../fflas/fflas_fgemm/fgemm_classical_mp.inl | 84 +------------------ fflas-ffpack/field/rns-double.inl | 59 +++++++++++-- fflas-ffpack/field/rns-integer-mod.h | 2 +- 4 files changed, 57 insertions(+), 98 deletions(-) diff --git a/fflas-ffpack/fflas/fflas_bounds.inl b/fflas-ffpack/fflas/fflas_bounds.inl index 6e87f39ba..bac069346 100644 --- a/fflas-ffpack/fflas/fflas_bounds.inl +++ b/fflas-ffpack/fflas/fflas_bounds.inl @@ -120,11 +120,6 @@ namespace FFLAS { InfNorm (const size_t M, const size_t N, const Givaro::Integer* A, const size_t lda){ Givaro::Integer max = 0; -#ifdef PROFILE_FGEMM_MP - Timer chrono; - chrono.start(); -#endif - #if 1 //Sequential ////////////////////////////////////////////////// for (size_t i=0; i> ParSeq"<>=1; ++lk;} size_t prime_bitsize= (53-lk)>>1; -#ifdef PROFILE_FGEMM_MP - chrono.stop(); - std::cout<<"-------------------------------"< H2(Zrns,H.recLevel,H.parseq); @@ -432,35 +400,15 @@ namespace FFLAS { Zrns.init(alphap, alpha); Zrns.init(betap, F.zero); -#ifdef PROFILE_FGEMM_MP - chrono.stop(); - std::cout<<"-------------------------------"< > ParSeq with alpha="<1){ chrono.stop(); - std::cout<<"-------------------------------"<1){ chrono.stop(); - std::cout<<"-------------------------------"<1){ chrono.stop(); - std::cout<<"-------------------------------"< void finit_rns(const FFPACK::RNSIntegerMod &F, const size_t m, const size_t n, size_t k, const Givaro::Integer *B, const size_t ldb, typename RNS::Element_ptr A) - {std::cout<<"Thread("<>>>>>> integer_mod: finit_rns"< From 3b12ce7823bfa88ca69f664985d43f7aaaf6c508 Mon Sep 17 00:00:00 2001 From: ZHG Date: Mon, 24 Jun 2019 11:30:58 +0200 Subject: [PATCH 15/31] Got ready for benchmark on a server --- benchmarks/benchmark-fgemv-rns.C | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark-fgemv-rns.C b/benchmarks/benchmark-fgemv-rns.C index d2d25f00b..37862da5c 100644 --- a/benchmarks/benchmark-fgemv-rns.C +++ b/benchmarks/benchmark-fgemv-rns.C @@ -80,7 +80,7 @@ void genData(Field& F, Matrix& A, Vector& X, Vector& Y, size_t m, size_t k, size_t incX, size_t incY, size_t lda, int NBK, int bitsize, uint64_t seed){ - typename Field::RandIter Rand(F,bitsize,seed); + typename Field::RandIter Rand(F,seed,bitsize); //Field::RandIter's parameters order has been changed between seed and bitsize fill_value(F, Rand, A, X, Y, m, k, incX, incY, lda, NBK); } From 14a3dea0d9905addbd7ab9930a4b890edcd23746 Mon Sep 17 00:00:00 2001 From: ZHG Date: Tue, 25 Jun 2019 14:11:53 +0200 Subject: [PATCH 16/31] Fallback to PARFOR1D in the rsn-double::init --- fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl | 1 + fflas-ffpack/field/rns-double.inl | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl b/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl index f750159bc..19ef84ec0 100644 --- a/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl +++ b/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl @@ -210,6 +210,7 @@ namespace FFLAS { #endif size_t rns_size = F.size(); typedef MMHelper::value, ParSeqTrait> SubHelper; + SYNCH_GROUP({ FORBLOCK1D(iter, rns_size, H.parseq.first_component(), TASK(MODE(CONSTREFERENCE(F,H)), diff --git a/fflas-ffpack/field/rns-double.inl b/fflas-ffpack/field/rns-double.inl index 6addd3cf6..f532c7e02 100644 --- a/fflas-ffpack/field/rns-double.inl +++ b/fflas-ffpack/field/rns-double.inl @@ -64,9 +64,9 @@ namespace FFPACK { //for(size_t i=0;i(Aiter+j+i*lda); From 45eb9fa72ba0d51ed27bc03a99561d507e3e0bf9 Mon Sep 17 00:00:00 2001 From: ZHG Date: Tue, 25 Jun 2019 15:02:46 +0200 Subject: [PATCH 17/31] Instant backup before code review --- fflas-ffpack/field/rns-double.inl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fflas-ffpack/field/rns-double.inl b/fflas-ffpack/field/rns-double.inl index f532c7e02..6addd3cf6 100644 --- a/fflas-ffpack/field/rns-double.inl +++ b/fflas-ffpack/field/rns-double.inl @@ -64,9 +64,9 @@ namespace FFPACK { //for(size_t i=0;i(Aiter+j+i*lda); From 34a1277fcd002c15341b6d1976281b773affc527 Mon Sep 17 00:00:00 2001 From: ZHG Date: Tue, 25 Jun 2019 16:17:44 +0200 Subject: [PATCH 18/31] Further improved to validate on hpac --- fflas-ffpack/fflas/fflas_bounds.inl | 4 +--- fflas-ffpack/field/rns-double.inl | 6 +++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/fflas-ffpack/fflas/fflas_bounds.inl b/fflas-ffpack/fflas/fflas_bounds.inl index bac069346..1c249cd03 100644 --- a/fflas-ffpack/fflas/fflas_bounds.inl +++ b/fflas-ffpack/fflas/fflas_bounds.inl @@ -120,7 +120,7 @@ namespace FFLAS { InfNorm (const size_t M, const size_t N, const Givaro::Integer* A, const size_t lda){ Givaro::Integer max = 0; -#if 1 //Sequential ////////////////////////////////////////////////// +#if 0 //Sequential ////////////////////////////////////////////////// for (size_t i=0; i vmax(M,0); auto sp=SPLITTER(NUM_THREADS,FFLAS::CuttingStrategy::Row,FFLAS::StrategyParameter::Threads); SYNCH_GROUP({ - FORBLOCK1D(iter, M, sp, TASK(MODE(CONSTREFERENCE(A,max,vmax) ), { @@ -145,7 +144,6 @@ namespace FFLAS { } }) ); - }); max=vmax[0]; for (size_t i=0; i1){ FFLAS::fflas_delete( A_beta); -#if 1 //Sequential ////////////////////////////////////////////////// +#if 0 //Sequential ////////////////////////////////////////////////// #ifdef CHECK_RNS bool ok=true; @@ -179,11 +179,11 @@ if(n>1){ auto sp=SPLITTER(NUM_THREADS,FFLAS::CuttingStrategy::Row,FFLAS::StrategyParameter::Threads); SYNCH_GROUP({ - FORBLOCK1D(i, m, sp, + FORBLOCK1D(iter, m, sp, TASK(MODE(CONSTREFERENCE(A,_basis,Arns) ), { - + for (auto i=iter.begin(); i!=iter.end(); ++i) for(size_t j=0;j Date: Tue, 25 Jun 2019 16:43:19 +0200 Subject: [PATCH 19/31] Check if freduce is speeded up --- fflas-ffpack/field/rns-double.inl | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/fflas-ffpack/field/rns-double.inl b/fflas-ffpack/field/rns-double.inl index e153f8340..659127042 100644 --- a/fflas-ffpack/field/rns-double.inl +++ b/fflas-ffpack/field/rns-double.inl @@ -175,7 +175,7 @@ if(n>1){ #else //Parallel ///////////////////////////////////////////////////// #ifdef CHECK_RNS - bool ok=true; + std::vector vok(m,true); auto sp=SPLITTER(NUM_THREADS,FFLAS::CuttingStrategy::Row,FFLAS::StrategyParameter::Threads); SYNCH_GROUP({ @@ -185,10 +185,13 @@ if(n>1){ for (auto i=iter.begin(); i!=iter.end(); ++i) for(size_t j=0;j Date: Tue, 25 Jun 2019 17:09:14 +0200 Subject: [PATCH 20/31] Corrected the wrong parallelization of freduce --- fflas-ffpack/field/rns-double.inl | 61 +++++++------------------------ 1 file changed, 14 insertions(+), 47 deletions(-) diff --git a/fflas-ffpack/field/rns-double.inl b/fflas-ffpack/field/rns-double.inl index 659127042..76859d807 100644 --- a/fflas-ffpack/field/rns-double.inl +++ b/fflas-ffpack/field/rns-double.inl @@ -152,8 +152,6 @@ if(n>1){ FFLAS::fflas_delete( A_beta); -#if 0 //Sequential ////////////////////////////////////////////////// - #ifdef CHECK_RNS bool ok=true; for (size_t i=0;i1){ std::cout<<"RNS freduce ... "<<(ok?"OK":"ERROR")< vok(m,true); - auto sp=SPLITTER(NUM_THREADS,FFLAS::CuttingStrategy::Row,FFLAS::StrategyParameter::Threads); - SYNCH_GROUP({ - - FORBLOCK1D(iter, m, sp, - TASK(MODE(CONSTREFERENCE(A,_basis,Arns) ), - { - - for (auto i=iter.begin(); i!=iter.end(); ++i) - for(size_t j=0;j1){ chrono.stop(); @@ -610,10 +565,22 @@ if(n>1){ // #else // auto sp=SPLITTER(1); // #endif - PARFOR1D(i,_size,SPLITTER(NUM_THREADS), +#if 0 + PARFOR1D(i,_size,SPLITTER(NUM_THREADS),{std::cout<<"Thread("<>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"< Date: Wed, 26 Jun 2019 09:31:50 +0200 Subject: [PATCH 21/31] Instant backup for code review --- fflas-ffpack/field/rns-double.inl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fflas-ffpack/field/rns-double.inl b/fflas-ffpack/field/rns-double.inl index 76859d807..a2911dd1d 100644 --- a/fflas-ffpack/field/rns-double.inl +++ b/fflas-ffpack/field/rns-double.inl @@ -566,7 +566,7 @@ if(n>1){ // auto sp=SPLITTER(1); // #endif #if 0 - PARFOR1D(i,_size,SPLITTER(NUM_THREADS),{std::cout<<"Thread("<>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"< Date: Wed, 26 Jun 2019 18:02:06 +0200 Subject: [PATCH 22/31] Fixed templated fgemv confusion for Givaro::Modular> --- .../fflas/fflas_fgemm/fgemm_classical_mp.inl | 77 +++++++++++++++++++ fflas-ffpack/fflas/fflas_fgemv.inl | 1 + fflas-ffpack/fflas/fflas_fgemv_mp.inl | 7 +- 3 files changed, 82 insertions(+), 3 deletions(-) diff --git a/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl b/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl index 19ef84ec0..c3cdbf5dd 100644 --- a/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl +++ b/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl @@ -241,9 +241,86 @@ namespace FFLAS { return Cd; } + // fgemm for RnsInteger default parallel version template inline typename FFPACK::RNSInteger::Element_ptr + fgemm (const FFPACK::RNSInteger &F, + const FFLAS_TRANSPOSE ta, + const FFLAS_TRANSPOSE tb, + const size_t m, const size_t n,const size_t k, + const typename FFPACK::RNSInteger::Element alpha, + typename FFPACK::RNSInteger::ConstElement_ptr Ad, const size_t lda, + typename FFPACK::RNSInteger::ConstElement_ptr Bd, const size_t ldb, + const typename FFPACK::RNSInteger::Element beta, + typename FFPACK::RNSInteger::Element_ptr Cd, const size_t ldc, + MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag, ParSeqHelper::Parallel > & H) + { + // compute each fgemm componentwise + size_t rns_size = F.size(); + size_t nt = H.parseq.numthreads(); + size_t loop_nt = std::min (rns_size, nt); + size_t iter_nt = nt / loop_nt; + size_t leftover_nt = nt % loop_nt; + ParSeqHelper::Parallel Hloop (loop_nt); +#ifdef PROFILE_FGEMM_MP + Givaro::Timer t;t.start(); +#endif + typedef MMHelper::value, + ParSeqHelper::Parallel > SubPar; + + typedef MMHelper::value, + ParSeqHelper::Sequential> SubSeq; + + FORBLOCK1D(iter, rns_size, Hloop, + TASK(MODE(CONSTREFERENCE(F,H)), + { + for(auto i=iter.begin(); i!=iter.end(); ++i) + { + size_t fgemm_nt = iter_nt; + if (i < leftover_nt) + fgemm_nt++; + if (fgemm_nt>1) // Running a parallel fgemm + { + SubPar H2(F.rns()._field_rns[i], H.recLevel, ParSeqHelper::Parallel(fgemm_nt)); + fgemm(F.rns()._field_rns[i], ta, tb, m, n, k, + alpha._ptr[i*alpha._stride], Ad._ptr+i*Ad._stride, + lda, Bd._ptr+i*Bd._stride, ldb, + beta._ptr[i*beta._stride], Cd._ptr+i*Cd._stride, + ldc, H2); + } + else // Running a sequential fgemm + { + SubSeq H2(F.rns()._field_rns[i], H.recLevel, ParSeqHelper::Sequential()); + fgemm(F.rns()._field_rns[i], ta, tb, m, n, k, + alpha._ptr[i*alpha._stride], Ad._ptr+i*Ad._stride, + lda, Bd._ptr+i*Bd._stride, ldb, + beta._ptr[i*beta._stride], Cd._ptr+i*Cd._stride, + ldc, H2); + } + } + }); // TASK + ); // FLORBLOCK1D + + +#ifdef PROFILE_FGEMM_MP + t.stop(); + + std::cerr<<"=========================================="< + inline typename FFPACK::RNSInteger::Element_ptr fgemm (const FFPACK::RNSInteger &F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, diff --git a/fflas-ffpack/fflas/fflas_fgemv.inl b/fflas-ffpack/fflas/fflas_fgemv.inl index d7044b345..605d956c3 100644 --- a/fflas-ffpack/fflas/fflas_fgemv.inl +++ b/fflas-ffpack/fflas/fflas_fgemv.inl @@ -521,6 +521,7 @@ namespace FFLAS{ return Y; } + //Common interface for fgemv with ParSeqHelper::Sequential input parameter in which the corresponding sequential implementation will be called for the given field type either for common field implementated as above or multiprcesion field ref. fflas_fgemv_mp.inl template typename Field::Element_ptr diff --git a/fflas-ffpack/fflas/fflas_fgemv_mp.inl b/fflas-ffpack/fflas/fflas_fgemv_mp.inl index b00279803..40675f349 100644 --- a/fflas-ffpack/fflas/fflas_fgemv_mp.inl +++ b/fflas-ffpack/fflas/fflas_fgemv_mp.inl @@ -144,7 +144,8 @@ namespace FFLAS { // specialization of the fgemv function for the field Givaro::Modular> // Calling fgemm, TODO: really specialize fgemv - template + //@FastFix: This is only the sequential implementation and any call to parallel fgemv for the field Givaro::Modular> will refer to the implementation in the pfgemv.inl file + template inline RecInt::ruint* fgemv (const Givaro::Modular,RecInt::ruint >& F, const FFLAS_TRANSPOSE ta, @@ -157,8 +158,8 @@ namespace FFLAS { MMHelper,RecInt::ruint >, MMHelperAlgo::Classic, ModeCategories::ConvertTo, - ParSeq > & H) { - MMHelper,RecInt::ruint >, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> H2(H); + ParSeqHelper::Sequential > & H) { + MMHelper,RecInt::ruint >, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeqHelper::Sequential> H2(H); fgemm (F,ta,FflasNoTrans,(ta==FFLAS::FflasNoTrans)?m:n,1,(ta==FFLAS::FflasNoTrans)?n:m,alpha,A,lda,X,incx,beta,Y,incy,H2); return Y; } From af2f7e1b4c0a33bbdae4c7516784bc75afc6cbe4 Mon Sep 17 00:00:00 2001 From: ZHG Date: Thu, 27 Jun 2019 10:37:09 +0200 Subject: [PATCH 23/31] Used FOR1D instead of FOR1DBLOCK for both InfNorm and freduce(rns_double::reduce) --- benchmarks/benchmark-fgemv-rns.C | 10 +++---- fflas-ffpack/fflas/fflas_bounds.inl | 39 +++++++-------------------- fflas-ffpack/fflas/fflas_fgemv_mp.inl | 2 +- fflas-ffpack/field/rns-double.inl | 20 ++++---------- 4 files changed, 20 insertions(+), 51 deletions(-) diff --git a/benchmarks/benchmark-fgemv-rns.C b/benchmarks/benchmark-fgemv-rns.C index 37862da5c..2f7f916bf 100644 --- a/benchmarks/benchmark-fgemv-rns.C +++ b/benchmarks/benchmark-fgemv-rns.C @@ -17,7 +17,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ -#define PROFILE_FGEMM_MP +//#define PROFILE_FGEMM_MP //#include "goto-def.h" #include "fflas-ffpack/fflas-ffpack-config.h" @@ -57,7 +57,7 @@ struct compatible_data_type>{ static constexpr bool value template -void fill_value(Field& F, RandIter& Rand, +void fill_value(Field& F, RandIter& Rand, Matrix& A, Vector& X, Vector& Y, size_t m, size_t k, size_t incX, size_t incY, size_t lda, int NBK){ @@ -76,7 +76,7 @@ void fill_value(Field& F, RandIter& Rand, } template -void genData(Field& F, +void genData(Field& F, Matrix& A, Vector& X, Vector& Y, size_t m, size_t k, size_t incX, size_t incY, size_t lda, int NBK, int bitsize, uint64_t seed){ @@ -134,7 +134,7 @@ bool benchmark_with_timer(Field& F, int p, Matrix& A, Vector& X, Vector& Y, size ParSeqHelper::Compose, ParSeqHelper::Parallel> H(GrainSize,t); FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); - break; + break; } default:{ FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY); @@ -187,7 +187,7 @@ void benchmark_in_Field(Field& F, int p, size_t m, size_t k, int NBK, int bitsi FFLAS::fflas_delete(A); FFLAS::fflas_delete(X); - FFLAS::fflas_delete(Y); + FFLAS::fflas_delete(Y); } diff --git a/fflas-ffpack/fflas/fflas_bounds.inl b/fflas-ffpack/fflas/fflas_bounds.inl index 1c249cd03..66d9ea812 100644 --- a/fflas-ffpack/fflas/fflas_bounds.inl +++ b/fflas-ffpack/fflas/fflas_bounds.inl @@ -118,41 +118,20 @@ namespace FFLAS { inline Givaro::Integer InfNorm (const size_t M, const size_t N, const Givaro::Integer* A, const size_t lda){ - Givaro::Integer max = 0; - -#if 0 //Sequential ////////////////////////////////////////////////// - for (size_t i=0; i0) max = x; - } -#else //Parallel ///////////////////////////////////////////////////// - - std::vector vmax(M,0); - auto sp=SPLITTER(NUM_THREADS,FFLAS::CuttingStrategy::Row,FFLAS::StrategyParameter::Threads); - SYNCH_GROUP({ - FORBLOCK1D(iter, M, sp, - TASK(MODE(CONSTREFERENCE(A,max,vmax) ), - { - for(auto i=iter.begin(); i!=iter.end(); ++i) - { - for (size_t j=0; j0){ vmax[i] = x;} - } - - } - }) - ); - }); + Givaro::Integer max = 0; + std::vector vmax(M,0); + auto sp=SPLITTER(NUM_THREADS,FFLAS::CuttingStrategy::Row,FFLAS::StrategyParameter::Threads); + FOR1D(i, M, sp, { + for (size_t j=0; j0){ vmax[i] = x;} + } + }); max=vmax[0]; for (size_t i=0; i0){ max = vmax[i];} } -#endif /////////////////////////////////////////////////////////////// - return abs(max); - } namespace Protected { diff --git a/fflas-ffpack/fflas/fflas_fgemv_mp.inl b/fflas-ffpack/fflas/fflas_fgemv_mp.inl index 40675f349..b04867456 100644 --- a/fflas-ffpack/fflas/fflas_fgemv_mp.inl +++ b/fflas-ffpack/fflas/fflas_fgemv_mp.inl @@ -144,7 +144,7 @@ namespace FFLAS { // specialization of the fgemv function for the field Givaro::Modular> // Calling fgemm, TODO: really specialize fgemv - //@FastFix: This is only the sequential implementation and any call to parallel fgemv for the field Givaro::Modular> will refer to the implementation in the pfgemv.inl file + //@QuickFix: This is only the sequential implementation and any call to parallel fgemv for the field Givaro::Modular> will refer to the implementation in the pfgemv.inl file template inline RecInt::ruint* fgemv (const Givaro::Modular,RecInt::ruint >& F, diff --git a/fflas-ffpack/field/rns-double.inl b/fflas-ffpack/field/rns-double.inl index a2911dd1d..6abb3e441 100644 --- a/fflas-ffpack/field/rns-double.inl +++ b/fflas-ffpack/field/rns-double.inl @@ -565,22 +565,12 @@ if(n>1){ // #else // auto sp=SPLITTER(1); // #endif -#if 0 - PARFOR1D(i,_size,SPLITTER(NUM_THREADS),{ - //for(size_t i=0;i<_size;i++) + + auto sp=SPLITTER(NUM_THREADS); + FOR1D(i,_size,sp,{ FFLAS::freduce (_field_rns[i],n,Arns+i*rda,1); - }); -#else - SYNCH_GROUP( - FORBLOCK1D(iter,_size,SPLITTER(NUM_THREADS), - TASK(MODE(CONSTREFERENCE(Arns)), - { - for(auto i=iter.begin(); i!=iter.end(); ++i) - FFLAS::freduce (_field_rns[i],n,Arns+i*rda,1); - }) - ) - ); -#endif + }); + } } From 10c90d98dbc2046973ad4e5013279ec3b3d2e082 Mon Sep 17 00:00:00 2001 From: ZHG Date: Thu, 27 Jun 2019 15:11:22 +0200 Subject: [PATCH 24/31] Cleaned up for code review before PR --- .../fflas/fflas_fgemm/fgemm_classical_mp.inl | 68 ++++++++++++++++--- fflas-ffpack/fflas/fflas_fgemv.inl | 44 ------------ fflas-ffpack/field/rns-double.inl | 30 +------- 3 files changed, 58 insertions(+), 84 deletions(-) diff --git a/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl b/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl index c3cdbf5dd..b772783d5 100644 --- a/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl +++ b/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl @@ -309,9 +309,8 @@ namespace FFLAS { #ifdef PROFILE_FGEMM_MP t.stop(); - std::cerr<<"=========================================="<>=1; ++lk;} @@ -439,16 +437,12 @@ namespace FFLAS { mC = 2*uint64_t(k)*H.normA*H.normB*abs(alpha); // need to use 2x bound to reach both positive and negative -#ifdef PROFILE_FGEMM_MP - chrono.stop(); - std::cout<<"FGEMM_MP: InfNorm compute bound on the output "< RnsDomain; RnsDomain Zrns(RNS); @@ -465,10 +459,25 @@ namespace FFLAS { Bp = FFLAS::fflas_new(Zrns,Browd,Bcold); Cp = FFLAS::fflas_new(Zrns,m,n); +#ifdef PROFILE_FGEMM_MP + chrono.stop(); + std::cout<<"-------------------------------"< H2(Zrns,H.recLevel,H.parseq); @@ -481,12 +490,25 @@ namespace FFLAS { // call fgemm fgemm(Zrns,ta,tb,m,n,k,alphap,Ap,Acold,Bp,Bcold,betap,Cp,n,H2); +#ifdef PROFILE_FGEMM_MP + chrono.stop(); + std::cout<<"FGEMM_MP: RNS Mul: "< > ParSeq with alpha="< - typename Field::Element_ptr - pfgemv(const Field& F, - const FFLAS_TRANSPOSE ta, - const size_t m, - const size_t n, - const typename Field::Element alpha, - const typename Field::ConstElement_ptr A, const size_t lda, - const typename Field::ConstElement_ptr X, const size_t incX, - const typename Field::Element beta, - typename Field::Element_ptr Y, const size_t incY){ - ParSeqHelper::Parallel parH; - PAR_BLOCK{ - parH.set_numthreads(NUM_THREADS); - MMHelper::value, - ParSeqHelper::Parallel > pH (F,m,n,1,parH); - fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY, pH); - } - return Y; - } - - template - typename Field::Element_ptr - fgemv(const Field& F, - const FFLAS_TRANSPOSE ta, - const size_t m, - const size_t n, - const typename Field::Element alpha, - const typename Field::ConstElement_ptr A, const size_t lda, - const typename Field::ConstElement_ptr X, const size_t incX, - const typename Field::Element beta, - typename Field::Element_ptr Y, const size_t incY, - FFLAS::ParSeqHelper::Compose, - FFLAS::ParSeqHelper::Parallel >& cpsH){ - MMHelper::value, ParSeqHelper::Parallel > pH(F,-1,cpsH); -std::cout<>>>>>>>>>>>>>>>>>>>>> "<_ldm){ FFPACK::failure()(__func__,__FILE__,__LINE__,"rns_double [init] -> rns basis is too small to handle integers with 2^(16*k) values "); std::cerr<<"with k="<1 && n>1) std::cerr<<"Kronecker : "<1){ #endif } -#ifdef PROFILE_FGEMM_MP -if(n>1){ - chrono.stop(); - std::cout<<"--------------------------------------------"<1){ std::cout<<"RNS freduce ... "<<(ok?"OK":"ERROR")<1){ - chrono.stop(); - std::cout<<"--------------------------------------------"< Date: Thu, 27 Jun 2019 16:26:19 +0200 Subject: [PATCH 25/31] Corrected bug in InfNorm by falling back to FOR1DBLOCK --- fflas-ffpack/fflas/fflas_bounds.inl | 33 ++++++++++++++++++----------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/fflas-ffpack/fflas/fflas_bounds.inl b/fflas-ffpack/fflas/fflas_bounds.inl index 66d9ea812..4cea3e8fa 100644 --- a/fflas-ffpack/fflas/fflas_bounds.inl +++ b/fflas-ffpack/fflas/fflas_bounds.inl @@ -118,19 +118,28 @@ namespace FFLAS { inline Givaro::Integer InfNorm (const size_t M, const size_t N, const Givaro::Integer* A, const size_t lda){ - Givaro::Integer max = 0; - std::vector vmax(M,0); - auto sp=SPLITTER(NUM_THREADS,FFLAS::CuttingStrategy::Row,FFLAS::StrategyParameter::Threads); - FOR1D(i, M, sp, { - for (size_t j=0; j0){ vmax[i] = x;} + Givaro::Integer max = 0; + std::vector vmax(M,0); + auto sp=SPLITTER(NUM_THREADS,FFLAS::CuttingStrategy::Row,FFLAS::StrategyParameter::Threads); + SYNCH_GROUP({ + FORBLOCK1D(iter, M, sp, + TASK(MODE(CONSTREFERENCE(A,max,vmax) ), + { + for(auto i=iter.begin(); i!=iter.end(); ++i) + { + for (size_t j=0; j0){ vmax[i] = x;} + } + + } + }) + ); + }); + max=vmax[0]; + for (size_t i=0; i0){ max = vmax[i];} } - }); - max=vmax[0]; - for (size_t i=0; i0){ max = vmax[i];} - } return abs(max); } From ba96e1912c01e6936825b772598eaf5fd4187ce5 Mon Sep 17 00:00:00 2001 From: ZHG Date: Mon, 1 Jul 2019 15:06:30 +0200 Subject: [PATCH 26/31] Trying to add test for rns fgemv --- tests/Makefile.am | 2 + tests/test-fgemv-rns.C | 413 +++++++++++++++++++++++++++++++++++++++++ tests/test-fgemv.C | 10 +- 3 files changed, 423 insertions(+), 2 deletions(-) create mode 100644 tests/test-fgemv-rns.C diff --git a/tests/Makefile.am b/tests/Makefile.am index 4326820ec..a67e7bb2e 100755 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -67,6 +67,7 @@ BASIC_TESTS = \ test-fgesv \ test-simd \ test-fgemv \ + test-fgemv-rns \ test-nullspace \ regression-check @@ -162,6 +163,7 @@ regression_check_SOURCES = regression-check.C test_solve_SOURCES = test-solve.C test_simd_SOURCES = test-simd.C test_fgemv_SOURCES = test-fgemv.C +test_fgemv_rns__SOURCES = test-fgemv-rns.C #test_pfgemm_DSL_SOURCES = test-pfgemm-DSL.C diff --git a/tests/test-fgemv-rns.C b/tests/test-fgemv-rns.C new file mode 100644 index 000000000..4f4fa453b --- /dev/null +++ b/tests/test-fgemv-rns.C @@ -0,0 +1,413 @@ +/* + * Copyright (C) the FFLAS-FFPACK group + * Written by Clément Pernet + * Brice Boyer (briceboyer) + * This file is Free Software and part of FFLAS-FFPACK. + * + * ========LICENCE======== + * This file is part of the library FFLAS-FFPACK. + * + * FFLAS-FFPACK is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * ========LICENCE======== + *. + */ + +// #ifndef NEWINO +// #define NEWWINO +// #endif + +// #define WINOTHRESHOLD 100 +// #define OLD_DYNAMIC_PEELING + + + +#include "fflas-ffpack/fflas-ffpack-config.h" + +#include +#include + +#include + +#include + +#include "fflas-ffpack/utils/timer.h" +#include "fflas-ffpack/fflas/fflas.h" + +#include "fflas-ffpack/utils/args-parser.h" +#include "fflas-ffpack/utils/test-utils.h" + +using namespace std; +using namespace FFPACK; +using namespace FFLAS; + +using Givaro::Modular; +using Givaro::ModularBalanced; + +template +struct rsn_compatible_data_type { static constexpr bool value = false; }; +template <> +struct rsn_compatible_data_type>{ static constexpr bool value = true; }; +template +struct rsn_compatible_data_type>{ static constexpr bool value = false; }; +template +struct rsn_compatible_data_type>{ static constexpr bool value = false; }; + +// checks that D = beta . Y + alpha . A ^ta * X +template +bool check_MV(const Field & F, + const typename Field::Element_ptr Cd, // c0 + enum FFLAS_TRANSPOSE & ta, + const size_t m, + const size_t k, + const typename Field::Element & alpha, + const typename Field::Element_ptr A, size_t lda, + const typename Field::Element_ptr X, size_t incX, + const typename Field::Element & beta, + const typename Field::Element_ptr Y, size_t incY) +{ + bool wrong = false; + typename Field::Element_ptr D; + if (ta == FflasNoTrans){ + D = fflas_new(F,m); + fassign (F, m, Cd, 1, D, 1); + for (size_t i=0; iwrite(std::cerr) << std::endl; +#endif + typedef typename Field::Element Element ; + typename Field::RandIter R(*F,seed++); + typename Field::NonZeroRandIter NZR(R); + + //size_t k = 0 ; + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->one,F->zero,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->zero,F->zero,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->mOne,F->zero,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->one ,F->one,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->zero,F->one,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->mOne,F->one,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->one ,F->mOne,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->zero,F->mOne,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->mOne,F->mOne,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + + Element alpha,beta ; + NZR.random(alpha); + ok = ok && launch_MV_dispatch(*F,m,k,F->one ,alpha,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->zero,alpha,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->mOne,alpha,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,alpha,F->one ,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,alpha,F->zero,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,alpha,F->mOne,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + + for (size_t j = 0 ; j < 3 ; ++j) { + R.random(alpha); + R.random(beta); + ok = ok && launch_MV_dispatch(*F,m,k,alpha,beta,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + } + //std::cout< >(0,(b?b:512_ui64),m,k,iters,p, seed); + } while (loop && ok); + + + + + return !ok ; +} +/* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +// vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s diff --git a/tests/test-fgemv.C b/tests/test-fgemv.C index 55c73d536..6a250f7f6 100644 --- a/tests/test-fgemv.C +++ b/tests/test-fgemv.C @@ -55,6 +55,14 @@ using namespace FFLAS; using Givaro::Modular; using Givaro::ModularBalanced; +template +struct rsn_compatible_data_type { static constexpr bool value = false; }; +template <> +struct rsn_compatible_data_type>{ static constexpr bool value = true; }; +template +struct rsn_compatible_data_type>{ static constexpr bool value = false; }; +template +struct rsn_compatible_data_type>{ static constexpr bool value = false; }; // checks that D = beta . Y + alpha . A ^ta * X template @@ -243,8 +251,6 @@ bool launch_MV(const Field & F, break; } - - } return ok ; } From 8f17bd4918ca0a0fa3e1108139cc392b72e71158 Mon Sep 17 00:00:00 2001 From: ZHG Date: Mon, 1 Jul 2019 16:10:27 +0200 Subject: [PATCH 27/31] Updated test-fgemv-rns --- tests/test-fgemv-rns.C | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/test-fgemv-rns.C b/tests/test-fgemv-rns.C index 4f4fa453b..fc687998b 100644 --- a/tests/test-fgemv-rns.C +++ b/tests/test-fgemv-rns.C @@ -177,7 +177,8 @@ bool launch_MV(const Field & F, Element_ptr Y2 = fflas_new (F, Ydim, incY); fassign (F, Ydim, Y, incY, Y2, incY); - + fassign (F, Ydim, Y2, incY, D, 1); + fassign (F, Ydim, Y2, incY, Y, incY); if (par){ { ParSeqHelper::Parallel WH(1); @@ -200,11 +201,14 @@ bool launch_MV(const Field & F, break; } + + fassign (F, Ydim, Y2, incY, D, 1); + fassign (F, Ydim, Y2, incY, Y, incY); if (par){ { - ParSeqHelper::Parallel WH; PAR_BLOCK{ + ParSeqHelper::Parallel WH(NUM_THREADS); fgemv(F, ta, m,k,alpha, A,lda, X, incX, beta, Y, incY, WH); } } @@ -222,11 +226,13 @@ bool launch_MV(const Field & F, break; } + fassign (F, Ydim, Y2, incY, D, 1); + fassign (F, Ydim, Y2, incY, Y, incY); if (par){ { - ParSeqHelper::Compose, ParSeqHelper::Parallel> WH; PAR_BLOCK{ + ParSeqHelper::Compose, ParSeqHelper::Parallel> WH(1,NUM_THREADS); fgemv(F, ta, m,k,alpha, A,lda, X, incX, beta, Y, incY, WH); } } From 18d2d149af3a1693c882d6a2d0b9d77b1118fdda Mon Sep 17 00:00:00 2001 From: ZHG Date: Wed, 17 Jul 2019 09:15:01 +0200 Subject: [PATCH 28/31] Fall back to 3.3.3 version for the libtool in the configure.ac --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 04df5b84f..671177a66 100644 --- a/configure.ac +++ b/configure.ac @@ -158,7 +158,7 @@ AC_PROG_LIBTOOL AC_PROG_EGREP AC_PROG_SED # newer libtool... -LT_PREREQ([2.4.3]) +LT_PREREQ([2.3.3]) LT_INIT From 6808a05567e02f13d0516cc6d55dd78cb95f4b50 Mon Sep 17 00:00:00 2001 From: ZHG Date: Wed, 17 Jul 2019 12:47:03 +0200 Subject: [PATCH 29/31] Chnaged LT_PREREQ to 2.4.2 --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 671177a66..cd6a8ee62 100644 --- a/configure.ac +++ b/configure.ac @@ -158,7 +158,7 @@ AC_PROG_LIBTOOL AC_PROG_EGREP AC_PROG_SED # newer libtool... -LT_PREREQ([2.3.3]) +LT_PREREQ([2.4.2]) LT_INIT From 03f143004d789b505001c4f7feafd5b38ec0e178 Mon Sep 17 00:00:00 2001 From: ZHG Date: Wed, 17 Jul 2019 13:27:37 +0200 Subject: [PATCH 30/31] Chnaged back LT_PREREQ to 2.4.3 --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index cd6a8ee62..04df5b84f 100644 --- a/configure.ac +++ b/configure.ac @@ -158,7 +158,7 @@ AC_PROG_LIBTOOL AC_PROG_EGREP AC_PROG_SED # newer libtool... -LT_PREREQ([2.4.2]) +LT_PREREQ([2.4.3]) LT_INIT From a30ca0b8ff183595ecdf1d5859f100c05f9489ad Mon Sep 17 00:00:00 2001 From: ZHG Date: Wed, 17 Jul 2019 15:02:32 +0200 Subject: [PATCH 31/31] Corrected Makefile.am typo error for test-fgemv-rns --- configure.ac | 2 +- tests/Makefile.am | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index 04df5b84f..cd6a8ee62 100644 --- a/configure.ac +++ b/configure.ac @@ -158,7 +158,7 @@ AC_PROG_LIBTOOL AC_PROG_EGREP AC_PROG_SED # newer libtool... -LT_PREREQ([2.4.3]) +LT_PREREQ([2.4.2]) LT_INIT diff --git a/tests/Makefile.am b/tests/Makefile.am index a67e7bb2e..a623fe333 100755 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -163,7 +163,7 @@ regression_check_SOURCES = regression-check.C test_solve_SOURCES = test-solve.C test_simd_SOURCES = test-simd.C test_fgemv_SOURCES = test-fgemv.C -test_fgemv_rns__SOURCES = test-fgemv-rns.C +test_fgemv_rns_SOURCES = test-fgemv-rns.C #test_pfgemm_DSL_SOURCES = test-pfgemm-DSL.C