diff --git a/benchmarks/Makefile.am b/benchmarks/Makefile.am index 161626145..9b5b92d32 100755 --- a/benchmarks/Makefile.am +++ b/benchmarks/Makefile.am @@ -35,7 +35,7 @@ endif PERFPUBLISHERFILE=benchmarks-report.xml -FFLA_BENCH = benchmark-fgemm benchmark-fgemm-rns benchmark-wino benchmark-ftrsm benchmark-fgesv benchmark-ftrsv benchmark-ftrtri benchmark-inverse benchmark-fsytrf benchmark-fsyrk benchmark-lqup benchmark-pluq benchmark-charpoly benchmark-charpoly-mp benchmark-fgemm-mp benchmark-fgemv-mp benchmark-ftrsm-mp benchmark-lqup-mp benchmark-checkers benchmark-fadd-lvl2 benchmark-fdot benchmark-fgemv +FFLA_BENCH = benchmark-fgemm benchmark-fgemm-rns benchmark-wino benchmark-ftrsm benchmark-fgesv benchmark-ftrsv benchmark-ftrtri benchmark-inverse benchmark-fsytrf benchmark-fsyrk benchmark-lqup benchmark-pluq benchmark-charpoly benchmark-charpoly-mp benchmark-fgemm-mp benchmark-fgemv-mp benchmark-ftrsm-mp benchmark-lqup-mp benchmark-checkers benchmark-fadd-lvl2 benchmark-fdot benchmark-fgemv benchmark-fgemv-rns BLAS_BENCH = benchmark-sgemm$(EXEEXT) benchmark-dgemm benchmark-dtrsm LAPA_BENCH = benchmark-dtrtri benchmark-dgetri benchmark-dgetrf benchmark-dsytrf @@ -85,6 +85,7 @@ benchmark_checkers_SOURCES = benchmark-checkers.C benchmark_fadd_lvl2_SOURCES = benchmark-fadd-lvl2.C benchmark_fdot_SOURCES = benchmark-fdot.C benchmark_fgemv_SOURCES = benchmark-fgemv.C +benchmark_fgemv_rns_SOURCES = benchmark-fgemv-rns.C benchmark_sgemm_CXXFLAGS = $(AM_CXXFLAGS) -D__SGEMM__ diff --git a/benchmarks/benchmark-fgemv-mp.C b/benchmarks/benchmark-fgemv-mp.C index 4cb25ee06..2758990ab 100644 --- a/benchmarks/benchmark-fgemv-mp.C +++ b/benchmarks/benchmark-fgemv-mp.C @@ -1,3 +1,4 @@ + /* * Copyright (C) FFLAS-FFPACK * Written by Pascal Giorgi @@ -28,6 +29,7 @@ // everywhere in the call stack #define __FFLASFFPACK_OPENBLAS_NT_ALREADY_SET 1 + #if not defined(MG_DEFAULT) #define MG_DEFAULT MG_ACTIVE #endif @@ -35,7 +37,6 @@ #define STD_RECINT_SIZE 8 #endif - #include "fflas-ffpack/fflas-ffpack-config.h" #include #include @@ -43,8 +44,9 @@ #include using namespace std; -#include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/fflas/fflas.h" +#include "fflas-ffpack/utils/fflas_io.h" +#include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/args-parser.h" #include "givaro/modular-integer.h" #include "givaro/givcaster.h" @@ -53,33 +55,16 @@ using namespace std; #include "recint/recint.h" #endif - -template -std::ostream& write_matrix(std::ostream& out, Givaro::Integer p, size_t m, size_t n, T* C, size_t ldc){ - - size_t www(size_t((double(p.bitsize())*log(2.))/log(10.))); - out<<"Matrix("< Field; Givaro::Integer p; FFLAS::Timer chrono, TimFreivalds; - double time=0.; + double time=0.,timev=0.; +#ifdef BENCH_FLINT + double timeFlint=0.; +#endif for (size_t loop=0;loop(&p))); + fmpz_mat_t AA,BB,CC,DD; + fmpz_mat_init (AA, m, k); + fmpz_mat_init (BB, k, n); + fmpz_mat_init (CC, m, n); + fmpz_mat_init (DD, m, n); + fmpz_t aalpha, bbeta; + fmpz_set_mpz(aalpha,*(reinterpret_cast(&alpha))); + fmpz_set_mpz(bbeta,*(reinterpret_cast(&beta))); + + for (size_t i=0;i(A+i*lda+j))); + for (size_t i=0;i(B+i*ldb+j))); + for (size_t i=0;i(C+i*ldc+j))); + chrono.clear();chrono.start(); + // DD= A.B + fmpz_mat_mul(DD,AA,BB); + // CC = beta.C + fmpz_mat_scalar_mul_fmpz(CC,CC,bbeta); + // CC = CC + DD.alpha + fmpz_mat_scalar_addmul_fmpz(CC,DD,aalpha); + // CC = CC mod p + for (size_t i=0;i +#include + +#include "fflas-ffpack/config-blas.h" +#include "fflas-ffpack/fflas/fflas.h" +#include "fflas-ffpack/utils/timer.h" +#include "fflas-ffpack/utils/args-parser.h" + +#include "fflas-ffpack/utils/fflas_io.h" +#include "fflas-ffpack/utils/test-utils.h" + +#include "fflas-ffpack/utils/timer.h" +#include "givaro/modular-integer.h" +#include "givaro/givcaster.h" + +using namespace FFPACK; + +using namespace std; +using namespace FFLAS; + +template +struct need_field_characteristic { static constexpr bool value = false; }; +template +struct need_field_characteristic>{ static constexpr bool value = true; }; +template +struct need_field_characteristic>{ static constexpr bool value = true; }; + +template +struct compatible_data_type { static constexpr bool value = true; }; +template <> +struct compatible_data_type>{ static constexpr bool value = false; }; +template <> +struct compatible_data_type>{ static constexpr bool value = false; }; + + +template +void fill_value(Field& F, RandIter& Rand, + Matrix& A, Vector& X, Vector& Y, + size_t m, size_t k, size_t incX, size_t incY, size_t lda, int NBK){ + + SYNCH_GROUP( + FORBLOCK1D(iter, m, SPLITTER(NBK, CuttingStrategy::Row, StrategyParameter::Threads), + TASK(MODE(CONSTREFERENCE(F,Rand,A)), + { + frand(F, Rand, iter.end()-iter.begin(), k, A+iter.begin()*lda, lda); + } + ); + ); + ); + //FFLAS::pfrand(F,Rand, m,k,A,m/NBK); + FFLAS::frand(F,Rand, k,1,X,incX); + FFLAS::fzero(F, m,1,Y,incY); +} + +template +void genData(Field& F, + Matrix& A, Vector& X, Vector& Y, + size_t m, size_t k, size_t incX, size_t incY, size_t lda, int NBK, + int bitsize, uint64_t seed){ + typename Field::RandIter Rand(F,seed,bitsize); //Field::RandIter's parameters order has been changed between seed and bitsize + fill_value(F, Rand, A, X, Y, m, k, incX, incY, lda, NBK); +} + +template +bool check_result(Field& F, size_t m, size_t lda, Matrix& A, Vector& X, size_t incX, Vector& Y, size_t incY){ + //Naive result checking by comparing result from pfgemv against the one from fgemv + typename Field::Element_ptr Y2 = FFLAS::fflas_new(F,m,1); + FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y2, incY); + + for(size_t j=0; j +bool benchmark_with_timer(Field& F, int p, Matrix& A, Vector& X, Vector& Y, size_t m, size_t k, size_t incX, + size_t incY, size_t lda, size_t iters, int t, double& time, size_t GrainSize){ + Timer chrono; + bool pass = true; + for (size_t i=0;i<=iters;++i){ + + chrono.clear(); + + if (p){ + + //typedef CuttingStrategy::Row row; + typedef CuttingStrategy::Recursive rec; + typedef StrategyParameter::Threads threads; + typedef StrategyParameter::Grain grain; + + if (i) { chrono.start(); } + + switch (p){ + case 1:{ + ParSeqHelper::Parallel H(GrainSize); + FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); + break; + } + case 2:{ + ParSeqHelper::Parallel H(t); + FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); + break; + } + case 3:{ + ParSeqHelper::Compose, ParSeqHelper::Parallel> H(GrainSize,t); + + FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY, H); + break; + } + default:{ + FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY); + break; + } + } + if (i) {chrono.stop(); time+=chrono.realtime();} + }else{ + if (i) chrono.start(); + FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY); + if (i) {chrono.stop(); time+=chrono.realtime();} + } + } + if(!check_result(F, m, lda, A, X, incX, Y, incY)) pass = false; + return pass; +} + +template +void benchmark_disp(Field& F, bool pass, double& time, size_t iters, int p, size_t m, size_t k, arg& as){ + if(pass){ + std::cout << "Time: " << time / double(iters) + << " Gflops: " << (2.*double(m)/1000.*double(k)/1000.0/1000.0) / time * double(iters); + writeCommandString(std::cout, as) << std::endl; + }else{ + std::cout<<"FAILED for "<::value, + "The provided data type for ZRing is not compatible for the desired operation and could lead to inconsistent result !"); + + benchmark_in_Field(F, p, m, k, NBK, bitsize, seed, iters, t, as, GrainSize); + +} + +template +void benchmark_with_field(const Givaro::Integer& q, int p, size_t m, size_t k, + int NBK, int bitsize, uint64_t seed, size_t iters, int t, + arg& as, size_t GrainSize){ + Field F(q); + benchmark_in_Field(F, p, m, k, NBK, bitsize, seed, iters, t, as, GrainSize); +} + +int main(int argc, char** argv) { + + int p=0; + + size_t iters = 3; + Givaro::Integer q = 131071; + size_t m = 4000; + size_t k = 4000; + + uint64_t seed = getSeed(); + int t; + PAR_BLOCK { t = NUM_THREADS; } + int NBK = -1; + int b=100; + size_t GrainSize = 64; + + Argument as[] = { + { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q }, + { 'b', "-b B", "Set the bitsize of input.", TYPE_INT , &b }, + { 'p', "-p P", "0 for sequential, 1 for , 2 for , 3 for Compose<, >.", + TYPE_INT , &p }, + { 'm', "-m M", "Set the dimension m of the matrix.", TYPE_INT , &m }, + { 'k', "-k K", "Set the dimension k of the matrix.", TYPE_INT , &k }, + { 't', "-t T", "number of virtual threads to drive the partition.", TYPE_INT , &t }, + { 'N', "-n N", "number of numa blocks per dimension for the numa placement", TYPE_INT , &NBK }, + { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iters }, + { 's', "-s S", "Sets seed.", TYPE_INT , &seed }, + { 'g', "-g G", "Sets GrainSize.", TYPE_INT , &GrainSize }, + END_OF_ARGUMENTS + }; + + parseArguments(argc,argv,as); + + if (NBK==-1) NBK = t; + + PAR_BLOCK { + //benchmark_with_field>( p, m, k, NBK, b, seed, iters, t, as, GrainSize); + benchmark_with_field>( p, m, k, NBK, b, seed, iters, t, as, GrainSize); + } + + + return 0; +} +/* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +// vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s diff --git a/benchmarks/benchmark-fgemv.C b/benchmarks/benchmark-fgemv.C index 5fb04802b..544c5c3c3 100644 --- a/benchmarks/benchmark-fgemv.C +++ b/benchmarks/benchmark-fgemv.C @@ -151,12 +151,12 @@ bool benchmark_with_timer(Field& F, int p, Matrix& A, Vector& X, Vector& Y, size FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, lda, F.one, A, lda, X, incX, F.zero, Y, incY); if (i) {chrono.stop(); time+=chrono.realtime();} } - +/* if(!check_result(F, m, lda, A, X, incX, Y, incY)){ pass = false; break; } - +*/ } return pass; } @@ -240,7 +240,7 @@ int main(int argc, char** argv) { int t; PAR_BLOCK { t = NUM_THREADS; } int NBK = -1; - uint64_t b=0; + uint64_t b=100; size_t GrainSize = 64; Argument as[] = { @@ -268,15 +268,15 @@ int main(int argc, char** argv) { } }else{ PAR_BLOCK { - //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as); - //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as); - //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as); + //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); + //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); + //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); - //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as); + //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); - //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as); + //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); - //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as); + //benchmark_with_field>(q, p, m, k, NBK, b, seed, iters, t, as, GrainSize); } } diff --git a/configure.ac b/configure.ac index 04df5b84f..cd6a8ee62 100644 --- a/configure.ac +++ b/configure.ac @@ -158,7 +158,7 @@ AC_PROG_LIBTOOL AC_PROG_EGREP AC_PROG_SED # newer libtool... -LT_PREREQ([2.4.3]) +LT_PREREQ([2.4.2]) LT_INIT diff --git a/fflas-ffpack/fflas/fflas_bounds.inl b/fflas-ffpack/fflas/fflas_bounds.inl index 896cd2484..4cea3e8fa 100644 --- a/fflas-ffpack/fflas/fflas_bounds.inl +++ b/fflas-ffpack/fflas/fflas_bounds.inl @@ -37,6 +37,10 @@ #include #include +#ifdef PROFILE_FGEMM_MP +#include "fflas-ffpack/utils/timer.h" +#endif + namespace FFLAS { namespace Protected { template @@ -115,11 +119,27 @@ namespace FFLAS { inline Givaro::Integer InfNorm (const size_t M, const size_t N, const Givaro::Integer* A, const size_t lda){ Givaro::Integer max = 0; - for (size_t i=0; i0) max = x; - } + std::vector vmax(M,0); + auto sp=SPLITTER(NUM_THREADS,FFLAS::CuttingStrategy::Row,FFLAS::StrategyParameter::Threads); + SYNCH_GROUP({ + FORBLOCK1D(iter, M, sp, + TASK(MODE(CONSTREFERENCE(A,max,vmax) ), + { + for(auto i=iter.begin(); i!=iter.end(); ++i) + { + for (size_t j=0; j0){ vmax[i] = x;} + } + + } + }) + ); + }); + max=vmax[0]; + for (size_t i=0; i0){ max = vmax[i];} + } return abs(max); } diff --git a/fflas-ffpack/fflas/fflas_fgemm.inl b/fflas-ffpack/fflas/fflas_fgemm.inl index a87f8b21e..ac7ac1ed9 100644 --- a/fflas-ffpack/fflas/fflas_fgemm.inl +++ b/fflas-ffpack/fflas/fflas_fgemm.inl @@ -410,7 +410,9 @@ namespace FFLAS { else if (!std::is_same >::value){ if (F.characteristic() < DOUBLE_TO_FLOAT_CROSSOVER) return Protected::fgemm_convert,Field>(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,H); - else if (!std::is_same >::value && 16*F.cardinality() < Givaro::ModularBalanced::maxCardinality()) + else if (!std::is_same >::value && + !std::is_same >::value && + 16*F.cardinality() < Givaro::ModularBalanced::maxCardinality()) return Protected::fgemm_convert,Field>(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,H); } } diff --git a/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl b/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl index 4fef8e4e4..b772783d5 100644 --- a/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl +++ b/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl @@ -192,7 +192,7 @@ namespace FFLAS { } // fgemm for RnsInteger: handle the moduli in parallel - template + template inline typename FFPACK::RNSInteger::Element_ptr fgemm (const FFPACK::RNSInteger &F, const FFLAS_TRANSPOSE ta, @@ -203,28 +203,110 @@ namespace FFLAS { typename FFPACK::RNSInteger::ConstElement_ptr Bd, const size_t ldb, const typename FFPACK::RNSInteger::Element beta, typename FFPACK::RNSInteger::Element_ptr Cd, const size_t ldc, - MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag, ParSeqHelper::Compose, ParSeqTrait> > & H) + MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag, ParSeqHelper::Compose, ParSeqTrait> > & H) { #ifdef PROFILE_FGEMM_MP Givaro::Timer t;t.start(); #endif size_t rns_size = F.size(); - typedef MMHelper::value, ParSeqTrait> SubHelper; - FORBLOCK1D(iter, rns_size, H.parseq.first_component(), + typedef MMHelper::value, ParSeqTrait> SubHelper; + + SYNCH_GROUP({ + FORBLOCK1D(iter, rns_size, H.parseq.first_component(), + TASK(MODE(CONSTREFERENCE(F,H)), + { + for(auto i=iter.begin(); i!=iter.end(); ++i) + { + SubHelper Hsub (F.rns()._field_rns[i], H.recLevel, H.parseq.second_component()); + FFLAS::fgemm(F.rns()._field_rns[i],ta,tb, + m, n, k, alpha._ptr[i*alpha._stride], + Ad._ptr+i*Ad._stride, lda, + Bd._ptr+i*Bd._stride, ldb, + beta._ptr[i*beta._stride], + Cd._ptr+i*Cd._stride, ldc + , Hsub + ); + } + }) + ); + + }); + +#ifdef PROFILE_FGEMM_MP + t.stop(); + std::cerr<<"=========================================="< + inline typename FFPACK::RNSInteger::Element_ptr + fgemm (const FFPACK::RNSInteger &F, + const FFLAS_TRANSPOSE ta, + const FFLAS_TRANSPOSE tb, + const size_t m, const size_t n,const size_t k, + const typename FFPACK::RNSInteger::Element alpha, + typename FFPACK::RNSInteger::ConstElement_ptr Ad, const size_t lda, + typename FFPACK::RNSInteger::ConstElement_ptr Bd, const size_t ldb, + const typename FFPACK::RNSInteger::Element beta, + typename FFPACK::RNSInteger::Element_ptr Cd, const size_t ldc, + MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag, ParSeqHelper::Parallel > & H) + { + // compute each fgemm componentwise + size_t rns_size = F.size(); + size_t nt = H.parseq.numthreads(); + size_t loop_nt = std::min (rns_size, nt); + size_t iter_nt = nt / loop_nt; + size_t leftover_nt = nt % loop_nt; + ParSeqHelper::Parallel Hloop (loop_nt); +#ifdef PROFILE_FGEMM_MP + Givaro::Timer t;t.start(); +#endif + typedef MMHelper::value, + ParSeqHelper::Parallel > SubPar; + + typedef MMHelper::value, + ParSeqHelper::Sequential> SubSeq; + + FORBLOCK1D(iter, rns_size, Hloop, TASK(MODE(CONSTREFERENCE(F,H)), { for(auto i=iter.begin(); i!=iter.end(); ++i) { - SubHelper Hsub (F.rns()._field_rns[i], H.recLevel, H.parseq.second_component()); - FFLAS::fgemm(F.rns()._field_rns[i],ta,tb, - m, n, k, alpha._ptr[i*alpha._stride], - Ad._ptr+i*Ad._stride, lda, - Bd._ptr+i*Bd._stride, ldb, - beta._ptr[i*beta._stride], - Cd._ptr+i*Cd._stride, ldc, Hsub); + size_t fgemm_nt = iter_nt; + if (i < leftover_nt) + fgemm_nt++; + if (fgemm_nt>1) // Running a parallel fgemm + { + SubPar H2(F.rns()._field_rns[i], H.recLevel, ParSeqHelper::Parallel(fgemm_nt)); + fgemm(F.rns()._field_rns[i], ta, tb, m, n, k, + alpha._ptr[i*alpha._stride], Ad._ptr+i*Ad._stride, + lda, Bd._ptr+i*Bd._stride, ldb, + beta._ptr[i*beta._stride], Cd._ptr+i*Cd._stride, + ldc, H2); + } + else // Running a sequential fgemm + { + SubSeq H2(F.rns()._field_rns[i], H.recLevel, ParSeqHelper::Sequential()); + fgemm(F.rns()._field_rns[i], ta, tb, m, n, k, + alpha._ptr[i*alpha._stride], Ad._ptr+i*Ad._stride, + lda, Bd._ptr+i*Bd._stride, ldb, + beta._ptr[i*beta._stride], Cd._ptr+i*Cd._stride, + ldc, H2); } - }) - ); + } + }); // TASK + ); // FLORBLOCK1D + + #ifdef PROFILE_FGEMM_MP t.stop(); std::cerr<<"=========================================="< inline typename FFPACK::RNSInteger::Element_ptr fgemm (const FFPACK::RNSInteger &F, @@ -246,7 +329,7 @@ namespace FFLAS { typename FFPACK::RNSInteger::ConstElement_ptr Bd, const size_t ldb, const typename FFPACK::RNSInteger::Element beta, typename FFPACK::RNSInteger::Element_ptr Cd, const size_t ldc, - MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag, ParSeqHelper::Parallel > & H) + MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag, ParSeqHelper::Parallel > & H) { // compute each fgemm componentwise size_t rns_size = F.size(); @@ -298,9 +381,9 @@ namespace FFLAS { }); // TASK ); // FLORBLOCK1D + #ifdef PROFILE_FGEMM_MP t.stop(); - std::cerr<<"=========================================="<, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq > & H) { - //std::cerr<<"Entering fgemm> ParSeq"<> ParSeq"<>=1; ++lk;} @@ -352,6 +437,7 @@ namespace FFLAS { mC = 2*uint64_t(k)*H.normA*H.normB*abs(alpha); // need to use 2x bound to reach both positive and negative + // A or B is zero, no need to modify C if (mC == 0) return C; @@ -381,6 +467,7 @@ namespace FFLAS { chrono.start(); #endif + // convert the input matrices to RNS representation finit_rns(Zrns,Arowd,Acold,(logA/16)+((logA%16)?1:0),A,lda,Ap); finit_rns(Zrns,Browd,Bcold,(logB/16)+((logB%16)?1:0),B,ldb,Bp); @@ -422,6 +509,7 @@ namespace FFLAS { std::cout<<"-------------------------------"< inline typename Field::Element_ptr @@ -82,7 +82,7 @@ namespace FFLAS{ namespace Protected { } }// Protected }// FFLAS - +//!Convert to either float or double according to field's cardinality namespace FFLAS { template inline typename Field::Element_ptr @@ -114,7 +114,7 @@ namespace FFLAS { // Computes Y <- alpha.op(A).X + beta.Y // A is M*N, //--------------------------------------------------------------------- - + //! Performs Matrix Vector Multiplication with delayed mod reductions. Ensures result is reduced. template inline typename Field::Element_ptr fgemv (const Field& F, const FFLAS_TRANSPOSE ta, @@ -126,7 +126,6 @@ namespace FFLAS { typename Field::Element_ptr Y, const size_t incY, MMHelper & H) { - if (!M) {return Y;} size_t Ydim = (ta == FflasNoTrans)?M:N; size_t Xdim = (ta == FflasNoTrans)?N:M; @@ -404,6 +403,8 @@ namespace FFLAS{ #endif return Y; } + + //specialization for ZRing inline Givaro::DoubleDomain::Element_ptr fgemv (const Givaro::DoubleDomain& F, const FFLAS_TRANSPOSE ta, const size_t M, const size_t N, @@ -442,6 +443,7 @@ namespace FFLAS{ return fgemv(F, ta, M, N, alpha, A, lda, X, incX, beta, Y, incY, Hb); } + ////specialization for ZRing inline Givaro::FloatDomain::Element_ptr fgemv (const Givaro::FloatDomain& F, const FFLAS_TRANSPOSE ta, const size_t M, const size_t N, @@ -463,6 +465,45 @@ namespace FFLAS{ return Y; } + //specialization for Givaro::ZRing with ParSeqHelper::Compose + template + Givaro::Integer* + fgemv(const Givaro::ZRing& F, + const FFLAS_TRANSPOSE ta, + const size_t m, + const size_t n, + const Givaro::Integer alpha, + const Givaro::Integer* A, const size_t lda, + const Givaro::Integer* X, const size_t incX, + const Givaro::Integer beta, + Givaro::Integer* Y, const size_t incY, + ParSeqHelper::Compose& parH){ + MMHelper, MMHelperAlgo::Auto, FFLAS::ModeTraits>::value, ParSeqHelper::Compose> pH (F,m,n,1,parH); + fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY, pH); + return Y; + } + + //specialization for Givaro::Modular with ParSeqHelper::Compose + template + Givaro::Integer* + fgemv(const Givaro::Modular& F, + const FFLAS_TRANSPOSE ta, + const size_t m, + const size_t n, + const Givaro::Integer alpha, + const Givaro::Integer* A, const size_t lda, + const Givaro::Integer* X, const size_t incX, + const Givaro::Integer beta, + Givaro::Integer* Y, const size_t incY, + ParSeqHelper::Compose& parH){ + MMHelper, MMHelperAlgo::Auto, FFLAS::ModeTraits>::value, ParSeqHelper::Compose> pH (F,m,n,1,parH); + fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY, pH); + return Y; + } + + + + //Common interface for fgemv with ParSeqHelper::Parallel input parameter in which the corresponding parallel implementation will be called for the given field (ref. pfgemv.inl) template typename Field::Element_ptr fgemv(const Field& F, @@ -475,10 +516,13 @@ namespace FFLAS{ const typename Field::Element beta, typename Field::Element_ptr Y, const size_t incY, ParSeqHelper::Parallel& parH){ - MMHelper::value, ParSeqHelper::Parallel > pH (F,m,n,1,parH); - return fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY, pH); + MMHelper::value, ParSeqHelper::Parallel > pH (F,m,n,1,parH); + fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY, pH); + return Y; } + + //Common interface for fgemv with ParSeqHelper::Sequential input parameter in which the corresponding sequential implementation will be called for the given field type either for common field implementated as above or multiprcesion field ref. fflas_fgemv_mp.inl template typename Field::Element_ptr fgemv(const Field& F, @@ -491,11 +535,13 @@ namespace FFLAS{ const typename Field::Element beta, typename Field::Element_ptr Y, const size_t incY, ParSeqHelper::Sequential& seqH ){ - MMHelper pH(F,m,n,1,seqH); + MMHelper pH(F,m,n,1,seqH); return fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY, pH); } + } #endif // __FFLASFFPACK_fgemv_INL /* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s + diff --git a/fflas-ffpack/fflas/fflas_fgemv_mp.inl b/fflas-ffpack/fflas/fflas_fgemv_mp.inl index 4f7017f40..b04867456 100644 --- a/fflas-ffpack/fflas/fflas_fgemv_mp.inl +++ b/fflas-ffpack/fflas/fflas_fgemv_mp.inl @@ -31,7 +31,6 @@ namespace FFLAS { - // specialization of the fgemv function for the field RNSInteger inline FFPACK::rns_double::Element_ptr fgemv (const FFPACK::RNSInteger& F, const FFLAS_TRANSPOSE ta, @@ -57,7 +56,6 @@ namespace FFLAS { return Y; } - // specialization of the fgemv function for the field RNSIntegerMod inline FFPACK::rns_double::Element_ptr fgemv (const FFPACK::RNSIntegerMod& F, const FFLAS_TRANSPOSE ta, @@ -70,7 +68,7 @@ namespace FFLAS { MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag> & H) { //std::cout<<"HERE 1"<, MMHelperAlgo::Classic, ModeCategories::DefaultTag > H2; + MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag > H2(H); //std::cout<<"HERE 2"< + template inline Givaro::Integer* fgemv (const Givaro::ZRing& F, const FFLAS_TRANSPOSE ta, const size_t m, const size_t n, const Givaro::Integer alpha, - Givaro::Integer* A, const size_t lda, - Givaro::Integer* X, const size_t ldx, + const Givaro::Integer* A, const size_t lda, // @fixme Why not originally const? + const Givaro::Integer* X, const size_t ldx, Givaro::Integer beta, Givaro::Integer* Y, const size_t ldy, - MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo > & H) - { - MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeqHelper::Sequential> H2; - fgemm(F,ta,FFLAS::FflasNoTrans, (ta==FFLAS::FflasNoTrans)?m:n, 1,(ta==FFLAS::FflasNoTrans)?n:m, alpha,A,lda,X,ldx,beta,Y,ldy,H2); - return Y; + MMHelper, AlgoT, ModeCategories::ConvertTo, ParSeqHelper::Sequential> & H) { + fgemm(F,ta,FFLAS::FflasNoTrans, (ta==FFLAS::FflasNoTrans)?m:n, 1,(ta==FFLAS::FflasNoTrans)?n:m, alpha,A,lda,X,ldx,beta,Y,ldy,H); + return Y; + } + template + inline Givaro::Integer* fgemv (const Givaro::ZRing& F, + const FFLAS_TRANSPOSE ta, + const size_t m, const size_t n, + const Givaro::Integer alpha, + const Givaro::Integer* A, const size_t lda, // @fixme Why not originally const? + const Givaro::Integer* X, const size_t ldx, + Givaro::Integer beta, + Givaro::Integer* Y, const size_t ldy, + MMHelper, AlgoT, ModeCategories::ConvertTo, ParSeqHelper::Parallel> & H){ + ParSeqHelper::Compose,ParSeqHelper::Sequential> CompHelper (H.parseq, ParSeqHelper::Sequential()); + MMHelper, AlgoT, ModeCategories::ConvertTo, ParSeqHelper::Compose,ParSeqHelper::Sequential>> Hc(F,m,1,n, CompHelper); + fgemv(F,ta, m, n, alpha,A,lda,X,ldx,beta,Y,ldy,Hc); + return Y; + } + template + inline Givaro::Integer* fgemv (const Givaro::ZRing& F, + const FFLAS_TRANSPOSE ta, + const size_t m, const size_t n, + const Givaro::Integer alpha, + const Givaro::Integer* A, const size_t lda, // @fixme Why not originally const? + const Givaro::Integer* X, const size_t ldx, + Givaro::Integer beta, + Givaro::Integer* Y, const size_t ldy, + MMHelper, AlgoT, ModeCategories::ConvertTo, ParSeqHelper::Compose,ComposeArgs...>> & H){ + fgemm(F,ta,FFLAS::FflasNoTrans, (ta==FFLAS::FflasNoTrans)?m:n, 1,(ta==FFLAS::FflasNoTrans)?n:m, alpha,A,lda,X,ldx,beta,Y,ldy,H); + + return Y; } + // specialization of the fgemv function for the field Givaro::Modular // Calling fgemm, TODO: really specialize fgemv + template inline Givaro::Integer* fgemv (const Givaro::Modular& F, const FFLAS_TRANSPOSE ta, const size_t m, const size_t n, @@ -108,16 +135,17 @@ namespace FFLAS { Givaro::Integer* X, const size_t ldx, Givaro::Integer beta, Givaro::Integer* Y, const size_t ldy, - MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo > & H) + MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> & H) { - MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeqHelper::Sequential> H2; + MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> H2(H); fgemm(F,ta,FFLAS::FflasNoTrans,(ta==FFLAS::FflasNoTrans)?m:n,1,(ta==FFLAS::FflasNoTrans)?n:m,alpha,A,lda,X,ldx,beta,Y,ldy,H2); return Y; } // specialization of the fgemv function for the field Givaro::Modular> // Calling fgemm, TODO: really specialize fgemv - template + //@QuickFix: This is only the sequential implementation and any call to parallel fgemv for the field Givaro::Modular> will refer to the implementation in the pfgemv.inl file + template inline RecInt::ruint* fgemv (const Givaro::Modular,RecInt::ruint >& F, const FFLAS_TRANSPOSE ta, @@ -130,8 +158,8 @@ namespace FFLAS { MMHelper,RecInt::ruint >, MMHelperAlgo::Classic, ModeCategories::ConvertTo, - ParSeq > & H) { - MMHelper,RecInt::ruint >, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeqHelper::Sequential> H2; + ParSeqHelper::Sequential > & H) { + MMHelper,RecInt::ruint >, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeqHelper::Sequential> H2(H); fgemm (F,ta,FflasNoTrans,(ta==FFLAS::FflasNoTrans)?m:n,1,(ta==FFLAS::FflasNoTrans)?n:m,alpha,A,lda,X,incx,beta,Y,incy,H2); return Y; } diff --git a/fflas-ffpack/field/rns-double.h b/fflas-ffpack/field/rns-double.h index 8286c8768..69bffb28c 100644 --- a/fflas-ffpack/field/rns-double.h +++ b/fflas-ffpack/field/rns-double.h @@ -53,7 +53,7 @@ namespace FFPACK { /* Structure that handles rns representation given a bound and bitsize for prime moduli - * support sign representation (i.e. the bound must be twice larger then ||A||) + * support sign representation (i.e. the bound must be twice larger than ||A||) */ struct rns_double { typedef Givaro::Integer integer; @@ -209,7 +209,7 @@ namespace FFPACK { // std::cout<<"t2="<_ldm){ FFPACK::failure()(__func__,__FILE__,__LINE__,"rns_double [init] -> rns basis is too small to handle integers with 2^(16*k) values "); std::cerr<<"with k="<init(1,1,x._ptr,x._stride, &y,1,k); + init(x); + size_t k =(y.bitsize())/16+((y.bitsize())%16?1:0); + _rns->init(1,1,x._ptr,x._stride, &y,1,k); return x; } Element& reduce (Element& x, const Element& y) const {return assign (x,y);} diff --git a/fflas-ffpack/paladin/blockcuts.inl b/fflas-ffpack/paladin/blockcuts.inl index 96b337c22..1ffd02d09 100644 --- a/fflas-ffpack/paladin/blockcuts.inl +++ b/fflas-ffpack/paladin/blockcuts.inl @@ -40,7 +40,7 @@ namespace FFLAS { struct Column{}; struct Block{}; struct Recursive{}; - typedef Row RNSModulus; + struct RNSModulus{}; } namespace StrategyParameter{ @@ -96,10 +96,10 @@ namespace FFLAS { struct Compose{ Compose() : _comp1 (), _comp2 () {} - Compose(const Compose & other) : _comp1 (other.first_component()), _comp2 (other.second_component()) {} + Compose(const Compose & other) : _comp1 (other.first_component()), _comp2 (other.second_component()) {} Compose(const Sequential & S) : _comp1 (1), _comp2 (1) {} Compose(size_t th1, size_t th2) : _comp1 (th1), _comp2 (th2) {} - Compose(const H1 & o1, const H2 & o2) : _comp1 (o1), _comp2 (o2) {} + Compose(const H1 & o1, const H2 & o2) : _comp1 (o1), _comp2 (o2) {} H1 first_component () const { return _comp1; } H2 second_component () const { return _comp2; } diff --git a/fflas-ffpack/paladin/pfgemv.inl b/fflas-ffpack/paladin/pfgemv.inl index c6f0234ad..440a1f650 100644 --- a/fflas-ffpack/paladin/pfgemv.inl +++ b/fflas-ffpack/paladin/pfgemv.inl @@ -25,7 +25,7 @@ namespace FFLAS { - + // specialization of the fgemv function for the MMHelper with CuttingStrategy::Recursive but templated for all possible field type so that the corresponding templated sequential implementation will be invoked in the parallel code region template typename Field::Element_ptr fgemv(const Field& F, @@ -38,7 +38,6 @@ namespace FFLAS const typename Field::Element beta, typename Field::Element_ptr Y, const size_t incY, MMHelper > & H){ - if (H.parseq.numthreads()==1 || m <= 1){ fgemv(F, ta, m, n, alpha, A, lda, X, incX, beta, Y, incY); @@ -75,7 +74,7 @@ namespace FFLAS return Y; } - + // specialization of the fgemv function for the MMHelper with CuttingStrategy::Row but templated for all possible field type so that the corresponding templated sequential implementation will be invoked in the parallel code region template typename Field::Element_ptr fgemv(const Field& F, @@ -106,7 +105,6 @@ namespace FFLAS return Y; } - } // FFLAS /* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ diff --git a/tests/Makefile.am b/tests/Makefile.am index 4326820ec..a623fe333 100755 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -67,6 +67,7 @@ BASIC_TESTS = \ test-fgesv \ test-simd \ test-fgemv \ + test-fgemv-rns \ test-nullspace \ regression-check @@ -162,6 +163,7 @@ regression_check_SOURCES = regression-check.C test_solve_SOURCES = test-solve.C test_simd_SOURCES = test-simd.C test_fgemv_SOURCES = test-fgemv.C +test_fgemv_rns_SOURCES = test-fgemv-rns.C #test_pfgemm_DSL_SOURCES = test-pfgemm-DSL.C diff --git a/tests/test-fgemv-rns.C b/tests/test-fgemv-rns.C new file mode 100644 index 000000000..75c357214 --- /dev/null +++ b/tests/test-fgemv-rns.C @@ -0,0 +1,419 @@ +/* + * Copyright (C) the FFLAS-FFPACK group + * Written by Clément Pernet + * Brice Boyer (briceboyer) + * This file is Free Software and part of FFLAS-FFPACK. + * + * ========LICENCE======== + * This file is part of the library FFLAS-FFPACK. + * + * FFLAS-FFPACK is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * ========LICENCE======== + *. + */ + +// #ifndef NEWINO +// #define NEWWINO +// #endif + +// #define WINOTHRESHOLD 100 +// #define OLD_DYNAMIC_PEELING + + + +#include "fflas-ffpack/fflas-ffpack-config.h" + +#include +#include + +#include + +#include + +#include "fflas-ffpack/utils/timer.h" +#include "fflas-ffpack/fflas/fflas.h" + +#include "fflas-ffpack/utils/args-parser.h" +#include "fflas-ffpack/utils/test-utils.h" + +using namespace std; +using namespace FFPACK; +using namespace FFLAS; + +using Givaro::Modular; +using Givaro::ModularBalanced; + +template +struct rsn_compatible_data_type { static constexpr bool value = false; }; +template <> +struct rsn_compatible_data_type>{ static constexpr bool value = true; }; +template +struct rsn_compatible_data_type>{ static constexpr bool value = false; }; +template +struct rsn_compatible_data_type>{ static constexpr bool value = false; }; + +// checks that D = beta . Y + alpha . A ^ta * X +template +bool check_MV(const Field & F, + const typename Field::Element_ptr Cd, // c0 + enum FFLAS_TRANSPOSE & ta, + const size_t m, + const size_t k, + const typename Field::Element & alpha, + const typename Field::Element_ptr A, size_t lda, + const typename Field::Element_ptr X, size_t incX, + const typename Field::Element & beta, + const typename Field::Element_ptr Y, size_t incY) +{ + bool wrong = false; + typename Field::Element_ptr D; + if (ta == FflasNoTrans){ + D = fflas_new(F,m); + fassign (F, m, Cd, 1, D, 1); + for (size_t i=0; iwrite(std::cerr) << std::endl; +#endif + typedef typename Field::Element Element ; + typename Field::RandIter R(*F,seed++); + typename Field::NonZeroRandIter NZR(R); + + //size_t k = 0 ; + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->one,F->zero,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->zero,F->zero,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->mOne,F->zero,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->one ,F->one,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->zero,F->one,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->mOne,F->one,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->one ,F->mOne,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->zero,F->mOne,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->mOne,F->mOne,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + + Element alpha,beta ; + NZR.random(alpha); + ok = ok && launch_MV_dispatch(*F,m,k,F->one ,alpha,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->zero,alpha,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,F->mOne,alpha,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,alpha,F->one ,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,alpha,F->zero,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + ok = ok && launch_MV_dispatch(*F,m,k,alpha,F->mOne,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + + for (size_t j = 0 ; j < 3 ; ++j) { + R.random(alpha); + R.random(beta); + ok = ok && launch_MV_dispatch(*F,m,k,alpha,beta,iters, par, R); + //std::cout << k << "/24" << std::endl; ++k; + } + //std::cout< >(0,(b?b:512_ui64),m,k,iters,p, seed); + } while (loop && ok); + + + + + return !ok ; +} +/* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +// vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s diff --git a/tests/test-fgemv.C b/tests/test-fgemv.C index 7ddf8a14b..6a250f7f6 100644 --- a/tests/test-fgemv.C +++ b/tests/test-fgemv.C @@ -55,6 +55,14 @@ using namespace FFLAS; using Givaro::Modular; using Givaro::ModularBalanced; +template +struct rsn_compatible_data_type { static constexpr bool value = false; }; +template <> +struct rsn_compatible_data_type>{ static constexpr bool value = true; }; +template +struct rsn_compatible_data_type>{ static constexpr bool value = false; }; +template +struct rsn_compatible_data_type>{ static constexpr bool value = false; }; // checks that D = beta . Y + alpha . A ^ta * X template @@ -243,8 +251,6 @@ bool launch_MV(const Field & F, break; } - - } return ok ; } @@ -409,7 +415,6 @@ int main(int argc, char** argv) ok = ok && run_with_field >(q,b,m,k,iters,p, seed); ok = ok && run_with_field >(q,b,m,k,iters, p, seed); ok = ok && run_with_field >(q,b,m,k,iters, p, seed); - ok = ok && run_with_field > >(q,b?b:127_ui64,m,k,iters, p, seed); ok = ok && run_with_field,RecInt::ruint<8> > >(q,b?b:127_ui64,m,k,iters, p, seed); ok = ok && run_with_field >(q,(b?b:512_ui64),m,k,iters,p, seed);