diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc index 135e8a0af55ec..7860ee31142a1 100644 --- a/paddle/fluid/memory/memory.cc +++ b/paddle/fluid/memory/memory.cc @@ -42,6 +42,8 @@ void* Alloc(platform::CPUPlace place, size_t size) { VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); void* p = GetCPUBuddyAllocator()->Alloc(size); VLOG(10) << " pointer=" << p; + // For debug + memset(p, 0, size); return p; } diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index 193f776e760d2..f82f0c71078a5 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -43,6 +43,9 @@ class CUDNNConvOpKernel : public framework::OpKernel { auto* input = ctx.Input("Input"); auto* filter = ctx.Input("Filter"); auto* output = ctx.Output("Output"); + auto* alg = ctx.Input("Algorithm"); + auto* algOut = ctx.Output("AlgorithmOut"); + algOut->mutable_data(platform::CPUPlace()); std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); @@ -126,13 +129,30 @@ class CUDNNConvOpKernel : public framework::OpKernel { ScalingParamType alpha = 1.0f, beta = 0.0f; miopenConvAlgoPerf_t perfRes; int algoCount = 0; + + VLOG(3) << "ctx: " << &ctx << " op: " << &ctx.op() << " scope: " << &ctx.scope(); + VLOG(3) << "alg: " << alg << " get alg str: " << ctx.op().Input("Algorithm"); + VLOG(3) << "get alg ptr: " << ctx.scope().FindVar(ctx.op().Input("Algorithm")); + VLOG(3) << "Input: " << alg->data() << " Output: " << algOut->mutable_data(platform::CPUPlace()); + int pre_alg = (alg->data())[0]; + // New allocated memory is initialized as 0 + if (pre_alg == 0) + { + PADDLE_ENFORCE(platform::dynload::miopenFindConvolutionForwardAlgorithm( + handle, cudnn_input_desc, input_data, + cudnn_filter_desc, filter_data, + cudnn_conv_desc, cudnn_output_desc, output_data, + 1, &algoCount, &perfRes, cudnn_workspace, workspace_size_in_bytes, false)); + (algOut->data())[0] = (int)(perfRes.fwd_algo) + 1; + VLOG(3) << "Find Kernel: store " << (algOut->data()) << " kernel :" << perfRes.fwd_algo; + } + else + { + perfRes.fwd_algo = (miopenConvFwdAlgorithm_t)(pre_alg - 1); + VLOG(3) << "Find Kernel: load " << (alg->data()) << " kernel :" << perfRes.fwd_algo; + } + for (int i = 0; i < groups; i++) { - // ------------------- cudnn conv algorithm --------------------- - PADDLE_ENFORCE(platform::dynload::miopenFindConvolutionForwardAlgorithm( - handle, cudnn_input_desc, input_data + i * group_offset_in, - cudnn_filter_desc, filter_data + i * group_offset_filter, - cudnn_conv_desc, cudnn_output_desc, output_data + i * group_offset_out, - 1, &algoCount, &perfRes, cudnn_workspace, workspace_size_in_bytes, false)); // ------------------- cudnn conv forward --------------------- PADDLE_ENFORCE(platform::dynload::miopenConvolutionForward( handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, @@ -157,6 +177,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { auto output_grad = ctx.Input(framework::GradVarName("Output")); auto input_grad = ctx.Output(framework::GradVarName("Input")); auto filter_grad = ctx.Output(framework::GradVarName("Filter")); +#if 0 + // This block is commented out since it triggers assertion. + auto* alg = ctx.Input("Algorithm"); + auto* algOut = ctx.Output("AlgorithmOut"); + if (alg == nullptr || algOut == nullptr) + { + VLOG(3) << "GradOp alg: " << alg << " algOut : " << algOut; + } +#endif const T* input_data = input->data(); const T* output_grad_data = output_grad->data(); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index e3f39218f5db7..681f16fcccab5 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -129,9 +129,13 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker) "H is the height of the filter, and W is the width of the filter. " "If the groups attribute is greater than 1, C equals the number of " "input image channels divided by the groups."); + AddInput("Algorithm", + "Selected algorithm for conv2d"); AddOutput("Output", "(Tensor) The output tensor of convolution operator. " "The format of output tensor is also NCHW."); + AddOutput("AlgorithmOut", + "Tuned algorithm for conv2d"); AddAttr>("strides", "(vector default:{1, 1}), the " "strides(h_stride, w_stride) of " @@ -225,9 +229,13 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker) "is the width of the filter." "If the groups attribute is greater than 1, C equals the number of " "input image channels divided by the groups."); + AddInput("Algorithm", + "Selected algorithm for conv3d"); AddOutput("Output", "(Tensor) The output tensor of convolution operator." "The format of output tensor is also NCDHW."); + AddOutput("AlgorithmOut", + "Tuned algorithm for conv3d"); AddAttr>("strides", "(vector, default:{1, 1, 1}), the " "strides(d_stride, h_stride, w_stride) of " diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d2e7d58524bfb..d7a20657f087c 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1293,6 +1293,13 @@ def _get_default_param_initializer(): dtype=dtype, default_initializer=_get_default_param_initializer()) + algorithm = helper.create_parameter( + attr=ParamAttr(name=None, initializer=Constant(0), trainable=False), + shape=[3], + dtype='int') + + algorithm_out = algorithm + pre_bias = helper.create_tmp_variable(dtype) helper.append_op( @@ -1300,8 +1307,11 @@ def _get_default_param_initializer(): inputs={ 'Input': input, 'Filter': filter_param, + 'Algorithm': algorithm, + }, + outputs={'Output': pre_bias, + 'AlgorithmOut': algorithm_out, }, - outputs={"Output": pre_bias}, attrs={ 'strides': stride, 'paddings': padding,