diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc
index 135e8a0af55ec..7860ee31142a1 100644
--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
@@ -42,6 +42,8 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   void* p = GetCPUBuddyAllocator()->Alloc(size);
   VLOG(10) << "  pointer=" << p;
+  // For debug
+  memset(p, 0, size);
   return p;
 }
 
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 193f776e760d2..f82f0c71078a5 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -43,6 +43,9 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     auto* input = ctx.Input<Tensor>("Input");
     auto* filter = ctx.Input<Tensor>("Filter");
     auto* output = ctx.Output<Tensor>("Output");
+    auto* alg    = ctx.Input<Tensor>("Algorithm");
+    auto* algOut = ctx.Output<Tensor>("AlgorithmOut");
+    algOut->mutable_data<int>(platform::CPUPlace());
 
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
@@ -126,13 +129,30 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     miopenConvAlgoPerf_t perfRes;
     int algoCount = 0;
+
+    VLOG(3) << "ctx: " << &ctx << " op: " << &ctx.op() << " scope: " << &ctx.scope();
+    VLOG(3) << "alg: " << alg << " get alg str: " << ctx.op().Input("Algorithm");
+    VLOG(3) << "get alg ptr: " << ctx.scope().FindVar(ctx.op().Input("Algorithm"));
+    VLOG(3) << "Input: " << alg->data<int>() << " Output: " << algOut->mutable_data<int>(platform::CPUPlace());
+    int pre_alg = (alg->data<int>())[0];
+    // New allocated memory is initialized as 0
+    if (pre_alg == 0)
+    {
+        PADDLE_ENFORCE(platform::dynload::miopenFindConvolutionForwardAlgorithm(
+            handle, cudnn_input_desc, input_data,
+            cudnn_filter_desc, filter_data,
+            cudnn_conv_desc, cudnn_output_desc, output_data,
+            1, &algoCount, &perfRes, cudnn_workspace, workspace_size_in_bytes, false));
+        (algOut->data<int>())[0] = (int)(perfRes.fwd_algo) + 1;
+        VLOG(3) << "Find Kernel: store " << (algOut->data<int>()) << " kernel :" << perfRes.fwd_algo;
+    }
+    else
+    {
+        perfRes.fwd_algo = (miopenConvFwdAlgorithm_t)(pre_alg - 1);
+        VLOG(3) << "Find Kernel:  load  " << (alg->data<int>()) << " kernel :" << perfRes.fwd_algo;
+    }
+
     for (int i = 0; i < groups; i++) {
-      // ------------------- cudnn conv algorithm ---------------------
-      PADDLE_ENFORCE(platform::dynload::miopenFindConvolutionForwardAlgorithm(
-          handle, cudnn_input_desc, input_data + i * group_offset_in,
-          cudnn_filter_desc, filter_data + i * group_offset_filter,
-	  cudnn_conv_desc, cudnn_output_desc, output_data + i * group_offset_out,
-          1, &algoCount, &perfRes, cudnn_workspace, workspace_size_in_bytes, false));
       // ------------------- cudnn conv forward ---------------------
       PADDLE_ENFORCE(platform::dynload::miopenConvolutionForward(
           handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
@@ -157,6 +177,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
     auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
     auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+#if 0
+    // This block is commented out since it triggers assertion.
+    auto* alg    = ctx.Input<Tensor>("Algorithm");
+    auto* algOut = ctx.Output<Tensor>("AlgorithmOut");
+    if (alg == nullptr || algOut == nullptr)
+    {
+      VLOG(3) << "GradOp alg: " << alg << " algOut : " << algOut;
+    }
+#endif
 
     const T* input_data = input->data<T>();
     const T* output_grad_data = output_grad->data<T>();
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index e3f39218f5db7..681f16fcccab5 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -129,9 +129,13 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
            "H is the height of the filter, and W is the width of the filter. "
            "If the groups attribute is greater than 1, C equals the number of "
            "input image channels divided by the groups.");
+  AddInput("Algorithm",
+           "Selected algorithm for conv2d");
   AddOutput("Output",
             "(Tensor) The output tensor of convolution operator. "
             "The format of output tensor is also NCHW.");
+  AddOutput("AlgorithmOut",
+            "Tuned algorithm for conv2d");
   AddAttr<std::vector<int>>("strides",
                             "(vector<int> default:{1, 1}), the "
                             "strides(h_stride, w_stride) of "
@@ -225,9 +229,13 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
            "is the width of the filter."
            "If the groups attribute is greater than 1, C equals the number of "
            "input image channels divided by the groups.");
+  AddInput("Algorithm",
+           "Selected algorithm for conv3d");
   AddOutput("Output",
             "(Tensor) The output tensor of convolution operator."
             "The format of output tensor is also NCDHW.");
+  AddOutput("AlgorithmOut",
+            "Tuned algorithm for conv3d");
   AddAttr<std::vector<int>>("strides",
                             "(vector<int>, default:{1, 1, 1}), the "
                             "strides(d_stride, h_stride, w_stride) of "
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index d2e7d58524bfb..d7a20657f087c 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1293,6 +1293,13 @@ def _get_default_param_initializer():
         dtype=dtype,
         default_initializer=_get_default_param_initializer())
 
+    algorithm = helper.create_parameter(
+        attr=ParamAttr(name=None, initializer=Constant(0), trainable=False),
+        shape=[3],
+        dtype='int')
+
+    algorithm_out = algorithm
+
     pre_bias = helper.create_tmp_variable(dtype)
 
     helper.append_op(
@@ -1300,8 +1307,11 @@ def _get_default_param_initializer():
         inputs={
             'Input': input,
             'Filter': filter_param,
+            'Algorithm': algorithm,
+        },
+        outputs={'Output': pre_bias,
+                 'AlgorithmOut': algorithm_out,
         },
-        outputs={"Output": pre_bias},
         attrs={
             'strides': stride,
             'paddings': padding,