Fix incorrect hash strings for softmax and 1d maxpool. (#1195)

* Bug fix in softmax cache parameters. * Additional bug fix for maxpool1d cache parameters. * Formatting. * Use nthreads in primitive hash.

Fix incorrect hash strings for softmax and 1d maxpool. (#1195)
* Bug fix in softmax cache parameters. * Additional bug fix for maxpool1d cache parameters. * Formatting. * Use nthreads in primitive hash.
4659d60d · Chris Sullivan · Jayaram Bobba · b5e69eaa · 4659d60d
Commit 4659d60d authored Jul 13, 2018 by Chris Sullivan Committed by Jayaram Bobba Jul 13, 2018
Show whitespace changes
Inline Side-by-side

Showing with 11 additions and 12 deletions

cuda_emitter.cpp src/ngraph/runtime/gpu/cuda_emitter.cpp +11 -12

No files found.
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
@@ -607,9 +607,8 @@ size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const GPURuntimeContext* ctx
                              std::to_string(window_width) + "_wst" + std::to_string(window_stride);
    std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
-    // primitive hash and kernel name are equivalent for maxpool_1d
+    size_t nthreads = shape_size(output_shape);
-    auto hash = kernel_name;
+    std::string hash = kernel_name + "_n" + std::to_string(nthreads);
    // check if the requested kernel is already an inserted primitive
    size_t primitive_index = m_primitive_emitter->lookup(hash);
    if (primitive_index != std::numeric_limits<size_t>::max())
@@ -617,12 +616,6 @@ size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const GPURuntimeContext* ctx
        return primitive_index;
    }
-    size_t nthreads = shape_size(output_shape);
-    //TODO: currently we set it to 64, will add tuning method later
-    uint32_t block_size_x = 64;
-    uint32_t aligned_grid_size_x =
-        align_to_block_size(static_cast<uint32_t>(nthreads), block_size_x);
    // if the kernel has not been compiled, build it
    auto compiled_kernel = ctx->compiled_kernel_pool->get(hash);
    if (compiled_kernel == nullptr)
@@ -630,9 +623,14 @@ size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const GPURuntimeContext* ctx
        codegen::CodeWriter writer;
        CudaKernelBuilder::get_max_pool_1d(
            writer, kernel_name, dtypes, input_width, output_width, window_width, window_stride);
-        compiled_kernel = ctx->compiled_kernel_pool->set(hash, writer.get_code());
+        compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
    }
+    //TODO: currently we set it to 64, will add tuning method later
+    uint32_t block_size_x = 64;
+    uint32_t aligned_grid_size_x =
+        align_to_block_size(static_cast<uint32_t>(nthreads), block_size_x);
    std::unique_ptr<gpu::primitive> pool(
        new gpu::primitive{[=](void** inputs, void** outputs) mutable {
            void* args_list[] = {&inputs[0], &outputs[0], &nthreads};
@@ -935,13 +933,14 @@ size_t
    // kernel_name is used to check if the cuda kernel has been previously compiled
    std::stringstream kernel_name;
    kernel_name << "ew_collective"
-                << "_" << op << "_" << join(dtypes, "_") << "_" << reduce_op
+                << "_" << op << "_" << join(dtypes, "_") << "_" << reduce_op << "_r"
+                << tensor_shape.size() << "_rt" << join(reduced_tensors, "_")
                // multi-output op
                << "_mo" << int(save_elementwise);
    // hash is used to check if the emitted primitive already exists
    std::stringstream ss;
-    ss << kernel_name.str() << "_s" << join(tensor_shape, "_");
+    ss << kernel_name.str() << "_s" << join(tensor_shape, "_") << "_ra" << join(axes, "_");
    auto hash = ss.str();
    // if the primitive exists, we are done