Commit 78ff5784 authored by fenglei.tian's avatar fenglei.tian

add multiplu use cudnn

parent 1c74ad24
#./build/test/unit-test --gtest_filter=GPU.ab #./build/test/unit-test --gtest_filter=GPU.ab
./build/test/unit-test --gtest_filter=GPU.maximum ./build/test/unit-test --gtest_filter=GPU.maximum
./build/test/unit-test --gtest_filter=GPU.minimum
./build/test/unit-test --gtest_filter=GPU.multiple*
#./build/test/unit-test --gtest_filter=GPU.abs #./build/test/unit-test --gtest_filter=GPU.abs
#./build/test/unit-test --gtest_filter=GPU.dot* #./build/test/unit-test --gtest_filter=GPU.dot*
...@@ -63,8 +63,6 @@ void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer, ...@@ -63,8 +63,6 @@ void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
{
// TODO Assert arg0_shape[0] == arg1_shape[0]?
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << n->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "const float alpha = 1.0;\n"; writer << "const float alpha = 1.0;\n";
...@@ -84,7 +82,6 @@ void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer, ...@@ -84,7 +82,6 @@ void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer,
<< out[0].get_size() << ");\n"; << out[0].get_size() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
}
} }
void runtime::gpu::GPU_Emitter::EmitConcat(codegen::CodeWriter& writer, void runtime::gpu::GPU_Emitter::EmitConcat(codegen::CodeWriter& writer,
...@@ -260,27 +257,26 @@ void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer, ...@@ -260,27 +257,26 @@ void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
writer << "{ // " << n->get_name() << "\n"; writer << " // " << n->get_name() << "\n";
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer += R"( writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0; float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
(cudnnCreateTensorDescriptor(&descriptor)); cudnnCreateTensorDescriptor(&descriptor);
(cudnnSetTensor4dDescriptor(descriptor, cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NHWC, /*format=*/CUDNN_TENSOR_NHWC,
/*dataType=*/CUDNN_DATA_FLOAT, /*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1, /*batch_size=*/1,
/*channels=*/1, /*channels=*/1,
/*image_height=*/1, /*image_height=*/1,
/*image_width=*/count)); /*image_width=*/count);
cudnnOpTensorDescriptor_t opTensorDesc; cudnnOpTensorDescriptor_t opTensorDesc;
(cudnnCreateOpTensorDescriptor(&opTensorDesc)); cudnnCreateOpTensorDescriptor(&opTensorDesc);
(cudnnSetOpTensorDescriptor(opTensorDesc, cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_MAX, CUDNN_OP_TENSOR_MAX,
CUDNN_DATA_FLOAT, CUDNN_DATA_FLOAT,
CUDNN_NOT_PROPAGATE_NAN)); CUDNN_NOT_PROPAGATE_NAN);
)"; )";
writer << "cudnnOpTensor(cudnn_handle," writer << "cudnnOpTensor(cudnn_handle,"
...@@ -294,11 +290,6 @@ cudnnOpTensorDescriptor_t opTensorDesc; ...@@ -294,11 +290,6 @@ cudnnOpTensorDescriptor_t opTensorDesc;
<< "&beta," << "&beta,"
<< "descriptor," << "descriptor,"
<< out[0].get_name() << ");\n"; << out[0].get_name() << ");\n";
writer += R"(
)";
writer.indent--;
writer << "}\n";
} }
void runtime::gpu::GPU_Emitter::EmitMinimum(codegen::CodeWriter& writer, void runtime::gpu::GPU_Emitter::EmitMinimum(codegen::CodeWriter& writer,
...@@ -306,6 +297,40 @@ void runtime::gpu::GPU_Emitter::EmitMinimum(codegen::CodeWriter& writer, ...@@ -306,6 +297,40 @@ void runtime::gpu::GPU_Emitter::EmitMinimum(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
writer << " // " << n->get_name() << "\n";
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor);
cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NHWC,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1,
/*channels=*/1,
/*image_height=*/1,
/*image_width=*/count);
cudnnOpTensorDescriptor_t opTensorDesc;
cudnnCreateOpTensorDescriptor(&opTensorDesc);
cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_MIN,
CUDNN_DATA_FLOAT,
CUDNN_NOT_PROPAGATE_NAN);
)";
writer << "cudnnOpTensor(cudnn_handle,"
<< "opTensorDesc,"
<< "&alpha1,"
<< "descriptor,"
<< args[0].get_name() << ","
<< "&alpha2,"
<< "descriptor,"
<< args[1].get_name() << ","
<< "&beta,"
<< "descriptor,"
<< out[0].get_name() << ");\n";
} }
void runtime::gpu::GPU_Emitter::EmitNegative( void runtime::gpu::GPU_Emitter::EmitNegative(
...@@ -476,34 +501,40 @@ void runtime::gpu::GPU_Emitter::EmitMultiply( ...@@ -476,34 +501,40 @@ void runtime::gpu::GPU_Emitter::EmitMultiply(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
const Shape& arg0_shape = args[0].get_shape(); writer << " // " << n->get_name() << "\n";
const Shape& arg1_shape = args[1].get_shape(); writer << "int count = " << out[0].get_size() << ";\n";
// Until we have EW kernel gen, use cuBLAS writer += R"(
// From https://stackoverflow.com/questions/7621520/element-wise-vector-vector-multiplication-in-bl as/7634831 float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor);
cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NHWC,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1,
/*channels=*/1,
/*image_height=*/1,
/*image_width=*/count);
// clang-format off cudnnOpTensorDescriptor_t opTensorDesc;
writer << "{ // " << n->get_name() << "\n"; cudnnCreateOpTensorDescriptor(&opTensorDesc);
writer.indent++; cudnnSetOpTensorDescriptor(opTensorDesc,
writer << "static const float alpha = 1.0;\n"; CUDNN_OP_TENSOR_MUL,
writer << "static const float beta = 0.0;\n"; CUDNN_DATA_FLOAT,
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";; CUDNN_NOT_PROPAGATE_NAN);
writer << "cublasSsbmv(" )";
<< "cublas_handle,"
<< "CUBLAS_FILL_MODE_LOWER," // Corresponds to FORTRAN "L" writer << "cudnnOpTensor(cudnn_handle,"
<< out[0].get_size() << "," // N = input size << "opTensorDesc,"
<< "0," // k = super-diagonal i.e. just use the diagonal of A << "&alpha1,"
<< "&alpha," // Alpha << "descriptor,"
<< args[0].get_name() << "," // vec A (broadcast to a matrix) << args[0].get_name() << ","
<< "1," // LDA = 1 << "&alpha2,"
<< args[1].get_name() << "," // vector x << "descriptor,"
<< "1," // Stride x << args[1].get_name() << ","
<< "&beta," // beta << "&beta,"
<< out[0].get_name() << "," // y << "descriptor,"
<< "1" // Stride y << out[0].get_name() << ");\n";
<< ");\n";
writer.indent--;
writer << "}\n";
// clang-format on
} }
void runtime::gpu::GPU_Emitter::EmitExp(codegen::CodeWriter& writer, void runtime::gpu::GPU_Emitter::EmitExp(codegen::CodeWriter& writer,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment