Commit 4b5f6f63 authored by oscar's avatar oscar

修改cuda函数

parent b3d8bc6f
......@@ -374,9 +374,73 @@ void map_bev_overlap(const int num_a, pybind11::array_t<float> boxes_a,const int
free(b_row_ptr);
}
void bev_overlap(const int num_a, float* boxes_a, const int num_b, float* boxes_b, float* ans_overlap) {
PYBIND11_MODULE(juefx_iou, m)
{
m.def("bev_overlap", &map_bev_overlap);
//pybind11::buffer_info bx_a = boxes_a.request();
//pybind11::buffer_info bx_b = boxes_b.request();
//pybind11::buffer_info ans = ans_overlap.request();
int size_a = num_a * 7 * sizeof(float);
int size_b = num_b * 7 * sizeof(float);
int size_ans = num_a * num_b * sizeof(float);
float* a_gpu;
float* b_gpu;
float* ans_gpu;
GPU_CHECK(cudaMalloc(&a_gpu, size_a));
GPU_CHECK(cudaMalloc(&b_gpu, size_b));
GPU_CHECK(cudaMalloc(&ans_gpu, size_ans));
//float* a_ptr = reinterpret_cast<float*>(bx_a.ptr);
//const float* a_ptr = reinterpret_cast<const float*>(bx_a.ptr);
//float* b_ptr = reinterpret_cast<float*>(bx_b.ptr);
//float* ans_ptr = reinterpret_cast<float*>(ans.ptr);
//int an = bx_a.shape[0];
//int bn = bx_b.shape[0];
// A AND B POINTERS ARE COLUMN-BASED WHEN IN ROS, CONVERTING THIS TO ROW-BASED.
//#ifdef ROS
//float* a_row_ptr;
//a_row_ptr = (float*)malloc(size_a);
//for (int ii = 0; ii < an; ii++) {
// for (int jj = 0; jj < 7; jj++) {
// *(a_row_ptr + jj + ii * 7) = *(a_ptr + ii + jj * an);
// }
//}
//float* b_row_ptr;
//b_row_ptr = (float*)malloc(size_b);
//for (int ii = 0; ii < bn; ii++) {
// for (int jj = 0; jj < 7; jj++) {
// *(b_row_ptr + jj + ii * 7) = *(b_ptr + ii + jj * bn);
// }
//}
GPU_CHECK(cudaMemcpy(a_gpu, boxes_a, size_a, cudaMemcpyHostToDevice));
GPU_CHECK(cudaMemcpy(b_gpu, boxes_b, size_b, cudaMemcpyHostToDevice));
//#else
//GPU_CHECK(cudaMemcpy(a_gpu, a_ptr, size_a, cudaMemcpyHostToDevice));
//GPU_CHECK(cudaMemcpy(b_gpu, b_ptr, size_b, cudaMemcpyHostToDevice));
//#endif
boxesoverlapLauncher(num_a, a_gpu, num_b, b_gpu, ans_gpu, num_a * num_b, num_a, num_b);
GPU_CHECK(cudaMemcpy(ans_overlap, ans_gpu, size_ans, cudaMemcpyDeviceToHost));
GPU_CHECK(cudaFree(a_gpu));
GPU_CHECK(cudaFree(b_gpu));
GPU_CHECK(cudaFree(ans_gpu));
free(a_row_ptr);
free(b_row_ptr);
}
//PYBIND11_MODULE(juefx_iou, m)
//{
// m.def("bev_overlap", &map_bev_overlap);
//}
......@@ -309,30 +309,31 @@ void map_kalman_update_batch( pybind11::array_t<float> Z,
MAKE SURE ALL INPUTS ARE TWO-DIM NUMPY ARRAY
*/
void kalman_update_batch(float** Z,
float** X, // in-place update
float** P, // in-place update
float** HX,
void kalman_update_batch(float** Z,// measurement size = bs * no
float** X, // in-place update states size = bs * ns
float** P, // in-place update predict size = bs * ns * ns
float** HX, // H*X size = bs * no
const int bs,
const int ns,
const int no
) {
const int ns, //ns = 10
const int no // no = 7
)
{
//pybind11::buffer_info ZZ = Z.request();
//pybind11::buffer_info XX = X.request();
//pybind11::buffer_info PP = P.request();
//pybind11::buffer_info HXX = HX.request();
int size_ZZ = ZZ.shape[0] * ZZ.shape[1] * sizeof(float);
int size_XX = XX.shape[0] * XX.shape[1] * sizeof(float);
int size_PP = PP.shape[0] * PP.shape[1] * sizeof(float);
int size_HXX = HXX.shape[0] * HXX.shape[1] * sizeof(float);
int size_ZZ = bs * no * sizeof(float);
int size_XX = bs * ns * sizeof(float);
int size_PP = bs * ns * ns * sizeof(float);
int size_HXX = bs * no * sizeof(float);
// std::cout << "size_HXX: " << size_HXX <<"\n";
float* host_Z = reinterpret_cast<float*>(ZZ.ptr);
float* host_X = reinterpret_cast<float*>(XX.ptr);
float* host_P = reinterpret_cast<float*>(PP.ptr);
float* host_HX = reinterpret_cast<float*>(HXX.ptr);
//float* host_Z = reinterpret_cast<float*>(ZZ.ptr);
//float* host_X = reinterpret_cast<float*>(XX.ptr);
//float* host_P = reinterpret_cast<float*>(PP.ptr);
//float* host_HX = reinterpret_cast<float*>(HXX.ptr);
float* device_Z;
float* device_X;
......@@ -343,16 +344,20 @@ void kalman_update_batch(float** Z,
GPU_CHECK(cudaMalloc(&device_X, size_XX));
GPU_CHECK(cudaMalloc(&device_P, size_PP));
GPU_CHECK(cudaMalloc(&device_HX, size_HXX));
GPU_CHECK(cudaMemcpy(device_Z, host_Z, size_ZZ, cudaMemcpyHostToDevice));
GPU_CHECK(cudaMemcpy(device_X, host_X, size_XX, cudaMemcpyHostToDevice));
GPU_CHECK(cudaMemcpy(device_P, host_P, size_PP, cudaMemcpyHostToDevice));
GPU_CHECK(cudaMemcpy(device_HX, host_HX, size_HXX, cudaMemcpyHostToDevice));
for (int i = 0; i < bs; i++)
{
GPU_CHECK(cudaMemcpy(device_Z + i*no, Z[i], no * sizeof(float), cudaMemcpyHostToDevice));
GPU_CHECK(cudaMemcpy(device_X + i*ns, X[i], ns * sizeof(float), cudaMemcpyHostToDevice));
GPU_CHECK(cudaMemcpy(device_P + i*ns*ns, P[i], ns*ns * sizeof(float), cudaMemcpyHostToDevice));
GPU_CHECK(cudaMemcpy(device_HX + i*no, HX[i], no * sizeof(float), cudaMemcpyHostToDevice));
}
kalmanUpdateLauncher_batch(device_Z, device_X, device_P, device_HX, bs, ns, no);
GPU_CHECK(cudaMemcpy(host_X, device_X, size_XX, cudaMemcpyDeviceToHost));
GPU_CHECK(cudaMemcpy(host_P, device_P, size_PP, cudaMemcpyDeviceToHost));
for (int i = 0; i < bs; i++)
{
GPU_CHECK(cudaMemcpy(X[i], device_X + i*ns, ns * sizeof(float), cudaMemcpyDeviceToHost));
GPU_CHECK(cudaMemcpy(P[i], device_P + i*ns*ns, ns * ns * sizeof(float), cudaMemcpyDeviceToHost));
}
GPU_CHECK(cudaFree(device_Z));
GPU_CHECK(cudaFree(device_X));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment