Commit 59540421 authored by oscar's avatar oscar

提交修改cu文件

parent f6453873
#include <stdio.h>
#include <iostream>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
//#include <pybind11/pybind11.h>
//#include <pybind11/numpy.h>
//#include <pybind11/stl.h>
#include "common.h"
......@@ -309,70 +309,70 @@ void boxesoverlapLauncher(const int num_a, const float *boxes_a, const int num_b
}
void map_bev_overlap(const int num_a, pybind11::array_t<float> boxes_a,const int num_b, pybind11::array_t<float> boxes_b, pybind11::array_t<float> ans_overlap){
pybind11::buffer_info bx_a = boxes_a.request();
pybind11::buffer_info bx_b = boxes_b.request();
pybind11::buffer_info ans = ans_overlap.request();
int size_a = bx_a.shape[0] * bx_a.shape[1] * sizeof(float);
int size_b = bx_b.shape[0] * bx_b.shape[1] * sizeof(float);
int size_ans = ans.shape[0] * ans.shape[1] * sizeof(float);
float* a_gpu;
float* b_gpu;
float* ans_gpu;
GPU_CHECK(cudaMalloc(&a_gpu, size_a));
GPU_CHECK(cudaMalloc(&b_gpu, size_b));
GPU_CHECK(cudaMalloc(&ans_gpu, size_ans));
//float* a_ptr = reinterpret_cast<float*>(bx_a.ptr);
const float* a_ptr = reinterpret_cast<const float*>(bx_a.ptr);
float* b_ptr = reinterpret_cast<float*>(bx_b.ptr);
float* ans_ptr = reinterpret_cast<float*>(ans.ptr);
int an = bx_a.shape[0];
int bn = bx_b.shape[0];
// A AND B POINTERS ARE COLUMN-BASED WHEN IN ROS, CONVERTING THIS TO ROW-BASED.
#ifdef ROS
float* a_row_ptr;
a_row_ptr = (float*)malloc(size_a);
for (int ii = 0; ii < an; ii++){
for (int jj = 0; jj < 7; jj++){
*(a_row_ptr + jj + ii * 7) = *(a_ptr + ii + jj * an);
}
}
float* b_row_ptr;
b_row_ptr = (float*)malloc(size_b);
for (int ii = 0; ii < bn; ii++){
for (int jj = 0; jj < 7; jj++){
*(b_row_ptr + jj + ii * 7) = *(b_ptr + ii + jj * bn);
}
}
GPU_CHECK(cudaMemcpy(a_gpu, a_row_ptr, size_a, cudaMemcpyHostToDevice));
GPU_CHECK(cudaMemcpy(b_gpu, b_row_ptr, size_b, cudaMemcpyHostToDevice));
#else
GPU_CHECK(cudaMemcpy(a_gpu, a_ptr, size_a, cudaMemcpyHostToDevice));
GPU_CHECK(cudaMemcpy(b_gpu, b_ptr, size_b, cudaMemcpyHostToDevice));
#endif
boxesoverlapLauncher(num_a, a_gpu, num_b, b_gpu, ans_gpu, ans.shape[0] * ans.shape[1], bx_a.shape[0], bx_b.shape[0]);
GPU_CHECK(cudaMemcpy(ans_ptr, ans_gpu, size_ans, cudaMemcpyDeviceToHost));
GPU_CHECK(cudaFree(a_gpu));
GPU_CHECK(cudaFree(b_gpu));
GPU_CHECK(cudaFree(ans_gpu));
free(a_row_ptr);
free(b_row_ptr);
}
//void map_bev_overlap(const int num_a, pybind11::array_t<float> boxes_a,const int num_b, pybind11::array_t<float> boxes_b, pybind11::array_t<float> ans_overlap){
//
// pybind11::buffer_info bx_a = boxes_a.request();
// pybind11::buffer_info bx_b = boxes_b.request();
// pybind11::buffer_info ans = ans_overlap.request();
//
// int size_a = bx_a.shape[0] * bx_a.shape[1] * sizeof(float);
// int size_b = bx_b.shape[0] * bx_b.shape[1] * sizeof(float);
// int size_ans = ans.shape[0] * ans.shape[1] * sizeof(float);
//
// float* a_gpu;
// float* b_gpu;
// float* ans_gpu;
//
// GPU_CHECK(cudaMalloc(&a_gpu, size_a));
// GPU_CHECK(cudaMalloc(&b_gpu, size_b));
// GPU_CHECK(cudaMalloc(&ans_gpu, size_ans));
//
//
////float* a_ptr = reinterpret_cast<float*>(bx_a.ptr);
//
// const float* a_ptr = reinterpret_cast<const float*>(bx_a.ptr);
// float* b_ptr = reinterpret_cast<float*>(bx_b.ptr);
// float* ans_ptr = reinterpret_cast<float*>(ans.ptr);
//
// int an = bx_a.shape[0];
// int bn = bx_b.shape[0];
//
// // A AND B POINTERS ARE COLUMN-BASED WHEN IN ROS, CONVERTING THIS TO ROW-BASED.
//#ifdef ROS
// float* a_row_ptr;
// a_row_ptr = (float*)malloc(size_a);
// for (int ii = 0; ii < an; ii++){
// for (int jj = 0; jj < 7; jj++){
// *(a_row_ptr + jj + ii * 7) = *(a_ptr + ii + jj * an);
// }
// }
//
// float* b_row_ptr;
// b_row_ptr = (float*)malloc(size_b);
// for (int ii = 0; ii < bn; ii++){
// for (int jj = 0; jj < 7; jj++){
// *(b_row_ptr + jj + ii * 7) = *(b_ptr + ii + jj * bn);
// }
// }
//
// GPU_CHECK(cudaMemcpy(a_gpu, a_row_ptr, size_a, cudaMemcpyHostToDevice));
// GPU_CHECK(cudaMemcpy(b_gpu, b_row_ptr, size_b, cudaMemcpyHostToDevice));
//#else
// GPU_CHECK(cudaMemcpy(a_gpu, a_ptr, size_a, cudaMemcpyHostToDevice));
// GPU_CHECK(cudaMemcpy(b_gpu, b_ptr, size_b, cudaMemcpyHostToDevice));
//#endif
//
// boxesoverlapLauncher(num_a, a_gpu, num_b, b_gpu, ans_gpu, ans.shape[0] * ans.shape[1], bx_a.shape[0], bx_b.shape[0]);
//
// GPU_CHECK(cudaMemcpy(ans_ptr, ans_gpu, size_ans, cudaMemcpyDeviceToHost));
//
// GPU_CHECK(cudaFree(a_gpu));
// GPU_CHECK(cudaFree(b_gpu));
// GPU_CHECK(cudaFree(ans_gpu));
//
// free(a_row_ptr);
// free(b_row_ptr);
//}
void bev_overlap(const int num_a, float* boxes_a, const int num_b, float* boxes_b, float* ans_overlap)
{
......
......@@ -6,9 +6,9 @@
#include <cuda_runtime.h>
#include <cublas.h>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
//#include <pybind11/pybind11.h>
//#include <pybind11/numpy.h>
//#include <pybind11/stl.h>
#include "kalman_batch_ops.cu"
#include "common.h"
......@@ -189,108 +189,108 @@ void kalmanUpdateLauncher_batch(float* d_Z, //(bs, no)
MAKE SURE ALL INPUTS ARE TWO-DIM NUMPY ARRAY
*/
void map_kalman_update_batch( pybind11::array_t<float> Z,
pybind11::array_t<float> X, // in-place update
pybind11::array_t<float> P, // in-place update
pybind11::array_t<float> HX,
const int bs,
const int ns,
const int no
){
pybind11::buffer_info ZZ = Z.request();
pybind11::buffer_info XX = X.request();
pybind11::buffer_info PP = P.request();
pybind11::buffer_info HXX = HX.request();
int size_ZZ = ZZ.shape[0] * ZZ.shape[1] * sizeof(float);
int size_XX = XX.shape[0] * XX.shape[1] * sizeof(float);
int size_PP = PP.shape[0] * PP.shape[1] * sizeof(float);
int size_HXX = HXX.shape[0] * HXX.shape[1] * sizeof(float);
// std::cout << "size_HXX: " << size_HXX <<"\n";
float* host_Z = reinterpret_cast<float*>(ZZ.ptr);
float* host_X = reinterpret_cast<float*>(XX.ptr);
float* host_P = reinterpret_cast<float*>(PP.ptr);
float* host_HX = reinterpret_cast<float*>(HXX.ptr);
float* device_Z;
float* device_X;
float* device_P;
float* device_HX;
GPU_CHECK(cudaMalloc(&device_Z, size_ZZ));
GPU_CHECK(cudaMalloc(&device_X, size_XX));
GPU_CHECK(cudaMalloc(&device_P, size_PP));
GPU_CHECK(cudaMalloc(&device_HX, size_HXX));
GPU_CHECK(cudaMemcpy(device_Z, host_Z, size_ZZ, cudaMemcpyHostToDevice));
GPU_CHECK(cudaMemcpy(device_X, host_X, size_XX, cudaMemcpyHostToDevice));
GPU_CHECK(cudaMemcpy(device_P, host_P, size_PP, cudaMemcpyHostToDevice));
GPU_CHECK(cudaMemcpy(device_HX, host_HX, size_HXX, cudaMemcpyHostToDevice));
kalmanUpdateLauncher_batch(device_Z, device_X, device_P, device_HX, bs, ns, no);
GPU_CHECK(cudaMemcpy(host_X, device_X, size_XX, cudaMemcpyDeviceToHost));
GPU_CHECK(cudaMemcpy(host_P, device_P, size_PP, cudaMemcpyDeviceToHost));
GPU_CHECK(cudaFree(device_Z));
GPU_CHECK(cudaFree(device_X));
GPU_CHECK(cudaFree(device_P));
GPU_CHECK(cudaFree(device_HX));
#ifdef DEBUG
int c_row = no;
int c_col = ns;
std::cout << "################################### kalman update gpu host_h before reinterpret_cast: no * ns" << "\n";
auto a = H.mutable_unchecked<2>();
for (int i = 0; i < a.shape(0); i++){
std::cout << "[";
for (int j = 0; j < a.shape(1); j++){
std::cout << a(i, j)<< ", ";
}
std::cout << "],\n";
}
std::cout << "++++++++++++++++++++++++++++++++++ kalman update gpu host_h shape: no * ns" << "\n";
for (int i=0;i<c_row;i++){
for (int j=0;j<c_col;j++){
std::cout<< *(host_H + i * c_col + j) << " ";
}
std::cout <<"\n";
}
float* tmp;
tmp = (float*)malloc(size_HH);
for (int ii = 0; ii < c_row; ii++){
for (int jj = 0; jj < c_col; jj++){
*(tmp + jj + ii * c_col) = *(host_H + ii + jj * c_row);
}
}
std::cout << "-------------------to rowMajor host_e_row: " << "\n";
for (int i=0;i<c_row;i++){
for (int j=0;j<c_col;j++){
std::cout<< *(tmp + i * c_col + j) << " ";
}
std::cout <<"\n";
}
free(tmp);
#endif
// ATTENTION ORDER COULD BE CHANGED IN ROS !
}
//void map_kalman_update_batch( pybind11::array_t<float> Z,
// pybind11::array_t<float> X, // in-place update
// pybind11::array_t<float> P, // in-place update
// pybind11::array_t<float> HX,
// const int bs,
// const int ns,
// const int no
// ){
//
// pybind11::buffer_info ZZ = Z.request();
// pybind11::buffer_info XX = X.request();
// pybind11::buffer_info PP = P.request();
// pybind11::buffer_info HXX = HX.request();
//
// int size_ZZ = ZZ.shape[0] * ZZ.shape[1] * sizeof(float);
// int size_XX = XX.shape[0] * XX.shape[1] * sizeof(float);
// int size_PP = PP.shape[0] * PP.shape[1] * sizeof(float);
// int size_HXX = HXX.shape[0] * HXX.shape[1] * sizeof(float);
// // std::cout << "size_HXX: " << size_HXX <<"\n";
//
// float* host_Z = reinterpret_cast<float*>(ZZ.ptr);
// float* host_X = reinterpret_cast<float*>(XX.ptr);
// float* host_P = reinterpret_cast<float*>(PP.ptr);
// float* host_HX = reinterpret_cast<float*>(HXX.ptr);
//
// float* device_Z;
// float* device_X;
// float* device_P;
// float* device_HX;
//
// GPU_CHECK(cudaMalloc(&device_Z, size_ZZ));
// GPU_CHECK(cudaMalloc(&device_X, size_XX));
// GPU_CHECK(cudaMalloc(&device_P, size_PP));
// GPU_CHECK(cudaMalloc(&device_HX, size_HXX));
//
// GPU_CHECK(cudaMemcpy(device_Z, host_Z, size_ZZ, cudaMemcpyHostToDevice));
// GPU_CHECK(cudaMemcpy(device_X, host_X, size_XX, cudaMemcpyHostToDevice));
// GPU_CHECK(cudaMemcpy(device_P, host_P, size_PP, cudaMemcpyHostToDevice));
// GPU_CHECK(cudaMemcpy(device_HX, host_HX, size_HXX, cudaMemcpyHostToDevice));
//
// kalmanUpdateLauncher_batch(device_Z, device_X, device_P, device_HX, bs, ns, no);
//
// GPU_CHECK(cudaMemcpy(host_X, device_X, size_XX, cudaMemcpyDeviceToHost));
// GPU_CHECK(cudaMemcpy(host_P, device_P, size_PP, cudaMemcpyDeviceToHost));
//
// GPU_CHECK(cudaFree(device_Z));
// GPU_CHECK(cudaFree(device_X));
// GPU_CHECK(cudaFree(device_P));
// GPU_CHECK(cudaFree(device_HX));
//
//#ifdef DEBUG
// int c_row = no;
// int c_col = ns;
// std::cout << "################################### kalman update gpu host_h before reinterpret_cast: no * ns" << "\n";
// auto a = H.mutable_unchecked<2>();
// for (int i = 0; i < a.shape(0); i++){
// std::cout << "[";
// for (int j = 0; j < a.shape(1); j++){
//
// std::cout << a(i, j)<< ", ";
// }
// std::cout << "],\n";
// }
//
//
// std::cout << "++++++++++++++++++++++++++++++++++ kalman update gpu host_h shape: no * ns" << "\n";
// for (int i=0;i<c_row;i++){
// for (int j=0;j<c_col;j++){
//
// std::cout<< *(host_H + i * c_col + j) << " ";
// }
// std::cout <<"\n";
// }
//
//
// float* tmp;
// tmp = (float*)malloc(size_HH);
// for (int ii = 0; ii < c_row; ii++){
// for (int jj = 0; jj < c_col; jj++){
// *(tmp + jj + ii * c_col) = *(host_H + ii + jj * c_row);
// }
// }
//
//
//
// std::cout << "-------------------to rowMajor host_e_row: " << "\n";
// for (int i=0;i<c_row;i++){
// for (int j=0;j<c_col;j++){
//
// std::cout<< *(tmp + i * c_col + j) << " ";
// }
// std::cout <<"\n";
// }
// free(tmp);
//#endif
// // ATTENTION ORDER COULD BE CHANGED IN ROS !
//
//
//
//
//
//}
//PYBIND11_MODULE(juefx_kalman_multi_shared, m)
//PYBIND11_MODULE(juefx_kalman_multi_1, m)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment