Commit a40e44b1 authored by oscar's avatar oscar

提交gpu的原始代码

parent af8b06c2
This diff is collapsed.
#ifndef COMMON_H
#define COMMON_H
// headers in STL
#include <stdio.h>
// headers in CUDA
//#include <cuda_runtime_api.h>
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define GPU_CHECK(ans) \
{ \
GPUAssert((ans), __FILE__, __LINE__); \
}
inline void GPUAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort)
exit(code);
}
}
//Memory Allocation Function
void cpuMalloc(float **data_ptr, size_t size)
{
float *data;
data = (float *) malloc( size);
*data_ptr = data;
}
//Ideintity Matrix Generation
void Identity(float *data, int n)
{
for (int i = 0; i < (n*n); i=i+1)
{
if((i%(n+1))==0)
data[i] = 1;
else
data[i] = 0;
}
}
#endif // COMMON_H
#include "common.h"
#include <cuda_runtime.h>
__global__ void MatSub(float* C, const float* A, const float* B, int bs, int no)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < bs * no)
C[i] = A[i] - B[i];
}
__global__ void KalmanUpdateS2(float* d_S, float* d_P, const int bs, const int ns, const int no){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= bs) return;
int p_stride = 11;
int s_stride = 8;
for (int i = 0; i < no; i++){
d_S[tid * no * no + i * s_stride] = d_P[tid * ns * ns + i * p_stride] + 1;
}
}
__global__ void KalmanUpdateS3(float* d_K, float* d_P, float* d_S, const int bs, const int ns, const int no){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= bs) return;
int p_stride = 11;
int k_stride = 8;
int s_stride = 8;
for (int i = 0; i < ns; i++){
if (i < no){
d_K[tid * ns * no + i * k_stride] = d_P[tid * ns * ns + i * p_stride] * (1 / d_S[tid * no * no + i * s_stride]);
}
else{
d_K[tid * ns * no + no * no + (i - no) * k_stride] = d_P[tid * ns * ns + ns * no + (i - no) * p_stride] * (1 / d_S[tid * no * no + (i - no) * s_stride]);
}
}
}
__global__ void KalmanUpdateS4(float* d_X, float* d_K, float* d_Y, const int bs, const int ns, const int no){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= bs) return;
int k_stride = 8;
for (int i = 0; i < ns; i++){
if (i < no){
d_X[tid * ns + i] += d_K[tid * ns * no + i * k_stride] * d_Y[tid * no + i];
}
else{
d_X[tid * ns + i] += d_K[tid * ns * no + no * no + (i - no) * k_stride] * d_Y[tid * no + i - no];
}
}
}
__global__ void KalmanUpdateS5(float* d_P, float* d_K, const int bs, const int ns, const int no){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= bs) return;
int p_stride = 11;
int k_stride = 8;
for (int i = ns - 1; i > -1; i--){
if (i < no){
float IKH_tl = 1 - d_K[tid * ns * no + i * k_stride];
d_P[tid * ns * ns + i * p_stride] *= IKH_tl;
if (i < 3) d_P[tid * ns * ns + no + i * p_stride] *= IKH_tl;
}
else{
float IKH_bl = 0 - d_K[tid * ns * no + no * no + (i - no) * k_stride];
float IKH_br = 1;
float P_bl = d_P[tid * ns * ns + ns * no + (i - no) * p_stride];
float P_br = d_P[tid * ns * ns + i * p_stride];
float P_tl = d_P[tid * ns * ns + (i - no) * p_stride];
float P_tr = d_P[tid * ns * ns + no + (i - no) * p_stride];
d_P[tid * ns * ns + ns * no + (i - no) * p_stride] = IKH_bl * P_tl + IKH_br * P_bl;
d_P[tid * ns * ns + i * p_stride] = IKH_bl * P_tr + IKH_br * P_br;
}
}
}
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
//#include <cutil_inline.h>
#include <sys/time.h>
#include <cuda_runtime.h>
#include <cublas.h>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include "kalman_batch_ops.cu"
#include "common.h"
//#define DEBUG
//#define INROS
/*
ns = dim_x = 10 len of state
no = dim_z = 7 len of observation
Z: measurement (bs, no)
X: estimate (bs, ns)
P: uncertainty covariance (bs, ns * ns)
MAKE SURE ALL INPUTS ARE TWO-DIM NUMPY ARRAY
*/
void kalmanUpdateLauncher_batch(float* d_Z, //(bs, no)
float* d_X, //(bs, ns)
float* d_P, //(bs, ns * ns)
float* d_HX, //(bs, no)
int bs,
int ns,
int no){
float *d_Y; //error (bs, no)
float *d_S; //for intermediate calculations (bs, no * no)
float *d_K; //Kalman gain (bs, ns * no)
GPU_CHECK(cudaMalloc(&d_Y, no * bs * sizeof(float)));
GPU_CHECK(cudaMalloc(&d_S, no * no * bs * sizeof(float)));
GPU_CHECK(cudaMalloc(&d_K, ns * no * bs * sizeof(float)));
// also can set ''int blocksPerGrid = bs; int threadsPerBlock = ns'' for step 2 - 4, in which we could eliminate loop;
// step 5 cannot use this, because we need to ensure the excuation order
int threadsPerBlock = 256;
int blocksPerGridSub = DIVUP(bs * no, threadsPerBlock);
// step 1
MatSub<<<blocksPerGridSub, threadsPerBlock>>>(d_Y, d_Z, d_HX, bs, no); // (bs, no) - (bs, no)
#ifdef DEBUG
cudaDeviceSynchronize();
std::cout << "------------------------ step 1 d_Y (bs, no) : \n";
float* tmp_y;
tmp_y = (float*)malloc(no * bs * sizeof(float));
GPU_CHECK(cudaMemcpy(tmp_y, d_Y, no * bs * sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0;i<bs;i++){
std::cout << "batch " << bs <<":\n";
std::cout << "[";
for (int j=0;j<no;j++){
std::cout<< tmp_y[i * no + j] << ", ";
}
std::cout <<"],\n";
}
std::cout <<"\n";
free(tmp_y);
#endif
// step 2 S = H * P * H^T + E
int blocksPerGrid2 = DIVUP(bs * no, threadsPerBlock);
KalmanUpdateS2<<<blocksPerGrid2, threadsPerBlock>>>(d_S, d_P, bs, ns, no); // (bs, no * no) + (bs, no * no)
#ifdef DEBUG
cudaDeviceSynchronize();
std::cout << "------------------------ step 2 d_S (bs, no * no) : \n";
float* tmp_s;
tmp_s = (float*)malloc(no * no * bs * sizeof(float));
GPU_CHECK(cudaMemcpy(tmp_s, d_S, no * no * bs * sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0;i<bs;i++){
std::cout << "[";
for (int j=0;j<no;j++){
std::cout << "[";
for (int k=0;k<no;k++){
std::cout<< tmp_s[i * no*no + j * no + k] << ", ";
}
std::cout <<"],\n";
}
std::cout <<"],\n";
}
std::cout <<"\n";
free(tmp_s);
#endif
// step 3 K = P * H^T * S^-1
int blocksPerGrid3 = DIVUP(bs * ns, threadsPerBlock);
KalmanUpdateS3<<<blocksPerGrid3, threadsPerBlock>>>(d_K, d_P, d_S, bs, ns, no); // (bs, ns * no)
#ifdef DEBUG
cudaDeviceSynchronize();
std::cout << "------------------------ step 3 d_K (bs, ns * no) : \n";
float* tmp_k;
tmp_k = (float*)malloc(no * ns * bs * sizeof(float));
GPU_CHECK(cudaMemcpy(tmp_k, d_K, ns * no * bs * sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0;i<bs;i++){
std::cout << "[";
for (int j=0;j<ns;j++){
std::cout << "[";
for (int k=0;k<no;k++){
std::cout<< tmp_k[i * ns*no + j * no + k] << ", ";
}
std::cout <<"],\n";
}
std::cout <<"],\n";
}
std::cout <<"\n";
free(tmp_k);
#endif
// step 4 x = x + K * y
int blocksPerGrid4 = DIVUP(bs * ns, threadsPerBlock);
KalmanUpdateS4<<<blocksPerGrid4, threadsPerBlock>>>(d_X, d_K, d_Y, bs, ns, no); // (bs, ns) + (bs, ns)
#ifdef DEBUG
cudaDeviceSynchronize();
std::cout << "------------------------ step 4 result d_X (bs, ns) : \n";
float* tmp_x;
tmp_x = (float*)malloc(ns * bs * sizeof(float));
GPU_CHECK(cudaMemcpy(tmp_x, d_X, ns * bs * sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0;i<bs;i++){
std::cout << "batch " << i <<":\n";
std::cout << "[";
for (int j=0;j<ns;j++){
std::cout<< tmp_x[i * ns + j] << ", ";
}
std::cout <<"],\n";
}
std::cout <<"\n";
free(tmp_x);
#endif
// step 5 P = P - K * H * P
int blocksPerGrid5 = DIVUP(bs, threadsPerBlock);
KalmanUpdateS5<<<blocksPerGrid5, threadsPerBlock>>>(d_P, d_K, bs, ns, no); // (bs, ns * ns) - (bs, ns * ns)
#ifdef DEBUG
std::cout << "------------------------ step 5 result d_P (bs, ns * ns) : \n";
float* tmp_res;
tmp_res = (float*)malloc(ns * ns * bs * sizeof(float));
GPU_CHECK(cudaMemcpy(tmp_res, d_P, ns * ns * bs * sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0;i<bs;i++){
std::cout << "batch " << i <<":\n";
std::cout << "[";
for (int j=0;j<ns;j++){
std::cout << "[";
for (int k=0;k<ns;k++){
std::cout<< tmp_res[i * ns*ns + j * ns + k] << ", ";
}
std::cout <<"],\n";
}
std::cout <<"],\n";
}
std::cout <<"\n";
free(tmp_res);
#endif
GPU_CHECK(cudaFree(d_Y));
GPU_CHECK(cudaFree(d_K));
GPU_CHECK(cudaFree(d_S));
}
/*
ns = dim_x = 10 len of state
no = dim_z = 7 len of observation
Z: measurement (bs, 7)
X: estimate (bs, 10)
P: uncertainty covariance (bs, 100)
MAKE SURE ALL INPUTS ARE TWO-DIM NUMPY ARRAY
*/
void map_kalman_update_batch( pybind11::array_t<float> Z,
pybind11::array_t<float> X, // in-place update
pybind11::array_t<float> P, // in-place update
pybind11::array_t<float> HX,
const int bs,
const int ns,
const int no
){
pybind11::buffer_info ZZ = Z.request();
pybind11::buffer_info XX = X.request();
pybind11::buffer_info PP = P.request();
pybind11::buffer_info HXX = HX.request();
int size_ZZ = ZZ.shape[0] * ZZ.shape[1] * sizeof(float);
int size_XX = XX.shape[0] * XX.shape[1] * sizeof(float);
int size_PP = PP.shape[0] * PP.shape[1] * sizeof(float);
int size_HXX = HXX.shape[0] * HXX.shape[1] * sizeof(float);
// std::cout << "size_HXX: " << size_HXX <<"\n";
float* host_Z = reinterpret_cast<float*>(ZZ.ptr);
float* host_X = reinterpret_cast<float*>(XX.ptr);
float* host_P = reinterpret_cast<float*>(PP.ptr);
float* host_HX = reinterpret_cast<float*>(HXX.ptr);
float* device_Z;
float* device_X;
float* device_P;
float* device_HX;
GPU_CHECK(cudaMalloc(&device_Z, size_ZZ));
GPU_CHECK(cudaMalloc(&device_X, size_XX));
GPU_CHECK(cudaMalloc(&device_P, size_PP));
GPU_CHECK(cudaMalloc(&device_HX, size_HXX));
GPU_CHECK(cudaMemcpy(device_Z, host_Z, size_ZZ, cudaMemcpyHostToDevice));
GPU_CHECK(cudaMemcpy(device_X, host_X, size_XX, cudaMemcpyHostToDevice));
GPU_CHECK(cudaMemcpy(device_P, host_P, size_PP, cudaMemcpyHostToDevice));
GPU_CHECK(cudaMemcpy(device_HX, host_HX, size_HXX, cudaMemcpyHostToDevice));
kalmanUpdateLauncher_batch(device_Z, device_X, device_P, device_HX, bs, ns, no);
GPU_CHECK(cudaMemcpy(host_X, device_X, size_XX, cudaMemcpyDeviceToHost));
GPU_CHECK(cudaMemcpy(host_P, device_P, size_PP, cudaMemcpyDeviceToHost));
GPU_CHECK(cudaFree(device_Z));
GPU_CHECK(cudaFree(device_X));
GPU_CHECK(cudaFree(device_P));
GPU_CHECK(cudaFree(device_HX));
#ifdef DEBUG
int c_row = no;
int c_col = ns;
std::cout << "################################### kalman update gpu host_h before reinterpret_cast: no * ns" << "\n";
auto a = H.mutable_unchecked<2>();
for (int i = 0; i < a.shape(0); i++){
std::cout << "[";
for (int j = 0; j < a.shape(1); j++){
std::cout << a(i, j)<< ", ";
}
std::cout << "],\n";
}
std::cout << "++++++++++++++++++++++++++++++++++ kalman update gpu host_h shape: no * ns" << "\n";
for (int i=0;i<c_row;i++){
for (int j=0;j<c_col;j++){
std::cout<< *(host_H + i * c_col + j) << " ";
}
std::cout <<"\n";
}
float* tmp;
tmp = (float*)malloc(size_HH);
for (int ii = 0; ii < c_row; ii++){
for (int jj = 0; jj < c_col; jj++){
*(tmp + jj + ii * c_col) = *(host_H + ii + jj * c_row);
}
}
std::cout << "-------------------to rowMajor host_e_row: " << "\n";
for (int i=0;i<c_row;i++){
for (int j=0;j<c_col;j++){
std::cout<< *(tmp + i * c_col + j) << " ";
}
std::cout <<"\n";
}
free(tmp);
#endif
// ATTENTION ORDER COULD BE CHANGED IN ROS !
}
//PYBIND11_MODULE(juefx_kalman_multi_shared, m)
//PYBIND11_MODULE(juefx_kalman_multi_1, m)
PYBIND11_MODULE(juefx_kalman_batch, m)
{
m.def("kalman_update_batch", &map_kalman_update_batch);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment