提交更新

ffcf8fed · oscar · 8208ce05 · ffcf8fed · ffcf8fed · ffcf8fed
Commit ffcf8fed authored Mar 30, 2022 by oscar
15 changed files
--- a/src/BaseTracker/BaseTrack.cpp
+++ b/src/BaseTracker/BaseTrack.cpp
+#include "BaseTrack.h"
+
+
+BaseTrack::BaseTrack(unsigned int num_states, unsigned int num_obs):m_num_states(num_states), m_num_obs(num_obs)
+{
+    //需要在派生类中赋值几个算法矩阵
+}
+
+void BaseTrack::Init(const std::vector<float>& data)
+{
+    if (kf_ == nullptr)
+        return;
+    int size = data.size();
+    if (size > m_num_obs)
+        return;
+    Eigen::VectorXf observation = Eigen::VectorXf::Zero(size);
+    for (int i = 0; i < data.size(); i++)
+        observation(i) = data[i];
+    kf_->x_.head(size) << observation;
+    hit_streak_++;
+    m_update_count++;
+}
+void BaseTrack::Predict()
+{
+    if (kf_ == nullptr)
+        return;
+    kf_->Predict();
+
+    // hit streak count will be reset
+    if (coast_cycles_ > 0) {
+        hit_streak_ = 0;
+    }
+    // accumulate coast cycle count
+    coast_cycles_++;
+}
+void BaseTrack::Update(const std::vector<float>& data)
+{
+    if (kf_ == nullptr)
+        return;
+    // get measurement update, reset coast cycle count
+    coast_cycles_ = 0;
+    // accumulate hit streak count
+    hit_streak_++;
+    m_update_count++;
+
+    // observation - center_x, center_y, area, ratio
+    int size = data.size();
+    Eigen::VectorXf observation = Eigen::VectorXf::Zero(size);
+    for (int i = 0; i < data.size(); i++)
+        observation(i) = data[i];
+    kf_->Update(observation);
+}
+void BaseTrack::UpdateDataCheck(const std::vector<float>& data, std::vector<float>& out)
+{
+
+}
+void BaseTrack::UpdateHit()
+{
+    if (kf_ == nullptr)
+        return;
+    // get measurement update, reset coast cycle count
+    coast_cycles_ = 0;
+    // accumulate hit streak count
+    hit_streak_++;
+    m_update_count++;
+}
+
+int BaseTrack::GetStateData(std::vector<float>& data)
+{
+    if (kf_ == nullptr)
+        return -1;
+    data.clear();
+    for (int i = 0; i < m_num_states; i++)
+        data.push_back(kf_->x_[i]);
+    return 0;
+}
+int BaseTrack::GetPredictData(std::vector<float>& data)
+{
+    if (kf_ == nullptr || m_num_obs > m_num_states)
+        return -1;
+    data.clear();
+    for (int i = 0; i < m_num_obs; i++)
+        data.push_back(kf_->x_predict_[i]);
+    return 0;
+}
+
+float BaseTrack::GetNIS() const
+{
+    if (kf_ == nullptr)
+        return 0;
+    return kf_->NIS_;
+}
+float BaseTrack::GetProb() const
+{
+    return m_prob;
+}
+
+bool BaseTrack::IsLost()
+{
+    return coast_cycles_ > m_kMaxCoastCycles;
+}
+bool BaseTrack::IsValid()
+{
+    return m_update_count >= m_updateValidCount;
+}
+int BaseTrack::GetIouData(std::vector<float>& data, int& obj_type)
+{
+    if (kf_ == nullptr || m_num_obs > m_num_states)
+        return -1;
+    data.clear();
+    for (int i = 0; i < m_num_obs; i++)
+        data.push_back(kf_->x_[i]);
+    return 0;
+}
+
+int BaseTrack::GetStatesNum()
+{
+    if (kf_ == nullptr)
+        return 0;
+    return kf_->num_states_;
+}
+int BaseTrack::GetObsNum()
+{
+    if (kf_ == nullptr)
+        return 0;
+    return kf_->num_obs_;
+}
+
+float* BaseTrack::GetStatesXPtr()
+{
+    if (kf_ == nullptr)
+        return nullptr;
+    return kf_->x_.data();
+}
+float* BaseTrack::GetPredictPtr()
+{
+    if (kf_ == nullptr)
+        return nullptr;
+    return kf_->P_.data();
+}
+float* BaseTrack::GetRPtr()
+{
+    if (kf_ == nullptr)
+        return nullptr;
+    return kf_->R_.data();
+}
--- a/src/BaseTracker/BaseTrack.h
+++ b/src/BaseTracker/BaseTrack.h
+#pragma once
+
+#include <eigen3/Eigen/Dense>
+#include "kalman_filter.h"
+#include <vector>
+#include <memory>
+
+#define _PI_ 3.1415926
+
+
+class BaseTrack 
+{
+public:
+    // Constructor
+    BaseTrack(unsigned int num_states, unsigned int num_obs);
+
+    // Destructor
+    ~BaseTrack() = default;
+
+    //virtual std::shared_ptr<mytracker::KalmanFilter> InitKF(unsigned int num_states, unsigned int num_obs) { return nullptr; };
+    virtual void Init(const std::vector<float>& data);
+    virtual void Predict();
+    virtual void Update(const std::vector<float>& data);
+    virtual void UpdateDataCheck(const std::vector<float>& data, std::vector<float>& out);//对于输入数据进行修正
+    virtual void UpdateHit();
+    virtual int GetStateData(std::vector<float>& data);
+    virtual int GetPredictData(std::vector<float>& data);
+    virtual float GetNIS() const;
+    virtual float GetProb() const;
+    virtual bool IsLost();//数据是否丢失，如果不更新就会丢失
+    virtual bool IsValid();//数据是否有效
+
+    virtual int GetIouData(std::vector<float>& data,int& obj_type);
+    
+    virtual int GetIouDataOrder(std::vector<int>& order) = 0;
+    virtual int GetKFDataOrder(std::vector<int>& order) = 0;
+
+    virtual double CalculateIou(const std::vector<float>& data) = 0;
+
+    int GetStatesNum();
+    int GetObsNum();
+
+    float* GetStatesXPtr();
+    float* GetPredictPtr();
+    float* GetRPtr();
+
+    void SetIouThreshold(float threshold) { m_iou_threshold = threshold; }
+    void SetMaxCoastCycles(int cycles) { m_kMaxCoastCycles = cycles; }
+    void SetValidUpdateCount(int count) { m_updateValidCount = count; }
+    virtual void SetValues(std::vector<float>& data) {}
+
+    int coast_cycles_ = 0, hit_streak_ = 0;
+
+    int m_update_count = 0;//数据更新的次数
+
+    int m_num_states = 0;
+    int m_num_obs = 0;
+
+    float m_prob = 0.0f;//计算iou的配置度
+
+    float m_iou_threshold = 0.01;//iou计算之后匹配的最小值
+    int m_kMaxCoastCycles = 2;//predict之后，计算几次认为丢失。
+    int m_updateValidCount = 3;//更新多少次认为是一个有效数据。
+
+
+    std::shared_ptr<mytracker::KalmanFilter> kf_ = nullptr;
+};
--- a/src/BaseTracker/BaseTracker.h
+++ b/src/BaseTracker/BaseTracker.h
+#pragma once
+
+
+#include <iostream>
+//#include <eigen3/Eigen/Dense>
+//#include "kalman_filter.h"
+#include <map>
+//#include "BaseTrack.h"
+#include <memory>
+#include <vector>
+#include "Iou.h"
+#include "LogBase.h"
+#include <memory.h>
+#include "Component.h"
+#ifdef _KF_IOU_CUDA_
+#include "bev_overlap_online.h"
+#include "kalman_update_batch_online.h"
+#endif
+
+#ifdef _USING_NSIGHT_
+#include <nvToolsExt.h>
+#endif
+
+template<class T>
+class BaseTracker
+{
+public:
+	BaseTracker() {}
+
+    void SetGPU(int gpu) { m_isGPU = gpu; }
+    void SetIouThreshold(float threshold) { m_iou_threshold = threshold; }
+    void SetMaxCoastCycles(int cycles) { m_kMaxCoastCycles = cycles; }
+    void SetValidUpdateCount(int count){ m_updateValidCount = count; }
+    void SetValues(std::vector<float>& values) { m_values = values; }
+
+    int Run(const std::vector<std::vector<float> >& detections, int _no/*观测数量*/, int _ns/*状态数量*/, std::vector<uint64_t>& detectionsId, std::map<uint64_t, int>& updateId, std::vector<uint64_t>& lostId);
+
+    int Run(const std::vector<std::vector<float> >& dets_high, const std::vector<std::vector<float> >& dets_low, int _no/*观测数量*/, int _ns/*状态数量*/, std::map<uint64_t, int>& update_high_ids, std::map<uint64_t, int>& update_low_ids, std::vector<uint64_t>& lostId);
+
+	std::map<uint64_t, std::shared_ptr<T> >& GetStates();
+
+
+    void AssociateDetectionsToTrackers(const std::vector<std::vector<float> >& detections, int _no/*观测数量*/, int _ns/*状态数量*/, std::map<uint64_t, std::shared_ptr<T> >& tracks, std::map<uint64_t, int>& matched, std::vector<int>& unmatched_det);
+
+    void AssociateDetectionsToTrackersEx(const std::vector<std::vector<float> >& dets_high, const std::vector<std::vector<float> >& dets_low, int _no/*观测数量*/, int _ns/*状态数量*/, std::map<uint64_t, std::shared_ptr<T> >& tracks, std::map<uint64_t, int>& high_matched, std::map<uint64_t, int>& low_matched, std::vector<int>& unmatched_det);
+
+public:
+
+	std::map<uint64_t, std::shared_ptr<T> > m_tracker;
+
+	uint64_t m_countId = 0;//生成物体id的累加值
+
+    int m_isGPU = 0;//默认不使用
+
+    float m_iou_threshold = 0.01;//iou计算之后匹配的最小值
+    int m_kMaxCoastCycles = 2;//predict之后，计算几次认为丢失。
+    int m_updateValidCount = 3;//更新多少次认为是一个有效数据。
+
+    std::vector<float> m_values;//调整矩阵参数的配置
+};
+
+
+template<class T>
+int BaseTracker<T>::Run(const std::vector<std::vector<float> >& detections, int _no/*观测数量*/, int _ns/*状态数量*/, std::vector<uint64_t>& detectionsId, std::map<uint64_t, int>& updateId, std::vector<uint64_t>& lostId)
+{
+    /*** Predict internal tracks from previous frame ***/
+#ifdef _USING_NSIGHT_
+    nvtxRangePush("Run Predict");
+#endif
+    for (auto& track : m_tracker)
+    {
+        track.second->Predict();
+        //std::vector<float> predict;
+        //track.second->GetPredictData(predict);
+        //SDK_LOG(SDK_INFO, "predict id = %d, data = [%f,%f,%f,%f,%f,%f,%f]",track.first,predict[0], predict[1], predict[2], predict[3], predict[4], predict[5], predict[6]);
+    }
+#ifdef _USING_NSIGHT_
+    nvtxRangePop();
+#endif
+    if (detections.empty())
+    {
+        /*** Delete lose tracked tracks ***/
+        for (auto it = m_tracker.begin(); it != m_tracker.end();)
+        {
+            if (it->second->IsLost())
+            {
+                lostId.push_back(it->first);
+                it = m_tracker.erase(it);
+            }
+            else
+            {
+                it++;
+            }
+        }
+        return 0;
+    }
+#ifdef _USING_NSIGHT_
+    nvtxRangePush("Run AssociateDetectionsToTrackers");
+#endif
+    detectionsId.resize(detections.size());
+    // Hash-map between track ID and associated detection bounding box
+    std::map<uint64_t, int> matched;
+    // vector of unassociated detections
+    std::vector<int> unmatched_det;
+
+    // return values - matched, unmatched_det
+    AssociateDetectionsToTrackers(detections,_no,_ns, m_tracker, matched, unmatched_det);
+#ifdef _USING_NSIGHT_
+    nvtxRangePop();
+#endif
+    /*** Update tracks with associated bbox ***/
+#ifdef _USING_NSIGHT_
+    nvtxRangePush("Run Update");
+#endif
+    if (m_isGPU == 0)
+    {
+        for (const auto& match : matched)
+        {
+            const auto& id = match.first;
+            m_tracker[id]->Update(detections[match.second]);
+            detectionsId[match.second] = id;
+            updateId[id] = match.second;
+        }
+    }
+    else
+    {
+        int bs = matched.size();
+        if (bs > 0)
+        {
+            int ns = 0;
+            int no = _no;
+            std::shared_ptr<float> Z = std::shared_ptr<float>(new float[bs * no], [](float* p) {if (p) delete[] p; p = nullptr; });
+            std::shared_ptr<float> X = std::shared_ptr<float>(new float[bs * _ns], [](float* p) {if (p) delete[] p; p = nullptr; });
+            std::shared_ptr<float> P = std::shared_ptr<float>(new float[bs * _ns * _ns], [](float* p) {if (p) delete[] p; p = nullptr; });
+            std::shared_ptr<float> R = std::shared_ptr<float>(new float[no * no], [](float* p) {if (p) delete[] p; p = nullptr; });
+            std::shared_ptr<float> HX = std::shared_ptr<float>(new float[bs * no], [](float* p) {if (p) delete[] p; p = nullptr; });
+            int bs_i = 0;
+            for (const auto& match : matched)
+            {
+                const auto& id = match.first;
+                std::vector<float> cre_det;
+                m_tracker[id]->UpdateDataCheck(detections[match.second], cre_det);
+                float* ptr_Z = Z.get() + bs_i * no;
+                for (int i = 0; i < no; i++)
+                {
+                    ptr_Z[i] = cre_det[i];
+                }
+                memcpy(X.get() + bs_i * _ns, m_tracker[id]->GetStatesXPtr(), _ns * sizeof(float));
+                memcpy(P.get() + bs_i * _ns * _ns, m_tracker[id]->GetPredictPtr(), _ns * _ns * sizeof(float));
+                //X.get()[bs_i] = m_tracker[id]->GetStatesXPtr();
+                //P.get()[bs_i] = m_tracker[id]->GetPredictPtr();
+                float* ptr_HX = HX.get() + bs_i * no;
+                memcpy(ptr_HX, m_tracker[id]->GetStatesXPtr(), no * sizeof(float));
+                if (ns == 0)
+                {
+                    ns = m_tracker[id]->GetStatesNum();
+                    float* _r = m_tracker[id]->GetRPtr();
+                    memcpy(R.get(), _r, no * no * sizeof(float));
+                }
+                bs_i++;
+                m_tracker[id]->UpdateHit();//gpu时需要update里面的变量
+                detectionsId[match.second] = id;
+                updateId[id] = match.second;
+            }
+            //SDK_LOG(SDK_INFO, "Z = [%s]", GetMatrixStr(Z.get(), no, bs).c_str());
+            //SDK_LOG(SDK_INFO, "X = [%s]", GetMatrixStr(X.get(), ns, bs).c_str());
+            //SDK_LOG(SDK_INFO, "P = [%s]", GetMatrixStr(P.get(), ns * ns, bs).c_str());
+            //SDK_LOG(SDK_INFO, "R = [%s]", GetMatrixStr(R.get(), no, no).c_str());
+            //SDK_LOG(SDK_INFO, "HX = [%s]", GetMatrixStr(HX.get(),no,bs).c_str());
+#ifdef _KF_IOU_CUDA_
+#ifdef _USING_NSIGHT_
+            nvtxRangePush("kalman_update_batch");
+#endif
+            kalman_update_batch(Z.get(), X.get(), P.get(), R.get(), HX.get(), bs, ns, no);
+#ifdef _USING_NSIGHT_
+            nvtxRangePop();
+#endif
+#endif
+            //SDK_LOG(SDK_INFO, "after X = [%s]", GetMatrixStr(X.get(), ns, bs).c_str());
+            //SDK_LOG(SDK_INFO, "after P = [%s]", GetMatrixStr(P.get(), ns * ns, bs).c_str());
+            bs_i = 0;
+            for (const auto& match : matched)
+            {
+                const auto& id = match.first;
+                memcpy(m_tracker[id]->GetStatesXPtr(), X.get() + bs_i * _ns, _ns * sizeof(float));
+                memcpy(m_tracker[id]->GetPredictPtr(), P.get() + bs_i * _ns * _ns, _ns * _ns * sizeof(float));
+                bs_i++;
+            }
+        }
+    }
+#ifdef _USING_NSIGHT_
+    nvtxRangePop();
+#endif
+    /*** Create new tracks for unmatched detections ***/
+    for (const auto& det : unmatched_det)
+    {
+        std::shared_ptr<T> trackPtr = std::make_shared<T>();
+        trackPtr->Init(detections[det]);
+        trackPtr->SetIouThreshold(m_iou_threshold);
+        trackPtr->SetMaxCoastCycles(m_kMaxCoastCycles);
+        trackPtr->SetValidUpdateCount(m_updateValidCount);
+        trackPtr->SetValues(m_values);
+        // Create new track and generate new ID
+        uint64_t newId = ++m_countId;
+        m_tracker[newId] = trackPtr;
+        detectionsId[det] = newId;
+        updateId[newId] = det;
+    }
+
+    /*** Delete lose tracked tracks ***/
+    for (auto it = m_tracker.begin(); it != m_tracker.end();)
+    {
+        if (it->second->IsLost())
+        {
+            lostId.push_back(it->first);
+            it = m_tracker.erase(it);
+        }
+        else
+        {
+            it++;
+        }
+    }
+    return 0;
+}
+
+template<class T>
+std::map<uint64_t, std::shared_ptr<T> >& BaseTracker<T>::GetStates()
+{
+    return m_tracker;
+}
+
+template<class T>
+void BaseTracker<T>::AssociateDetectionsToTrackers(const std::vector<std::vector<float> >& detections, int _no/*观测数量*/, int _ns/*状态数量*/, std::map<uint64_t, std::shared_ptr<T> >& tracks, std::map<uint64_t, int>& matched, std::vector<int>& unmatched_det)
+{
+    if (tracks.empty())
+    {
+        //不做匹配
+        for (int i = 0; i < detections.size(); i++)
+        {
+            unmatched_det.push_back(i);
+        }
+        return;
+    }
+
+    std::vector<std::vector<float>> iou_matrix;
+    // resize IOU matrix based on number of detection and tracks
+    iou_matrix.resize(detections.size(), std::vector<float>(tracks.size()));
+
+    std::vector<std::vector<float>> association;
+    // resize association matrix based on number of detection and tracks
+    association.resize(detections.size(), std::vector<float>(tracks.size()));
+
+    if (m_isGPU == 0)
+    {
+        // row - detection, column - tracks
+        for (size_t i = 0; i < detections.size(); i++)
+        {
+            size_t j = 0;
+            for (const auto& trk : tracks)
+            {
+                iou_matrix[i][j] = trk.second->CalculateIou(detections[i]);
+                j++;
+            }
+        }
+        //std::string dete_str = GetMatrixStr(detections, detections.size(), detections.size() > 0 ? detections[0].size() : 0);
+        //std::vector<std::vector<float> > tracker_states;
+        //for (auto& iter : tracks)
+        //{
+        //    std::vector<float> measure;
+        //    if (iter.second->GetMeasureData(measure) == 0)
+        //    {
+        //        tracker_states.emplace_back(measure);
+        //    }
+        //    else
+        //    {
+        //        SDK_LOG(SDK_INFO, "GetMeasureData failed");
+        //    }
+        //}
+        //std::string track_str = GetMatrixStr(tracker_states, tracker_states.size(), tracker_states.size() > 0 ? tracker_states[0].size() : 0);
+        //SDK_LOG(SDK_INFO, "detections = [%s]", dete_str.c_str());
+        //SDK_LOG(SDK_INFO, "tracker_states = [%s]", track_str.c_str());
+    }
+    else
+    {
+        std::vector<std::vector<float> > tracker_states;
+        std::vector<int> tracker_type;
+        for (auto& iter : tracks)
+        {
+            std::vector<float> measure;
+            int obj_type = 0;
+            if (iter.second->GetIouData(measure,obj_type) == 0)
+            {
+                tracker_states.emplace_back(measure);
+            }
+            else
+            {
+                SDK_LOG(SDK_INFO, "GetMeasureData failed");
+            }
+            tracker_type.push_back(obj_type);
+        }
+        int measure_size = _no;
+        int tra_size = tracker_states.size() > 0 ? tracker_states[0].size() : 0;
+        if (tra_size == 0 || measure_size != tra_size)
+            return;
+        int detect_size = detections.size() * measure_size;
+        int tracker_size = tracker_states.size() * tra_size;
+        int iou_size = detections.size() * tracker_states.size();
+        std::shared_ptr<float> detect_ptr = std::shared_ptr<float>(new float[detect_size], [](float* p) {if (p) delete[] p; p = nullptr; });
+        std::shared_ptr<float> tracker_ptr = std::shared_ptr<float>(new float[tracker_size], [](float* p) {if (p) delete[] p; p = nullptr; });
+        std::shared_ptr<float> iou_ptr = std::shared_ptr<float>(new float[iou_size], [](float* p) {if (p) delete[] p; p = nullptr; });
+        std::vector<int> det_type;
+        for (int i = 0; i < detections.size(); i++)
+        {
+            std::vector<float> out;
+            int ob_type = 0;
+            T::MeasureIouData(detections[i], out, ob_type);
+            for (int j = 0; j < out.size(); j++)
+            {
+                detect_ptr.get()[i * measure_size + j] = out[j];
+            }
+            det_type.push_back(ob_type);
+        }
+        for (int i = 0; i < tracker_states.size(); i++)
+        {
+            for (int j = 0; j < tra_size; j++)
+            {
+                tracker_ptr.get()[i * tra_size + j] = tracker_states[i][j];
+            }
+        }
+        //std::string dete_str = GetMatrixStr(detect_ptr.get(), measure_size, detections.size());
+        //std::string track_str = GetMatrixStr(tracker_states, tracker_states.size(), tra_size);
+        //SDK_LOG(SDK_INFO, "detections = [%s]",dete_str.c_str());
+        //SDK_LOG(SDK_INFO, "tracker_states = [%s]", track_str.c_str());
+#ifdef _KF_IOU_CUDA_
+#ifdef _USING_NSIGHT_
+        nvtxRangePush("bev_overlap");
+#endif
+        bev_overlap(detections.size(), detect_ptr.get(), tracker_states.size(), tracker_ptr.get(), iou_ptr.get());
+#ifdef _USING_NSIGHT_
+        nvtxRangePop();
+#endif
+#endif
+        for(int i = 0; i < detections.size(); i++)
+            for (int j = 0; j < tracker_states.size(); j++)
+            {
+                float* i_ptr = iou_ptr.get();
+                if (det_type[i] == tracker_type[j])
+                {
+                    iou_matrix[i][j] = i_ptr[i * tracker_states.size() + j];
+                }
+                else
+                {
+                    iou_matrix[i][j] = 0.0f;
+                }
+            }
+    }
+
+    // Find association
+    //std::string str = GetMatrixStr(iou_matrix, detections.size(), tracks.size());
+    //SDK_LOG(SDK_INFO, "iou_matrix = [%s]",str.c_str());
+    HungarianMatching(iou_matrix, detections.size(), tracks.size(), association);
+
+    for (size_t i = 0; i < detections.size(); i++)
+    {
+        bool matched_flag = false;
+        size_t j = 0;
+        for (auto& trk : tracks)
+        {
+            if (0 == association[i][j])
+            {
+                // Filter out matched with low IOU
+                //SDK_LOG(SDK_INFO, "match info i = %d,j = %d, iou_matrix = %f, m_iou_threshold = %f",i,j, iou_matrix[i][j], trk.second->m_iou_threshold);
+                if (iou_matrix[i][j] >= trk.second->m_iou_threshold)
+                {
+                    matched[trk.first] = i;
+                    trk.second->m_prob = iou_matrix[i][j];
+                    matched_flag = true;
+                }
+                // It builds 1 to 1 association, so we can break from here
+                break;
+            }
+            j++;
+        }
+        // if detection cannot match with any tracks
+        if (!matched_flag) {
+            unmatched_det.push_back(i);
+        }
+    }
+}
+
+template<class T>
+int BaseTracker<T>::Run(const std::vector<std::vector<float> >& dets_high, const std::vector<std::vector<float> >& dets_low, int _no/*观测数量*/, int _ns/*状态数量*/, std::map<uint64_t, int>& update_high_ids, std::map<uint64_t, int>& update_low_ids, std::vector<uint64_t>& lostId)
+{
+#ifdef _USING_NSIGHT_
+    nvtxRangePush("Run Predict");
+#endif
+    for (auto& track : m_tracker)
+    {
+        track.second->Predict();
+    }
+#ifdef _USING_NSIGHT_
+    nvtxRangePop();
+#endif
+    if (dets_high.empty() && dets_low.empty())
+    {
+        /*** Delete lose tracked tracks ***/
+        for (auto it = m_tracker.begin(); it != m_tracker.end();)
+        {
+            if (it->second->IsLost())
+            {
+                lostId.push_back(it->first);
+                it = m_tracker.erase(it);
+            }
+            else
+            {
+                it++;
+            }
+        }
+        return 0;
+    }
+#ifdef _USING_NSIGHT_
+    nvtxRangePush("Run AssociateDetectionsToTrackers");
+#endif
+    // Hash-map between track ID and associated detection bounding box
+    std::map<uint64_t, int> high_matched;
+    std::map<uint64_t, int> low_matched;
+    // vector of unassociated detections
+    std::vector<int> unmatched_det;
+
+    // return values - matched, unmatched_det
+    AssociateDetectionsToTrackersEx(dets_high, dets_low, _no, _ns, m_tracker, high_matched, low_matched, unmatched_det);
+#ifdef _USING_NSIGHT_
+    nvtxRangePop();
+#endif
+    /*** Update tracks with associated bbox ***/
+#ifdef _USING_NSIGHT_
+    nvtxRangePush("Run Update");
+#endif
+    for (const auto& match : high_matched)
+    {
+        const auto& id = match.first;
+        m_tracker[id]->Update(dets_high[match.second]);
+        update_high_ids[id] = match.second;
+    }
+    for (const auto& match : low_matched)
+    {
+        const auto& id = match.first;
+        m_tracker[id]->Update(dets_low[match.second]);
+        update_low_ids[id] = match.second;
+    }
+#ifdef _USING_NSIGHT_
+    nvtxRangePop();
+#endif
+    /*** Create new tracks for unmatched detections ***/
+    for (const auto& det : unmatched_det)
+    {
+        std::shared_ptr<T> trackPtr = std::make_shared<T>();
+        trackPtr->Init(dets_high[det]);
+        trackPtr->SetIouThreshold(m_iou_threshold);
+        trackPtr->SetMaxCoastCycles(m_kMaxCoastCycles);
+        trackPtr->SetValidUpdateCount(m_updateValidCount);
+        trackPtr->SetValues(m_values);
+        // Create new track and generate new ID
+        uint64_t newId = ++m_countId;
+        m_tracker[newId] = trackPtr;
+        update_high_ids[newId] = det;
+    }
+
+    /*** Delete lose tracked tracks ***/
+    for (auto it = m_tracker.begin(); it != m_tracker.end();)
+    {
+        if (it->second->IsLost())
+        {
+            lostId.push_back(it->first);
+            it = m_tracker.erase(it);
+        }
+        else
+        {
+            it++;
+        }
+    }
+    return 0;
+}
+
+template<class T>
+void BaseTracker<T>::AssociateDetectionsToTrackersEx(const std::vector<std::vector<float> >& dets_high, const std::vector<std::vector<float> >& dets_low, int _no/*观测数量*/, int _ns/*状态数量*/, std::map<uint64_t, std::shared_ptr<T> >& tracks, std::map<uint64_t, int>& high_matched, std::map<uint64_t, int>& low_matched, std::vector<int>& unmatched_det)
+{
+    if (tracks.empty())
+    {
+        //不做匹配
+        for (int i = 0; i < dets_high.size(); i++)
+        {
+            unmatched_det.push_back(i);
+        }
+        return;
+    }
+    if (dets_high.size() > 0)     //高分检测不能为空
+    {
+        std::vector<std::vector<float>> iou_matrix_high;
+        // resize IOU matrix based on number of detection and tracks
+        iou_matrix_high.resize(dets_high.size(), std::vector<float>(tracks.size()));
+
+        std::vector<std::vector<float>> association_high;
+        // resize association matrix based on number of detection and tracks
+        association_high.resize(dets_high.size(), std::vector<float>(tracks.size()));
+
+        for (size_t i = 0; i < dets_high.size(); i++)
+        {
+            size_t j = 0;
+            for (const auto& trk : tracks)
+            {
+                iou_matrix_high[i][j] = trk.second->CalculateIou(dets_high[i]);
+                j++;
+            }
+        }
+
+        // Find association
+        //std::string str = GetMatrixStr(iou_matrix, detections.size(), tracks.size());
+        //SDK_LOG(SDK_INFO, "iou_matrix = [%s]",str.c_str());
+        HungarianMatching(iou_matrix_high, dets_high.size(), tracks.size(), association_high);
+
+        for (size_t i = 0; i < dets_high.size(); i++)
+        {
+            bool matched_flag = false;
+            size_t j = 0;
+            for (auto& trk : tracks)
+            {
+                if (0 == association_high[i][j])
+                {
+                    // Filter out matched with low IOU
+                    //SDK_LOG(SDK_INFO, "match info i = %d,j = %d, iou_matrix = %f, m_iou_threshold = %f",i,j, iou_matrix[i][j], trk.second->m_iou_threshold);
+                    if (iou_matrix_high[i][j] >= trk.second->m_iou_threshold)
+                    {
+                        high_matched[trk.first] = i;
+                        trk.second->m_prob = iou_matrix_high[i][j];
+                        matched_flag = true;
+                    }
+                    // It builds 1 to 1 association, so we can break from here
+                    break;
+                }
+                j++;
+            }
+            // if detection cannot match with any tracks
+            if (!matched_flag) {
+                unmatched_det.push_back(i);
+            }
+        }
+    }
+
+    if (dets_low.size() > 0 && tracks.size() - high_matched.size() > 0)  //低分检测不能为空且总跟踪目标减高分检测不能为空
+    {
+        std::vector<std::vector<float>> iou_matrix_low;
+        // resize IOU matrix based on number of detection and tracks
+        iou_matrix_low.resize(dets_low.size(), std::vector<float>(tracks.size() - high_matched.size()));
+
+        std::vector<std::vector<float>> association_low;
+        // resize association matrix based on number of detection and tracks
+        association_low.resize(dets_low.size(), std::vector<float>(tracks.size() - high_matched.size()));
+
+        for (size_t i = 0; i < dets_low.size(); i++)
+        {
+            size_t j = 0;
+            for (const auto& trk : tracks)
+            {
+                if (high_matched.find(trk.first) == high_matched.end())
+                {
+                    iou_matrix_low[i][j] = trk.second->CalculateIou(dets_low[i]);
+                    j++;
+                }
+            }
+        }
+
+        // Find association
+        //std::string str = GetMatrixStr(iou_matrix, detections.size(), tracks.size());
+        //SDK_LOG(SDK_INFO, "iou_matrix = [%s]",str.c_str());
+        HungarianMatching(iou_matrix_low, dets_low.size(), tracks.size() - high_matched.size(), association_low);
+
+        for (size_t i = 0; i < dets_low.size(); i++)
+        {
+            bool matched_flag = false;
+            size_t j = 0;
+            for (auto& trk : tracks)
+            {
+                if (high_matched.find(trk.first) == high_matched.end())
+                {
+                    if (0 == association_low[i][j])
+                    {
+                        // Filter out matched with low IOU
+                        //SDK_LOG(SDK_INFO, "match info i = %d,j = %d, iou_matrix = %f, m_iou_threshold = %f",i,j, iou_matrix[i][j], trk.second->m_iou_threshold);
+                        if (iou_matrix_low[i][j] >= trk.second->m_iou_threshold)
+                        {
+                            low_matched[trk.first] = i;
+                            trk.second->m_prob = iou_matrix_low[i][j];
+                            matched_flag = true;
+                        }
+                        // It builds 1 to 1 association, so we can break from here
+                        break;
+                    }
+                    j++;
+                }
+            }
+        }
+    }
+
+
+}
--- a/src/BaseTracker/Iou.cpp
+++ b/src/BaseTracker/Iou.cpp
+#include "Iou.h"
+//#include <iostream>
+//#include <vector>
+//#include <math.h>
+//#include <string.h>
+//#include <algorithm>
+//#include <opencv2/opencv.hpp>
+//#include "caffe/nms.h"
+//using namespace std;
+//using namespace cv;
+
+
+void HungarianMatching(const std::vector<std::vector<float>>& iou_matrix,
+    size_t nrows, size_t ncols,
+    std::vector<std::vector<float>>& association)
+{
+    Matrix<float> matrix(nrows, ncols);
+    // Initialize matrix with IOU values
+    for (size_t i = 0; i < nrows; i++) {
+        for (size_t j = 0; j < ncols; j++) {
+            // Multiply by -1 to find max cost
+            if (iou_matrix[i][j] != 0) {
+                matrix(i, j) = -iou_matrix[i][j];
+            }
+            else {
+                // TODO: figure out why we have to assign value to get correct result
+                matrix(i, j) = 1.0f;
+            }
+        }
+    }
+
+    //    // Display begin matrix state.
+    //    for (size_t row = 0 ; row < nrows ; row++) {
+    //        for (size_t col = 0 ; col < ncols ; col++) {
+    //            std::cout.width(10);
+    //            std::cout << matrix(row,col) << ",";
+    //        }
+    //        std::cout << std::endl;
+    //    }
+    //    std::cout << std::endl;
+
+
+        // Apply Kuhn-Munkres algorithm to matrix.
+    Munkres<float> m;
+    m.solve(matrix);
+
+    //    // Display solved matrix.
+    //    for (size_t row = 0 ; row < nrows ; row++) {
+    //        for (size_t col = 0 ; col < ncols ; col++) {
+    //            std::cout.width(2);
+    //            std::cout << matrix(row,col) << ",";
+    //        }
+    //        std::cout << std::endl;
+    //    }
+    //    std::cout << std::endl;
+
+    for (size_t i = 0; i < nrows; i++) {
+        for (size_t j = 0; j < ncols; j++) {
+            association[i][j] = matrix(i, j);
+        }
+    }
+}
+
+
+struct CVPoint2f
+{
+    CVPoint2f(float _x, float _y):x(_x),y(_y)
+    {
+    }
+    CVPoint2f(const CVPoint2f& obj)
+    {
+        x = obj.x;
+        y = obj.y;
+    }
+    float x;
+    float y;
+};
+
+
+bool bInBox(const std::vector<CVPoint2f> &vpBoxA, const CVPoint2f &p)
+{
+    std::vector<CVPoint2f> corners = vpBoxA;
+    for(int i = 0; i<vpBoxA.size(); i++)   //01230123
+        corners.push_back(vpBoxA[i]);
+
+    std::vector< std::vector<double> > linesA;
+    for(int i = 0; i<vpBoxA.size(); i++)
+    {
+        CVPoint2f p1 = corners[i];
+        CVPoint2f p2 = corners[i+1];
+        CVPoint2f p3 = corners[i+2];
+        double a;
+        if(p1.x - p2.x == 0)
+            a = -(p1.y - p2.y)/0.0001;
+        else
+            a = -(p1.y - p2.y)/(p1.x - p2.x);
+
+        double b = 1;
+        double c = -a * p2.x - p2.y;
+        double d = a*p3.x + b*p3.y + c;
+
+        std::vector<double> line{a,b,c,d};
+        linesA.push_back(line);
+    }
+
+    for(int i=0; i<linesA.size(); i++)
+    {
+        std::vector<double > l = linesA[i];
+        double y = l[0] * p.x + l[1] * p.y +l[2];
+        if(y * l[3] < 0)
+            return false;
+    }
+    return true;
+}
+
+double InterSection_2D(const std::vector<CVPoint2f> &vpBoxA, const std::vector<CVPoint2f> &vpBoxB)
+{
+    double min_x, max_x, min_y, max_y;
+    min_x = vpBoxA[0].x;
+    max_x = vpBoxA[0].x;
+    min_y = vpBoxA[0].y;
+    max_y = vpBoxA[0].y;
+
+    for(int i=1; i<vpBoxA.size(); i++)
+    {
+        CVPoint2f p = vpBoxA[i];
+        if(p.x > max_x)
+            max_x = p.x;
+        if(p.x < min_x)
+            min_x = p.x;
+        if(p.y > max_y)
+            max_y = p.y;
+        if(p.y < min_y)
+            min_y = p.y;
+    }
+    for(int i=0; i<vpBoxB.size(); i++)
+    {
+        CVPoint2f p = vpBoxB[i];
+        if(p.x > max_x)
+            max_x = p.x;
+        if(p.x < min_x)
+            min_x = p.x;
+        if(p.y > max_y)
+            max_y = p.y;
+        if(p.y < min_y)
+            min_y = p.y;
+    }
+
+    //将两个BBox的定点坐标最小值设置为0, 以防止有负数的产生
+    std::vector<CVPoint2f> vpBoxAA = vpBoxA;
+    std::vector<CVPoint2f> vpBoxBB = vpBoxB;
+
+    //if(min_x < 0 && min_y < 0)
+    for(int i=0; i<vpBoxA.size(); i++)
+    {
+        vpBoxAA[i].x = vpBoxAA[i].x - min_x;
+        vpBoxAA[i].y = vpBoxAA[i].y - min_y;
+        vpBoxBB[i].x = vpBoxBB[i].x - min_x;
+        vpBoxBB[i].y = vpBoxBB[i].y - min_y;
+    }
+
+    int imax_x = (int)((max_x - min_x) * 10000);
+    int imax_y = (int)((max_y - min_y) * 10000);
+
+    double points_inA = 0, points_inB = 0, points_inAB = 0;
+    srand((int)time(0));
+    for(int i = 0; i<100000; i++)
+    {
+        int xx = rand()%(imax_x)+1;  //生成[1, imax_x]之间的整数
+        int yy = rand()%(imax_y)+1;  //生成[1, imax_y]之间的整数
+
+        CVPoint2f p((float)xx / 10000.0, (float)yy / 10000.0);
+        if( bInBox(vpBoxAA, p) )
+            ++points_inA;
+        if( bInBox(vpBoxBB, p) )
+            ++points_inB;
+        if( bInBox(vpBoxAA, p) && bInBox(vpBoxBB, p) )
+            ++points_inAB;
+    }
+
+    double iou = points_inAB / (points_inA + points_inB - points_inAB);
+    //cout<<"points_inA : "<<points_inA<<",  points_inB: "<<points_inB<<" ,points_inAB: "<<points_inAB<<endl;
+    return iou;
+}
+//double CuboidIoU(const Eigen::MatrixXd &truth_poses, const Eigen::MatrixXd &landmark_poses)
+double CuboidIoU(const std::vector<float> &truth_poses, const std::vector<float> &landmark_poses)
+{
+
+    std::vector<CVPoint2f>  vground_points;
+    std::vector<CVPoint2f>  vlandmark_points;
+    if(1)  //通过坐标旋转求取groundtruth立方体中 2D-Boundbox四个顶点的坐标
+    {
+        //double cen_x = truth_poses(0,0);
+        //double cen_y = truth_poses(0,1);
+        //double len = truth_poses(0,6);
+        //double wid = truth_poses(0,7);
+        //double yaw = truth_poses(0,5);
+        double cen_x = truth_poses[0];
+        double cen_y = truth_poses[1];
+        double len = truth_poses[6];
+        double wid = truth_poses[7];
+        double yaw = truth_poses[5];
+
+        double x, y, xx, yy;
+        x = cen_x - len;
+        y = cen_y - wid;
+        xx = (x - cen_x)*cos(yaw) - (y - cen_y)*sin(yaw) + cen_x;
+        yy = (x - cen_x)*sin(yaw) + (y - cen_y)*cos(yaw) + cen_y;
+        CVPoint2f p0(xx, yy);
+
+        x = cen_x - len;
+        y = cen_y + wid;
+        xx = (x - cen_x)*cos(yaw) - (y - cen_y)*sin(yaw) + cen_x;
+        yy = (x - cen_x)*sin(yaw) + (y - cen_y)*cos(yaw) + cen_y;
+        CVPoint2f p1(xx, yy);
+
+        x = cen_x + len;
+        y = cen_y + wid;
+        xx = (x - cen_x)*cos(yaw) - (y - cen_y)*sin(yaw) + cen_x;
+        yy = (x - cen_x)*sin(yaw) + (y - cen_y)*cos(yaw) + cen_y;
+        CVPoint2f p2(xx, yy);
+
+        x = cen_x + len;
+        y = cen_y - wid;
+        xx = (x - cen_x)*cos(yaw) - (y - cen_y)*sin(yaw) + cen_x;
+        yy = (x - cen_x)*sin(yaw) + (y - cen_y)*cos(yaw) + cen_y;
+        CVPoint2f p3(xx, yy);
+
+        vground_points = {p0, p1, p2, p3};
+    }
+
+    if(1)//通过坐标旋转求取landmark中 2D-Boundbox四个顶点的坐标
+    {
+        //double cen_x = landmark_poses(0,0);
+        //double cen_y = landmark_poses(0,1);
+        //double len = landmark_poses(0,6);
+        //double wid = landmark_poses(0,7);
+        //double yaw = landmark_poses(0,5);
+        double cen_x = landmark_poses[0];
+        double cen_y = landmark_poses[1];
+        double len = landmark_poses[6];
+        double wid = landmark_poses[7];
+        double yaw = landmark_poses[5];
+
+        double x, y, xx, yy;
+        x = cen_x - len;
+        y = cen_y - wid;
+        xx = (x - cen_x)*cos(yaw) - (y - cen_y)*sin(yaw) + cen_x;
+        yy = (x - cen_x)*sin(yaw) + (y - cen_y)*cos(yaw) + cen_y;
+        CVPoint2f p0(xx, yy);
+
+        x = cen_x - len;
+        y = cen_y + wid;
+        xx = (x - cen_x)*cos(yaw) - (y - cen_y)*sin(yaw) + cen_x;
+        yy = (x - cen_x)*sin(yaw) + (y - cen_y)*cos(yaw) + cen_y;
+        CVPoint2f p1(xx, yy);
+
+        x = cen_x + len;
+        y = cen_y + wid;
+        xx = (x - cen_x)*cos(yaw) - (y - cen_y)*sin(yaw) + cen_x;
+        yy = (x - cen_x)*sin(yaw) + (y - cen_y)*cos(yaw) + cen_y;
+        CVPoint2f p2(xx, yy);
+
+        x = cen_x + len;
+        y = cen_y - wid;
+        xx = (x - cen_x)*cos(yaw) - (y - cen_y)*sin(yaw) + cen_x;
+        yy = (x - cen_x)*sin(yaw) + (y - cen_y)*cos(yaw) + cen_y;
+        CVPoint2f p3(xx, yy);
+
+        vlandmark_points = {p0, p1, p2, p3};
+    }
+
+    double iou_2d = InterSection_2D(vlandmark_points, vground_points);
+    std::cout << " iou_2d = " << iou_2d << std::endl;
+    double iou_3d = 0;
+    if (iou_2d > 0) {
+        //double tru_minz = truth_poses(0, 2) - truth_poses(0, 8);
+        //double tru_maxz = truth_poses(0, 2) + truth_poses(0, 8);
+
+        //double land_minz = landmark_poses(0, 2) - landmark_poses(0, 8);
+        //double land_maxz = landmark_poses(0, 2) + landmark_poses(0, 8);
+
+        double tru_minz = truth_poses[2] - truth_poses[8];
+        double tru_maxz = truth_poses[2] + truth_poses[8];
+
+        double land_minz = landmark_poses[2] - landmark_poses[8];
+        double land_maxz = landmark_poses[2] + landmark_poses[8];
+        if (land_maxz <= tru_maxz && land_maxz >= tru_minz) {
+            double height_iou = (land_maxz - tru_minz) / (tru_maxz - land_minz);
+            iou_3d = iou_2d * height_iou;
+        } else if (tru_maxz < land_maxz && tru_maxz > land_minz) {
+            double height_iou = (tru_maxz - land_minz) / (land_maxz - tru_minz);
+            iou_3d = iou_2d * height_iou;
+        }
+    }
+
+    return iou_3d;
+}
+
+
+//void main(int argc, char **argv)
+//{
+//    Eigen::MatrixXd truth_poses(1,9);
+//    truth_poses<<-0.4875, -0.798913, -1.125, 0, 0, -1.27409, 0.240477, 0.235315, 0.375;
+//    Eigen::MatrixXd landmark_poses(1,9);
+//    landmark_poses<<-0.506616, -0.796303, -1.20499, 0, 0, -0.345443, 0.22,  0.22, 0.4 ;
+//    double iou_3d = CuboidIoU_once(truth_poses, landmark_poses);
+//    cout<<"the iou of two cuboid is: "<<iou_3d<<endl;
+//    return;
+//}
+
+
+
+
+
+//cv::Point2f center;
+
+////c++ 多边形求交集
+////const int maxn = 300;
+//const double eps = 1e-8;
+//int dcmp(double x)
+//{
+//    if(x > eps) return 1;
+//    return x < -eps ? -1 : 0;
+//}
+//struct MyPoint
+//{
+//    double x, y;
+//};
+//double cross(MyPoint a,MyPoint b,MyPoint c) ///叉积
+//{
+//    return (a.x-c.x)*(b.y-c.y)-(b.x-c.x)*(a.y-c.y);
+//}
+//MyPoint intersection(MyPoint a,MyPoint b,MyPoint c,MyPoint d)
+//{
+//    MyPoint p = a;
+//    double t =((a.x-c.x)*(c.y-d.y)-(a.y-c.y)*(c.x-d.x))/((a.x-b.x)*(c.y-d.y)-(a.y-b.y)*(c.x-d.x));
+//    p.x +=(b.x-a.x)*t;
+//    p.y +=(b.y-a.y)*t;
+//    return p;
+//}
+////计算多边形面积
+//double PolygonArea(MyPoint p[], int n)
+//{
+//    if(n < 3) return 0.0;
+//    double s = p[0].y * (p[n - 1].x - p[1].x);
+//    p[n] = p[0];
+//    for(int i = 1; i < n; ++ i)
+//        s += p[i].y * (p[i - 1].x - p[i + 1].x);
+//    return fabs(s * 0.5);
+//}
+//double CPIA(MyPoint a[], MyPoint b[], int na, int nb)//ConvexPolygonIntersectArea
+//{
+//    MyPoint p[20], tmp[20];
+//    int tn, sflag, eflag;
+//    a[na] = a[0], b[nb] = b[0];
+//    memcpy(p,b,sizeof(MyPoint)*(nb + 1));
+//    for(int i = 0; i < na && nb > 2; i++)
+//    {
+//        sflag = dcmp(cross(a[i + 1], p[0],a[i]));
+//        for(int j = tn = 0; j < nb; j++, sflag = eflag)
+//        {
+//            if(sflag>=0) tmp[tn++] = p[j];
+//            eflag = dcmp(cross(a[i + 1], p[j + 1],a[i]));
+//            if((sflag ^ eflag) == -2)
+//                tmp[tn++] = intersection(a[i], a[i + 1], p[j], p[j + 1]); ///求交点
+//        }
+//        memcpy(p, tmp, sizeof(MyPoint) * tn);
+//        nb = tn, p[nb] = p[0];
+//    }
+//    if(nb < 3) return 0.0;
+//    return PolygonArea(p, nb);
+//}
+//double SPIA(MyPoint a[], MyPoint b[], int na, int nb)///SimplePolygonIntersectArea 调用此函数
+//{
+//    int i, j;
+//    MyPoint t1[4], t2[4];
+//    double res = 0, num1, num2;
+//    a[na] = t1[0] = a[0], b[nb] = t2[0] = b[0];
+//
+//    for(i = 2; i < na; i++)
+//    {
+//        t1[1] = a[i-1], t1[2] = a[i];
+//        num1 = dcmp(cross(t1[1], t1[2],t1[0]));
+//        if(num1 < 0) swap(t1[1], t1[2]);
+//
+//        for(j = 2; j < nb; j++)
+//        {
+//
+//            t2[1] = b[j - 1], t2[2] = b[j];
+//            num2 = dcmp(cross(t2[1], t2[2],t2[0]));
+//            if(num2 < 0) swap(t2[1], t2[2]);
+//            res += CPIA(t1, t2, 3, 3) * num1 * num2;
+//        }
+//    }
+//    return res;
+//}
+////c++ 多边形求交集
+//
+//double IOU(const proposal_type & r1, const proposal_type & r2)
+//{
+//    double inter = intersection_area(r1,r2);
+//
+//    double o = inter / (calcularea(r1) + calcularea(r2) - inter);
+//
+//    return (o >= 0) ? o : 0;
+//}
+//
+//double intersection_area(const proposal_type & r1, const proposal_type & r2){
+//    MyPoint p1[10],p2[10];
+//
+//    p1[0].x=r1.x2;
+//    p1[0].y=r1.y2;
+//    p1[1].x=r1.x3;
+//    p1[1].y=r1.y3;
+//    p1[2].x=r1.x4;
+//    p1[2].y=r1.y4;
+//    p1[3].x=r1.x1;
+//    p1[3].y=r1.y1;
+//
+//    p2[0].x=r2.x2;
+//    p2[0].y=r2.y2;
+//    p2[1].x=r2.x3;
+//    p2[1].y=r2.y3;
+//    p2[2].x=r2.x4;
+//    p2[2].y=r2.y4;
+//    p2[3].x=r2.x1;
+//    p2[3].y=r2.y1;
+//    double area = SPIA(p1, p2, 4, 4);
+//    return area;
+//}
+//
+//double calcularea(const proposal_type & r){
+//    float d12=sqrt(pow(r.x2-r.x1,2)+pow(r.y2-r.y1,2));
+//    float d14=sqrt(pow(r.x4-r.x1,2)+pow(r.y4-r.y1,2));
+//    float d24=sqrt(pow(r.x2-r.x4,2)+pow(r.y2-r.y4,2));
+//    float d32=sqrt(pow(r.x2-r.x3,2)+pow(r.y2-r.y3,2));
+//    float d34=sqrt(pow(r.x3-r.x4,2)+pow(r.y3-r.y4,2));
+//    float p1=(d12+d14+d24)/2;
+//    float p2=(d24+d32+d34)/2;
+//    float s1=sqrt(p1*(p1-d12)*(p1-d14)*(p1-d24));
+//    float s2=sqrt(p2*(p2-d32)*(p2-d34)*(p2-d24));
+//    return s1+s2;
+//}
+//
+//bool cmpr(const proposal_type &a,const proposal_type &b){
+//    return a.score > b.score;
+//}
+//
+//void nms(vector<proposal_type>& proposals, const double nms_threshold)
+//{
+//
+//    //按分数排序
+//    sort(proposals.begin(),proposals.end(),cmpr);
+//    //标志，false代表留下，true代表扔掉
+//    vector<bool> del(proposals.size(), false);
+//    for(size_t i = 0; i < proposals.size(); i++){
+//        if(!del[i]){
+//            // std::cout<<scores[i]<<std::endl;
+//            for(size_t j = i+1; j < proposals.size()-1; j++){
+//                if(!del[j] && IOU(proposals[i], proposals[j]) > nms_threshold){
+//                    del[j] = true;//IOU大于阈值，扔掉
+//                }
+//            }
+//        }
+//    }
+///*for(int i=0;i<del.size();i++){
+//cout<<del[i]<<" ";
+//}*/
+//vector<proposal_type> new_proposals;
+///*for(const auto i : index){
+//    if(!del[i]) new_proposals.push_back(proposals[i]);
+//}*/
+//for (int i = 0; i < proposals.size(); i++) {
+//    if (!del[i]) new_proposals.push_back(proposals[i]);
+//}
+////cout<<new_proposals.size()<<endl;
+//proposals.clear();
+//vector<proposal_type>().swap(proposals);
+//proposals = new_proposals;
+////    cout<<proposals.size()<<endl;
+//    // scores.clear();
+//    //vector<float>().swap(scores);
+//}
+
--- a/src/BaseTracker/Iou.h
+++ b/src/BaseTracker/Iou.h
+#pragma once
+
+#include "munkres.h"
+#include "params.h"
+#include <vector>
+
+void HungarianMatching(const std::vector<std::vector<float>>& iou_matrix,
+    size_t nrows, size_t ncols,
+    std::vector<std::vector<float>>& association);
+
+double CuboidIoU(const std::vector<float>& truth_poses, const std::vector<float>& landmark_poses);
+
+
--- a/src/BaseTracker/Track2D.cpp
+++ b/src/BaseTracker/Track2D.cpp
+#include "Track2D.h"
+
+
+Track2D::Track2D():BaseTrack(8, 4)
+{
+    std::shared_ptr<mytracker::KalmanFilter> kf = std::make_shared<mytracker::KalmanFilter>(8, 4);
+    kf->F_ <<
+        1, 0, 0, 0, 1, 0, 0, 0,
+        0, 1, 0, 0, 0, 1, 0, 0,
+        0, 0, 1, 0, 0, 0, 1, 0,
+        0, 0, 0, 1, 0, 0, 0, 1,
+        0, 0, 0, 0, 1, 0, 0, 0,
+        0, 0, 0, 0, 0, 1, 0, 0,
+        0, 0, 0, 0, 0, 0, 1, 0,
+        0, 0, 0, 0, 0, 0, 0, 1;
+    kf->P_ <<
+        10, 0, 0, 0, 0, 0, 0, 0,
+        0, 10, 0, 0, 0, 0, 0, 0,
+        0, 0, 10, 0, 0, 0, 0, 0,
+        0, 0, 0, 10, 0, 0, 0, 0,
+        0, 0, 0, 0, 10000, 0, 0, 0,
+        0, 0, 0, 0, 0, 10000, 0, 0,
+        0, 0, 0, 0, 0, 0, 10000, 0,
+        0, 0, 0, 0, 0, 0, 0, 10000;
+    kf->H_ <<
+        1, 0, 0, 0, 0, 0, 0, 0,
+        0, 1, 0, 0, 0, 0, 0, 0,
+        0, 0, 1, 0, 0, 0, 0, 0,
+        0, 0, 0, 1, 0, 0, 0, 0;
+    kf->Q_ <<
+        1, 0, 0, 0, 0, 0, 0, 0,
+        0, 1, 0, 0, 0, 0, 0, 0,
+        0, 0, 1, 0, 0, 0, 0, 0,
+        0, 0, 0, 1, 0, 0, 0, 0,
+        0, 0, 0, 0, 0.01, 0, 0, 0,
+        0, 0, 0, 0, 0, 0.01, 0, 0,
+        0, 0, 0, 0, 0, 0, 0.0001, 0,
+        0, 0, 0, 0, 0, 0, 0, 0.0001;
+    kf->R_ <<
+        1, 0, 0, 0,
+        0, 1, 0, 0,
+        0, 0, 10, 0,
+        0, 0, 0, 10;
+
+    kf_ = kf;
+}
+
+
+double Track2D::CalculateIou(const std::vector<float>& data)
+{
+    std::vector<float> states;
+    GetStateData(states);
+
+    //auto xx1 = std::max(states[0], data[0]);
+    //auto yy1 = std::max(states[1], data[1]);
+    //auto xx2 = std::min(det.br().x, trk.br().x);
+    //auto yy2 = std::min(det.br().y, trk.br().y);
+    //auto w = std::max(0, xx2 - xx1);
+    //auto h = std::max(0, yy2 - yy1);
+
+    //// calculate area of intersection and union
+    //float det_area = det.area();
+    //float trk_area = trk.area();
+    //auto intersection_area = w * h;
+    //float union_area = det_area + trk_area - intersection_area;
+    //auto iou = intersection_area / union_area;
+
+    auto w = std::max(std::min((states[0] + states[2]), (data[0] + data[2])) - std::max(states[0], data[0]), 0.0f);
+    auto h = std::max(std::min((states[1] + states[3]), (data[1] + data[3])) - std::max(states[1], data[1]), 0.0f);
+
+    auto intersection_area = w * h;
+    float union_area = states[2] * states[3] + data[2] * data[3] - intersection_area;
+    auto iou = intersection_area / union_area;
+	return iou;
+}
--- a/src/BaseTracker/Track2D.h
+++ b/src/BaseTracker/Track2D.h
+#pragma once
+
+#include <vector>
+#include "BaseTrack.h"
+#ifdef _QICHECHENG_
+#include "jfxrosperceiver/det_tracking.h"
+#define jfx_common_msgs jfxrosperceiver
+#else
+#include "jfx_common_msgs/det_tracking.h"
+#endif
+
+using trackOjbPtr = std::shared_ptr< jfx_common_msgs::det_tracking>;
+
+class Track2D :public BaseTrack
+{
+public:
+    // Constructor
+    Track2D();
+    ~Track2D() {}
+
+    trackOjbPtr m_obj = nullptr;
+
+    virtual int GetIouDataOrder(std::vector<int>& order) { return 0; };
+    virtual int GetKFDataOrder(std::vector<int>& order) { return 0; };
+
+    virtual double CalculateIou(const std::vector<float>& data);
+
+    static void MeasureIouData(const std::vector<float>& input, std::vector<float>& out, int& obj_type) {}
+};
--- a/src/BaseTracker/kf_gpu/bev_overlap_online.cu
+++ b/src/BaseTracker/kf_gpu/bev_overlap_online.cu
+#include <stdio.h>
+#include <iostream>
+//#include <pybind11/pybind11.h>
+//#include <pybind11/numpy.h>
+//#include <pybind11/stl.h>
+
+#include "common.h"
+#include <cmath>
+#include <fstream>
+#define ROS
+
+//#define DEBUG
+
+#define THREADS_PER_BLOCK 16
+
+
+
+
+const float EPS = 1e-8;
+struct Point {
+    float x, y;
+    __device__ Point() {}
+    __device__ Point(double _x, double _y){
+        x = _x, y = _y;
+    }
+
+    __device__ void set(float _x, float _y){
+        x = _x; y = _y;
+    }
+
+    __device__ Point operator +(const Point &b)const{
+        return Point(x + b.x, y + b.y);
+    }
+
+    __device__ Point operator -(const Point &b)const{
+        return Point(x - b.x, y - b.y);
+    }
+};
+
+__device__ inline float cross(const Point &a, const Point &b){
+    return a.x * b.y - a.y * b.x;
+}
+
+__device__ inline float cross(const Point &p1, const Point &p2, const Point &p0){
+    return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
+}
+
+__device__ int check_rect_cross(const Point &p1, const Point &p2, const Point &q1, const Point &q2){
+    int ret = min(p1.x,p2.x) <= max(q1.x,q2.x)  &&
+              min(q1.x,q2.x) <= max(p1.x,p2.x) &&
+              min(p1.y,p2.y) <= max(q1.y,q2.y) &&
+              min(q1.y,q2.y) <= max(p1.y,p2.y);
+    return ret;
+}
+
+__device__ inline int check_in_box2d(const float *box, const Point &p){
+    //params: (7) [x, y, z, dx, dy, dz, heading]
+    const float MARGIN = 1e-2;
+
+    float center_x = box[0], center_y = box[1];
+    float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);  // rotate the point in the opposite direction of box
+    float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
+    float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
+
+    return (fabs(rot_x) < box[3] / 2 + MARGIN && fabs(rot_y) < box[4] / 2 + MARGIN);
+}
+
+__device__ inline int intersection(const Point &p1, const Point &p0, const Point &q1, const Point &q0, Point &ans){
+    // fast exclusion
+    if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
+
+    // check cross standing
+    float s1 = cross(q0, p1, p0);
+    float s2 = cross(p1, q1, p0);
+    float s3 = cross(p0, q1, q0);
+    float s4 = cross(q1, p1, q0);
+
+    if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
+
+    // calculate intersection of two lines
+    float s5 = cross(q1, p1, p0);
+    if(fabs(s5 - s1) > EPS){
+        ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
+        ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
+
+    }
+    else{
+        float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
+        float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
+        float D = a0 * b1 - a1 * b0;
+
+        ans.x = (b0 * c1 - b1 * c0) / D;
+        ans.y = (a1 * c0 - a0 * c1) / D;
+    }
+
+    return 1;
+}
+
+__device__ inline void rotate_around_center(const Point &center, const float angle_cos, const float angle_sin, Point &p){
+    float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * (-angle_sin) + center.x;
+    float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
+    p.set(new_x, new_y);
+}
+
+__device__ inline int point_cmp(const Point &a, const Point &b, const Point &center){
+    return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x);
+}
+
+__device__ inline float box_overlap(const float *box_a, const float *box_b){
+    // params box_a: [x, y, z, dx, dy, dz, heading]
+    // params box_b: [x, y, z, dx, dy, dz, heading]
+
+    float a_angle = box_a[6], b_angle = box_b[6];
+    float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2, a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
+    float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
+    float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
+    float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
+    float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
+
+    Point center_a(box_a[0], box_a[1]);
+    Point center_b(box_b[0], box_b[1]);
+
+#ifdef DEBUG
+    printf("kernel box_overlap a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", a_x1, a_y1, a_x2, a_y2, a_angle,
+           b_x1, b_y1, b_x2, b_y2, b_angle);
+    printf("kernel box_overlap center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y, center_b.x, center_b.y);
+#endif
+    Point box_a_corners[5];
+    box_a_corners[0].set(a_x1, a_y1);
+    box_a_corners[1].set(a_x2, a_y1);
+    box_a_corners[2].set(a_x2, a_y2);
+    box_a_corners[3].set(a_x1, a_y2);
+
+    Point box_b_corners[5];
+    box_b_corners[0].set(b_x1, b_y1);
+    box_b_corners[1].set(b_x2, b_y1);
+    box_b_corners[2].set(b_x2, b_y2);
+    box_b_corners[3].set(b_x1, b_y2);
+
+    // get oriented corners
+    float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
+    float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
+
+    for (int k = 0; k < 4; k++){
+#ifdef DEBUG
+        printf("kernel box_overlap before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
+#endif
+        rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
+        rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
+#ifdef DEBUG
+        printf("kernel box_overlap corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
+#endif
+    }
+
+    box_a_corners[4] = box_a_corners[0];
+    box_b_corners[4] = box_b_corners[0];
+
+    // get intersection of lines
+    Point cross_points[16];
+    Point poly_center;
+    int cnt = 0, flag = 0;
+
+    poly_center.set(0, 0);
+    for (int i = 0; i < 4; i++){
+        for (int j = 0; j < 4; j++){
+            flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], cross_points[cnt]);
+            if (flag){
+                poly_center = poly_center + cross_points[cnt];
+                cnt++;
+#ifdef DEBUG
+                printf("kernel box_overlap Cross points (%.3f, %.3f): a(%.3f, %.3f)->(%.3f, %.3f), b(%.3f, %.3f)->(%.3f, %.3f) \n",
+                    cross_points[cnt - 1].x, cross_points[cnt - 1].y,
+                    box_a_corners[i].x, box_a_corners[i].y, box_a_corners[i + 1].x, box_a_corners[i + 1].y,
+                    box_b_corners[i].x, box_b_corners[i].y, box_b_corners[i + 1].x, box_b_corners[i + 1].y);
+#endif
+            }
+        }
+    }
+
+    // check corners
+    for (int k = 0; k < 4; k++){
+        if (check_in_box2d(box_a, box_b_corners[k])){
+            poly_center = poly_center + box_b_corners[k];
+            cross_points[cnt] = box_b_corners[k];
+            cnt++;
+#ifdef DEBUG
+                printf("kernel box_overlap b corners in a: corner_b(%.3f, %.3f)", cross_points[cnt - 1].x, cross_points[cnt - 1].y);
+#endif
+        }
+        if (check_in_box2d(box_b, box_a_corners[k])){
+            poly_center = poly_center + box_a_corners[k];
+            cross_points[cnt] = box_a_corners[k];
+            cnt++;
+#ifdef DEBUG
+                printf("kernel box_overlap a corners in b: corner_a(%.3f, %.3f)", cross_points[cnt - 1].x, cross_points[cnt - 1].y);
+#endif
+        }
+    }
+
+    poly_center.x /= cnt;
+    poly_center.y /= cnt;
+
+    // sort the points of polygon
+    Point temp;
+    for (int j = 0; j < cnt - 1; j++){
+        for (int i = 0; i < cnt - j - 1; i++){
+            if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)){
+                temp = cross_points[i];
+                cross_points[i] = cross_points[i + 1];
+                cross_points[i + 1] = temp;
+            }
+        }
+    }
+
+#ifdef DEBUG
+    printf("kernel box_overlap cnt=%d\n", cnt);
+    for (int i = 0; i < cnt; i++){
+        printf("kernel box_overlap All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x, cross_points[i].y);
+    }
+#endif
+
+    // get the overlap areas
+    float area = 0;
+    for (int k = 0; k < cnt - 1; k++){
+        area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]);
+    }
+
+    return fabs(area) / 2.0;
+}
+
+
+__global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap){
+    // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+    // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+    const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+    const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+    if (a_idx >= num_a || b_idx >= num_b){
+        return;
+    }
+    const float * cur_box_a = boxes_a + a_idx * 7;
+    const float * cur_box_b = boxes_b + b_idx * 7;
+    	
+    float bev_dist = pow(pow((cur_box_a[0] - cur_box_b[0]), 2) + pow((cur_box_a[1] - cur_box_b[1]), 2), 0.5);
+    float thr = (cur_box_a[3] > cur_box_b[3]) ? cur_box_a[3] : cur_box_b[3];
+    if (bev_dist > thr){
+	    return;
+    }
+
+    float s_overlap = box_overlap(cur_box_a, cur_box_b);
+    ans_overlap[a_idx * num_b + b_idx] = s_overlap;
+}
+
+
+
+
+void boxesoverlapLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap, int n, int an, int bn){
+
+    dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK), DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
+
+    boxes_overlap_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b, ans_overlap);
+
+#ifdef DEBUG
+    std::cout << "**************************bev_overlapC.cu boxesoverlapLauncher    input box a: \n";
+    int size_ba = an* 7 * sizeof(float);
+    float* tmp_a;
+    tmp_a = (float*)malloc(size_ba);
+    GPU_CHECK(cudaMemcpy(tmp_a, boxes_a, size_ba, cudaMemcpyDeviceToHost));
+    for(int i =0; i < an;i++){
+        std::string sp;
+        for (int j=0;j < 7;j++){
+            sp = sp + std::to_string(tmp_a[7 * i + j]) + ", ";
+        }
+        std::cout << sp << "\n";
+    }
+    std::cout << "\n";
+    free(tmp_a);
+
+    std::cout << "**************************bev_overlapC.cu boxesoverlapLauncher   input box b: \n";
+    int size_bb = bn* 7 * sizeof(float);
+    float* tmp_b;
+    tmp_b = (float*)malloc(size_bb);
+    GPU_CHECK(cudaMemcpy(tmp_b, boxes_b, size_bb, cudaMemcpyDeviceToHost));
+    for(int i =0; i < bn;i++){
+        std::string sp;
+        for (int j=0;j < 7;j++){
+            sp = sp + std::to_string(tmp_b[7 * i + j]) + ", ";
+        }
+        std::cout << sp << "\n";
+    }
+    std::cout << "\n";
+    free(tmp_b);
+
+    std::cout << "**************************bev_overlapC.cu boxesoverlapLauncheri   output res: \n";
+    int size_ans = n * sizeof(float);
+    float* tmp_ans;
+    tmp_ans = (float*)malloc(size_ans);
+    GPU_CHECK(cudaMemcpy(tmp_ans, ans_overlap, size_ans, cudaMemcpyDeviceToHost));
+    for(int i =0; i < n;i++){
+	    std::cout << tmp_ans[i] << " ";
+    }
+    std::cout << "\n";
+    free(tmp_ans);
+
+    cudaDeviceSynchronize();  // for using printf in kernel function
+#endif
+}
+
+
+//void map_bev_overlap(const int num_a, pybind11::array_t<float> boxes_a,const int num_b, pybind11::array_t<float> boxes_b, pybind11::array_t<float> ans_overlap){
+//
+//	pybind11::buffer_info bx_a = boxes_a.request();
+//	pybind11::buffer_info bx_b = boxes_b.request();
+//	pybind11::buffer_info ans = ans_overlap.request();
+//
+//	int size_a = bx_a.shape[0] * bx_a.shape[1] * sizeof(float);
+//	int size_b = bx_b.shape[0] * bx_b.shape[1] * sizeof(float);
+//	int size_ans = ans.shape[0] * ans.shape[1] * sizeof(float);
+//
+//	float* a_gpu;
+//	float* b_gpu;
+//	float* ans_gpu;
+//
+//	GPU_CHECK(cudaMalloc(&a_gpu, size_a));
+//	GPU_CHECK(cudaMalloc(&b_gpu, size_b));
+//	GPU_CHECK(cudaMalloc(&ans_gpu, size_ans));
+//
+//
+////float* a_ptr = reinterpret_cast<float*>(bx_a.ptr);
+//
+//	const float* a_ptr = reinterpret_cast<const float*>(bx_a.ptr);
+//	float* b_ptr = reinterpret_cast<float*>(bx_b.ptr);
+//	float* ans_ptr = reinterpret_cast<float*>(ans.ptr);
+//
+//	int an = bx_a.shape[0];
+//	int bn = bx_b.shape[0];
+//
+//	//   A AND B POINTERS ARE COLUMN-BASED WHEN IN ROS, CONVERTING THIS TO ROW-BASED.
+//#ifdef ROS	
+//	float* a_row_ptr;
+//	a_row_ptr = (float*)malloc(size_a);
+//	for (int ii = 0; ii < an; ii++){
+//		for (int jj = 0; jj < 7; jj++){
+//			*(a_row_ptr + jj + ii * 7) = *(a_ptr + ii + jj * an);
+//		}
+//	}
+//
+//	float* b_row_ptr;
+//	b_row_ptr = (float*)malloc(size_b);
+//	for (int ii = 0; ii < bn; ii++){
+//		for (int jj = 0; jj < 7; jj++){
+//			*(b_row_ptr + jj + ii * 7) = *(b_ptr + ii + jj * bn);
+//		}
+//	}
+//	
+//	GPU_CHECK(cudaMemcpy(a_gpu, a_row_ptr, size_a, cudaMemcpyHostToDevice));
+//	GPU_CHECK(cudaMemcpy(b_gpu, b_row_ptr, size_b, cudaMemcpyHostToDevice));
+//#else
+//	GPU_CHECK(cudaMemcpy(a_gpu, a_ptr, size_a, cudaMemcpyHostToDevice));
+//	GPU_CHECK(cudaMemcpy(b_gpu, b_ptr, size_b, cudaMemcpyHostToDevice));
+//#endif
+//
+//	boxesoverlapLauncher(num_a, a_gpu, num_b, b_gpu, ans_gpu, ans.shape[0] * ans.shape[1], bx_a.shape[0], bx_b.shape[0]);
+//
+//	GPU_CHECK(cudaMemcpy(ans_ptr, ans_gpu, size_ans, cudaMemcpyDeviceToHost));
+//
+//	GPU_CHECK(cudaFree(a_gpu));
+//	GPU_CHECK(cudaFree(b_gpu));
+//	GPU_CHECK(cudaFree(ans_gpu));
+//
+//    free(a_row_ptr);
+//    free(b_row_ptr);
+//}
+
+void bev_overlap(const int num_a, float* boxes_a, const int num_b, float* boxes_b, float* ans_overlap) 
+{
+
+    //pybind11::buffer_info bx_a = boxes_a.request();
+    //pybind11::buffer_info bx_b = boxes_b.request();
+    //pybind11::buffer_info ans = ans_overlap.request();
+
+    int size_a = num_a * 7 * sizeof(float);
+    int size_b = num_b * 7 * sizeof(float);
+    int size_ans = num_a * num_b * sizeof(float);
+
+    float* a_gpu;
+    float* b_gpu;
+    float* ans_gpu;
+
+    GPU_CHECK(cudaMalloc(&a_gpu, size_a));
+    GPU_CHECK(cudaMalloc(&b_gpu, size_b));
+    GPU_CHECK(cudaMalloc(&ans_gpu, size_ans));
+
+
+    //float* a_ptr = reinterpret_cast<float*>(bx_a.ptr);
+
+    //const float* a_ptr = reinterpret_cast<const float*>(bx_a.ptr);
+    //float* b_ptr = reinterpret_cast<float*>(bx_b.ptr);
+    //float* ans_ptr = reinterpret_cast<float*>(ans.ptr);
+
+    //int an = bx_a.shape[0];
+    //int bn = bx_b.shape[0];
+
+    //   A AND B POINTERS ARE COLUMN-BASED WHEN IN ROS, CONVERTING THIS TO ROW-BASED.
+//#ifdef ROS	
+    //float* a_row_ptr;
+    //a_row_ptr = (float*)malloc(size_a);
+    //for (int ii = 0; ii < an; ii++) {
+    //    for (int jj = 0; jj < 7; jj++) {
+    //        *(a_row_ptr + jj + ii * 7) = *(a_ptr + ii + jj * an);
+    //    }
+    //}
+
+    //float* b_row_ptr;
+    //b_row_ptr = (float*)malloc(size_b);
+    //for (int ii = 0; ii < bn; ii++) {
+    //    for (int jj = 0; jj < 7; jj++) {
+    //        *(b_row_ptr + jj + ii * 7) = *(b_ptr + ii + jj * bn);
+    //    }
+    //}
+
+    GPU_CHECK(cudaMemcpy(a_gpu, boxes_a, size_a, cudaMemcpyHostToDevice));
+    GPU_CHECK(cudaMemcpy(b_gpu, boxes_b, size_b, cudaMemcpyHostToDevice));
+//#else
+    //GPU_CHECK(cudaMemcpy(a_gpu, a_ptr, size_a, cudaMemcpyHostToDevice));
+    //GPU_CHECK(cudaMemcpy(b_gpu, b_ptr, size_b, cudaMemcpyHostToDevice));
+//#endif
+
+    boxesoverlapLauncher(num_a, a_gpu, num_b, b_gpu, ans_gpu, num_a * num_b, num_a, num_b);
+
+    GPU_CHECK(cudaMemcpy(ans_overlap, ans_gpu, size_ans, cudaMemcpyDeviceToHost));
+
+    GPU_CHECK(cudaFree(a_gpu));
+    GPU_CHECK(cudaFree(b_gpu));
+    GPU_CHECK(cudaFree(ans_gpu));
+
+    //free(a_row_ptr);
+    //free(b_row_ptr);
+}
+
+//PYBIND11_MODULE(juefx_iou, m)
+//{
+//  m.def("bev_overlap", &map_bev_overlap);
+//}
+
--- a/src/BaseTracker/kf_gpu/bev_overlap_online.h
+++ b/src/BaseTracker/kf_gpu/bev_overlap_online.h
+#ifndef _BEV_OVERLAP_ONLINE_H_
+#define _BEV_OVERLAP_ONLINE_H_
+
+
+
+void bev_overlap(const int num_a, float* boxes_a, const int num_b, float* boxes_b, float* ans_overlap);
+
+
+#endif
\ No newline at end of file
--- a/src/BaseTracker/kf_gpu/common.h
+++ b/src/BaseTracker/kf_gpu/common.h
+#ifndef COMMON_H
+#define COMMON_H
+
+// headers in STL
+#include <stdio.h>
+
+// headers in CUDA
+//#include <cuda_runtime_api.h>
+
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+#define GPU_CHECK(ans)                                                                                                 \
+  {                                                                                                                    \
+    GPUAssert((ans), __FILE__, __LINE__);                                                                              \
+  }
+inline void GPUAssert(cudaError_t code, const char* file, int line, bool abort = true)
+{
+  if (code != cudaSuccess)
+  {
+    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+    if (abort)
+      exit(code);
+  }
+}
+
+//Memory Allocation Function
+//void cpuMalloc(float **data_ptr, size_t size)
+//{
+//	float *data;
+//	data = (float *) malloc( size);
+//	*data_ptr = data;
+//}
+
+
+//Ideintity Matrix Generation
+//void Identity(float *data, int n)
+//{
+//        for (int i = 0; i < (n*n); i=i+1)
+//                {
+//                if((i%(n+1))==0)
+//                        data[i] = 1;
+//                else
+//                        data[i] = 0;
+//                }
+//}
+
+
+
+#endif  // COMMON_H
--- a/src/BaseTracker/kf_gpu/kalman_batch_ops.cu
+++ b/src/BaseTracker/kf_gpu/kalman_batch_ops.cu
+#include "common.h"
+#include <cuda_runtime.h>
+
+
+__global__ void MatSub(float* C, const float* A, const float* B, int bs, int no)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < bs * no)
+        C[i] = A[i] - B[i];
+}
+
+
+
+// __global__ void KalmanUpdateS2(float* d_S, float* d_P, const int bs, const int ns, const int no){
+
+// 	int tid = blockDim.x * blockIdx.x + threadIdx.x;
+// 	if (tid >= bs)	return;
+// 	int p_stride = 11; 					
+// 	int s_stride = 8; 					
+
+// 	for (int i = 0; i < no; i++){
+// 		d_S[tid * no * no + i * s_stride] = d_P[tid * ns * ns + i * p_stride] + 1;
+// 	}
+// }
+
+
+
+__global__ void KalmanUpdateS2(float* d_S, float* d_P, float* d_R, const int bs, const int ns, const int no){
+
+int tid = blockDim.x * blockIdx.x + threadIdx.x;
+if (tid >= bs)	return;
+int p_stride = 11; 					
+int s_stride = 8; 					
+
+for (int i = 0; i < no; i++){
+	d_S[tid * no * no + i * s_stride] = d_P[tid * ns * ns + i * p_stride] + d_R[i * s_stride];
+}
+}
+
+
+__global__ void KalmanUpdateS3(float* d_K, float* d_P, float* d_S, const int bs, const int ns, const int no){
+
+	int tid = blockDim.x * blockIdx.x + threadIdx.x;
+	if (tid >= bs) return;
+	int p_stride = 11; 
+	int k_stride = 8;      				
+	int s_stride = 8; 
+
+	for (int i = 0; i < ns; i++){
+		if (i < no){
+			d_K[tid * ns * no + i * k_stride] = d_P[tid * ns * ns + i * p_stride] * (1 / d_S[tid * no * no + i * s_stride]);
+		}
+		else{
+			d_K[tid * ns * no + no * no + (i - no) * k_stride] = d_P[tid * ns * ns + ns * no + (i - no) * p_stride] * (1 / d_S[tid * no * no + (i - no) * s_stride]);
+		}
+	}	
+}
+
+
+__global__ void KalmanUpdateS4(float* d_X, float* d_K, float* d_Y, const int bs, const int ns, const int no){
+
+	int tid = blockDim.x * blockIdx.x + threadIdx.x;
+	if (tid >= bs) return;  				
+	int k_stride = 8; 
+
+	for (int i = 0; i < ns; i++){
+		if (i < no){
+			d_X[tid * ns + i] += d_K[tid * ns * no + i * k_stride] * d_Y[tid * no + i];
+		}
+		else{
+			
+			d_X[tid * ns + i] += d_K[tid * ns * no + no * no + (i - no) * k_stride] * d_Y[tid * no + i - no];
+		}
+	}	
+}
+
+
+__global__ void KalmanUpdateS5(float* d_P, float* d_K, const int bs, const int ns, const int no){
+
+	int tid = blockDim.x * blockIdx.x + threadIdx.x;
+	if (tid >= bs) return;
+	int p_stride = 11;      				
+	int k_stride = 8; 
+
+	for (int i = ns - 1; i > -1; i--){
+		if (i < no){
+			float IKH_tl = 1 - d_K[tid * ns * no + i * k_stride];
+
+			d_P[tid * ns * ns + i * p_stride] *= IKH_tl;						
+			if (i < 3) d_P[tid * ns * ns + no + i * p_stride] *= IKH_tl; 				
+		}
+		else{		
+			float IKH_bl = 0 - d_K[tid * ns * no + no * no + (i - no) * k_stride];
+			float IKH_br = 1;
+			float P_bl = d_P[tid * ns * ns + ns * no + (i - no) * p_stride];
+			float P_br = d_P[tid * ns * ns + i * p_stride];	
+			float P_tl = d_P[tid * ns * ns + (i - no) * p_stride];
+			float P_tr = d_P[tid * ns * ns + no + (i - no) * p_stride];
+
+			d_P[tid * ns * ns + ns * no + (i - no) * p_stride] = IKH_bl * P_tl + IKH_br * P_bl;	
+			d_P[tid * ns * ns + i * p_stride] = IKH_bl * P_tr + IKH_br * P_br;							
+		}
+	}	
+}
+
+
+
+
+
--- a/src/BaseTracker/kf_gpu/kalman_update_batch_online.cu
+++ b/src/BaseTracker/kf_gpu/kalman_update_batch_online.cu
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+//#include <cutil_inline.h>
+#include <sys/time.h>
+#include <cuda_runtime.h>
+#include <cublas.h>
+
+//#include <pybind11/pybind11.h>
+//#include <pybind11/numpy.h>
+//#include <pybind11/stl.h>
+
+#include "kalman_batch_ops.cu"
+#include "common.h"
+
+
+
+//#define DEBUG
+//#define INROS
+
+
+
+
+/*
+	ns = dim_x = 10                          len of state
+	no = dim_z = 7                          len of observation
+
+	Z: measurement                                                              (bs, no)
+	X: estimate                                                                 (bs, ns)
+	P: uncertainty covariance                                                   (bs, ns * ns)
+
+	MAKE SURE ALL INPUTS ARE TWO-DIM NUMPY ARRAY
+*/
+
+void kalmanUpdateLauncher_batch(float* d_Z, 					//(bs, no)
+				float* d_X, 					//(bs, ns)
+				float* d_P, 					//(bs, ns * ns)
+				float* d_R, 					//(bs, no * no)
+				float* d_HX, 					//(bs, no)
+				int bs,
+				int ns, 
+				int no){
+
+	float *d_Y;                     //error					(bs, no)					
+	float *d_S;                     //for intermediate calculations		(bs, no * no)
+	float *d_K;                     //Kalman gain				(bs, ns * no)
+
+	GPU_CHECK(cudaMalloc(&d_Y, no * bs * sizeof(float)));
+	GPU_CHECK(cudaMalloc(&d_S, no * no * bs * sizeof(float)));
+	GPU_CHECK(cudaMalloc(&d_K, ns * no * bs * sizeof(float)));
+
+	// also can set ''int blocksPerGrid = bs; int threadsPerBlock = ns'' for step 2 - 4, in which we could eliminate loop;
+	// step 5 cannot use this, because we need to ensure the excuation order
+	int threadsPerBlock = 256;
+	int blocksPerGridSub = DIVUP(bs * no, threadsPerBlock);	
+
+	// step 1
+	MatSub<<<blocksPerGridSub, threadsPerBlock>>>(d_Y, d_Z, d_HX, bs, no); 			// (bs, no) - (bs, no)
+	
+#ifdef DEBUG
+	cudaDeviceSynchronize();
+	std::cout << "------------------------  step 1 d_Y  (bs, no) :  \n";
+	float* tmp_y;
+	tmp_y = (float*)malloc(no * bs * sizeof(float));
+	GPU_CHECK(cudaMemcpy(tmp_y, d_Y, no * bs * sizeof(float), cudaMemcpyDeviceToHost));
+	for (int i=0;i<bs;i++){
+		std::cout << "batch " << bs <<":\n";
+		std::cout << "[";
+		for (int j=0;j<no;j++){
+		   std::cout<< tmp_y[i * no + j] << ", ";
+		}
+		std::cout <<"],\n";
+	}
+	std::cout <<"\n";
+	free(tmp_y);
+#endif
+
+	// step 2	S = H * P * H^T + E, E also called R, measurement noise
+	int blocksPerGrid2 = DIVUP(bs * no, threadsPerBlock);	
+	KalmanUpdateS2<<<blocksPerGrid2, threadsPerBlock>>>(d_S, d_P, d_R, bs, ns, no);		// (bs, no * no) + (bs, no * no)
+
+#ifdef DEBUG
+	cudaDeviceSynchronize();
+	std::cout << "------------------------  step 2 d_S  (bs, no * no) :  \n";
+	float* tmp_s;
+	tmp_s = (float*)malloc(no * no * bs * sizeof(float));
+	GPU_CHECK(cudaMemcpy(tmp_s, d_S, no * no * bs * sizeof(float), cudaMemcpyDeviceToHost));
+	for (int i=0;i<bs;i++){
+		std::cout << "[";
+		for (int j=0;j<no;j++){
+			std::cout << "[";
+			for (int k=0;k<no;k++){
+				std::cout<< tmp_s[i * no*no + j * no + k] << ", ";
+			}
+			std::cout <<"],\n";
+		}
+		std::cout <<"],\n";
+	}
+	std::cout <<"\n";
+	free(tmp_s);
+#endif
+
+	// step 3	K = P * H^T * S^-1
+
+	int blocksPerGrid3 = DIVUP(bs * ns, threadsPerBlock);	
+	KalmanUpdateS3<<<blocksPerGrid3, threadsPerBlock>>>(d_K, d_P, d_S, bs, ns, no);	// (bs, ns * no) 
+
+#ifdef DEBUG
+	cudaDeviceSynchronize();
+	std::cout << "------------------------  step 3 d_K  (bs, ns * no) :  \n";
+	float* tmp_k;
+	tmp_k = (float*)malloc(no * ns * bs * sizeof(float));
+	GPU_CHECK(cudaMemcpy(tmp_k, d_K, ns * no * bs * sizeof(float), cudaMemcpyDeviceToHost));
+	for (int i=0;i<bs;i++){
+		std::cout << "[";
+		for (int j=0;j<ns;j++){
+			std::cout << "[";
+			for (int k=0;k<no;k++){
+				std::cout<< tmp_k[i * ns*no + j * no + k] << ", ";
+			}
+			std::cout <<"],\n";
+		}
+		std::cout <<"],\n";
+	}
+	std::cout <<"\n";
+	free(tmp_k);
+#endif
+
+	// step 4	x = x + K * y
+
+	int blocksPerGrid4 = DIVUP(bs * ns, threadsPerBlock);	
+	KalmanUpdateS4<<<blocksPerGrid4, threadsPerBlock>>>(d_X, d_K, d_Y, bs, ns, no);	// (bs, ns) + (bs, ns)
+
+#ifdef DEBUG	
+	cudaDeviceSynchronize();
+	std::cout << "------------------------  step 4 result d_X  (bs, ns) :  \n";
+	float* tmp_x;
+	tmp_x = (float*)malloc(ns * bs * sizeof(float));
+	GPU_CHECK(cudaMemcpy(tmp_x, d_X, ns * bs * sizeof(float), cudaMemcpyDeviceToHost));
+	for (int i=0;i<bs;i++){
+		std::cout << "batch " << i <<":\n";
+		std::cout << "[";
+		for (int j=0;j<ns;j++){
+		   std::cout<< tmp_x[i * ns + j] << ", ";
+		}
+		std::cout <<"],\n";
+	}
+	std::cout <<"\n";
+	free(tmp_x);
+#endif	
+
+	// step 5	P = P - K * H * P
+	int blocksPerGrid5 = DIVUP(bs, threadsPerBlock);
+	KalmanUpdateS5<<<blocksPerGrid5, threadsPerBlock>>>(d_P, d_K, bs, ns, no);		// (bs, ns * ns) - (bs, ns * ns)
+
+#ifdef DEBUG
+	std::cout << "------------------------  step 5 result d_P  (bs, ns * ns) :  \n";
+	float* tmp_res;
+	tmp_res = (float*)malloc(ns * ns * bs * sizeof(float));
+	GPU_CHECK(cudaMemcpy(tmp_res, d_P, ns * ns * bs * sizeof(float), cudaMemcpyDeviceToHost));
+	for (int i=0;i<bs;i++){
+		std::cout << "batch " << i <<":\n";
+		std::cout << "[";
+		for (int j=0;j<ns;j++){
+			std::cout << "[";
+			for (int k=0;k<ns;k++){
+			   std::cout<< tmp_res[i * ns*ns + j * ns + k] << ", ";
+			}
+			std::cout <<"],\n";
+		}
+		std::cout <<"],\n";
+	}
+	std::cout <<"\n";
+	free(tmp_res);
+#endif
+
+	GPU_CHECK(cudaFree(d_Y));
+	GPU_CHECK(cudaFree(d_K));
+	GPU_CHECK(cudaFree(d_S));
+}
+
+
+ /*
+	ns = dim_x = 10                          len of state
+	no = dim_z = 7                          len of observation
+
+	Z: measurement                                                              (bs, 7)
+	X: estimate                                                                 (bs, 10)
+	P: uncertainty covariance                                                   (bs, 100)
+	R: measurement noise														(bs, 49)
+
+  MAKE SURE ALL INPUTS ARE TWO-DIM NUMPY ARRAY
+  */
+//void map_kalman_update_batch( 	pybind11::array_t<float> Z,
+//								pybind11::array_t<float> X,     // in-place update
+//								pybind11::array_t<float> P,     // in-place update
+//								pybind11::array_t<float> HX,
+//								const int bs,
+//								const int ns,
+//								const int no
+//                      		){
+//
+//	pybind11::buffer_info ZZ = Z.request();
+//	pybind11::buffer_info XX = X.request();
+//	pybind11::buffer_info PP = P.request();
+//	pybind11::buffer_info HXX = HX.request();
+//
+//	int size_ZZ = ZZ.shape[0] * ZZ.shape[1] * sizeof(float);
+//	int size_XX = XX.shape[0] * XX.shape[1] * sizeof(float);
+//	int size_PP = PP.shape[0] * PP.shape[1] * sizeof(float);
+//	int size_HXX = HXX.shape[0] * HXX.shape[1] * sizeof(float);
+//	// std::cout << "size_HXX: " << size_HXX <<"\n";
+//
+//	float* host_Z = reinterpret_cast<float*>(ZZ.ptr);
+//	float* host_X = reinterpret_cast<float*>(XX.ptr);
+//	float* host_P = reinterpret_cast<float*>(PP.ptr);
+//	float* host_HX = reinterpret_cast<float*>(HXX.ptr);
+//
+//	float* device_Z;
+//	float* device_X;
+//	float* device_P;
+//	float* device_HX;
+//
+//	GPU_CHECK(cudaMalloc(&device_Z, size_ZZ));
+//	GPU_CHECK(cudaMalloc(&device_X, size_XX));
+//	GPU_CHECK(cudaMalloc(&device_P, size_PP));
+//	GPU_CHECK(cudaMalloc(&device_HX, size_HXX));
+//
+//	GPU_CHECK(cudaMemcpy(device_Z, host_Z, size_ZZ, cudaMemcpyHostToDevice));
+//	GPU_CHECK(cudaMemcpy(device_X, host_X, size_XX, cudaMemcpyHostToDevice));
+//	GPU_CHECK(cudaMemcpy(device_P, host_P, size_PP, cudaMemcpyHostToDevice));
+//	GPU_CHECK(cudaMemcpy(device_HX, host_HX, size_HXX, cudaMemcpyHostToDevice));
+//
+//	kalmanUpdateLauncher_batch(device_Z, device_X, device_P, device_HX, bs, ns, no);
+//
+//	GPU_CHECK(cudaMemcpy(host_X, device_X, size_XX, cudaMemcpyDeviceToHost));
+//	GPU_CHECK(cudaMemcpy(host_P, device_P, size_PP, cudaMemcpyDeviceToHost));
+//
+//	GPU_CHECK(cudaFree(device_Z));
+//	GPU_CHECK(cudaFree(device_X));
+//	GPU_CHECK(cudaFree(device_P));
+//	GPU_CHECK(cudaFree(device_HX));
+//
+//#ifdef DEBUG
+//	int c_row = no;
+//	int c_col = ns;
+//	std::cout << "################################### kalman update gpu host_h before reinterpret_cast: no * ns" << "\n";
+//	auto a = H.mutable_unchecked<2>();
+//	for (int i = 0; i < a.shape(0); i++){
+//		std::cout << "[";
+//		for (int j = 0; j < a.shape(1); j++){
+//
+//		        std::cout << a(i, j)<< ", ";
+//		}
+//		std::cout << "],\n";
+//	}
+//
+//
+//  	std::cout << "++++++++++++++++++++++++++++++++++ kalman update gpu host_h shape: no * ns" << "\n";
+//        for (int i=0;i<c_row;i++){
+//           for (int j=0;j<c_col;j++){
+//
+//                std::cout<< *(host_H + i * c_col + j) << " ";
+//           }
+//           std::cout <<"\n";
+//   	}
+//
+//
+//	float* tmp;
+//        tmp = (float*)malloc(size_HH);
+//        for (int ii = 0; ii < c_row; ii++){
+//                for (int jj = 0; jj < c_col; jj++){
+//                        *(tmp + jj + ii * c_col) = *(host_H + ii + jj * c_row);
+//                }
+//        }
+//
+//
+//
+//  	std::cout << "-------------------to rowMajor host_e_row: " << "\n";
+//        for (int i=0;i<c_row;i++){
+//           for (int j=0;j<c_col;j++){
+//
+//                std::cout<< *(tmp + i * c_col + j) << " ";
+//           }
+//           std::cout <<"\n";
+//   	}
+//	   free(tmp);
+//#endif 
+//  //        ATTENTION ORDER COULD BE CHANGED IN ROS !
+//
+//
+//
+//
+//
+//}
+
+//PYBIND11_MODULE(juefx_kalman_multi_shared, m)
+//PYBIND11_MODULE(juefx_kalman_multi_1, m)
+//PYBIND11_MODULE(juefx_kalman_batch, m)
+//{
+//  m.def("kalman_update_batch", &map_kalman_update_batch);
+//}
+
+/*
+	ns = dim_x = 10                          len of state
+	no = dim_z = 7                          len of observation
+
+	Z: measurement                                                              (bs, 7)
+	X: estimate                                                                 (bs, 10)
+	P: uncertainty covariance                                                   (bs, 100)
+
+  MAKE SURE ALL INPUTS ARE TWO-DIM NUMPY ARRAY
+  */
+void kalman_update_batch(float* Z,// measurement size = bs * no
+	float* X,     // in-place update states  size = bs * ns
+	float* P,     // in-place update predict size = bs * ns * ns
+	float* R,		//R covariance matrix of observation noise no * no
+	float* HX,	   // H*X  size = bs * no
+	const int bs,
+	const int ns,  //ns = 10
+	const int no   // no = 7
+) 
+{
+
+	//pybind11::buffer_info ZZ = Z.request();
+	//pybind11::buffer_info XX = X.request();
+	//pybind11::buffer_info PP = P.request();
+	//pybind11::buffer_info HXX = HX.request();
+
+	int size_ZZ = bs * no * sizeof(float);
+	int size_XX = bs * ns * sizeof(float);
+	int size_PP = bs * ns * ns * sizeof(float);
+	int size_RR = no * no * sizeof(float);
+	int size_HXX = bs * no * sizeof(float);
+	// std::cout << "size_HXX: " << size_HXX <<"\n";
+
+	//float* host_Z = reinterpret_cast<float*>(ZZ.ptr);
+	//float* host_X = reinterpret_cast<float*>(XX.ptr);
+	//float* host_P = reinterpret_cast<float*>(PP.ptr);
+	//float* host_HX = reinterpret_cast<float*>(HXX.ptr);
+
+	float* device_Z;
+	float* device_X;
+	float* device_P;
+	float* device_R;
+	float* device_HX;
+
+	GPU_CHECK(cudaMalloc(&device_Z, size_ZZ));
+	GPU_CHECK(cudaMalloc(&device_X, size_XX));
+	GPU_CHECK(cudaMalloc(&device_P, size_PP));
+	GPU_CHECK(cudaMalloc(&device_R, size_RR));
+	GPU_CHECK(cudaMalloc(&device_HX, size_HXX));
+	GPU_CHECK(cudaMemcpy(device_Z, Z, size_ZZ, cudaMemcpyHostToDevice));
+	GPU_CHECK(cudaMemcpy(device_X, X, size_XX, cudaMemcpyHostToDevice));
+	GPU_CHECK(cudaMemcpy(device_P, P, size_PP, cudaMemcpyHostToDevice));
+	GPU_CHECK(cudaMemcpy(device_R, R, size_RR, cudaMemcpyHostToDevice));
+	GPU_CHECK(cudaMemcpy(device_HX, HX, size_HXX, cudaMemcpyHostToDevice));
+
+	kalmanUpdateLauncher_batch(device_Z, device_X, device_P, device_R, device_HX, bs, ns, no);
+
+	GPU_CHECK(cudaMemcpy(X, device_X, size_XX, cudaMemcpyDeviceToHost));
+	GPU_CHECK(cudaMemcpy(P, device_P, size_PP, cudaMemcpyDeviceToHost));
+
+	GPU_CHECK(cudaFree(device_Z));
+	GPU_CHECK(cudaFree(device_X));
+	GPU_CHECK(cudaFree(device_P));
+	GPU_CHECK(cudaFree(device_R));
+	GPU_CHECK(cudaFree(device_HX));
+
+}
+
--- a/src/BaseTracker/kf_gpu/kalman_update_batch_online.h
+++ b/src/BaseTracker/kf_gpu/kalman_update_batch_online.h
+#ifndef _KALMAN_UPDATE_BATCH_ONLINE_H_
+#define _KALMAN_UPDATE_BATCH_ONLINE_H_
+
+
+
+void kalman_update_batch(float* Z,// measurement size = bs * no
+	float* X,     // in-place update states  size = bs * ns
+	float* P,     // in-place update predict size = bs * ns * ns
+	float* R,		//R covariance matrix of observation noise no * no
+	float* HX,	   // H*X  size = bs * no
+	const int bs,
+	const int ns,  //ns = 10
+	const int no   // no = 7
+);
+
+
+#endif  
--- a/src/Component.cpp
+++ b/src/Component.cpp
+
+#include "Component.h"
+#include <sys/time.h>
+#include <unistd.h>
+
+uint64_t GetCurTime()
+{
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    uint64_t seconds = time.tv_sec;
+    uint64_t ttt = seconds * 1000 * 1000 + time.tv_usec;
+    return ttt;
+}
+
+std::string GetMatrixStr(const float* data_ptr, int col, int row)
+{
+    std::string str;
+    for (int i = 0; i < row; i++)
+    {
+        for (int j = 0; j < col; j++)
+        {
+            char log[128] = {};
+            sprintf(log, "%f,", data_ptr[i*col+j]);
+            str += log;
+        }
+    }
+    return str;
+}
+
+std::string GetMatrixStr(const std::vector<std::vector<float>>& data_ptr, int col, int row)
+{
+    std::string str;
+    for (int i = 0; i < col; i++)
+    {
+        for (int j = 0; j < row; j++)
+        {
+            char log[128] = {};
+            sprintf(log, "%f,", data_ptr[i][j]);
+            str += log;
+        }
+    }
+    return str;
+}
+std::string GetMatrixStr(const std::vector<std::vector<double>>& data_ptr, int col, int row)
+{
+    std::string str;
+    for (int i = 0; i < col; i++)
+    {
+        for (int j = 0; j < row; j++)
+        {
+            char log[128] = {};
+            sprintf(log, "%f,", data_ptr[i][j]);
+            str += log;
+        }
+    }
+    return str;
+}
+
+std::string GetMatrixStr(float** data_ptr, int col, int row)
+{
+    std::string str;
+    for (int i = 0; i < row; i++)
+    {
+        for (int j = 0; j < col; j++)
+        {
+            char log[128] = {};
+            sprintf(log, "%f,", data_ptr[i][j]);
+            str += log;
+        }
+    }
+    return str;
+}
+
+
+std::string GetTimeStr(uint64_t timestamp)
+{
+    time_t tt = timestamp / 1000;
+    int misc = timestamp % 1000;
+    struct tm* ptr;
+    ptr = localtime(&tt);
+    // printf("time: %d \n", tt);
+    char str[80];
+    strftime(str, sizeof(str), "%Y-%m-%d %H:%M:%S", ptr);
+    //2018-09-19 16:01:37.517
+    char tStr[128] = {};
+    sprintf(tStr, ".%d", misc);
+    std::string timeStr = std::string(str) + std::string(tStr);
+    return timeStr;
+}
+
+double calcIntersectionArea(cv::RotatedRect rect1, cv::RotatedRect rect2)
+{
+    std::vector<cv::Point2f> vertices;
+    int intersectionType = cv::rotatedRectangleIntersection(rect1, rect2, vertices);
+    if (vertices.size() == 0)
+        return 0.0;
+    else
+    {
+        std::vector<cv::Point2f> order_pts;
+        // 找到交集（交集的区域），对轮廓的各个点进行排序
+
+        cv::convexHull(cv::Mat(vertices), order_pts, true);
+        double area = cv::contourArea(order_pts);
+        //float inner = (float)(area / (areaRect1 + areaRect2 - area + 0.0001));
+        return area;
+    }
+}
+
+float calcIntersectionRate(cv::RotatedRect rect1, cv::RotatedRect rect2)
+{
+    double area = calcIntersectionArea(rect1, rect2);
+
+    float iou_2d = area / (rect1.size.width * rect1.size.height + rect2.size.width * rect2.size.height);
+    return iou_2d;
+}
--- a/src/Component.h
+++ b/src/Component.h
+#pragma once
+
+#include <string>
+#include <vector>
+#include <opencv2/opencv.hpp>
+
+uint64_t GetCurTime();
+
+std::string GetTimeStr(uint64_t timestamp);
+
+
+std::string GetMatrixStr(const float* data_ptr, int col, int row);
+std::string GetMatrixStr(const std::vector<std::vector<float>>& data_ptr, int col, int row);
+std::string GetMatrixStr(const std::vector<std::vector<double>>& data_ptr, int col, int row);
+std::string GetMatrixStr(float** data_ptr, int col, int row);
+
+double calcIntersectionArea(cv::RotatedRect rect1, cv::RotatedRect rect2);
+
+float calcIntersectionRate(cv::RotatedRect rect1, cv::RotatedRect rect2);