Commit b83d4add authored by marina.kolpakova's avatar marina.kolpakova

memory optimization

parent 4d9c7c10
......@@ -41,9 +41,9 @@
//M*/
#include <opencv2/gpu/device/common.hpp>
// #include <icf.hpp>
#include <icf.hpp>
// #include <opencv2/gpu/device/saturate_cast.hpp>
// #include <stdio.h>
#include <stdio.h>
// #include <float.h>
// //#define LOG_CUDA_CASCADE
......@@ -93,6 +93,58 @@ namespace icf {
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
texture<float2, cudaTextureType1D, cudaReadModeElementType> tnode;
__global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages,
const Node* nodes,
PtrStepSz<uchar4> objects)
{
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
Level level = levels[blockIdx.z];
if(x >= level.workRect.x || y >= level.workRect.y) return;
Octave octave = octaves[level.octave];
int st = octave.index * octave.stages;
const int stEnd = st + 1000;//octave.stages;
float confidence = 0.f;
#pragma unroll 8
for(; st < stEnd; ++st)
{
const int nId = st * 3;
const Node node = nodes[nId];
const float stage = stages[st];
confidence += node.rect.x * stage;
}
uchar4 val;
val.x = (int)confidence;
if (x == y) objects(0, threadIdx.x) = val;
}
void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
const PtrStepSzb& nodes, const PtrStepSzb& features,
PtrStepSz<uchar4> objects)
{
int fw = 160;
int fh = 120;
dim3 block(32, 8);
dim3 grid(fw / 32, fh / 8, 47);
const Level* l = (const Level*)levels.ptr();
const Octave* oct = ((const Octave*)octaves.ptr());
const float* st = (const float*)stages.ptr();
const Node* nd = (const Node*)nodes.ptr();
// cudaSafeCall( cudaBindTexture(0, tnode, nodes.data, rgb.cols / size) );
test_kernel<<<grid, block>>>(l, oct, st, nd, objects);
cudaSafeCall( cudaGetLastError());
cudaSafeCall( cudaDeviceSynchronize());
}
}
}}}
......
/*M///////////////////////////////////////////////////////////////////////////////////////
//M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
......@@ -38,12 +38,12 @@
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
//M
// #include <opencv2/gpu/device/common.hpp>
#include <opencv2/gpu/device/common.hpp>
// #ifndef __OPENCV_ICF_HPP__
// #define __OPENCV_ICF_HPP__
#ifndef __OPENCV_ICF_HPP__
#define __OPENCV_ICF_HPP__
// #if defined __CUDACC__
// # define __device __device__ __forceinline__
......@@ -52,49 +52,62 @@
// #endif
// namespace cv { namespace gpu { namespace icf {
// using cv::gpu::PtrStepSzb;
// using cv::gpu::PtrStepSzf;
// typedef unsigned char uchar;
// struct __align__(16) Octave
// {
// ushort index;
// ushort stages;
// ushort shrinkage;
// ushort2 size;
// float scale;
// Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc)
// : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
// };
// struct __align__(8) Level //is actually 24 bytes
// {
// int octave;
// // float origScale; //not actually used
// float relScale;
// float shrScale; // used for marking detection
// float scaling[2]; // calculated according to Dollal paper
// // for 640x480 we can not get overflow
// uchar2 workRect;
// uchar2 objSize;
// Level(int idx, const Octave& oct, const float scale, const int w, const int h)
// : octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
// {
// workRect.x = round(w / (float)oct.shrinkage);
// workRect.y = round(h / (float)oct.shrinkage);
// objSize.x = round(oct.size.x * relScale);
// objSize.y = round(oct.size.y * relScale);
// }
// };
namespace cv { namespace gpu { namespace device {
namespace icf {
struct __align__(16) Octave
{
ushort index;
ushort stages;
ushort shrinkage;
ushort2 size;
float scale;
Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc)
: index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
};
struct __align__(8) Level //is actually 24 bytes
{
int octave;
float relScale;
float shrScale; // used for marking detection
float scaling[2]; // calculated according to Dollal paper
// for 640x480 we can not get overflow
uchar2 workRect;
uchar2 objSize;
Level(int idx, const Octave& oct, const float scale, const int w, const int h)
: octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
{
workRect.x = round(w / (float)oct.shrinkage);
workRect.y = round(h / (float)oct.shrinkage);
objSize.x = round(oct.size.x * relScale);
objSize.y = round(oct.size.y * relScale);
}
};
struct __align__(8) Node
{
// int feature;
uchar4 rect;
float threshold;
Node(const uchar4 c, const int t) : rect(c), threshold(t) {}
};
struct __align__(8) Feature
{
int channel;
uchar4 rect;
Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
};
}
}}}
// struct Cascade
// {
// Cascade() {}
......@@ -146,21 +159,6 @@
// static const float magnitudeScaling = 1.f ;// / sqrt(2);
// };
// struct __align__(8) Node
// {
// int feature;
// float threshold;
// Node(const int f, const float t) : feature(f), threshold(t) {}
// };
// struct __align__(8) Feature
// {
// int channel;
// uchar4 rect;
// Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
// };
// }}}
// #endif
\ No newline at end of file
#endif
\ No newline at end of file
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment