Commit 27cfe31b authored by chacha21's avatar chacha21

more ICV_HLINE specific cases

added ICV_HLINE custom implementations for element sizes up to 32
but timings show that it is not very relevant for sizes >= 12
parent 92a3dbe1
......@@ -1104,33 +1104,33 @@ static const int opencvOne = 1;
# endif
# endif
/*
static inline uint32_t opencvBigToHost32(const uchar* p){
#if OPENCV_BYTEORDER==4321
uint32_t x;
memcpy(&x,p,4);
static inline uint16_t opencvLittleToHost16(const uchar* p){
#if OPENCV_BYTEORDER==1234
uint16_t x;
memcpy(&x,p,sizeof(x));
return x;
#elif OPENCV_BYTEORDER==4321 && defined(__GNUC__)
uint16_t x;
memcpy(&x,p,sizeof(x));
return (p[0]<<8) | (p[1]>>8);
#elif OPENCV_BYTEORDER==4321 && defined(_MSC_VER) && _MSC_VER>=1300
uint16_t x;
memcpy(&x,p,sizeof(x));
return _byteswap_ushort(x);
#elif OPENCV_LITTLEENDIAN
return x;
#elif OPENCV_BYTEORDER==1234 && defined(__GNUC__)
uint32_t x;
memcpy(&x,p,4);
return __builtin_bswap32(x);
#elif OPENCV_BYTEORDER==1234 && defined(_MSC_VER) && _MSC_VER>=1300
uint32_t x;
memcpy(&x,p,4);
return _byteswap_ulong(x);
#else
return ((unsigned)p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3];
return (p[0]<<8) | (p[1]>>8);
#endif
}
static inline uint32_t opencvBigToHost32(uint32_t x){
#if OPENCV_BYTEORDER==4321
static inline uint16_t opencvLittleToHost16(uint16_t x){
#if OPENCV_LITTLEENDIAN
return x;
#else
return opencvBigToHost32((uchar*)&x);
return opencvLittleToHost16((uchar*)&x);
#endif
}
*/
static inline uint32_t opencvLittleToHost32(const uchar* p){
#if OPENCV_BYTEORDER==1234
......@@ -1338,6 +1338,26 @@ else \
}
*/
static inline void ICV_HLINE_X(uchar* ptr, int xl, int xr, const uchar* color, int pix_size)
{
uchar* hline_min_ptr = (uchar*)(ptr) + (xl)*(pix_size);
uchar* hline_end_ptr = (uchar*)(ptr) + (xr+1)*(pix_size);
uchar* hline_ptr = hline_min_ptr;
if (hline_min_ptr < hline_end_ptr)
{
memcpy(hline_ptr, color, pix_size);
hline_ptr += pix_size;
}//end if (hline_min_ptr < hline_end_ptr)
size_t sizeToCopy = pix_size;
while(hline_ptr < hline_end_ptr)
{
memcpy(hline_ptr, hline_min_ptr, sizeToCopy);
hline_ptr += sizeToCopy;
sizeToCopy = std::min(2*sizeToCopy, static_cast<size_t>(hline_end_ptr-hline_ptr));
}//end while(hline_ptr < hline_end_ptr)
}
//end ICV_HLINE_X()
static inline void ICV_HLINE_0(uchar* ptr, int xl, int xr, const uchar* color, int pix_size)
{
uchar* hline_ptr = (uchar*)(ptr) + (xl)*(pix_size);
......@@ -1345,7 +1365,7 @@ static inline void ICV_HLINE_0(uchar* ptr, int xl, int xr, const uchar* color, i
for( ; hline_ptr <= hline_max_ptr; hline_ptr += (pix_size))
{
int hline_j;
for( hline_j = 0; hline_j < (4); hline_j++ )
for( hline_j = 0; hline_j < (pix_size); hline_j++ )
{
hline_ptr[hline_j] = ((uchar*)color)[hline_j];
}
......@@ -1360,6 +1380,24 @@ static inline void ICV_HLINE_1(uchar* ptr, int xl, int xr, const uchar* color)
uchar hline_c = *(const uchar*)(color);
memset(hline_ptr, hline_c, (hline_max_ptr - hline_ptr) + 1);
}
//end ICV_HLINE_1()
static inline void ICV_HLINE_2(uchar* ptr, int xl, int xr, const uchar* color)
{
if (is_aligned(((uchar*)(ptr) + (xl)*2), 0x2))
{
uint16_t c = opencvLittleToHost16((uchar*)(color));
uint16_t* hline_ptr = (uint16_t*)(ptr) + xl;
uint16_t* hline_max_ptr = (uint16_t*)(ptr) + xr;
for( ; hline_ptr <= hline_max_ptr; )
*hline_ptr++ = c;
}
else
{
ICV_HLINE_X(ptr, xl, xr, color, 2);
}
}
//end ICV_HLINE_2()
static inline void ICV_HLINE_3(uchar* ptr, int xl, int xr, const uchar* color)
{
......@@ -1420,12 +1458,7 @@ static inline void ICV_HLINE_3(uchar* ptr, int xl, int xr, const uchar* color)
}
else
{
for( ; hline_ptr < hline_end ; )
{
*hline_ptr++ = ((uchar*)(color))[0];
*hline_ptr++ = ((uchar*)(color))[1];
*hline_ptr++ = ((uchar*)(color))[2];
}
ICV_HLINE_X(ptr, xl, xr, color, 3);
}
}
//end ICV_HLINE_3()
......@@ -1442,30 +1475,288 @@ static inline void ICV_HLINE_4(uchar* ptr, int xl, int xr, const uchar* color)
}
else
{
uchar* hline_ptr = (uchar*)(ptr) + (xl)*(4);
uchar* hline_max_ptr = (uchar*)(ptr) + (xr)*(4);
for( ; hline_ptr <= hline_max_ptr; hline_ptr += (4))
ICV_HLINE_X(ptr, xl, xr, color, 4);
}
}
//end ICV_HLINE_4()
static inline void ICV_HLINE_6(uchar* ptr, int xl, int xr, const uchar* color)
{
uchar* hline_ptr = (uchar*)(ptr) + (xl)*6;
uchar* hline_end = (uchar*)(ptr) + (xr+1)*6;
uchar* hbody24_start = std::min(hline_end, (uchar*)(24*(((uintptr_t)(hline_ptr)+23)/24)));
uchar* hbody24_end = std::min(hline_end, (uchar*)(24*(((uintptr_t)(hline_end))/24)));
uchar* hbody12_start = std::min(hline_end, (uchar*)(12*(((uintptr_t)(hline_ptr)+11)/12)));
uchar* hbody12_end = std::min(hline_end, (uchar*)(12*(((uintptr_t)(hline_end))/12)));
if (hbody24_start < hbody24_end)
{
int hline_j;
for( hline_j = 0; hline_j < (4); hline_j++ )
int offset = ((uintptr_t)(hbody24_start-hline_ptr))%6;
uint64_t c4[3];
uchar* ptrC4 = reinterpret_cast<uchar*>(&c4);
ptrC4[0] = ((uchar*)(color))[(offset++)%6];
ptrC4[1] = ((uchar*)(color))[(offset++)%6];
ptrC4[2] = ((uchar*)(color))[(offset++)%6];
ptrC4[3] = ((uchar*)(color))[(offset++)%6];
ptrC4[4] = ((uchar*)(color))[(offset++)%6];
ptrC4[5] = ((uchar*)(color))[(offset++)%6];
memcpy(&ptrC4[6], &ptrC4[0], 6);
memcpy(&ptrC4[12], &ptrC4[0], 12);
c4[0] = opencvLittleToHost64(c4[0]);
c4[1] = opencvLittleToHost64(c4[1]);
c4[2] = opencvLittleToHost64(c4[2]);
for(offset = 0 ; hline_ptr < hbody24_start; offset = (offset+1)%6)
*hline_ptr++ = ((uchar*)(color))[offset];
for(uint64_t* ptr64 = reinterpret_cast<uint64_t*>(hbody24_start), *ptr64End = reinterpret_cast<uint64_t*>(hbody24_end) ; ptr64<ptr64End ; )
{
hline_ptr[hline_j] = ((uchar*)color)[hline_j];
*ptr64++ = c4[0];
*ptr64++ = c4[1];
*ptr64++ = c4[2];
}
for(offset = ((uintptr_t)(hbody24_end-(uchar*)(ptr)))%6, hline_ptr = hbody24_end ; hline_ptr < hline_end ; offset = (offset+1)%6)
*hline_ptr++ = ((uchar*)(color))[offset];
}
else if (hbody12_start < hbody12_end)
{
int offset = ((uintptr_t)(hbody12_start-hline_ptr))%6;
uint32_t c4[3];
uchar* ptrC4 = reinterpret_cast<uchar*>(&c4);
ptrC4[0] = ((uchar*)(color))[(offset++)%6];
ptrC4[1] = ((uchar*)(color))[(offset++)%6];
ptrC4[2] = ((uchar*)(color))[(offset++)%6];
ptrC4[3] = ((uchar*)(color))[(offset++)%6];
ptrC4[4] = ((uchar*)(color))[(offset++)%6];
ptrC4[5] = ((uchar*)(color))[(offset++)%6];
memcpy(&ptrC4[6], &ptrC4[0], 6);
c4[0] = opencvLittleToHost32(c4[0]);
c4[1] = opencvLittleToHost32(c4[1]);
c4[2] = opencvLittleToHost32(c4[2]);
for(offset = 0 ; hline_ptr < hbody12_start; offset = (offset+1)%6)
*hline_ptr++ = ((uchar*)(color))[offset];
for(uint32_t* ptr32 = reinterpret_cast<uint32_t*>(hbody12_start), *ptr32End = reinterpret_cast<uint32_t*>(hbody12_end) ; ptr32<ptr32End ; )
{
*ptr32++ = c4[0];
*ptr32++ = c4[1];
*ptr32++ = c4[2];
}
for(offset = ((uintptr_t)(hbody12_end-(uchar*)(ptr)))%3, hline_ptr = hbody12_end ; hline_ptr < hline_end ; offset = (offset+1)%3)
*hline_ptr++ = ((uchar*)(color))[offset];
}
else
{
ICV_HLINE_X(ptr, xl, xr, color, 6);
}
}
//end ICV_HLINE_6()
static inline void ICV_HLINE_8(uchar* ptr, int xl, int xr, const uchar* color)
{
if (is_aligned(((uchar*)(ptr) + (xl)*8), 0x8))
{
uint64_t c = opencvLittleToHost64((uchar*)(color));
uint64_t* hline_ptr = (uint64_t*)((uchar*)(ptr) + (xl)*(8));
uint64_t* hline_max_ptr = (uint64_t*)((uchar*)(ptr) + (xr)*(8));
for( ; hline_ptr <= hline_max_ptr; )
*hline_ptr++ = c;
}
else if (is_aligned(((uchar*)(ptr) + (xl)*8), 0x4))
{
uint32_t c[2] = {opencvLittleToHost32((uchar*)(color)+0x00),
opencvLittleToHost32((uchar*)(color)+0x04)};
uint32_t* hline_ptr = (uint32_t*)((uchar*)(ptr) + (xl)*(8));
uint32_t* hline_max_ptr = (uint32_t*)((uchar*)(ptr) + (xr)*(8));
for( ; hline_ptr <= hline_max_ptr; )
{
*hline_ptr++ = c[0];
*hline_ptr++ = c[1];
}
}
else
{
ICV_HLINE_X(ptr, xl, xr, color, 8);
}
}
//end ICV_HLINE_4()
//end ICV_HLINE_8()
static inline void ICV_HLINE_12(uchar* ptr, int xl, int xr, const uchar* color)
{
if (is_aligned(((uchar*)(ptr) + (xl)*12), 0x4))
{
uint32_t c[3] = {opencvLittleToHost32((uchar*)(color)+0x00),
opencvLittleToHost32((uchar*)(color)+0x04),
opencvLittleToHost32((uchar*)(color)+0x08)};
uint32_t* hline_ptr = (uint32_t*)((uchar*)(ptr) + (xl)*(12));
uint32_t* hline_max_ptr = (uint32_t*)((uchar*)(ptr) + (xr)*(12));
for( ; hline_ptr <= hline_max_ptr; )
{
*hline_ptr++ = c[0];
*hline_ptr++ = c[1];
*hline_ptr++ = c[2];
}
}
else
{
ICV_HLINE_X(ptr, xl, xr, color, 12);
}
}
//end ICV_HLINE_12()
static inline void ICV_HLINE_16(uchar* ptr, int xl, int xr, const uchar* color)
{
if (is_aligned(((uchar*)(ptr) + (xl)*16), 0x8))
{
uint64_t c[2] = {opencvLittleToHost64((uchar*)(color)+0x00),
opencvLittleToHost64((uchar*)(color)+0x08)};
uint64_t* hline_ptr = (uint64_t*)((uchar*)(ptr) + (xl)*(16));
uint64_t* hline_max_ptr = (uint64_t*)((uchar*)(ptr) + (xr)*(16));
for( ; hline_ptr <= hline_max_ptr; )
{
*hline_ptr++ = c[0];
*hline_ptr++ = c[1];
}
}
else if (is_aligned(((uchar*)(ptr) + (xl)*16), 0x4))
{
uint32_t c[4] = {opencvLittleToHost32((uchar*)(color)+0x00),
opencvLittleToHost32((uchar*)(color)+0x04),
opencvLittleToHost32((uchar*)(color)+0x08),
opencvLittleToHost32((uchar*)(color)+0x0C)};
uint32_t* hline_ptr = (uint32_t*)((uchar*)(ptr) + (xl)*(16));
uint32_t* hline_max_ptr = (uint32_t*)((uchar*)(ptr) + (xr)*(16));
for( ; hline_ptr <= hline_max_ptr; )
{
*hline_ptr++ = c[0];
*hline_ptr++ = c[1];
*hline_ptr++ = c[2];
*hline_ptr++ = c[3];
}
}
else
{
ICV_HLINE_X(ptr, xl, xr, color, 16);
}
}
//end ICV_HLINE_16()
static inline void ICV_HLINE_24(uchar* ptr, int xl, int xr, const uchar* color)
{
if (is_aligned(((uchar*)(ptr) + (xl)*24), 0x8))
{
uint64_t c[3] = {opencvLittleToHost64((uchar*)(color)+0x00),
opencvLittleToHost64((uchar*)(color)+0x08),
opencvLittleToHost64((uchar*)(color)+0x10)};
uint64_t* hline_ptr = (uint64_t*)((uchar*)(ptr) + (xl)*(24));
uint64_t* hline_max_ptr = (uint64_t*)((uchar*)(ptr) + (xr)*(24));
for( ; hline_ptr <= hline_max_ptr; )
{
*hline_ptr++ = c[0];
*hline_ptr++ = c[1];
*hline_ptr++ = c[2];
}
}
else if (is_aligned(((uchar*)(ptr) + (xl)*24), 0x4))
{
uint32_t c[6] = {opencvLittleToHost32((uchar*)(color)+0x00),
opencvLittleToHost32((uchar*)(color)+0x04),
opencvLittleToHost32((uchar*)(color)+0x08),
opencvLittleToHost32((uchar*)(color)+0x0C),
opencvLittleToHost32((uchar*)(color)+0x10),
opencvLittleToHost32((uchar*)(color)+0x14)};
uint32_t* hline_ptr = (uint32_t*)((uchar*)(ptr) + (xl)*(24));
uint32_t* hline_max_ptr = (uint32_t*)((uchar*)(ptr) + (xr)*(24));
for( ; hline_ptr <= hline_max_ptr; )
{
*hline_ptr++ = c[0];
*hline_ptr++ = c[1];
*hline_ptr++ = c[2];
*hline_ptr++ = c[3];
*hline_ptr++ = c[4];
*hline_ptr++ = c[5];
}
}
else
{
ICV_HLINE_X(ptr, xl, xr, color, 24);
}
}
//end ICV_HLINE_24()
static inline void ICV_HLINE_32(uchar* ptr, int xl, int xr, const uchar* color)
{
if (is_aligned(((uchar*)(ptr) + (xl)*32), 0x8))
{
uint64_t c[4] = {opencvLittleToHost64((uchar*)(color)+0x00),
opencvLittleToHost64((uchar*)(color)+0x08),
opencvLittleToHost64((uchar*)(color)+0x10),
opencvLittleToHost64((uchar*)(color)+0x18)};
uint64_t* hline_ptr = (uint64_t*)((uchar*)(ptr) + (xl)*(32));
uint64_t* hline_max_ptr = (uint64_t*)((uchar*)(ptr) + (xr)*(32));
for( ; hline_ptr <= hline_max_ptr; )
{
*hline_ptr++ = c[0];
*hline_ptr++ = c[1];
*hline_ptr++ = c[2];
*hline_ptr++ = c[3];
}
}
else if (is_aligned(((uchar*)(ptr) + (xl)*2324), 0x4))
{
uint32_t c[8] = {opencvLittleToHost32((uchar*)(color)+0x00),
opencvLittleToHost32((uchar*)(color)+0x04),
opencvLittleToHost32((uchar*)(color)+0x08),
opencvLittleToHost32((uchar*)(color)+0x0C),
opencvLittleToHost32((uchar*)(color)+0x10),
opencvLittleToHost32((uchar*)(color)+0x14),
opencvLittleToHost32((uchar*)(color)+0x18),
opencvLittleToHost32((uchar*)(color)+0x1C)};
uint32_t* hline_ptr = (uint32_t*)((uchar*)(ptr) + (xl)*(32));
uint32_t* hline_max_ptr = (uint32_t*)((uchar*)(ptr) + (xr)*(32));
for( ; hline_ptr <= hline_max_ptr; )
{
*hline_ptr++ = c[0];
*hline_ptr++ = c[1];
*hline_ptr++ = c[2];
*hline_ptr++ = c[3];
*hline_ptr++ = c[4];
*hline_ptr++ = c[5];
*hline_ptr++ = c[6];
*hline_ptr++ = c[7];
}
}
else
{
ICV_HLINE_X(ptr, xl, xr, color, 32);
}
}
//end ICV_HLINE_32()
static const bool ICV_HLINE_OPTIMIZATION = true;
static inline void ICV_HLINE(uchar* ptr, int xl, int xr, const void* color, int pix_size)
{
if (pix_size == 1)
if (!ICV_HLINE_OPTIMIZATION)
ICV_HLINE_0(ptr, xl, xr, reinterpret_cast<const uchar*>(color), pix_size);
else if (pix_size == 1)
ICV_HLINE_1(ptr, xl, xr, reinterpret_cast<const uchar*>(color));
else if (pix_size == 2)
ICV_HLINE_2(ptr, xl, xr, reinterpret_cast<const uchar*>(color));
else if (pix_size == 3)
ICV_HLINE_3(ptr, xl, xr, reinterpret_cast<const uchar*>(color));
else if (pix_size == 4)
ICV_HLINE_4(ptr, xl, xr, reinterpret_cast<const uchar*>(color));
else if (pix_size == 6)
ICV_HLINE_6(ptr, xl, xr, reinterpret_cast<const uchar*>(color));
else if (pix_size == 8)
ICV_HLINE_8(ptr, xl, xr, reinterpret_cast<const uchar*>(color));
//timings do not show relevant improvement when element_size >= 12
/*else if (pix_size == 12)
ICV_HLINE_12(ptr, xl, xr, reinterpret_cast<const uchar*>(color));
else if (pix_size == 16)
ICV_HLINE_16(ptr, xl, xr, reinterpret_cast<const uchar*>(color));
else if (pix_size == 24)
ICV_HLINE_24(ptr, xl, xr, reinterpret_cast<const uchar*>(color));
else if (pix_size == 32)
ICV_HLINE_32(ptr, xl, xr, reinterpret_cast<const uchar*>(color));*/
else
ICV_HLINE_0(ptr, xl, xr, reinterpret_cast<const uchar*>(color), pix_size);
ICV_HLINE_X(ptr, xl, xr, reinterpret_cast<const uchar*>(color), pix_size);
}
//end ICV_HLINE()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment