Retina module is now parallelized thanks to the TBB library. Speed increase…

Retina module is now parallelized thanks to the TBB library. Speed increase expected on multicore plateforms

Retina module is now parallelized thanks to the TBB library. Speed increase…
Retina module is now parallelized thanks to the TBB library. Speed increase expected on multicore plateforms
424bc609 · noob · c78884c7 · 424bc609 · 424bc609 · 424bc609
Commit 424bc609 authored Aug 29, 2012 by noob
9 changed files
--- a/modules/contrib/src/basicretinafilter.cpp
+++ b/modules/contrib/src/basicretinafilter.cpp
--- a/modules/contrib/src/basicretinafilter.hpp
+++ b/modules/contrib/src/basicretinafilter.hpp
--- a/modules/contrib/src/magnoretinafilter.cpp
+++ b/modules/contrib/src/magnoretinafilter.cpp
@@ -153,6 +153,9 @@ void MagnoRetinaFilter::setCoefficientsTable(const float parasolCells_beta, cons
 void MagnoRetinaFilter::_amacrineCellsComputing(const float *OPL_ON, const float *OPL_OFF)
 {
+#ifdef HAVE_TBB
+        tbb::parallel_for(tbb::blocked_range<size_t>(0,_filterOutput.getNBpixels()), Parallel_amacrineCellsComputing(OPL_ON, OPL_OFF, &_previousInput_ON[0], &_previousInput_OFF[0], &_amacrinCellsTempOutput_ON[0], &_amacrinCellsTempOutput_OFF[0], _temporalCoefficient), tbb::auto_partitioner());
+#else
 	register const float *OPL_ON_PTR=OPL_ON;
 	register const float *OPL_OFF_PTR=OPL_OFF;
 	register float *previousInput_ON_PTR= &_previousInput_ON[0];
@@ -175,6 +178,7 @@ void MagnoRetinaFilter::_amacrineCellsComputing(const float *OPL_ON, const float
 		*(previousInput_OFF_PTR++)=*(OPL_OFF_PTR++);
 	}
+#endif
 }
 // launch filter that runs all the IPL filter

--- a/modules/contrib/src/magnoretinafilter.hpp
+++ b/modules/contrib/src/magnoretinafilter.hpp
@@ -190,10 +190,52 @@ private:
    // varialbles
    float _temporalCoefficient;
-    // amacrine cells filter : high pass temporal filter
+	// amacrine cells filter : high pass temporal filter
-    void _amacrineCellsComputing(const float *ONinput, const float *OFFinput);
+	void _amacrineCellsComputing(const float *ONinput, const float *OFFinput);
+#ifdef HAVE_TBB
+/******************************************************
+** IF TBB is useable, then, main loops are parallelized using these functors
+** ==> main idea paralellise main filters loops, then, only the most used methods are parallelized... TODO : increase the number of parallelised methods as necessary
+** ==> functors names = Parallel_$$$ where $$$= the name of the serial method that is parallelised
+** ==> functors constructors can differ from the parameters used with their related serial functions
+*/
+    class Parallel_amacrineCellsComputing
+    {
+    private:
+	const float *OPL_ON, *OPL_OFF;
+        float *previousInput_ON, *previousInput_OFF, *amacrinCellsTempOutput_ON, *amacrinCellsTempOutput_OFF;
+	const float temporalCoefficient;
+    public:
+        Parallel_amacrineCellsComputing(const float *OPL_ON_PTR, const float *OPL_OFF_PTR, float *previousInput_ON_PTR, float *previousInput_OFF_PTR, float *amacrinCellsTempOutput_ON_PTR, float *amacrinCellsTempOutput_OFF_PTR, float temporalCoefficientVal)
+        :OPL_ON(OPL_ON_PTR), OPL_OFF(OPL_OFF_PTR), previousInput_ON(previousInput_ON_PTR), previousInput_OFF(previousInput_OFF_PTR), amacrinCellsTempOutput_ON(amacrinCellsTempOutput_ON_PTR), amacrinCellsTempOutput_OFF(amacrinCellsTempOutput_OFF_PTR), temporalCoefficient(temporalCoefficientVal) {}
+        void operator()( const tbb::blocked_range<size_t>& r ) const {
+	register const float *OPL_ON_PTR=OPL_ON+r.begin();
+	register const float *OPL_OFF_PTR=OPL_OFF+r.begin();
+	register float *previousInput_ON_PTR= previousInput_ON+r.begin();
+	register float *previousInput_OFF_PTR= previousInput_OFF+r.begin();
+	register float *amacrinCellsTempOutput_ON_PTR= amacrinCellsTempOutput_ON+r.begin();
+	register float *amacrinCellsTempOutput_OFF_PTR= amacrinCellsTempOutput_OFF+r.begin();
+	for (unsigned int IDpixel=r.begin() ; IDpixel!=r.end(); ++IDpixel)
+	{
+		/* Compute ON and OFF amacrin cells high pass temporal filter */
+		float magnoXonPixelResult = temporalCoefficient*(*amacrinCellsTempOutput_ON_PTR+ *OPL_ON_PTR-*previousInput_ON_PTR);
+		*(amacrinCellsTempOutput_ON_PTR++)=((float)(magnoXonPixelResult>0))*magnoXonPixelResult;
+		float magnoXoffPixelResult = temporalCoefficient*(*amacrinCellsTempOutput_OFF_PTR+ *OPL_OFF_PTR-*previousInput_OFF_PTR);
+		*(amacrinCellsTempOutput_OFF_PTR++)=((float)(magnoXoffPixelResult>0))*magnoXoffPixelResult;
+		/* prepare next loop */
+		*(previousInput_ON_PTR++)=*(OPL_ON_PTR++);
+		*(previousInput_OFF_PTR++)=*(OPL_OFF_PTR++);
+	}
+        }
+    };
+#endif
 };
 }

--- a/modules/contrib/src/parvoretinafilter.cpp
+++ b/modules/contrib/src/parvoretinafilter.cpp
@@ -199,17 +199,20 @@ const std::valarray<float> &ParvoRetinaFilter::runFilter(const std::valarray<flo
 	return (*_parvocellularOutputONminusOFF);
 }
-void ParvoRetinaFilter::_OPL_OnOffWaysComputing()
+void ParvoRetinaFilter::_OPL_OnOffWaysComputing() // WARNING : this method requires many buffer accesses, parallelizing can increase bandwith & core efficacy
 {
 	// loop that makes the difference between photoreceptor cells output and horizontal cells
 	// positive part goes on the ON way, negative pat goes on the OFF way
-	register float *photoreceptorsOutput_PTR= &_photoreceptorsOutput[0];
-	register float *horizontalCellsOutput_PTR= &_horizontalCellsOutput[0];
-	register float *bipolarCellsON_PTR = &_bipolarCellsOutputON[0];
-	register float *bipolarCellsOFF_PTR = &_bipolarCellsOutputOFF[0];
-	register float *parvocellularOutputON_PTR= &_parvocellularOutputON[0];
-	register float *parvocellularOutputOFF_PTR= &_parvocellularOutputOFF[0];
+#ifdef HAVE_TBB
+        tbb::parallel_for(tbb::blocked_range<size_t>(0,_filterOutput.getNBpixels()), Parallel_OPL_OnOffWaysComputing(&_photoreceptorsOutput[0], &_horizontalCellsOutput[0], &_bipolarCellsOutputON[0], &_bipolarCellsOutputOFF[0], &_parvocellularOutputON[0], &_parvocellularOutputOFF[0]), tbb::auto_partitioner());
+#else
+	float *photoreceptorsOutput_PTR= &_photoreceptorsOutput[0];
+	float *horizontalCellsOutput_PTR= &_horizontalCellsOutput[0];
+	float *bipolarCellsON_PTR = &_bipolarCellsOutputON[0];
+	float *bipolarCellsOFF_PTR = &_bipolarCellsOutputOFF[0];
+	float *parvocellularOutputON_PTR= &_parvocellularOutputON[0];
+	float *parvocellularOutputOFF_PTR= &_parvocellularOutputOFF[0];
 	// compute bipolar cells response equal to photoreceptors minus horizontal cells response
 	// and copy the result on parvo cellular outputs... keeping time before their local contrast adaptation for final result
 	for (register unsigned int IDpixel=0 ; IDpixel<_filterOutput.getNBpixels() ; ++IDpixel)
@@ -222,6 +225,7 @@ void ParvoRetinaFilter::_OPL_OnOffWaysComputing()
 		*(parvocellularOutputON_PTR++)=*(bipolarCellsON_PTR++) = isPositive*pixelDifference;
 		*(parvocellularOutputOFF_PTR++)=*(bipolarCellsOFF_PTR++)= (isPositive-1.0f)*pixelDifference;
 	}
+#endif
 }
 }
--- a/modules/contrib/src/parvoretinafilter.hpp
+++ b/modules/contrib/src/parvoretinafilter.hpp
@@ -216,6 +216,45 @@ private:
 	// private functions
 	void _OPL_OnOffWaysComputing();
+#ifdef HAVE_TBB
+/******************************************************
+** IF TBB is useable, then, main loops are parallelized using these functors
+** ==> main idea paralellise main filters loops, then, only the most used methods are parallelized... TODO : increase the number of parallelised methods as necessary
+** ==> functors names = Parallel_$$$ where $$$= the name of the serial method that is parallelised
+** ==> functors constructors can differ from the parameters used with their related serial functions
+*/
+    class Parallel_OPL_OnOffWaysComputing
+    {
+    private:
+	float *photoreceptorsOutput, *horizontalCellsOutput, *bipolarCellsON, *bipolarCellsOFF, *parvocellularOutputON, *parvocellularOutputOFF;
+    public:
+        Parallel_OPL_OnOffWaysComputing(float *photoreceptorsOutput_PTR, float *horizontalCellsOutput_PTR, float *bipolarCellsON_PTR, float *bipolarCellsOFF_PTR, float *parvocellularOutputON_PTR, float *parvocellularOutputOFF_PTR)
+        :photoreceptorsOutput(photoreceptorsOutput_PTR), horizontalCellsOutput(horizontalCellsOutput_PTR), bipolarCellsON(bipolarCellsON_PTR), bipolarCellsOFF(bipolarCellsOFF_PTR), parvocellularOutputON(parvocellularOutputON_PTR), parvocellularOutputOFF(parvocellularOutputOFF_PTR) {}
+        void operator()( const tbb::blocked_range<size_t>& r ) const {
+	    // compute bipolar cells response equal to photoreceptors minus horizontal cells response
+	    // and copy the result on parvo cellular outputs... keeping time before their local contrast adaptation for final result
+	    float *photoreceptorsOutput_PTR= photoreceptorsOutput+r.begin();
+	    float *horizontalCellsOutput_PTR= horizontalCellsOutput+r.begin();
+	    float *bipolarCellsON_PTR = bipolarCellsON+r.begin();
+	    float *bipolarCellsOFF_PTR = bipolarCellsOFF+r.begin();
+	    float *parvocellularOutputON_PTR= parvocellularOutputON+r.begin();
+	    float *parvocellularOutputOFF_PTR= parvocellularOutputOFF+r.begin();
+            for (register unsigned int IDpixel=r.begin() ; IDpixel!=r.end() ; ++IDpixel)
+	    {
+		float pixelDifference = *(photoreceptorsOutput_PTR++) -*(horizontalCellsOutput_PTR++);
+		// test condition to allow write pixelDifference in ON or OFF buffer and 0 in the over
+		float isPositive=(float) (pixelDifference>0.0f);
+		// ON and OFF channels writing step
+		*(parvocellularOutputON_PTR++)=*(bipolarCellsON_PTR++) = isPositive*pixelDifference;
+		*(parvocellularOutputOFF_PTR++)=*(bipolarCellsOFF_PTR++)= (isPositive-1.0f)*pixelDifference;
+	    }
+        }
+    };
+#endif
 };
 }
 #endif

--- a/modules/contrib/src/retinacolor.cpp
+++ b/modules/contrib/src/retinacolor.cpp
@@ -89,7 +89,7 @@ RetinaColor::RetinaColor(const unsigned int NBrows, const unsigned int NBcolumns
 _demultiplexedColorFrame(NBrows*NBcolumns*3),
 _chrominance(NBrows*NBcolumns*3),
 _colorLocalDensity(NBrows*NBcolumns*3),
- _imageGradient(NBrows*NBcolumns*3)
+ _imageGradient(NBrows*NBcolumns*2)
 {
 	// link to parent buffers (let's recycle !)
 	_luminance=&_filterOutput;
@@ -126,12 +126,12 @@ RetinaColor::~RetinaColor()
 void RetinaColor::clearAllBuffers()
 {
 	BasicRetinaFilter::clearAllBuffers();
-	_tempMultiplexedFrame=0;
+	_tempMultiplexedFrame=0.f;
-	_demultiplexedTempBuffer=0;
+	_demultiplexedTempBuffer=0.f;
-	_demultiplexedColorFrame=0;
+	_demultiplexedColorFrame=0.f;
-	_chrominance=0;
+	_chrominance=0.f;
-	_imageGradient=1;
+	_imageGradient=0.57f;
 }
 /**
@@ -149,7 +149,7 @@ void RetinaColor::resize(const unsigned int NBrows, const unsigned int NBcolumns
 	_demultiplexedColorFrame.resize(NBrows*NBcolumns*3);
 	_chrominance.resize(NBrows*NBcolumns*3);
 	_colorLocalDensity.resize(NBrows*NBcolumns*3);
-	_imageGradient.resize(NBrows*NBcolumns*3);
+	_imageGradient.resize(NBrows*NBcolumns*2);
 	// link to parent buffers (let's recycle !)
 	_luminance=&_filterOutput;
@@ -325,15 +325,15 @@ void RetinaColor::runColorDemultiplexing(const std::valarray<float> &multiplexed
 	}else
 	{
-		register const float *multiplexedColorFramePTR1= get_data(multiplexedColorFrame);
+		register const float *multiplexedColorFramePTR= get_data(multiplexedColorFrame);
-		for (unsigned int indexc=0; indexc<_filterOutput.getNBpixels() ; ++indexc, ++chrominancePTR, ++colorLocalDensityPTR, ++luminance, ++multiplexedColorFramePTR1)
+		for (unsigned int indexc=0; indexc<_filterOutput.getNBpixels() ; ++indexc, ++chrominancePTR, ++colorLocalDensityPTR, ++luminance, ++multiplexedColorFramePTR)
 		{
 			// normalize by photoreceptors density
 			float Cr=*(chrominancePTR)*_colorLocalDensity[indexc];
 			float Cg=*(chrominancePTR+_filterOutput.getNBpixels())*_colorLocalDensity[indexc+_filterOutput.getNBpixels()];
 			float Cb=*(chrominancePTR+_filterOutput.getDoubleNBpixels())*_colorLocalDensity[indexc+_filterOutput.getDoubleNBpixels()];
 			*luminance=(Cr+Cg+Cb)*_pG;
-			_demultiplexedTempBuffer[_colorSampling[indexc]] = *multiplexedColorFramePTR1 - *luminance;
+			_demultiplexedTempBuffer[_colorSampling[indexc]] = *multiplexedColorFramePTR - *luminance;
 		}
@@ -349,8 +349,9 @@ void RetinaColor::runColorDemultiplexing(const std::valarray<float> &multiplexed
 		_adaptiveSpatialLPfilter(&_demultiplexedTempBuffer[0]+_filterOutput.getNBpixels(), &_demultiplexedColorFrame[0]+_filterOutput.getNBpixels());
 		_adaptiveSpatialLPfilter(&_demultiplexedTempBuffer[0]+_filterOutput.getDoubleNBpixels(), &_demultiplexedColorFrame[0]+_filterOutput.getDoubleNBpixels());
-		for (unsigned int index=0; index<_filterOutput.getNBpixels()*3 ; ++index) // cette boucle pourrait �tre supprimee en passant la densit� � la fonction de filtrage
+/*		for (unsigned int index=0; index<_filterOutput.getNBpixels()*3 ; ++index) // cette boucle pourrait �tre supprimee en passant la densit� � la fonction de filtrage
-			_demultiplexedColorFrame[index] /= _chrominance[index];
+			_demultiplexedColorFrame[index] /= _chrominance[index];*/
+		_demultiplexedColorFrame/=_chrominance; // more optimal ;o)
 		// compute and substract the residual luminance
 		for (unsigned int index=0; index<_filterOutput.getNBpixels() ; ++index)
@@ -432,6 +433,9 @@ void RetinaColor::clipRGBOutput_0_maxInputValue(float *inputOutputBuffer, const
 	if (inputOutputBuffer==NULL)
 		inputOutputBuffer= &_demultiplexedColorFrame[0];
+#ifdef HAVE_TBB // call the TemplateBuffer TBB clipping method
+        tbb::parallel_for(tbb::blocked_range<size_t>(0,_filterOutput.getNBpixels()*3), Parallel_clipBufferValues<float>(inputOutputBuffer, 0, maxInputValue), tbb::auto_partitioner());
+#else
 	register float *inputOutputBufferPTR=inputOutputBuffer;
 	for (register unsigned int jf = 0; jf < _filterOutput.getNBpixels()*3; ++jf, ++inputOutputBufferPTR)
 	{
@@ -440,6 +444,7 @@ void RetinaColor::clipRGBOutput_0_maxInputValue(float *inputOutputBuffer, const
 		else if (*inputOutputBufferPTR<0)
 			*inputOutputBufferPTR=0;
 	}
+#endif
 	//std::cout<<"RetinaColor::...normalizing RGB frame OK"<<std::endl;
 }
@@ -535,8 +540,8 @@ void RetinaColor::_applyRIFfilter(const float *sourceBuffer, float *destinationB
 void RetinaColor::_getNormalizedContoursImage(const float *inputFrame, float *outputFrame)
 {
-	float maxValue=0;
+	float maxValue=0.f;
-	float normalisationFactor=1.f/3;
+	float normalisationFactor=1.f/3.f;
 	for (unsigned int indexr=1 ; indexr<_filterOutput.getNBrows()-1; ++indexr)
 	{
 		for (unsigned int indexc=1 ; indexc<_filterOutput.getNBcolumns()-1; ++indexc)
@@ -564,19 +569,23 @@ void RetinaColor::_adaptiveSpatialLPfilter(const float *inputFrame, float *outpu
 	_gain = (1-0.57f)*(1-0.57f)*(1-0.06f)*(1-0.06f);
 	// launch the serie of 1D directional filters in order to compute the 2D low pass filter
+	// -> horizontal filters work with the first layer of imageGradient
 	_adaptiveHorizontalCausalFilter_addInput(inputFrame, outputFrame, 0, _filterOutput.getNBrows());
-	_adaptiveHorizontalAnticausalFilter(outputFrame, 0, _filterOutput.getNBrows());
+	_horizontalAnticausalFilter_Irregular(outputFrame, 0, _filterOutput.getNBrows(), &_imageGradient[0]);
-	_adaptiveVerticalCausalFilter(outputFrame, 0, _filterOutput.getNBcolumns());
+	// -> horizontal filters work with the second layer of imageGradient
+	_verticalCausalFilter_Irregular(outputFrame, 0, _filterOutput.getNBcolumns(), &_imageGradient[0]+_filterOutput.getNBpixels());
 	_adaptiveVerticalAnticausalFilter_multGain(outputFrame, 0, _filterOutput.getNBcolumns());
 }
-//  horizontal causal filter which adds the input inside
+//  horizontal causal filter which adds the input inside... replaces the parent _horizontalCausalFilter_Irregular_addInput by avoiding a product for each pixel
 void RetinaColor::_adaptiveHorizontalCausalFilter_addInput(const float *inputFrame, float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd)
 {
+#ifdef HAVE_TBB
+        tbb::parallel_for(tbb::blocked_range<size_t>(IDrowStart,IDrowEnd), Parallel_adaptiveHorizontalCausalFilter_addInput(inputFrame, outputFrame, &_imageGradient[0], _filterOutput.getNBcolumns()), tbb::auto_partitioner());
+#else
 	register float* outputPTR=outputFrame+IDrowStart*_filterOutput.getNBcolumns();
 	register const float* inputPTR=inputFrame+IDrowStart*_filterOutput.getNBcolumns();
-	register float *imageGradientPTR= &_imageGradient[0]+IDrowStart*_filterOutput.getNBcolumns();
+	register const float *imageGradientPTR= &_imageGradient[0]+IDrowStart*_filterOutput.getNBcolumns();
 	for (unsigned int IDrow=IDrowStart; IDrow<IDrowEnd; ++IDrow)
 	{
 		register float result=0;
@@ -589,51 +598,17 @@ void RetinaColor::_adaptiveHorizontalCausalFilter_addInput(const float *inputFra
 		}
 		//        std::cout<<" "<<std::endl;
 	}
+#endif
 }
-//  horizontal anticausal filter  (basic way, no add on)
+//  vertical anticausal filter which multiplies the output by _gain... replaces the parent _verticalAnticausalFilter_multGain by avoiding a product for each pixel and taking into account the second layer of the _imageGradient buffer
-void RetinaColor::_adaptiveHorizontalAnticausalFilter(float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd)
-{
-	register float* outputPTR=outputFrame+IDrowEnd*(_filterOutput.getNBcolumns())-1;
-	register float *imageGradientPTR= &_imageGradient[0]+IDrowEnd*(_filterOutput.getNBcolumns())-1;
-	for (unsigned int IDrow=IDrowStart; IDrow<IDrowEnd; ++IDrow)
-	{
-		register float result=0;
-		for (unsigned int index=0; index<_filterOutput.getNBcolumns(); ++index)
-		{
-			result = *(outputPTR)+  (*imageGradientPTR)* result;
-			*(outputPTR--) = result;
-			--imageGradientPTR;
-		}
-	}
-}
-//  vertical anticausal filter
-void RetinaColor::_adaptiveVerticalCausalFilter(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd)
-{
-	for (unsigned int IDcolumn=IDcolumnStart; IDcolumn<IDcolumnEnd; ++IDcolumn)
-	{
-		register float result=0;
-		register float *outputPTR=outputFrame+IDcolumn;
-		register float *imageGradientPTR= &_imageGradient[0]+IDcolumn;
-		for (unsigned int index=0; index<_filterOutput.getNBrows(); ++index)
-		{
-			result = *(outputPTR) + (*(imageGradientPTR+_filterOutput.getNBpixels())) * result;
-			*(outputPTR) = result;
-			outputPTR+=_filterOutput.getNBcolumns();
-			imageGradientPTR+=_filterOutput.getNBcolumns();
-		}
-	}
-}
-//  vertical anticausal filter which multiplies the output by _gain
 void RetinaColor::_adaptiveVerticalAnticausalFilter_multGain(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd)
 {
+#ifdef HAVE_TBB
+        tbb::parallel_for(tbb::blocked_range<size_t>(IDcolumnStart,IDcolumnEnd), Parallel_adaptiveVerticalAnticausalFilter_multGain(outputFrame, &_imageGradient[0]+_filterOutput.getNBpixels(), _filterOutput.getNBrows(), _filterOutput.getNBcolumns(), _gain), tbb::auto_partitioner());
+#else
 	float* outputOffset=outputFrame+_filterOutput.getNBpixels()-_filterOutput.getNBcolumns();
-	float* gradOffset= &_imageGradient[0]+_filterOutput.getNBpixels()-_filterOutput.getNBcolumns();
+	float* gradOffset= &_imageGradient[0]+_filterOutput.getNBpixels()*2-_filterOutput.getNBcolumns();
 	for (unsigned int IDcolumn=IDcolumnStart; IDcolumn<IDcolumnEnd; ++IDcolumn)
 	{
@@ -642,12 +617,13 @@ void RetinaColor::_adaptiveVerticalAnticausalFilter_multGain(float *outputFrame,
 		register float *imageGradientPTR=gradOffset+IDcolumn;
 		for (unsigned int index=0; index<_filterOutput.getNBrows(); ++index)
 		{
-			result = *(outputPTR) + (*(imageGradientPTR+_filterOutput.getNBpixels())) * result;
+			result = *(outputPTR) + (*(imageGradientPTR)) * result;
 			*(outputPTR) = _gain*result;
 			outputPTR-=_filterOutput.getNBcolumns();
 			imageGradientPTR-=_filterOutput.getNBcolumns();
 		}
 	}
+#endif
 }
 ///////////////////////////

--- a/modules/contrib/src/retinacolor.hpp
+++ b/modules/contrib/src/retinacolor.hpp
@@ -248,9 +248,7 @@ protected:
 	void _getNormalizedContoursImage(const float *inputFrame, float *outputFrame);
 	// -> special adaptive filters dedicated to low pass filtering on the chrominance (skeeps filtering on the edges)
 	void _adaptiveSpatialLPfilter(const float *inputFrame,  float *outputFrame);
-	void _adaptiveHorizontalCausalFilter_addInput(const float *inputFrame, float *outputFrame, const unsigned int IDrowStart, const unsigned int IDrowEnd);
+	void _adaptiveHorizontalCausalFilter_addInput(const float *inputFrame, float *outputFrame, const unsigned int IDrowStart, const unsigned int IDrowEnd); // TBB parallelized
-	void _adaptiveHorizontalAnticausalFilter(float *outputFrame, const unsigned int IDrowStart, const unsigned int IDrowEnd);
-	void _adaptiveVerticalCausalFilter(float *outputFrame, const unsigned int IDcolumnStart, const unsigned int IDcolumnEnd);
 	void _adaptiveVerticalAnticausalFilter_multGain(float *outputFrame, const unsigned int IDcolumnStart, const unsigned int IDcolumnEnd);
 	void _computeGradient(const float *luminance);
 	void _normalizeOutputs_0_maxOutputValue(void);
@@ -258,6 +256,84 @@ protected:
 	// color space transform
 	void _applyImageColorSpaceConversion(const std::valarray<float> &inputFrame, std::valarray<float> &outputFrame, const float *transformTable);
+#ifdef HAVE_TBB
+/******************************************************
+** IF TBB is useable, then, main loops are parallelized using these functors
+** ==> main idea paralellise main filters loops, then, only the most used methods are parallelized... TODO : increase the number of parallelised methods as necessary
+** ==> functors names = Parallel_$$$ where $$$= the name of the serial method that is parallelised
+** ==> functors constructors can differ from the parameters used with their related serial functions
+*/
+/* Template :
+    class 
+    {
+    private:
+    public:
+         Parallel_()
+         : {}
+         void operator()( const tbb::blocked_range<size_t>& r ) const {
+        }
+    }:
+*/
+    class Parallel_adaptiveHorizontalCausalFilter_addInput
+    {
+    private:
+	float *outputFrame;
+	const float *inputFrame, *imageGradient;
+	const unsigned int nbColumns;
+    public:
+         Parallel_adaptiveHorizontalCausalFilter_addInput(const float *inputImg, float *bufferToProcess, const float *imageGrad, const unsigned int nbCols)
+         :outputFrame(bufferToProcess), inputFrame(inputImg), imageGradient(imageGrad), nbColumns(nbCols) {};
+         void operator()( const tbb::blocked_range<size_t>& r ) const {
+            register float* outputPTR=outputFrame+r.begin()*nbColumns;
+	    register const float* inputPTR=inputFrame+r.begin()*nbColumns;
+	    register const float *imageGradientPTR= imageGradient+r.begin()*nbColumns;
+	    for (unsigned int IDrow=r.begin(); IDrow!=r.end(); ++IDrow)
+	    {
+		register float result=0;
+		for (unsigned int index=0; index<nbColumns; ++index)
+		{
+			result = *(inputPTR++) + (*imageGradientPTR++)* result;
+			*(outputPTR++) = result;
+		}
+	    }
+        }
+    };
+    class Parallel_adaptiveVerticalAnticausalFilter_multGain
+    {
+    private:
+        float *outputFrame;
+	const float *imageGradient;
+        const unsigned int nbRows, nbColumns;
+        const float filterParam_gain;
+    public:        
+        Parallel_adaptiveVerticalAnticausalFilter_multGain(float *bufferToProcess, const float *imageGrad, const unsigned int nbRws, const unsigned int nbCols, const float  gain)
+        :outputFrame(bufferToProcess), imageGradient(imageGrad), nbRows(nbRws), nbColumns(nbCols), filterParam_gain(gain){}
+        void operator()( const tbb::blocked_range<size_t>& r ) const {
+            float* offset=outputFrame+nbColumns*nbRows-nbColumns;
+            const float* gradOffset= imageGradient+nbColumns*nbRows-nbColumns;
+    	    for (unsigned int IDcolumn=r.begin(); IDcolumn!=r.end(); ++IDcolumn)
+	    {
+		register float result=0;
+		register float *outputPTR=offset+IDcolumn;
+		register const float *imageGradientPTR=gradOffset+IDcolumn;
+		for (unsigned int index=0; index<nbRows; ++index)
+		{
+			result = *(outputPTR) + *(imageGradientPTR) * result;
+			*(outputPTR) = filterParam_gain*result;
+			outputPTR-=nbColumns;
+			imageGradientPTR-=nbColumns;
+		}
+	    }
+        }
+    };
+#endif
 };
 }

--- a/modules/contrib/src/templatebuffer.hpp
+++ b/modules/contrib/src/templatebuffer.hpp
@@ -70,6 +70,38 @@
 #include <iostream>
 #include <cmath>
+//// If TBB is used
+// ==> then include required includes 
+#ifdef HAVE_TBB
+#include "tbb/parallel_for.h"
+#include "tbb/blocked_range.h"
+// ==> declare usefull generic tools
+template <class type>
+class Parallel_clipBufferValues
+{
+private:
+    type *bufferToClip;
+    const type minValue, maxValue;
+public:
+    Parallel_clipBufferValues(type* bufferToProcess, const type min, const type max)
+    : bufferToClip(bufferToProcess), minValue(min), maxValue(max){}
+    void operator()( const tbb::blocked_range<size_t>& r ) const {
+	register type *inputOutputBufferPTR=bufferToClip+r.begin();
+        for (register unsigned int jf = r.begin(); jf != r.end(); ++jf, ++inputOutputBufferPTR)
+	{
+	    if (*inputOutputBufferPTR>maxValue)
+		*inputOutputBufferPTR=maxValue;
+	    else if (*inputOutputBufferPTR<minValue)
+		*inputOutputBufferPTR=minValue;
+	}
+    }
+};
+#endif
 //#define __TEMPLATEBUFFERDEBUG //define TEMPLATEBUFFERDEBUG in order to display debug information
 namespace cv
@@ -351,21 +383,25 @@ public:
            }
        }
-        std::cout<<"Tdebug"<<std::endl;
+		std::cout<<"Tdebug"<<std::endl;
-        std::cout<<"deltaL="<<deltaL<<", deltaH="<<deltaH<<std::endl;
+		std::cout<<"deltaL="<<deltaL<<", deltaH="<<deltaH<<std::endl;
-        std::cout<<"this->max()"<<this->max()<<"maxThreshold="<<maxThreshold<<"updatedHighValue="<<updatedHighValue<<std::endl;
+		std::cout<<"this->max()"<<this->max()<<"maxThreshold="<<maxThreshold<<"updatedHighValue="<<updatedHighValue<<std::endl;
-        std::cout<<"this->min()"<<this->min()<<"minThreshold="<<minThreshold<<"updatedLowValue="<<updatedLowValue<<std::endl;
+		std::cout<<"this->min()"<<this->min()<<"minThreshold="<<minThreshold<<"updatedLowValue="<<updatedLowValue<<std::endl;
-        // clipping values outside than the updated thresholds
+		// clipping values outside than the updated thresholds
-        bufferPTR=this->Buffer();
+                bufferPTR=this->Buffer();
-        for (unsigned int i=0;i<this->size();++i, ++bufferPTR)
+#ifdef HAVE_TBB // call the TemplateBuffer TBB clipping method
-        {
+                tbb::parallel_for(tbb::blocked_range<size_t>(0,this->size()), Parallel_clipBufferValues<type>(bufferPTR, updatedLowValue, updatedHighValue), tbb::auto_partitioner());
-            if (*bufferPTR<updatedLowValue)
+#else
-                *bufferPTR=updatedLowValue;
-            else if (*bufferPTR>updatedHighValue)
+		for (unsigned int i=0;i<this->size();++i, ++bufferPTR)
-                *bufferPTR=updatedHighValue;
+		{
-        }
+			if (*bufferPTR<updatedLowValue)
+				*bufferPTR=updatedLowValue;
-        normalizeGrayOutput_0_maxOutputValue(this->Buffer(), this->size(), maxOutputValue);
+			else if (*bufferPTR>updatedHighValue)
+				*bufferPTR=updatedHighValue;
+		}
+#endif
+		normalizeGrayOutput_0_maxOutputValue(this->Buffer(), this->size(), maxOutputValue);
    }