//---------------------------------------------------------------------------



#include "imagetools.h"
#include <numeric>
#include <cmath>
#include <cassert>
#include <algorithm>
#include <iostream>

using namespace std;

//
TConvolution::TConvolution(CvSize sz, int w, int h)
    :srcSz(sz)
{
    cw = w;
    ch = h;
	rowStep = (2*sz.width) / cw;
	rowStep = (rowStep & 0x01) ? (rowStep & 0xFFFFFFFE) + 1 : rowStep;
    rows = int(ceil(sz.height/(2.0f*ch)));
	blockStep = (rowStep * rows);

	// Alloc the image
    unsigned size = (blockStep * cw * ch);
    image = new signed char[size];

    // Generate index of blocks
    block = new signed char*[cw*ch];
    for (unsigned i = 0; i < cw*ch; ++i)
        block[i] = image + i * blockStep;

    row = new int[rows];
    for (unsigned i = 0; i < rows; ++i)
        row[i] = i * rowStep;
    //cout << "Convolution " << cw << "x" << ch << endl;
    //cout << "rowstep=" << rowStep << endl;
    //cout << "blockstep=" << blockStep << endl << endl;
}


//---------------------------------------------------------------------------

void integrate(IplImage * image, IplImage * int1)
{
	unsigned char * srcbase = (unsigned char*)image->imageData;
	unsigned * dstbase = (unsigned*)int1->imageData;

	// top-left corner
	*dstbase = *srcbase;

	// first row
	for (int x = 1; x < image->width; ++x)
	{
		*(dstbase+x) = *(dstbase+x-1) + *(srcbase+x);
	}

    srcbase += image->widthStep;
    dstbase += int1->widthStep/4;
    
    // rest of image
	for (int y = 1; y < image->height; ++y, srcbase += image->widthStep, dstbase += int1->widthStep/4)
	{
		unsigned tmp = 0;

		for (int x = 0; srcbase+x < srcbase+image->width; ++x)
		{
			tmp += *(srcbase+x);
			*(dstbase+x) = tmp + *(dstbase+x-(int1->widthStep/4));
		}
	}
}


void integrate(IplImage * image, IplImage * int1, IplImage * int2)
{	
	unsigned char * srcbase = (unsigned char*)image->imageData;
	unsigned * dstbase1 = (unsigned*)int1->imageData;
	unsigned * dstbase2 = (unsigned*)int2->imageData;

	// top-left corner
	*dstbase1 = *srcbase;
	*dstbase2 = *srcbase * *srcbase;

	// first row
	for (int x = 1; x < image->width; ++x)
	{
		*(dstbase1+x) = *(dstbase1+x-1) + *(srcbase+x);
		*(dstbase2+x) = *(dstbase2+x-1) + *(srcbase+x) * *(srcbase+x);
	}

    srcbase += image->widthStep;
    dstbase1 += int1->widthStep/4;
    dstbase2 += int2->widthStep/4;
    
    // rest of image
	for (int y = 1; y < image->height; ++y, srcbase += image->widthStep, dstbase1 += int1->widthStep/4, dstbase2 += int2->widthStep/4)
	{
		unsigned tmp1 = 0;
		unsigned tmp2 = 0;

		for (int x = 0; srcbase+x < srcbase+image->width; ++x)
		{
			tmp1 += *(srcbase+x);
			tmp2 += *(srcbase+x) * *(srcbase+x);
			*(dstbase1+x) = tmp1 + *(dstbase1+x - (int1->widthStep/4));
			*(dstbase2+x) = tmp2 + *(dstbase2+x - (int2->widthStep/4));
		}
	}
}


///////////////////////////////////////////////////////////////////////////////

// Calculates means in 2x2 blocks and stores result to a vector
static void meanRegions(unsigned char * data, int w, int h, unsigned widthStep, unsigned blockStep, signed char * v)
{
    int accum[4] = {0, 0, 0, 0};
    widthStep -= w;

    unsigned char * base0 = data;
    unsigned char * base1 = data+w;
    unsigned char * base2 = data+blockStep;
    unsigned char * base3 = data+blockStep+w;

    for (int y = 0; y < h; ++y, base0+=widthStep, base1+=widthStep, base2+=widthStep, base3+=widthStep)
    {
        for (int x = 0; x < w; ++x, ++base0, ++base1, ++base2, ++base3)
        {
            accum[0] += *base0;
            accum[1] += *base1;
            accum[2] += *base2;
            accum[3] += *base3;
        }
    }

    v[0] = (accum[0]/(w*h)) - 128;
    v[1] = (accum[1]/(w*h)) - 128;
    v[2] = (accum[2]/(w*h)) - 128;
    v[3] = (accum[3]/(w*h)) - 128;
}


void convolution(IplImage * image, TConvolution * conv)
{
    int sx = conv->cw;
    int sy = conv->ch;

	// Cil
	signed char * convImage = (signed char*)conv->image;

	// u,v posunuti konvoluce
	for (int v = 0; v < sy; ++v)
    {
        for (int u = 0; u < sx; ++u, convImage += conv->blockStep)
        {
			
            signed char * dst = convImage;
            for (int y = v; y <= image->height-2*sy; y+=2*sy, dst+=conv->rowStep)
            {
				// zdrojova adresa (u, y+v)
                unsigned char * src = (unsigned char*)(image->imageData + y * image->widthStep) + u;
				// cilova adresa 
				signed char * dest = dst;
                for (int x = 0; x <= image->width-2*sx; x+=2*sx, dest+=4, src+=2*sx)
                {
                    meanRegions(src, sx, sy, image->widthStep, sy*image->widthStep, dest);
                }
            }
        }
    }
}


void convolve1x1(IplImage * image, TConvolution * conv)
{
	// Cil
	signed char * dst = conv->image;
	long long dstOfs = 0;
	
    for (int y = 0; y <= image->height-2; y+=2, dst+=conv->rowStep)
    {
        // zdrojova adresa
        unsigned char * src = (unsigned char*)(image->imageData + y * image->widthStep);
        // cilova adresa 
        signed char * dest = dst;
		dstOfs = dst - conv->image;

        for (unsigned char * base = src; base <= src+image->width-2; dest+=4, dstOfs+=4, base+=2)
        {
            // Vypocet a zapis konvoluci
            dest[0] = *(base) - 128;
            dest[1] = *(base+1) - 128;
            dest[2] = *(base+image->widthStep) - 128;
            dest[3] = *(base+image->widthStep+1) - 128;
        }
    }
}


//#define DATA(u,v) (*(px + (v) * image->widthStep + (u)))
#define DATA(u,v) (*((px) + (widthStepTab[v]) + (u)))
#if 0
void convolution12(IplImage * image, TConvolution * conv11, TConvolution * conv22, TConvolution * conv12, TConvolution * conv21)
{
    /*
    assert(conv11->cw == conv11->ch == 1 && "conv11 must hold 1x1 convolution image");
    assert(conv22->cw == conv22->ch == 2 && "conv22 must hold 2x2 convolution image");
    assert(conv12->cw == 1 && conv12->ch == 2 && "conv12 must hold 1x2 convolution image");
    assert(conv21->cw == 2 && conv21->ch == 1 && "conv21 must hold 2x1 convolution image");
    */

    int widthStepTab[5] = 
    {
        0,
        1*image->widthStep,
        2*image->widthStep,
        3*image->widthStep,
        4*image->widthStep,
    };

    signed char * dst1x1[2] = {
        conv11->image,
        conv11->image+conv11->rowStep,
    };
    
    signed char * dst2x2[4] = {
        conv22->image,
        conv22->image + conv22->blockStep,
        conv22->image + 2 * conv22->blockStep,
        conv22->image + 3 * conv22->blockStep,
    };

    signed char * dst2x1[4] = {
        conv21->image,
        conv21->image + conv21->rowStep,
        conv21->image + conv21->blockStep,
        conv21->image + conv21->rowStep + conv21->blockStep,
    };

    signed char * dst1x2[2] = {
        conv12->image,
        conv12->image + conv12->blockStep,
    };

    for (unsigned char * base = (unsigned char*)image->imageData;
        base < (unsigned char*)(image->imageData+((image->height-5)*image->widthStep));
        base += 4*image->widthStep)
    {
        signed char * tmpdst1x1[2];
        copy(dst1x1, dst1x1+2, tmpdst1x1);        
        signed char * tmpdst2x2[4];
        copy(dst2x2, dst2x2+4, tmpdst2x2);
        signed char * tmpdst1x2[2];
        copy(dst1x2, dst1x2+2, tmpdst1x2);
        signed char * tmpdst2x1[4];
        copy(dst2x1, dst2x1+4, tmpdst2x1);

        for (unsigned char * px = base; px < base+image->width-5; px+=4)
        {
            // 1x1 results
            *(tmpdst1x1[0] + 0) = DATA(0, 0) - 128;
            *(tmpdst1x1[0] + 1) = DATA(1, 0) - 128;
            *(tmpdst1x1[0] + 2) = DATA(0, 1) - 128;
            *(tmpdst1x1[0] + 3) = DATA(1, 1) - 128;

            *(tmpdst1x1[0] + 4) = DATA(2, 0) - 128; 
            *(tmpdst1x1[0] + 5) = DATA(3, 0) - 128;
            *(tmpdst1x1[0] + 6) = DATA(2, 1) - 128;
            *(tmpdst1x1[0] + 7) = DATA(3, 1) - 128;

            *(tmpdst1x1[1] + 0) = DATA(0, 2) - 128;
            *(tmpdst1x1[1] + 1) = DATA(1, 2) - 128;
            *(tmpdst1x1[1] + 2) = DATA(0, 3) - 128;
            *(tmpdst1x1[1] + 3) = DATA(1, 3) - 128;

            *(tmpdst1x1[1] + 4) = DATA(2, 2) - 128;
            *(tmpdst1x1[1] + 5) = DATA(3, 2) - 128;
            *(tmpdst1x1[1] + 6) = DATA(2, 3) - 128; 
            *(tmpdst1x1[1] + 7) = DATA(3, 3) - 128;

            // 2x1 results
            /*
            *(tmpdst2x1[0] + 0) = // ((DATA(0, 0) + DATA(1, 0)) >> 1)-128;
                                  (*(tmpdst1x1[0]+0) + *(tmpdst1x1[0]+1)) >> 1;
            *(tmpdst2x1[0] + 1) = // ((DATA(2, 0) + DATA(3, 0)) >> 1)-128;
                                  (*(tmpdst1x1[0]+4) + *(tmpdst1x1[0]+5)) >> 1;
            *(tmpdst2x1[0] + 2) = // ((DATA(0, 1) + DATA(1, 1)) >> 1)-128;
                                  (*(tmpdst1x1[0]+2) + *(tmpdst1x1[0]+3)) >> 1;
            *(tmpdst2x1[0] + 3) = // ((DATA(2, 1) + DATA(3, 1)) >> 1)-128;
                                  (*(tmpdst1x1[0]+6) + *(tmpdst1x1[0]+7)) >> 1;
            
            *(tmpdst2x1[1] + 0) = // ((DATA(0, 2) + DATA(1, 2)) >> 1)-128;
                                  (*(tmpdst1x1[1]+0) + *(tmpdst1x1[1]+1)) >> 1;
            *(tmpdst2x1[1] + 1) = // ((DATA(2, 2) + DATA(3, 2)) >> 1)-128;
                                  (*(tmpdst1x1[1]+4) + *(tmpdst1x1[1]+5)) >> 1;
            *(tmpdst2x1[1] + 2) = // ((DATA(0, 3) + DATA(1, 3)) >> 1)-128;
                                  (*(tmpdst1x1[1]+2) + *(tmpdst1x1[1]+3)) >> 1;
            *(tmpdst2x1[1] + 3) = // ((DATA(2, 3) + DATA(3, 3)) >> 1)-128;
                                  (*(tmpdst1x1[1]+6) + *(tmpdst1x1[1]+7)) >> 1;

            *(tmpdst2x1[2] + 0) = //((DATA(1, 0) + DATA(2, 0)) >> 1)-128;
                                  (*(tmpdst1x1[0]+1) + *(tmpdst1x1[0]+4)) >> 1;
            *(tmpdst2x1[2] + 1) = //((DATA(3, 0) + DATA(4, 0)) >> 1)-128;
                                  (*(tmpdst1x1[0]+5) + (DATA(4, 0) - 128)) >> 1;
            *(tmpdst2x1[2] + 2) = //((DATA(1, 1) + DATA(2, 1)) >> 1)-128;
                                  (*(tmpdst1x1[0]+3) + *(tmpdst1x1[0]+6)) >> 1;
            *(tmpdst2x1[2] + 3) = //((DATA(3, 1) + DATA(4, 1)) >> 1)-128;
                                  (*(tmpdst1x1[0]+7) + (DATA(4, 1) - 128)) >> 1;
            
            *(tmpdst2x1[3] + 0) = //((DATA(1, 2) + DATA(2, 2)) >> 1)-128;
                                  (*(tmpdst1x1[1]+1) + *(tmpdst1x1[1]+4)) >> 1;
            *(tmpdst2x1[3] + 1) = //((DATA(3, 2) + DATA(4, 2)) >> 1)-128;
                                  (*(tmpdst1x1[1]+5) + (DATA(4, 2) - 128)) >> 1;
            *(tmpdst2x1[3] + 2) = //((DATA(1, 3) + DATA(2, 3)) >> 1)-128;
                                  (*(tmpdst1x1[1]+3) + *(tmpdst1x1[1]+6)) >> 1;
            *(tmpdst2x1[3] + 3) = //((DATA(3, 3) + DATA(4, 3)) >> 1)-128;
                                  (*(tmpdst1x1[1]+7) + *(DATA(4, 3) - 128) >> 1;
            */
            // Optimized  version of above commented code
            *(tmpdst2x1[0] + 0) = (*(tmpdst1x1[0]+0) + *(tmpdst1x1[0]+1)) >> 1;
            *(tmpdst2x1[2] + 0) = (*(tmpdst1x1[0]+1) + *(tmpdst1x1[0]+4)) >> 1;
            *(tmpdst2x1[0] + 1) = (*(tmpdst1x1[0]+4) + *(tmpdst1x1[0]+5)) >> 1;
            *(tmpdst2x1[2] + 1) = (*(tmpdst1x1[0]+5) + (DATA(4, 0) - 128)) >> 1;
            
            *(tmpdst2x1[0] + 2) = (*(tmpdst1x1[0]+2) + *(tmpdst1x1[0]+3)) >> 1;
            *(tmpdst2x1[2] + 2) = (*(tmpdst1x1[0]+3) + *(tmpdst1x1[0]+6)) >> 1;
            *(tmpdst2x1[0] + 3) = (*(tmpdst1x1[0]+6) + *(tmpdst1x1[0]+7)) >> 1;
            *(tmpdst2x1[2] + 3) = (*(tmpdst1x1[0]+7) + (DATA(4, 1) - 128)) >> 1;
            
            *(tmpdst2x1[1] + 0) = (*(tmpdst1x1[1]+0) + *(tmpdst1x1[1]+1)) >> 1;
            *(tmpdst2x1[3] + 0) = (*(tmpdst1x1[1]+1) + *(tmpdst1x1[1]+4)) >> 1;
            *(tmpdst2x1[1] + 1) = (*(tmpdst1x1[1]+4) + *(tmpdst1x1[1]+5)) >> 1;
            *(tmpdst2x1[3] + 1) = (*(tmpdst1x1[1]+5) + (DATA(4, 2) - 128)) >> 1;
            
            *(tmpdst2x1[1] + 2) = (*(tmpdst1x1[1]+2) + *(tmpdst1x1[1]+3)) >> 1;
            *(tmpdst2x1[3] + 2) = (*(tmpdst1x1[1]+3) + *(tmpdst1x1[1]+6)) >> 1;
            *(tmpdst2x1[1] + 3) = (*(tmpdst1x1[1]+6) + *(tmpdst1x1[1]+7)) >> 1;
            *(tmpdst2x1[3] + 3) = (*(tmpdst1x1[1]+7) + (DATA(4, 3) - 128)) >> 1;
 
            /*
            // 1x2 results
            *(tmpdst1x2[0] + 0) = // ((DATA(0,0) + DATA(0,1)) >> 1) - 128;
                                  (*(tmpdst1x1[0]+0) + *(tmpdst1x1[0]+2)) >> 1;
            *(tmpdst1x2[0] + 1) = // ((DATA(1,0) + DATA(1,1)) >> 1) - 128;
                                  (*(tmpdst1x1[0]+1) + *(tmpdst1x1[0]+3)) >> 1;
            *(tmpdst1x2[0] + 2) = // ((DATA(0,2) + DATA(0,3)) >> 1) - 128;
                                  (*(tmpdst1x1[1]+0) + *(tmpdst1x1[1]+2)) >> 1;
            *(tmpdst1x2[0] + 3) = // ((DATA(1,2) + DATA(1,3)) >> 1) - 128;
                                  (*(tmpdst1x1[1]+1) + *(tmpdst1x1[1]+3)) >> 1;
            *(tmpdst1x2[0] + 4) = // ((DATA(2,0) + DATA(2,1)) >> 1) - 128;
                                  (*(tmpdst1x1[0]+4) + *(tmpdst1x1[0]+6)) >> 1;
            *(tmpdst1x2[0] + 5) = // ((DATA(3,0) + DATA(3,1)) >> 1) - 128;
                                  (*(tmpdst1x1[0]+5) + *(tmpdst1x1[0]+7)) >> 1;
            *(tmpdst1x2[0] + 6) = // ((DATA(2,2) + DATA(2,3)) >> 1) - 128;
                                  (*(tmpdst1x1[1]+4) + *(tmpdst1x1[1]+6)) >> 1;
            *(tmpdst1x2[0] + 7) = // ((DATA(3,2) + DATA(3,3)) >> 1) - 128;
                                  (*(tmpdst1x1[1]+5) + *(tmpdst1x1[1]+7)) >> 1;
            
            *(tmpdst1x2[1] + 0) = // ((DATA(0,1) + DATA(0,2)) >> 1) - 128;
                                  (*(tmpdst1x1[0]+2) + *(tmpdst1x1[1]+0)) >> 1;
            *(tmpdst1x2[1] + 1) = // ((DATA(1,1) + DATA(1,2)) >> 1) - 128;
                                  (*(tmpdst1x1[0]+3) + *(tmpdst1x1[1]+1)) >> 1;
            *(tmpdst1x2[1] + 2) = // ((DATA(0,3) + DATA(0,4)) >> 1) - 128;
                                  (*(tmpdst1x1[1]+2) + (DATA(0, 4) - 128) >> 1;
            *(tmpdst1x2[1] + 3) = // ((DATA(1,3) + DATA(1,4)) >> 1) - 128;
                                  (*(tmpdst1x1[1]+3) + (DATA(1, 4) - 128) >> 1;
            *(tmpdst1x2[1] + 4) = // ((DATA(2,1) + DATA(2,2)) >> 1) - 128;
                                  (*(tmpdst1x1[0]+6) + *(tmpdst1x1[1]+4)) >> 1;
            *(tmpdst1x2[1] + 5) = // ((DATA(3,1) + DATA(3,2)) >> 1) - 128;
                                  (*(tmpdst1x1[0]+7) + *(tmpdst1x1[1]+5)) >> 1;
            *(tmpdst1x2[1] + 6) = // ((DATA(2,3) + DATA(2,4)) >> 1) - 128;
                                  (*(tmpdst1x1[1]+6) + (DATA(2, 4) - 128) >> 1;
            *(tmpdst1x2[1] + 7) = // ((DATA(3,3) + DATA(3,4)) >> 1) - 128;
                                  (*(tmpdst1x1[1]+7) + (DATA(3, 4) - 128) >> 1;
            */
 
            // 1x2 results
            *(tmpdst1x2[0] + 0) = (*(tmpdst1x1[0]+0) + *(tmpdst1x1[0]+2)) >> 1;
            *(tmpdst1x2[0] + 1) = (*(tmpdst1x1[0]+1) + *(tmpdst1x1[0]+3)) >> 1;
            *(tmpdst1x2[0] + 2) = (*(tmpdst1x1[1]+0) + *(tmpdst1x1[1]+2)) >> 1;
            *(tmpdst1x2[0] + 3) = (*(tmpdst1x1[1]+1) + *(tmpdst1x1[1]+3)) >> 1;
            *(tmpdst1x2[0] + 4) = (*(tmpdst1x1[0]+4) + *(tmpdst1x1[0]+6)) >> 1;
            *(tmpdst1x2[0] + 5) = (*(tmpdst1x1[0]+5) + *(tmpdst1x1[0]+7)) >> 1;
            *(tmpdst1x2[0] + 6) = (*(tmpdst1x1[1]+4) + *(tmpdst1x1[1]+6)) >> 1;
            *(tmpdst1x2[0] + 7) = (*(tmpdst1x1[1]+5) + *(tmpdst1x1[1]+7)) >> 1;
            
            *(tmpdst1x2[1] + 0) = (*(tmpdst1x1[0]+2) + *(tmpdst1x1[1]+0)) >> 1;
            *(tmpdst1x2[1] + 1) = (*(tmpdst1x1[0]+3) + *(tmpdst1x1[1]+1)) >> 1;
            *(tmpdst1x2[1] + 2) = (*(tmpdst1x1[1]+2) + (DATA(0, 4) - 128)) >> 1;
            *(tmpdst1x2[1] + 3) = (*(tmpdst1x1[1]+3) + (DATA(1, 4) - 128)) >> 1;
            *(tmpdst1x2[1] + 4) = (*(tmpdst1x1[0]+6) + *(tmpdst1x1[1]+4)) >> 1;
            *(tmpdst1x2[1] + 5) = (*(tmpdst1x1[0]+7) + *(tmpdst1x1[1]+5)) >> 1;
            *(tmpdst1x2[1] + 6) = (*(tmpdst1x1[1]+6) + (DATA(2, 4) - 128)) >> 1;
            *(tmpdst1x2[1] + 7) = (*(tmpdst1x1[1]+7) + (DATA(3, 4) - 128)) >> 1;
           
            
            // 2x2 results using 2x1
            *(tmpdst2x2[0] + 0) = //((DATA(0,0) + DATA(1,0) + DATA(0,1) + DATA(1,1)) >> 2) - 128;
                                  (*(tmpdst2x1[0] + 0) + *(tmpdst2x1[0] + 2)) >> 1;
            *(tmpdst2x2[0] + 1) = //((DATA(2,0) + DATA(3,0) + DATA(2,1) + DATA(3,1)) >> 2) - 128;
                                  (*(tmpdst2x1[0] + 1) + *(tmpdst2x1[0] + 3)) >> 1;
            *(tmpdst2x2[0] + 2) = //((DATA(0,2) + DATA(1,2) + DATA(0,3) + DATA(1,3)) >> 2) - 128;
                                  (*(tmpdst2x1[1] + 0) + *(tmpdst2x1[1] + 2)) >> 1;
            *(tmpdst2x2[0] + 3) = //((DATA(2,2) + DATA(3,2) + DATA(2,3) + DATA(3,3)) >> 2) - 128;
                                  (*(tmpdst2x1[1] + 1) + *(tmpdst2x1[1] + 3)) >> 1;

            *(tmpdst2x2[1] + 0) = //((DATA(1,0) + DATA(2,0) + DATA(1,1) + DATA(2,1)) >> 2) - 128;
                                  (*(tmpdst2x1[2] + 0) + *(tmpdst2x1[2] + 2)) >> 1;
            *(tmpdst2x2[1] + 1) = //((DATA(3,0) + DATA(4,0) + DATA(3,1) + DATA(4,1)) >> 2) - 128;
                                  (*(tmpdst2x1[2] + 1) + *(tmpdst2x1[2] + 3)) >> 1;
            *(tmpdst2x2[1] + 2) = //((DATA(1,2) + DATA(2,2) + DATA(1,3) + DATA(2,3)) >> 2) - 128;
                                  (*(tmpdst2x1[3] + 0) + *(tmpdst2x1[2] + 2)) >> 1;
            *(tmpdst2x2[1] + 3) = //((DATA(3,2) + DATA(4,2) + DATA(3,3) + DATA(4,3)) >> 2) - 128;
                                  (*(tmpdst2x1[3] + 1) + *(tmpdst2x1[3] + 3)) >> 1;

            *(tmpdst2x2[2] + 0) = ((DATA(0,1) + DATA(1,1) + DATA(0,2) + DATA(1,2)) >> 2) - 128;
                                  //(*(tmpdst2x1[0] + 2) + *(tmpdst2x1[1] + 0)) >> 1;
            *(tmpdst2x2[2] + 1) = ((DATA(2,1) + DATA(3,1) + DATA(2,2) + DATA(3,2)) >> 2) - 128;
                                  //(*(tmpdst2x1[0] + 3) + *(tmpdst2x1[1] + 1)) >> 1;
            *(tmpdst2x2[2] + 2) = ((DATA(0,3) + DATA(1,3) + DATA(0,4) + DATA(1,4)) >> 2) - 128;
                                  //(*(tmpdst2x1[0] + 3) + *(tmpdst2x1[1] + 1)) >> 1;
            *(tmpdst2x2[2] + 3) = ((DATA(2,3) + DATA(3,3) + DATA(2,4) + DATA(3,4)) >> 2) - 128;
                                  //(*(tmpdst2x1[0] + 3) + *(tmpdst2x1[1] + 1)) >> 1;

            *(tmpdst2x2[3] + 0) = // ((DATA(1,1) + DATA(2,1) + DATA(1,2) + DATA(2,2)) >> 2) - 128;
                                  (*(tmpdst1x2[1] + 1) + *()) >> 1;
            *(tmpdst2x2[3] + 1) = ((DATA(3,1) + DATA(4,1) + DATA(3,2) + DATA(4,2)) >> 2) - 128;
            *(tmpdst2x2[3] + 2) = ((DATA(1,3) + DATA(2,3) + DATA(1,4) + DATA(2,4)) >> 2) - 128;
            *(tmpdst2x2[3] + 3) = ((DATA(3,3) + DATA(4,3) + DATA(3,4) + DATA(4,4)) >> 2) - 128;
            
            // Update pointers
            tmpdst1x1[0]+=8, tmpdst1x1[1]+=8;
            tmpdst2x2[0]+=4, tmpdst2x2[1]+=4, tmpdst2x2[2]+=4, tmpdst2x2[3]+=4;
            tmpdst2x1[0]+=4, tmpdst2x1[1]+=4, tmpdst2x1[2]+=4, tmpdst2x1[3]+=4;
            tmpdst1x2[0]+=8, tmpdst1x2[1]+=8;
        }

        // Update pointers
        dst1x1[0] += 2*conv11->rowStep, dst1x1[1] += 2*conv11->rowStep;
        dst2x2[0] += conv22->rowStep, dst2x2[1] += conv22->rowStep, dst2x2[2] += conv22->rowStep, dst2x2[3] += conv22->rowStep;
        dst2x1[0] += 2*conv21->rowStep, dst2x1[1] += 2*conv21->rowStep, dst2x1[2] += 2*conv21->rowStep, dst2x1[3] += 2*conv21->rowStep;
        dst1x2[0] += conv12->rowStep, dst1x2[1] += conv12->rowStep;
    }
}
#endif
inline void convolve4x4Block(
        unsigned char * base, int step,
        signed char * base1x1, int rowStep1x1,
        signed char * base2x1, int rowStep2x1, int blockStep2x1,
        signed char * base1x2, int rowStep1x2, int blockStep1x2,
        signed char * base2x2, int rowStep2x2, int blockStep2x2
        );

void convolve12all(IplImage * image, vector<TConvolution*> & conv)
{
    unsigned char * baseImg = (unsigned char *) image->imageData;
    signed char * base1x1 = conv[0]->image;
    signed char * base2x1 = conv[1]->image;
    signed char * base1x2 = conv[2]->image;
    signed char * base2x2 = conv[3]->image;

    while (baseImg < (unsigned char *)image->imageData + (image->height * image->widthStep))
    {
        unsigned char * rowImg = baseImg;
        signed char * row1x1 = base1x1;
        signed char * row2x1 = base2x1;
        signed char * row1x2 = base1x2;
        signed char * row2x2 = base2x2;
        
        while (rowImg < baseImg + image->width)
        {
            convolve4x4Block(
                rowImg, image->widthStep,
                row1x1, conv[0]->rowStep,
                row2x1, conv[1]->rowStep, conv[1]->blockStep,
                row1x2, conv[2]->rowStep, conv[2]->blockStep,
                row2x2, conv[3]->rowStep, conv[3]->blockStep
            );
            
            rowImg += 4;
            row1x1 += 8;
            row2x1 += 4;
            row1x2 += 8;
            row2x2 += 4;
        }
        
        baseImg += image->widthStep * 4;
        base1x1 += 2 * conv[0]->rowStep;
        base2x1 += 2 * conv[1]->rowStep;
        base1x2 += conv[2]->rowStep;
        base2x2 += conv[3]->rowStep;
    }
}

inline void convolve4x4Block(
        //IplImage * image, unsigned offset,
        unsigned char * base, int step,
        signed char * base1x1, int rowStep1x1,
        signed char * base2x1, int rowStep2x1, int blockStep2x1,
        signed char * base1x2, int rowStep1x2, int blockStep1x2,
        signed char * base2x2, int rowStep2x2, int blockStep2x2
        )
{
    unsigned char * data = base;
    
    signed char * ptr1x1 = base1x1;
    
    // 1x1 results
    *(ptr1x1 + 0) = *(data + 0) - 128;
    *(ptr1x1 + 1) = *(data + 1) - 128;
    *(ptr1x1 + 2) = *(data + step + 0) - 128;
    *(ptr1x1 + 3) = *(data + step + 1) - 128;
    
    *(ptr1x1 + 4) = *(data + 2) - 128;
    *(ptr1x1 + 5) = *(data + 3) - 128;
    *(ptr1x1 + 6) = *(data + step + 2) - 128;
    *(ptr1x1 + 7) = *(data + step + 3) - 128;

    ptr1x1 += rowStep1x1;
    data += 2 * step;
 
    *(ptr1x1 + 0) = *(data + 0) - 128;
    *(ptr1x1 + 1) = *(data + 1) - 128;
    *(ptr1x1 + 2) = *(data + step + 0) - 128;
    *(ptr1x1 + 3) = *(data + step + 1) - 128;
    
    *(ptr1x1 + 4) = *(data + 2) - 128;
    *(ptr1x1 + 5) = *(data + 3) - 128;
    *(ptr1x1 + 6) = *(data + step + 2) - 128;
    *(ptr1x1 + 7) = *(data + step + 3) - 128;

    // End of 1x1 part
    // 2x1 results
    {
    signed char * ptr1x1_0 = base1x1;
    signed char * ptr1x1_1 = base1x1 + rowStep1x1;

    signed char * ptr2x1_00 = base2x1;
    signed char * ptr2x1_01 = base2x1 + rowStep2x1;
    signed char * ptr2x1_10 = base2x1 + blockStep2x1;
    signed char * ptr2x1_11 = base2x1 + blockStep2x1 + rowStep2x1;

    // block 0,0 row 0
    *(ptr2x1_00 + 0) = (*(ptr1x1_0 + 0) + *(ptr1x1_0 + 1)) >> 1;
    *(ptr2x1_00 + 1) = (*(ptr1x1_0 + 4) + *(ptr1x1_0 + 5)) >> 1;
    *(ptr2x1_00 + 2) = (*(ptr1x1_0 + 2) + *(ptr1x1_0 + 3)) >> 1;
    *(ptr2x1_00 + 3) = (*(ptr1x1_0 + 6) + *(ptr1x1_0 + 7)) >> 1;

    // row 1
    *(ptr2x1_01 + 0) = (*(ptr1x1_1 + 0) + *(ptr1x1_1 + 1)) >> 1;
    *(ptr2x1_01 + 1) = (*(ptr1x1_1 + 4) + *(ptr1x1_1 + 5)) >> 1;
    *(ptr2x1_01 + 2) = (*(ptr1x1_1 + 2) + *(ptr1x1_1 + 3)) >> 1;
    *(ptr2x1_01 + 3) = (*(ptr1x1_1 + 6) + *(ptr1x1_1 + 7)) >> 1;

    // block 1,0 row 0
    data = base + 4;
    *(ptr2x1_10 + 0) = (*(ptr1x1_0 + 1) + *(ptr1x1_0 + 4)) >> 1;
    *(ptr2x1_10 + 1) = (*(ptr1x1_0 + 5) + (*(data) - 128)) >> 1;
    *(ptr2x1_10 + 2) = (*(ptr1x1_0 + 3) + *(ptr1x1_0 + 6)) >> 1;
    *(ptr2x1_10 + 3) = (*(ptr1x1_0 + 7) + (*(data + step) - 128)) >> 1;
    
    // row 1
    data += 2 * step;
    *(ptr2x1_11 + 0) = (*(ptr1x1_1 + 1) + *(ptr1x1_1 + 4)) >> 1;
    *(ptr2x1_11 + 1) = (*(ptr1x1_1 + 5) + (*(data) - 128)) >> 1;
    *(ptr2x1_11 + 2) = (*(ptr1x1_1 + 3) + *(ptr1x1_1 + 6)) >> 1;
    *(ptr2x1_11 + 3) = (*(ptr1x1_1 + 7) + (*(data + step) - 128)) >> 1;
 
    // End of 2x1 part
    }
    {
    signed char * ptr1x2_0 = base1x2;
    signed char * ptr1x2_1 = base1x2 + blockStep1x2;
    signed char * ptr1x1_0 = base1x1;
    signed char * ptr1x1_1 = base1x1 + rowStep1x1;
    
    // 1x2 part, block 0
    *(ptr1x2_0 + 0) = (*(ptr1x1_0 + 0) + *(ptr1x1_0 + 2)) >> 1;
    *(ptr1x2_0 + 1) = (*(ptr1x1_0 + 1) + *(ptr1x1_0 + 3)) >> 1;
    *(ptr1x2_0 + 2) = (*(ptr1x1_1 + 0) + *(ptr1x1_1 + 2)) >> 1;
    *(ptr1x2_0 + 3) = (*(ptr1x1_1 + 1) + *(ptr1x1_1 + 3)) >> 1;
    
    *(ptr1x2_0 + 4) = (*(ptr1x1_0 + 4) + *(ptr1x1_0 + 6)) >> 1;
    *(ptr1x2_0 + 5) = (*(ptr1x1_0 + 5) + *(ptr1x1_0 + 7)) >> 1;
    *(ptr1x2_0 + 6) = (*(ptr1x1_1 + 4) + *(ptr1x1_1 + 6)) >> 1;
    *(ptr1x2_0 + 7) = (*(ptr1x1_1 + 5) + *(ptr1x1_1 + 7)) >> 1;

    // block 1
    data = base + 4 * step;
    *(ptr1x2_1 + 0) = (*(ptr1x1_0 + 2) + *(ptr1x1_1 + 0)) >> 1;
    *(ptr1x2_1 + 1) = (*(ptr1x1_0 + 3) + *(ptr1x1_1 + 1)) >> 1;
    *(ptr1x2_1 + 2) = (*(ptr1x1_1 + 2) + (*(data + 0) - 128)) >> 1;
    *(ptr1x2_1 + 2) = (*(ptr1x1_1 + 3) + (*(data + 1) - 128)) >> 1;
    
    *(ptr1x2_1 + 4) = (*(ptr1x1_0 + 6) + *(ptr1x1_1 + 4)) >> 1;
    *(ptr1x2_1 + 5) = (*(ptr1x1_0 + 7) + *(ptr1x1_1 + 5)) >> 1;
    *(ptr1x2_1 + 6) = (*(ptr1x1_1 + 6) + (*(data + 2) - 128)) >> 1;
    *(ptr1x2_1 + 7) = (*(ptr1x1_1 + 7) + (*(data + 3) - 128)) >> 1;

    // End of 1x2 part
    }
    {
    // 2x2 part, block 0 (00)
    signed char * ptr2x2_00 = base2x2;
    signed char * ptr2x2_01 = base2x2 + blockStep2x2;
    signed char * ptr2x2_10 = base2x2 + 2 * blockStep2x2;
    signed char * ptr2x2_11 = base2x2 + 3 * blockStep2x2;
    
    signed char * ptr1x2_0 = base1x2;
    signed char * ptr1x2_1 = base1x2 + blockStep2x1;
    signed char * ptr2x1_10 = base2x1 + blockStep2x1;
    signed char * ptr2x1_11 = base2x1 + blockStep2x1 + rowStep2x1;
    
    *(ptr2x2_00 + 0) = (*(ptr1x2_0 + 0) + *(ptr1x2_0 + 1)) >> 1;
    *(ptr2x2_00 + 1) = (*(ptr1x2_0 + 4) + *(ptr1x2_0 + 5)) >> 1;
    *(ptr2x2_00 + 2) = (*(ptr1x2_0 + 2) + *(ptr1x2_0 + 3)) >> 1;
    *(ptr2x2_00 + 3) = (*(ptr1x2_0 + 6) + *(ptr1x2_0 + 7)) >> 1;

    // block 1 (01)
    *(ptr2x2_01 + 0) = (*(ptr2x1_10 + 0) + *(ptr2x1_10 + 2)) >> 1;
    *(ptr2x2_01 + 1) = (*(ptr2x1_10 + 1) + *(ptr2x1_10 + 3)) >> 1;
    *(ptr2x2_01 + 2) = (*(ptr2x1_11 + 0) + *(ptr2x1_11 + 2)) >> 1;
    *(ptr2x2_01 + 3) = (*(ptr2x1_11 + 1) + *(ptr2x1_11 + 3)) >> 1;

    // block 2 (10)
    *(ptr2x2_10 + 0) = (*(ptr1x2_1 + 0) + *(ptr1x2_1 + 1)) >> 1;    
    *(ptr2x2_10 + 1) = (*(ptr1x2_1 + 4) + *(ptr1x2_1 + 5)) >> 1;    
    *(ptr2x2_10 + 2) = (*(ptr1x2_1 + 2) + *(ptr1x2_1 + 3)) >> 1;    
    *(ptr2x2_10 + 3) = (*(ptr1x2_1 + 6) + *(ptr1x2_1 + 7)) >> 1;    
    
    // block 3 (11)
    data = base + 4 * step;
    signed char tmp = ((*(data + 3) + *(data + 4)) >> 1) - 128;
    *(ptr2x2_11 + 0) = (*(ptr1x2_1 + 1) + *(ptr1x2_1 + 4)) >> 1;
    *(ptr2x2_11 + 1) = (*(ptr2x1_11 + 3) + *(ptr2x1_11 + 1)) >> 1;
    *(ptr2x2_11 + 2) = (*(ptr1x2_1 + 3) + *(ptr1x2_1 + 6)) >> 1;
    *(ptr2x2_11 + 3) = (*(ptr2x1_11 + 3) + tmp) >> 1;

    }
}


/*
// calc 4x2 and 2x4 from 2x2
void convolution24(TConvolution * conv0, TConvolution * conv1, TConvolution * conv2)
{


    signed char * src2x2[4] = {
        conv0->image,
        conv0->image+conv0->blockStep,
        conv0->image+2*conv0->blockStep,
        conv0->image+3*conv0->blockStep,
    };

    signed char * dst2x4[8] = {
        conv1->image, // shift 0x0
        conv1->image+conv1->blockStep, // 1,0
        conv1->image+2*conv1->blockStep, // 2,0
        conv1->image+3*conv1->blockStep, // 3,0
        conv1->image+4*conv1->blockStep, // 0,1
        conv1->image+5*conv1->blockStep, // 1,1
        conv1->image+6*conv1->blockStep, // 2,1
        conv1->image+7*conv1->blockStep, // 3,1
    };



    while (src2x2[0] < conv0->image+conv0->rows*conv0->rowStep)
    {
        while ()
        {
        }

    
    for (int i = 0; i < 4; ++i)
    {
        signed char * src = src2x2[i];
        signed char * dst0 = dst2x4[0];
        signed char * dst1 = dst2x4[2];

        *(dst0 + 0) = (src[0] + src[1]) >> 1;
        *(dst0 + 1) = (src[4] + src[5]) >> 1;
        *(dst0 + 2) = (src[2] + src[3]) >> 1;
        *(dst0 + 3) = (src[6] + src[7]) >> 1;
    
        *(dst1 + 0) = (src[1] + src[4]) >> 1;
        *(dst1 + 1) = (src[5] + src[8]) >> 1;
        *(dst1 + 2) = (src[3] + src[6]) >> 1;
        *(dst1 + 3) = (src[7] + src[10]) >> 1;
    
        src += conv0->rowStep;
        *(dst0 + conv0->rowStep + 0) = (src[0] + src[1]) >> 1;
        *(dst0 + conv0->rowStep + 1) = (src[4] + src[5]) >> 1;
        *(dst0 + conv0->rowStep + 2) = (src[2] + src[3]) >> 1;
        *(dst0 + conv0->rowStep + 3) = (src[6] + src[7]) >> 1;

        *(dst1 + conv0->rowStep + 0) = (src[1] + src[4]) >> 1;
        *(dst1 + conv0->rowStep + 1) = (src[5] + src[8]) >> 1;
        *(dst1 + conv0->rowStep + 2) = (src[3] + src[6]) >> 1;
        *(dst1 + conv0->rowStep + 3) = (src[7] + src[10]) >> 1;
    }



        // update pointers
    }

}
*/

// 5x5 -> 4x4 downscale kernel
static inline void downscaleBlock(unsigned char * src, int srcStep, unsigned char * dst, int dstStep)
{
    //*(dst+0) = *(src+0);
    //*(dst+1) = *(src+1);
    *(unsigned short*)(dst+0) = *(unsigned short*)(src+0);
    *(dst+2) = (*(src+2) + *(src+3)) >> 1;    
    *(dst+3) = *(src+4);
    src += srcStep;
    dst += dstStep;
    
    //*(dst+0) = *(src+0);
    //*(dst+1) = *(src+1);
    *(unsigned short*)(dst+0) = *(unsigned short*)(src+0);
    *(dst+2) = (*(src+2) + *(src+3)) >> 1;    
    *(dst+3) = *(src+4);
    src += srcStep;
    dst += dstStep;
    
    *(dst+0) = (*(src+0) + *(src+srcStep)) >> 1;
    *(dst+1) = (*(src+1) + *(src+srcStep+1)) >> 1;
    *(dst+2) = (*(src+2) + *(src+3) + *(src+srcStep+2) + *(src+srcStep+3)) >> 2;
    *(dst+3) = (*(src+4) + *(src+srcStep+4)) >> 1;
    src += 2 * srcStep;
    dst += 2 * dstStep;
    
    //*(dst+0) = *(src+0);
    //*(dst+1) = *(src+1);
    *(unsigned short*)(dst+0) = *(unsigned short*)(src+0);
    *(dst+2) = (*(src+2) + *(src+3)) >> 1;    
    *(dst+3) = *(src+4);
}


// 
void downscale5to4(IplImage * src, IplImage * dst)
{
    unsigned char * s_px = (unsigned char *) src->imageData;
    unsigned char * d_px = (unsigned char *) dst->imageData;
    unsigned char * s_px_end = (unsigned char *) src->imageData + (src->height - 5) * src->widthStep;

    while (s_px < s_px_end)
    {
        unsigned char * s_row = s_px;
        unsigned char * d_row = d_px;
        unsigned char * s_row_end = s_px + src->width - 5;

        while (s_row < s_row_end)
        {
            downscaleBlock(s_row, src->widthStep, d_row, dst->widthStep);
            s_row += 5;
            d_row += 4;
        }
        
        s_px += 5 * src->widthStep;
        d_px += 4 * dst->widthStep;
    }
}

