//-----------------------------------------------------------------------------
// scale.h
// Fixed step image scaling, optimized via SSE intrinsics
// Jiri Havel, DCGM FIT BUT, Brno
// $Id$
//-----------------------------------------------------------------------------

#ifndef _SCALE_H_
#define _SCALE_H_

#include <cv.h>

#include <emmintrin.h>

class Scale8to7
{
private :

	struct Coeficients
	{
		union
		{
			__m128i v;
			short a[8];
		} a, b;
		Coeficients();
	};
	const static Coeficients c;

	static inline __m128i doLine(const char *src)
	{
		const __m128i a = _mm_unpacklo_epi8(
			_mm_loadl_epi64((__m128i*)src),
			_mm_setzero_si128()
		);
		return _mm_add_epi16(
			_mm_mullo_epi16(
				a,
				c.a.v
			),
			_mm_mullo_epi16(
				_mm_srli_si128(
					a,
					2
				),
				c.b.v
			)
		);
	}

public :

	enum
	{
		srcWidth = 8,
		srcHeight = 8,
		dstWidth = 7,
		dstHeight = 7,
	};

	static inline void doBlock(const char *src, const int srcPitch, char *dst, const int dstPitch)
	{
		__m128i a = doLine(src);
		for(int r = 0; r < dstHeight; ++r)
		{
			const __m128i b = doLine(src + (r+1)*srcPitch);
			_mm_storel_epi64(
				(__m128i*)(dst + r*dstPitch),
				_mm_packs_epi16(
					_mm_srai_epi16(
						_mm_add_epi16(
							_mm_mullo_epi16(
								a,
								_mm_set1_epi16(c.a.a[r])
							),
							_mm_mullo_epi16(
								b,
								_mm_set1_epi16(c.b.a[r])
							)
						),
						8
					),
					_mm_setzero_si128()
				)
			);
			a = b;
		}
	}
};

class ScaleHalf
{
public :

	enum
	{
		srcWidth = 16,
		srcHeight = 2,
		dstWidth = 8,
		dstHeight = 1,
	};

	static inline void doBlock(const char *src, const int srcPitch, char *dst, const int dstPitch)
	{
		const __m128i c = _mm_avg_epu8(
			_mm_load_si128((__m128i*)(src           )),
			_mm_load_si128((__m128i*)(src + srcPitch))
		);
		_mm_storel_epi64(
			(__m128i*)dst,
			_mm_packus_epi16(
				_mm_srli_epi16(
					_mm_avg_epu8(
						c,
						_mm_slli_si128(
							c,
							1
						)
					),
					8
				),
				_mm_setzero_si128()
			)
		);
	}
};

template <class Op>
inline CvSize sizeToSrcBlocks(const CvSize & size)
{
	return cvSize(
		((size.width %Op::srcWidth ) ? (size.width /Op::srcWidth  + 1) : (size.width /Op::srcWidth )),
		((size.height%Op::srcHeight) ? (size.height/Op::srcHeight + 1) : (size.height/Op::srcHeight))
	);
}

template <class Op>
inline CvSize getDstSize(const CvSize &in)
{
	CvSize blocks = sizeToSrcBlocks<Op>(in);
	return cvSize(blocks.width*Op::dstWidth, blocks.height*Op::dstHeight);
}

inline CvSize alignSize(const CvSize & size, const CvSize & alignment)
{
	return cvSize(
		(size.width %alignment.width ) ? (size.width /alignment.width  + 1)*alignment.width  : size.width,
		(size.height%alignment.height) ? (size.height/alignment.height + 1)*alignment.height : size.height
	);
}

template <class Op>
inline CvSize alignFor(const CvSize & size)
{
	return alignSize(size, cvSize(Op::srcWidth, Op::srcHeight));
}

template <class Op>
void doBlockOp(const IplImage *src, IplImage *dst)
{
	CvSize blocks = sizeToSrcBlocks<Op>(cvGetSize(src));
	assert(blocks.width *Op::srcWidth  <= src->widthStep);
	assert(blocks.width *Op::dstWidth  <= dst->widthStep);
	assert(blocks.height*Op::srcHeight <= src->height);
	assert(blocks.height*Op::dstHeight <= dst->height);
	for(int ri = 0; ri < blocks.height; ++ri)
	{
		char *srcRow = src->imageData + ri*Op::srcHeight*src->widthStep;
		char *dstRow = dst->imageData + ri*Op::dstHeight*dst->widthStep;
		for(int ci = 0; ci < blocks.width; ++ci)
		{
			Op::doBlock(srcRow + ci*Op::srcWidth, src->widthStep, dstRow + ci*Op::dstWidth, dst->widthStep);
		}
	}
}

#endif

