/* ------------------------------------------------------------------------- */
/*   rivatv-convert.c video image conversion routines                  	     */
/* ------------------------------------------------------------------------- */
/*   Copyright (C) 2002, 2003, 2004 Stefan Jahn <stefan@lkcc.org>
 *
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *   
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *   
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.		     */
/* ------------------------------------------------------------------------- */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/init.h>
#if (defined (__i386__) || defined (__x86_64__))  && RIVATV_ENABLE_ASM
#include <asm/i387.h>
#endif
#include "rivatv.h"

#ifndef RIVATV_DISABLE_CONVERSION

#define __minmax(val, min, max) ((val < min) ? min : ((val > max) ? max : val))

#define PRECISION_BITS 10

#define RED_U_SCALE   ((unsigned long long) (1.732 * (1ULL << PRECISION_BITS)))
#define GREEN_U_SCALE ((unsigned long long) (0.336 * (1ULL << PRECISION_BITS)))
#define GREEN_V_SCALE ((unsigned long long) (0.698 * (1ULL << PRECISION_BITS)))
#define BLUE_V_SCALE  ((unsigned long long) (1.371 * (1ULL << PRECISION_BITS)))

#define RED_U_SCALE_I	((int) RED_U_SCALE)
#define GREEN_U_SCALE_I ((int) GREEN_U_SCALE)
#define GREEN_V_SCALE_I ((int) GREEN_V_SCALE)
#define BLUE_V_SCALE_I	((int) BLUE_V_SCALE)

static unsigned long long uyvy_constants[] = {

	0x00FF00FF00FF00FFULL,
	0x0080008000800080ULL,
	(RED_U_SCALE   << 32) | RED_U_SCALE,
	(BLUE_V_SCALE  << 32) | BLUE_V_SCALE,
	(GREEN_V_SCALE << 32) | GREEN_U_SCALE,
	0x00000000FF0000FFULL,
};

#define offset_00FF00FF00FF00FF (8 * 0)
#define offset_0080008000800080 (8 * 1)
#define offset_RED_UV_SCALE	(8 * 2)
#define offset_BLUE_UV_SCALE	(8 * 3)
#define offset_GREEN_UV_SCALE	(8 * 4)
#define offset_00000000FF0000FF (8 * 5)

#endif /* RIVATV_DISABLE_CONVERSION */

/* ------------------------------------------------------------------------- *
 *
 * MMX optimizations
 *
 * ------------------------------------------------------------------------- */

/* indicates whether CPU can process MMX code */
int isMMX = 0;

/* indicates whether MMX code is requested or not */
int mmx = 1;

#if (defined (__i386__) || defined (__x86_64__)) && RIVATV_ENABLE_ASM

/* kernel_fpu_begin() is not exported (properly).  Made a local
 * version to work around this. */
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 5, 68)

static inline void __save_init_fpu( struct task_struct *tsk )
{
	if ( cpu_has_fxsr ) {
                asm volatile( "fxsave %0 ; fnclex"
                              : "=m" (tsk->thread.i387.fxsave) );
        } else {
                asm volatile( "fnsave %0 ; fwait"
                              : "=m" (tsk->thread.i387.fsave) );
        }
        tsk->flags &= ~PF_USEDFPU;
}

void kernel_fpu_begin(void)
{
        struct task_struct *tsk = current;

        if (tsk->flags & PF_USEDFPU) {
                __save_init_fpu(tsk);
                return;
        }
        clts();
}

/* If the ver is a RH one and */
/* is less than 2.4.20 but greater than 2.5.0 */
/* then redefine to your hearts content :-) */
#if defined(RIVATV_ISREDHAT)
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 4, 20) && LINUX_VERSION_CODE > KERNEL_VERSION(2, 5, 0)
#define kernel_fpu_end() stts()
#endif
#endif

#endif
#endif /* x86 */

void rivatv_detect_mmx (void)
{
	if (mmx && test_bit (X86_FEATURE_MMX, boot_cpu_data.x86_capability)) {
		PRINTK_INFO ("MMX processor extension enabled\n");
		isMMX = 1;
	}
}

#ifndef RIVATV_DISABLE_CONVERSION

#define MMX_PREPARE_PIXELS							\
	/* Load 2 Pixels at once */						\
	"1: movd      (%%esi), %%mm0	\n" /* mm0 = ________Y1V0Y0U0 */	\
	"   movd      4(%%esi), %%mm1	\n" /* mm1 = ________Y3V2Y2U2 */	\
										\
	/* Create Y-Value Register -> MM2,MM3 */				\
	"   pxor      %%mm6, %%mm6	\n" /* mm6 = ________________ */	\
	"   pxor      %%mm7, %%mm7	\n" /* mm7 = ________________ */	\
	"   movq      %%mm0, %%mm2	\n" /* mm2 = ________Y1V0Y0U0 */	\
	"   movq      %%mm1, %%mm3	\n" /* mm3 = ________Y3V2Y2U2 */	\
	"   psrlw     $8, %%mm2		\n" /* mm2 = __________Y1__Y0 */	\
	"   psrlw     $8, %%mm3		\n" /* mm3 = __________Y3__Y2 */	\
	"   punpcklwd %%mm6, %%mm2	\n" /* mm2 = ______Y1______Y0 */	\
	"   punpcklwd %%mm7, %%mm3	\n" /* mm3 = ______Y3______Y2 */	\
	"   psllq     %4, %%mm2		\n" /* mm2 = scaled ______Y1______Y0 */ \
	"   psllq     %4, %%mm3		\n" /* mm3 = scaled ______Y3______Y2 */ \
										\
	/* Create UV-Value Register -> MM4,MM5 */				\
	"   movq      %%mm0, %%mm4	\n"					\
	"   punpckldq %%mm1, %%mm4	\n" /* mm4 = Y3V2Y2U2Y1V0Y0U0 */	\
	"   pand      0(%%edx), %%mm4	\n" /* mm4 = __V2__U2__V0__U0 */	\
	"   psubw     8(%%edx), %%mm4	\n" /* mm4 = __V2__U2__V0__U0 - 128 */	\
	"   movq      %%mm4, %%mm5	\n" /* mm5 = __V2__U2__V0__U0 - 128 */	\
	"   punpckhwd %%mm4, %%mm5	\n" /* mm5 = __V2__V2__U2__U2 */	\
	"   punpcklwd %%mm4, %%mm4	\n" /* mm4 = __V0__V0__U0__U0 */	\
	"   psrad     $16, %%mm5	\n" /* mm5 = ______V2______U2 */	\
	"   psrad     $16, %%mm4	\n" /* mm4 = ______V0______U0 */	\
										\
	/* Create Green-Values */						\
	"   movq      %%mm5, %%mm0	\n" /* mm0 = ______V2______U2 */	\
	"   movq      %%mm4, %%mm1	\n" /* mm1 = ______V0______U0 */	\
	"   pmaddwd   32(%%edx), %%mm0	\n"					\
	"   pmaddwd   32(%%edx), %%mm1	\n"					\
	"   movq      %%mm0, %%mm6	\n" /* mm6 = scaled ______V2______U2 */ \
	"   movq      %%mm1, %%mm7	\n" /* mm7 = scaled ______V0______U0 */ \
	"   psrlq     $32, %%mm0	\n" /* mm0 = ______________V2 */	\
	"   psrlq     $32, %%mm1	\n" /* mm1 = ______________V0 */	\
	"   paddd     %%mm0, %%mm6	\n" /* mm6 = ___________U2+V2 */	\
	"   paddd     %%mm1, %%mm7	\n" /* mm7 = ___________U0+V0 */	\
	"   punpckldq %%mm6, %%mm6	\n" /* mm6 = ___U2+V2___U2+V2 */	\
	"   punpckldq %%mm7, %%mm7	\n" /* mm7 = ___U0+V0___U0+V0 */	\
	"   movq      %%mm2, %%mm0	\n" /* mm0 = ______Y1______Y0 */	\
	"   movq      %%mm3, %%mm1	\n" /* mm1 = ______Y3______Y2 */	\
	"   psubd     %%mm7, %%mm0	\n" /* mm0 = scaled ______G1______G0 */ \
	"   psubd     %%mm6, %%mm1	\n" /* mm1 = scaled ______G3______G2 */ \
	"   psrad     %4, %%mm0		\n" /* mm0 = ______G1______G0 */	\
	"   psrad     %4, %%mm1		\n" /* mm1 = ______G3______G2 */	\
	"   packssdw  %%mm1, %%mm0	\n" /* mm0 = __G3__G2__G1__G0 */	\
										\
	/* Create Red-Values */							\
	"   movq      %%mm4, %%mm6	\n" /* mm6 = ______V0______U0 */	\
	"   movq      %%mm5, %%mm7	\n" /* mm7 = ______V2______U2 */	\
	"   punpckldq %%mm7, %%mm6	\n" /* mm6 = ______U2______U0 */	\
	"   pmaddwd   16(%%edx), %%mm6	\n"					\
	"   movq      %%mm6, %%mm1	\n"					\
	"   movq      %%mm6, %%mm7	\n"					\
	"   punpckldq %%mm6, %%mm1	\n" /* mm1 = ______U0______U0 */	\
	"   punpckhdq %%mm6, %%mm7	\n" /* mm7 = ______U2______U2 */	\
	"   paddd     %%mm3, %%mm7	\n" /* mm7 = ______R3______R2 */	\
	"   paddd     %%mm2, %%mm1	\n" /* mm1 = ______R1______R0 */	\
	"   psrad     %4, %%mm7		\n"					\
	"   psrad     %4, %%mm1		\n"					\
	"   packssdw  %%mm7, %%mm1	\n" /* mm1 = __R3__R2__R1__R0 */	\
										\
	/* Create Blue-Values */						\
	"   punpckhdq %%mm5, %%mm4	\n" /* mm4 = ______V2______V0 */	\
	"   pmaddwd   24(%%edx), %%mm4	\n"					\
	"   movq      %%mm4, %%mm5	\n"					\
	"   movq      %%mm4, %%mm6	\n"					\
	"   punpckldq %%mm5, %%mm4	\n" /* mm4 = ______V0______V0 */	\
	"   punpckhdq %%mm5, %%mm6	\n" /* mm6 = ______V2______V2 */	\
	"   paddd     %%mm4, %%mm2	\n" /* mm2 = ______B1______B0 */	\
	"   paddd     %%mm6, %%mm3	\n" /* mm3 = ______B3______B2 */	\
	"   psrad     %4, %%mm2		\n"					\
	"   psrad     %4, %%mm3		\n"					\
	"   packssdw  %%mm3, %%mm2	\n" /* mm2 = __B3__B2__B1__B0 */	\
										\
	/* Current state: Blue->MM2, Red->MM1, Green->MM0 */


/* Convert the video decoders native UYVY color format into RGB24. The 
   routine takes 8 byte UYVY and encodes it into 12 byte RGB. */
static void rivatv_UYVY_to_RGB24 (char *uyvy_buf, char *rgb_buf, int width)
{
#if (defined (__i386__) || defined (__x86_64__)) && RIVATV_ENABLE_ASM
	if (isMMX) {
		kernel_fpu_begin ();
		__asm__ __volatile__(
		"   movl      %1, %%esi		\n"
		"   movl      %2, %%edi		\n"
		"   movl      %3, %%ecx		\n"
		"   shrl      $2, %%ecx		\n"
		"   movl      %0, %%edx		\n"

		MMX_PREPARE_PIXELS

		/* Store saturated RGB-Values in RGB-Buffer */
		"   packuswb  %%mm2, %%mm2	\n" /* mm2 = B3B2B1B0B3B2B1B0 */
		"   packuswb  %%mm0, %%mm1	\n" /* mm1 = G3G2G1G0R3R2R1R0 */
		"   punpcklwd %%mm2, %%mm2	\n" /* mm2 = B3B2B3B2B1B0B1B0 */
		"   movq      %%mm2, %%mm0	\n" /* mm0 = B3B2B3B2B1B0B1B0 */
		"   punpcklbw %%mm2, %%mm2	\n" /* mm2 = B1B1B0B0B1B1B0B0 */
		"   punpckhbw %%mm0, %%mm0	\n" /* mm0 = B3B3B2B2B3B3B2B2 */
		"   movq      %%mm1, %%mm4	\n" /* mm4 = G3G2G1G0R3R2R1R0 */
		"   movq      %%mm1, %%mm5	\n" /* mm5 = G3G2G1G0R3R2R1R0 */
		"   psrlq     $32, %%mm4	\n" /* mm4 = ________G3G2G1G0 */
		"   punpcklbw %%mm4, %%mm5	\n" /* mm5 = G3R3G2R2G1R1G0R0 */
		"   movq      %%mm5, %%mm6	\n" /* mm6 = G3R3G2R2G1R1G0R0 */
		"   movq      %%mm5, %%mm7	\n" /* mm7 = G3R3G2R2G1R1G0R0 */
		"   punpcklwd %%mm2, %%mm6	\n" /* mm6 = B1B1G1R1B0B0G0R0 */
		"   punpckhwd %%mm0, %%mm7	\n" /* mm7 = B3B3G3R3B2B2G2R2 */
		"   movq      %%mm6, %%mm0	\n" /* mm0 = B1B1G1R1B0B0G0R0 */
		"   movq      %%mm6, %%mm1	\n" /* mm1 = B1B1G1R1B0B0G0R0 */
		"   psllq     $8, %%mm0		\n" /* mm0 = B1G1R1B0B0G0R0__ */
		"   psllq     $8, %%mm1		\n" /* mm1 = __G1R1B0B0G0R0__ */
		"   psrlq     $40, %%mm0	\n" /* mm0 = __________B1G1R1 */
		"   punpckldq %%mm0, %%mm1	\n" /* mm2 = __B1G1R1B0G0R0__ */
		"   movq      %%mm7, %%mm0	\n" /* mm0 = B3B3G3R3B2B2G2R2 */
		"   movq      %%mm7, %%mm2	\n" /* mm2 = B3B3G3R3B2B2G2R2 */
		"   psrlq     $8, %%mm1		\n" /* mm2 = ____B1G1R1B0G0R0 */
		"   psllq     $48, %%mm0	\n" /* mm0 = G2R2____________ */
		"   por	      %%mm0, %%mm1	\n" /* mm1 = G2R2B1G1R1B0G0R0 */
		"   psrlq     $24, %%mm2	\n" /* mm2 = ______B3B3G3R3B2 */

		/* Finally */
		"   movq      %%mm1, (%%edi)	\n"
		"   movd      %%mm2, 8(%%edi)	\n"
		"   addl      $12, %%edi	\n"
		"   addl      $8, %%esi		\n"
		"   decl      %%ecx		\n"
		"   jnz	      1b		\n"
		"   emms			\n"
		/* output */		:
		/* input */		: "g" (uyvy_constants),
					  "g" (uyvy_buf),	"g" (rgb_buf),
					  "r" (width),		"i" (10) /* PRECISION_BITS */
		/* clobber registers */ : "cc", "ecx", "edx", "esi", "edi");
		kernel_fpu_end ();
	} else
#endif
	{
		unsigned char *uyvy = uyvy_buf;
		unsigned char *rgb = rgb_buf;
		int r, g, b, n;
		int y[4], u[4], v[4];

		width >>= 2;
		while (width--) {
			u[0] = u[1] = *uyvy++ - 128;
			y[0] = *uyvy++ << PRECISION_BITS;
			v[0] = v[1] = *uyvy++ - 128;
			y[1] = *uyvy++ << PRECISION_BITS;
			u[2] = u[3] = *uyvy++ - 128;
			y[2] = *uyvy++ << PRECISION_BITS;
			v[2] = v[3] = *uyvy++ - 128;
			y[3] = *uyvy++ << PRECISION_BITS;

			for (n = 0; n < 4; n++) {
				r = (u[n] * RED_U_SCALE_I + y[n]) >> PRECISION_BITS;
				r = __minmax (r, 0, 255);
				*rgb++ = r;
			
				g = (y[n] - v[n] * GREEN_V_SCALE_I - u[n] * GREEN_U_SCALE_I) >> PRECISION_BITS;
				g = __minmax (g, 0, 255);
				*rgb++ = g;
	  
				b = (v[n] * BLUE_V_SCALE_I + y[n]) >> PRECISION_BITS;
				b = __minmax (b, 0, 255);
				*rgb++ = b;
			}
		}
	}
}

/* Convert the video decoders native UYVY color format into RGB555. The 
   routine takes 8 byte UYVY and encodes it into 8 byte RGB. */
static void rivatv_UYVY_to_RGB555 (char *uyvy_buf, char *rgb_buf, int width)
{
#if (defined (__i386__) || defined (__x86_64__)) && RIVATV_ENABLE_ASM
	if (isMMX) {
		kernel_fpu_begin ();
		__asm__ __volatile__(
		"   movl      %1, %%esi		\n"
		"   movl      %2, %%edi		\n"
		"   movl      %3, %%ecx		\n"
		"   shrl      $2, %%ecx		\n"
		"   movl      %0, %%edx		\n"

		MMX_PREPARE_PIXELS

		/* Store saturated RGB-Values */
		"   pxor      %%mm3, %%mm3	\n"
		"   packuswb  %%mm3, %%mm2	\n" /* mm2 = ________B3B2B1B0 */
		"   packuswb  %%mm3, %%mm1	\n" /* mm1 = ________R3R2R1R0 */
		"   packuswb  %%mm3, %%mm0	\n" /* mm0 = ________G3G2G1G0 */
		"   punpcklbw %%mm3, %%mm2	\n" /* mm2 = __B3__B2__B1__B0 */
		"   punpcklbw %%mm3, %%mm1	\n" /* mm1 = __R3__R2__R1__R0 */
		"   punpcklbw %%mm3, %%mm0	\n" /* mm0 = __G3__G2__G1__G0 */
		"   psrlw     $3, %%mm2		\n"
		"   psrlw     $3, %%mm1		\n"
		"   psrlw     $3, %%mm0		\n"
		"   psllw     $10, %%mm2	\n"
		"   psllw     $5, %%mm0		\n"
		"   por	      %%mm2, %%mm1	\n"
		"   por	      %%mm0, %%mm1	\n" /* mm1 = RGB3RGB2RGB1RGB0 */

		/* Finally */
		"   movq      %%mm1, (%%edi)	\n"
		"   addl      $8, %%edi		\n"
		"   addl      $8, %%esi		\n"
		"   decl      %%ecx		\n"
		"   jnz	      1b		\n"
		"   emms			\n"
		/* output */		:
		/* input */		: "g" (uyvy_constants),
					  "g" (uyvy_buf),	"g" (rgb_buf),
					  "r" (width),		"i" (10) /* PRECISION_BITS */
		/* clobber registers */ : "cc", "ecx", "edx", "esi", "edi");
		kernel_fpu_end ();
	} else
#endif
	{
		unsigned char *uyvy = uyvy_buf;
		unsigned short *rgb = (unsigned short *) rgb_buf;
		int r, g, b, n;
		int y[4], u[4], v[4];

		width >>= 2;
		while (width--) {
			u[0] = u[1] = *uyvy++ - 128;
			y[0] = *uyvy++ << PRECISION_BITS;
			v[0] = v[1] = *uyvy++ - 128;
			y[1] = *uyvy++ << PRECISION_BITS;
			u[2] = u[3] = *uyvy++ - 128;
			y[2] = *uyvy++ << PRECISION_BITS;
			v[2] = v[3] = *uyvy++ - 128;
			y[3] = *uyvy++ << PRECISION_BITS;

			for (n = 0; n < 4; n++) {
				r = (u[n] * RED_U_SCALE_I + y[n]) >> PRECISION_BITS;
				r = __minmax (r, 0, 255);
			
				g = (y[n] - v[n] * GREEN_V_SCALE_I - u[n] * GREEN_U_SCALE_I) >> PRECISION_BITS;
				g = __minmax (g, 0, 255);
	  
				b = (v[n] * BLUE_V_SCALE_I + y[n]) >> PRECISION_BITS;
				b = __minmax (b, 0, 255);

				*rgb++ = ((b & 0xF8) << 7) | ((g & 0xF8) << 2) | ((r & 0xF8) >> 3);
			}
		}
	}
}

/* Convert the video decoders native UYVY color format into RGB565. The 
   routine takes 8 byte UYVY and encodes it into 8 byte RGB. */
static void rivatv_UYVY_to_RGB565 (char *uyvy_buf, char *rgb_buf, int width)
{
#if (defined (__i386__) || defined (__x86_64__)) && RIVATV_ENABLE_ASM
	if (isMMX) {
		kernel_fpu_begin ();
		__asm__ __volatile__(
		"   movl      %1, %%esi		\n"
		"   movl      %2, %%edi		\n"
		"   movl      %3, %%ecx		\n"
		"   shrl      $2, %%ecx		\n"
		"   movl      %0, %%edx		\n"

		MMX_PREPARE_PIXELS

		/* Store saturated RGB-Values */
		"   pxor      %%mm3, %%mm3	\n"
		"   packuswb  %%mm3, %%mm2	\n" /* mm2 = ________B3B2B1B0 */
		"   packuswb  %%mm3, %%mm1	\n" /* mm1 = ________R3R2R1R0 */
		"   packuswb  %%mm3, %%mm0	\n" /* mm0 = ________G3G2G1G0 */
		"   punpcklbw %%mm3, %%mm2	\n" /* mm2 = __B3__B2__B1__B0 */
		"   punpcklbw %%mm3, %%mm1	\n" /* mm1 = __R3__R2__R1__R0 */
		"   punpcklbw %%mm3, %%mm0	\n" /* mm0 = __G3__G2__G1__G0 */
		"   psrlw     $3, %%mm2		\n"
		"   psrlw     $3, %%mm1		\n"
		"   psrlw     $2, %%mm0		\n"
		"   psllw     $11, %%mm2	\n"
		"   psllw     $5, %%mm0		\n"
		"   por	      %%mm2, %%mm1	\n"
		"   por	      %%mm0, %%mm1	\n" /* mm1 = RGB3RGB2RGB1RGB0 */

		/* Finally */
		"   movq      %%mm1, (%%edi)	\n"
		"   addl      $8, %%edi		\n"
		"   addl      $8, %%esi		\n"
		"   decl      %%ecx		\n"
		"   jnz	      1b		\n"
		"   emms			\n"
		/* output */		:
		/* input */		: "g" (uyvy_constants),
					  "g" (uyvy_buf),	"g" (rgb_buf),
					  "r" (width),		"i" (10) /* PRECISION_BITS */
		/* clobber registers */ : "cc", "ecx", "edx", "esi", "edi");
		kernel_fpu_end ();
	} else
#endif
	{
		unsigned char *uyvy = uyvy_buf;
		unsigned short *rgb = (unsigned short *) rgb_buf;
		int r, g, b, n;
		int y[4], u[4], v[4];

		width >>= 2;
		while (width--) {
			u[0] = u[1] = *uyvy++ - 128;
			y[0] = *uyvy++ << PRECISION_BITS;
			v[0] = v[1] = *uyvy++ - 128;
			y[1] = *uyvy++ << PRECISION_BITS;
			u[2] = u[3] = *uyvy++ - 128;
			y[2] = *uyvy++ << PRECISION_BITS;
			v[2] = v[3] = *uyvy++ - 128;
			y[3] = *uyvy++ << PRECISION_BITS;

			for (n = 0; n < 4; n++) {
				r = (u[n] * RED_U_SCALE_I + y[n]) >> PRECISION_BITS;
				r = __minmax (r, 0, 255);
			
				g = (y[n] - v[n] * GREEN_V_SCALE_I - u[n] * GREEN_U_SCALE_I) >> PRECISION_BITS;
				g = __minmax (g, 0, 255);
	  
				b = (v[n] * BLUE_V_SCALE_I + y[n]) >> PRECISION_BITS;
				b = __minmax (b, 0, 255);

				*rgb++ = ((b & 0xF8) << 8) | ((g & 0xFC) << 3) | ((r & 0xF8) >> 3);
			}
		}
	}
}

/* Convert the video decoders native UYVY color format into RGB32. The 
   routine takes 8 byte UYVY and encodes it into 16 byte RGB. */
static void rivatv_UYVY_to_RGB32 (char *uyvy_buf, char *rgb_buf, int width)
{
#if (defined (__i386__) || defined (__x86_64__)) && RIVATV_ENABLE_ASM
	if (isMMX) {
		kernel_fpu_begin ();
		__asm__ __volatile__(
		"   movl      %1, %%esi		\n"
		"   movl      %2, %%edi		\n"
		"   movl      %3, %%ecx		\n"
		"   shrl      $2, %%ecx		\n"
		"   movl      %0, %%edx		\n"

		MMX_PREPARE_PIXELS

		/* Store saturated RGB-Values in RGB-Buffer */
		"   packuswb  %%mm2, %%mm2	\n" /* mm2 = B3B2B1B0B3B2B1B0 */
		"   packuswb  %%mm0, %%mm1	\n" /* mm1 = G3G2G1G0R3R2R1R0 */
		"   punpcklwd %%mm2, %%mm2	\n" /* mm2 = B3B2B3B2B1B0B1B0 */
		"   movq      %%mm2, %%mm0	\n" /* mm0 = B3B2B3B2B1B0B1B0 */
		"   punpcklbw %%mm2, %%mm2	\n" /* mm2 = B1B1B0B0B1B1B0B0 */
		"   punpckhbw %%mm0, %%mm0	\n" /* mm0 = B3B3B2B2B3B3B2B2 */
		"   movq      %%mm1, %%mm4	\n" /* mm4 = G3G2G1G0R3R2R1R0 */
		"   movq      %%mm1, %%mm5	\n" /* mm5 = G3G2G1G0R3R2R1R0 */
		"   psrlq     $32, %%mm4	\n" /* mm4 = ________G3G2G1G0 */
		"   punpcklbw %%mm4, %%mm5	\n" /* mm5 = G3R3G2R2G1R1G0R0 */
		"   movq      %%mm5, %%mm6	\n" /* mm6 = G3R3G2R2G1R1G0R0 */
		"   movq      %%mm5, %%mm7	\n" /* mm7 = G3R3G2R2G1R1G0R0 */
		"   punpcklwd %%mm2, %%mm6	\n" /* mm6 = B1B1G1R1B0B0G0R0 */
		"   punpckhwd %%mm0, %%mm7	\n" /* mm7 = B3B3G3R3B2B2G2R2 */
		
		/* Finally */
		"   movq      %%mm6, (%%edi)	\n"
		"   movq      %%mm7, 8(%%edi)	\n"
		"   addl      $16, %%edi	\n"
		"   addl      $8, %%esi		\n"
		"   decl      %%ecx		\n"
		"   jnz	      1b		\n"
		"   emms			\n"
		/* output */		:
		/* input */		: "g" (uyvy_constants),
					  "g" (uyvy_buf),	"g" (rgb_buf),
					  "r" (width),		"i" (10) /* PRECISION_BITS */
		/* clobber registers */ : "cc", "ecx", "edx", "esi", "edi");
		kernel_fpu_end ();
	} else
#endif
	{
		unsigned char *uyvy = uyvy_buf;
		u32 *rgb = (u32 *) rgb_buf;
		int r, g, b, n;
		int y[4], u[4], v[4];

		width >>= 2;
		while (width--) {
			u[0] = u[1] = *uyvy++ - 128;
			y[0] = *uyvy++ << PRECISION_BITS;
			v[0] = v[1] = *uyvy++ - 128;
			y[1] = *uyvy++ << PRECISION_BITS;
			u[2] = u[3] = *uyvy++ - 128;
			y[2] = *uyvy++ << PRECISION_BITS;
			v[2] = v[3] = *uyvy++ - 128;
			y[3] = *uyvy++ << PRECISION_BITS;

			for (n = 0; n < 4; n++) {
				r = (u[n] * RED_U_SCALE_I + y[n]) >> PRECISION_BITS;
				r = __minmax (r, 0, 255);
			
				g = (y[n] - v[n] * GREEN_V_SCALE_I - u[n] * GREEN_U_SCALE_I) >> PRECISION_BITS;
				g = __minmax (g, 0, 255);
	  
				b = (v[n] * BLUE_V_SCALE_I + y[n]) >> PRECISION_BITS;
				b = __minmax (b, 0, 255);

				*rgb++ = 0xFF000000 | (b << 16) | (g << 8) | r;
			}
		}
	}
}

/* Convert the video decoders native UYVY color format into YUV422. */
static void rivatv_UYVY_to_YUV422 (char *uyvy_buf, char *yuv_buf, int width)
{
#if (defined (__i386__) || defined (__x86_64__)) && RIVATV_ENABLE_ASM
	if (isMMX) {
		kernel_fpu_begin ();
		__asm__ __volatile__(
		"   movl      %0, %%esi		\n"
		"   movl      %1, %%edi		\n"
		"   movl      %2, %%edx		\n"
		"   shrl      $3, %%edx		\n"

		/* Process 8 pixels at once */
		"1: movq      (%%esi), %%mm0	\n" /* mm0 = Y3V2Y2U2Y1V0Y0U0 */
		"   movq      8(%%esi), %%mm2	\n" /* mm2 = Y7V6Y6U6Y5V4Y4U4 */
		"   movq      %%mm0, %%mm1	\n" /* mm1 = Y3V2Y2U2Y1V0Y0U0 */
		"   movq      %%mm2, %%mm3	\n" /* mm3 = Y7V6Y6U6Y5V4Y4U4 */
		"   psllw     $8, %%mm0		\n" /* mm0 = V2__U2__V0__U0__ */
		"   psrlw     $8, %%mm1		\n" /* mm1 = __Y3__Y2__Y1__Y0 */
		"   psllw     $8, %%mm2		\n" /* mm2 = V6__U6__V4__U4__ */
		"   psrlw     $8, %%mm3		\n" /* mm3 = __Y7__Y6__Y5__Y4 */
		"   por	      %%mm1, %%mm0	\n" /* mm0 = V2Y3U2Y2V0Y1U0Y0 */
		"   por	      %%mm3, %%mm2	\n" /* mm2 = V6Y7U6Y6V4Y5U4Y4 */
		"   movq      %%mm0, (%%edi)	\n"
		"   movq      %%mm2, 8(%%edi)	\n"
		"   addl      $16, %%esi	\n"
		"   addl      $16, %%edi	\n"
		"   decl      %%edx		\n"
		"   jnz	      1b		\n"
		"   emms			\n"
		/* output */		:
		/* input */		: "g" (uyvy_buf), "g" (yuv_buf), "g" (width)
		/* clobber registers */ : "cc", "edx", "esi", "edi");
		kernel_fpu_end ();
	} else
#endif
	{
		u32 *uyvy = (u32 *) uyvy_buf;
		u32 *yuv = (u32 *) yuv_buf;
		u32 val;

		width >>= 1;
		while (width--) {
			val = *uyvy++;
			val = ((val << 8) & ~0x00FF0000) | ((val >> 8) & ~0x0000FF00);
			*yuv++ = val;
		}
	}
}

/* Convert the video decoders native UYVY color format into YUYV. */
#define rivatv_UYVY_to_YUYV rivatv_UYVY_to_YUV422

/* Convert the video decoders native UYVY color format into YUV411. */
static void rivatv_UYVY_to_YUV411 (char *uyvy_buf, char *yuv_buf, int width)
{
#if (defined (__i386__) || defined (__x86_64__)) && RIVATV_ENABLE_ASM
	if (isMMX) {
		kernel_fpu_begin ();
		__asm__ __volatile__(
		"   movl      %0, %%esi		\n"
		"   movl      %1, %%edi		\n"
		"   movl      %3, %%edx		\n"
		"   movl      %2, %%ecx		\n"
		"   shrl      $3, %%ecx		\n"

		/* Process 8 pixels at once */
		"1: movq (%%esi), %%mm0		\n" /* mm0 = Y3V2Y2U2Y1V0Y0U0 */
		"   movq 8(%%esi), %%mm1	\n" /* mm1 = Y7V6Y6U6Y5V4Y4U4 */

		/* Process Y values */
		"   movq %%mm0, %%mm2		\n" /* mm2 = Y3V2Y2U2Y1V0Y0U0 */
		"   movq %%mm1, %%mm3		\n" /* mm3 = Y7V6Y6U6Y5V4Y4U4 */
		"   psrlw $8, %%mm2		\n" /* mm2 = __Y3__Y2__Y1__Y0 */
		"   psrlw $8, %%mm3		\n" /* mm3 = __Y7__Y6__Y5__Y4 */
		"   pxor %%mm6, %%mm6		\n" /* mm6 = ________________ */
		"   pxor %%mm7, %%mm7		\n" /* mm7 = ________________ */
		"   packuswb %%mm6, %%mm2	\n" /* mm2 = ________Y3Y2Y1Y0 */
		"   packuswb %%mm7, %%mm3	\n" /* mm3 = ________Y7Y6Y5Y4 */
		"   punpcklwd %%mm6, %%mm2	\n" /* mm2 = ____Y3Y2____Y1Y0 */
		"   punpcklwd %%mm7, %%mm3	\n" /* mm3 = ____Y7Y6____Y5Y4 */
		"   movq %%mm2, %%mm4		\n" /* mm4 = ____Y3Y2____Y1Y0 */
		"   movq %%mm3, %%mm5		\n" /* mm5 = ____Y7Y6____Y5Y4 */
		"   psllq $8, %%mm4		\n" /* mm4 = __Y3Y2____Y1Y0__ */
		"   psllq $8, %%mm5		\n" /* mm5 = __Y7Y6____Y5Y4__ */
		"   punpckldq %%mm6, %%mm4	\n" /* mm4 = __________Y1Y0__ */
		"   punpckldq %%mm7, %%mm5	\n" /* mm5 = __________Y5Y4__ */
		"   punpckhdq %%mm2, %%mm6	\n" /* mm6 = ____Y3Y2________ */
		"   punpckhdq %%mm3, %%mm7	\n" /* mm7 = ____Y7Y6________ */
		"   por %%mm6, %%mm4		\n" /* mm4 = ____Y3Y2__Y1Y0__ */
		"   por %%mm7, %%mm5		\n" /* mm5 = ____Y7Y6__Y5Y4__ */

		/* Average U0+U2, V0+V2, U4+U6 and V4+V6 */
		"   pand 0(%%edx), %%mm0	\n" /* mm0 = __V2__U2__V0__U0 */
		"   pand 0(%%edx), %%mm1	\n" /* mm1 = __V6__U6__V4__U4 */
		"   movq %%mm0, %%mm6		\n" /* mm6 = __V2__U2__V0__U0 */
		"   movq %%mm1, %%mm7		\n" /* mm7 = __V6__U6__V4__U4 */
		"   psrlq $32, %%mm6		\n" /* mm6 = __________V2__U2 */
		"   psrlq $32, %%mm7		\n" /* mm7 = __________V6__U6 */
		"   paddw %%mm6, %%mm0		\n" /* mm0 = __________V0__U0 */
		"   paddw %%mm7, %%mm1		\n" /* mm1 = __________V4__U4 */
		"   psrlw $1, %%mm0		\n" /* mm0 = __________V0__U0 */
		"   psrlw $1, %%mm1		\n" /* mm1 = __________V4__U4 */

		/* Put alltogether */
		"   movq %%mm0, %%mm6		\n" /* mm6 = __________V0__U0 */
		"   movq %%mm1, %%mm7		\n" /* mm7 = __________V4__U4 */
		"   psllq $8, %%mm6		\n" /* mm6 = ________V0__U0__ */
		"   psllq $8, %%mm7		\n" /* mm7 = ________V4__U4__ */
		"   por %%mm6, %%mm0		\n" /* mm0 = ________V0V0U0U0 */
		"   por %%mm7, %%mm1		\n" /* mm1 = ________V4V4U4U4 */
		"   pand 40(%%edx), %%mm0	\n" /* mm0 = ________V0____U0 */
		"   pand 40(%%edx), %%mm1	\n" /* mm1 = ________V4____U4 */
		"   por %%mm4, %%mm0		\n" /* mm0 = ____Y3Y2V0Y1Y0U0 */
		"   por %%mm5, %%mm1		\n" /* mm1 = ____Y7Y6V4Y5Y4U4 */

		/* Prepare pixels for storage */
		"   movq %%mm1, %%mm2		\n" /* mm2 = ____Y7Y6V4Y5Y4U4 */
		"   movq %%mm1, %%mm3		\n" /* mm3 = ____Y7Y6V4Y5Y4U4 */
		"   psllq $48, %%mm2		\n" /* mm2 = Y4U4____________ */
		"   psrlq $16, %%mm3		\n" /* mm3 = ________Y7Y6V4Y5 */
		"   por %%mm2, %%mm0		\n" /* mm0 = Y4U4Y3Y2V0Y1Y0U0 */

		"   movd %%mm3, 8(%%edi)	\n"
		"   movq %%mm0, (%%edi)		\n"
		"   add $16, %%esi		\n"
		"   add $12, %%edi		\n"
		"   dec %%ecx			\n"
		"   jnz 1b			\n"
		"   emms			\n"

		/* output */		:
		/* input */		: "g" (uyvy_buf), "g" (yuv_buf), "g" (width), "g" (uyvy_constants)
		/* clobber registers */ : "cc", "ecx", "edx", "esi", "edi");
		kernel_fpu_end ();
	} else
#endif
	{
		u8 *uyvy = (u8 *) uyvy_buf;
		u8 *yuv = (u8 *) yuv_buf;

		/* does not get any faster with good old i386 asm code */
		width >>= 2;
		while (width--) {
			*yuv++ = (uyvy[0] + uyvy[4]) >> 1;
			*yuv++ = uyvy[1];
			*yuv++ = uyvy[3];
			*yuv++ = (uyvy[2] + uyvy[6]) >> 1;
			*yuv++ = uyvy[5];
			*yuv++ = uyvy[7];
			uyvy += 8;
		}
	}
}

/* Convert the video decoders native UYVY color format into GREY. */
static void rivatv_UYVY_to_GREY (char *uyvy_buf, char *grey_buf, int width)
{
#if (defined (__i386__) || defined (__x86_64__)) && RIVATV_ENABLE_ASM
	if (isMMX) {
		kernel_fpu_begin ();
		__asm__ __volatile__(
		"   movl      %0, %%esi		\n"
		"   movl      %1, %%edi		\n"
		"   movl      %2, %%edx		\n"
		"   shrl      $3, %%edx		\n"
		"1: movq      (%%esi), %%mm0	\n" /* mm0 = Y3V2Y2U2Y1V0Y0U0 */
		"   movq      8(%%esi), %%mm1	\n" /* mm1 = Y7V6Y6U6Y5V4Y4U4 */
		"   psrlw     $8, %%mm0		\n" /* mm0 = __Y3__Y2__Y1__Y0 */
		"   psrlw     $8, %%mm1		\n" /* mm1 = __Y7__Y6__Y5__Y4 */
		"   packuswb  %%mm1, %%mm0	\n" /* mm0 = Y7Y6Y5Y4Y3Y2Y1Y0 */
		"   movq      %%mm0, (%%edi)	\n"
		"   addl      $16, %%esi	\n"
		"   addl      $8, %%edi		\n"
		"   decl      %%edx		\n"
		"   jnz	      1b		\n"
		"   emms			\n"
		/* output */		:
		/* input */		: "g" (uyvy_buf), "g" (grey_buf), "g" (width)
		/* clobber registers */ : "cc", "edx", "esi", "edi");
		kernel_fpu_end ();
	} else
#endif
	{
		u32 *uyvy = (u32 *) uyvy_buf;
		u32 *grey = (u32 *) grey_buf;
		u32 val1, val2, val;

		width >>= 2;
		while (width--) {
			val1 = *uyvy++;
			val2 = *uyvy++;
			val =	((val1 & 0x0000FF00) >> 8) | ((val1 & 0xFF000000) >> 16) |
				((val2 & 0x0000FF00) << 8) | ((val2 & 0xFF000000));
			*grey++ = val;
		}
	}
}

#endif /* RIVATV_DISABLE_CONVERSION */

/* Plain copy for 16 bit pixel formats. */
static void rivatv_copy_BPP2 (char *src, char *dst, int width)
{
#if (defined (__i386__) || defined (__x86_64__)) && RIVATV_ENABLE_ASM
	if (isMMX) {
		kernel_fpu_begin ();
		__asm__ __volatile__(
		"   movl      %0, %%esi		\n"
		"   movl      %1, %%edi		\n"
		"   movl      %2, %%ecx		\n"
		"   shrl      $2, %%ecx		\n"
		"1: movq      (%%esi), %%mm0	\n"
		"   movq      %%mm0, (%%edi)	\n"
		"   addl      $8, %%esi		\n"
		"   addl      $8, %%edi		\n"
		"   decl      %%ecx		\n"
		"   jnz	      1b		\n"
		"   emms			\n"
		/* output */		:
		/* input */		: "g" (src), "g" (dst), "g" (width)
		/* clobber registers */ : "cc", "ecx", "esi", "edi");
		kernel_fpu_end ();
	} else
#endif
	{
		memcpy (dst, src, width << 1);
	}
}

/* Plain copy for 24 bit pixel formats. */
static void rivatv_copy_BPP3 (char *src, char *dst, int width)
{
#if (defined (__i386__) || defined (__x86_64__)) && RIVATV_ENABLE_ASM
	if (isMMX) {
		kernel_fpu_begin ();
		__asm__ __volatile__(
		"   movl      %0, %%esi		\n"
		"   movl      %1, %%edi		\n"
		"   movl      %2, %%ecx		\n"
		"   movl      %%ecx, %%eax	\n"
		"   shrl      $2, %%ecx		\n"
		"   shrl      $3, %%eax		\n"
		"   addl      %%eax, %%ecx	\n"
		"1: movq      (%%esi), %%mm0	\n"
		"   movq      %%mm0, (%%edi)	\n"
		"   addl      $8, %%esi		\n"
		"   addl      $8, %%edi		\n"
		"   decl      %%ecx		\n"
		"   jnz	      1b		\n"
		"   emms			\n"
		/* output */		:
		/* input */		: "g" (src), "g" (dst), "g" (width)
		/* clobber registers */ : "cc", "eax", "ecx", "esi", "edi");
		kernel_fpu_end ();
	} else
#endif
	{
		memcpy (dst, src, width * 3);
	}
}

#ifndef RIVATV_DISABLE_CONVERSION

/* Convert the video decoders native UYVY color format into YUV422P. */
static void rivatv_UYVY_to_YUV422P (char *src, char *dst, int width, int height)
{
	u8 *dstY, *dstU, *dstV;
	u32 size = width * height;

	dstY = dst;
	dstU = dstY + size;
	dstV = dstU + (size >> 1);
	size >>= 1;

#if (defined (__i386__) || defined (__x86_64__)) && RIVATV_ENABLE_ASM
	if (isMMX) {
		kernel_fpu_begin ();
		__asm__ __volatile__(
		"   movl      %0, %%esi		\n"
		"   movl      %1, %%edi		\n"
		"   movl      %2, %%ebx		\n"
		"   movl      %3, %%edx		\n"
		"   movl      %4, %%ecx		\n"
		"   shrl      $3, %%ecx		\n"
		/* Process 16 pixels at once. */
		"1: movq      (%%esi), %%mm0	\n" /* mm0 = Y03V02Y02U02Y01V00Y00U00 */
		"   movq      8(%%esi), %%mm1	\n" /* mm1 = Y07V06Y06U06Y05V04Y04U04 */
		"   movq      16(%%esi), %%mm4	\n" /* mm4 = Y11V10Y10U08Y09V10Y08U08 */
		"   movq      24(%%esi), %%mm5	\n" /* mm5 = Y15V14Y14U14Y13V12Y12U12 */
		"   movq      %%mm0, %%mm2	\n" /* mm2 = Y03V02Y02U02Y01V00Y00U00 */
		"   movq      %%mm1, %%mm3	\n" /* mm3 = Y07V06Y06U06Y05V04Y04U04 */
		"   movq      %%mm4, %%mm6	\n" /* mm6 = Y11V10Y10U08Y09V10Y08U08 */
		"   movq      %%mm5, %%mm7	\n" /* mm7 = Y15V14Y14U14Y13V12Y12U12 */
		"   psrlw     $8, %%mm0		\n" /* mm0 = ___Y03___Y02___Y01___Y00 */
		"   psrlw     $8, %%mm1		\n" /* mm1 = ___Y07___Y06___Y05___Y04 */
		"   psrlw     $8, %%mm4		\n" /* mm4 = ___Y11___Y10___Y09___Y08 */
		"   psrlw     $8, %%mm5		\n" /* mm5 = ___Y15___Y14___Y13___Y12 */
		"   psllw     $8, %%mm2		\n" /* mm2 = V02___U02___V00___U00___ */
		"   psllw     $8, %%mm3		\n" /* mm3 = V06___U06___V04___U04___ */
		"   psllw     $8, %%mm6		\n" /* mm6 = V10___U10___V08___U08___ */
		"   psllw     $8, %%mm7		\n" /* mm7 = V14___U14___V12___U12___ */
		"   psrlw     $8, %%mm2		\n" /* mm2 = ___V02___U02___V00___U00 */
		"   psrlw     $8, %%mm3		\n" /* mm3 = ___V06___U06___V04___U04 */
		"   psrlw     $8, %%mm6		\n" /* mm6 = ___V10___U10___V08___U08 */
		"   psrlw     $8, %%mm7		\n" /* mm7 = ___V14___U14___V12___U12 */
		"   packuswb  %%mm3, %%mm2	\n" /* mm2 = V06U06V04U04V02U02V00U00 */
		"   packuswb  %%mm7, %%mm6	\n" /* mm6 = V14U14V12U12V10U10V08U08 */
		"   packuswb  %%mm1, %%mm0	\n" /* mm0 = Y07Y06Y05Y04Y03Y02Y01Y00 */
		"   packuswb  %%mm5, %%mm4	\n" /* mm4 = Y15Y14Y13Y12Y11Y10Y09Y08 */
		"   movq      %%mm2, %%mm3	\n" /* mm3 = V06U06V04U04V02U02V00U00 */
		"   movq      %%mm6, %%mm7	\n" /* mm7 = V14U14V12U12V10U10V08U08 */
		/* Store Y pixels. */
		"   movq      %%mm0, (%%edi)	\n"
		"   movq      %%mm4, 8(%%edi)	\n"
		"   psllw     $8, %%mm2		\n" /* mm2 = U06___U04___U02___U00___ */
		"   psllw     $8, %%mm6		\n" /* mm6 = U14___U12___U10___U08___ */
		"   psrlw     $8, %%mm3		\n" /* mm3 = ___V06___V04___V02___V00 */
		"   psrlw     $8, %%mm7		\n" /* mm7 = ___V14___V12___V10___V08 */
		"   psrlw     $8, %%mm2		\n" /* mm2 = ___U06___U04___U02___U00 */
		"   psrlw     $8, %%mm6		\n" /* mm6 = ___U14___U12___U10___U08 */
		"   packuswb  %%mm7, %%mm3	\n" /* mm3 = V14V12V10V08V06V04V02V00 */
		"   packuswb  %%mm6, %%mm2	\n" /* mm2 = U14U12U10U08U06U04U02U00 */
		/* Store V pixels. */
		"   movq      %%mm3, (%%edx)	\n"
		/* Store U pixels. */
		"   movq      %%mm2, (%%ebx)	\n"
		"   addl      $16, %%edi	\n"
		"   addl      $32, %%esi	\n"
		"   addl      $8, %%ebx		\n"
		"   addl      $8, %%edx		\n"
		"   decl      %%ecx		\n"
		"   jnz	      1b		\n"
		"   emms			\n"
		/* output */		:
		/* input */		: "g" (src), "g" (dstY), "g" (dstU), "g" (dstV), "g" (size)
		/* clobber registers */ : "cc", "ebx", "ecx", "edx", "esi", "edi");
		kernel_fpu_end ();
	} else
#endif
	{
		while (size--) {
			*dstU++ = *src++;
			*dstY++ = *src++;
			*dstV++ = *src++;
			*dstY++ = *src++;
		}
	}
}

/* Convert the video decoders native UYVY color format into YUV411P. */
static void rivatv_UYVY_to_YUV411P (char *src, char *dst, int width, int height)
{
	u8 *dstY, *dstU, *dstV;
	u32 size = width * height;

	dstY = dst;
	dstU = dstY + size;
	dstV = dstU + (size >> 2);
	size >>= 2;

#if (defined (__i386__) || defined (__x86_64__)) && RIVATV_ENABLE_ASM
	if (isMMX) {
		kernel_fpu_begin ();
		__asm__ __volatile__(
		"   movl      %0, %%esi		\n"
		"   movl      %1, %%edi		\n"
		"   movl      %2, %%ebx		\n"
		"   movl      %3, %%edx		\n"
		"   movl      %4, %%ecx		\n"
		"   shrl      $2, %%ecx		\n"
		/* Process 16 pixels at once. */
		"1: movq      (%%esi), %%mm0	\n" /* mm0 = Y03V02Y02U02Y01V00Y00U00 */
		"   movq      8(%%esi), %%mm1	\n" /* mm1 = Y07V06Y06U06Y05V04Y04U04 */
		"   movq      16(%%esi), %%mm4	\n" /* mm4 = Y11V10Y10U08Y09V10Y08U08 */
		"   movq      24(%%esi), %%mm5	\n" /* mm5 = Y15V14Y14U14Y13V12Y12U12 */
		"   movq      %%mm0, %%mm2	\n" /* mm2 = Y03V02Y02U02Y01V00Y00U00 */
		"   movq      %%mm1, %%mm3	\n" /* mm3 = Y07V06Y06U06Y05V04Y04U04 */
		"   movq      %%mm4, %%mm6	\n" /* mm6 = Y11V10Y10U08Y09V10Y08U08 */
		"   movq      %%mm5, %%mm7	\n" /* mm7 = Y15V14Y14U14Y13V12Y12U12 */
		"   psrlw     $8, %%mm0		\n" /* mm0 = ___Y03___Y02___Y01___Y00 */
		"   psrlw     $8, %%mm1		\n" /* mm1 = ___Y07___Y06___Y05___Y04 */
		"   psrlw     $8, %%mm4		\n" /* mm4 = ___Y11___Y10___Y09___Y08 */
		"   psrlw     $8, %%mm5		\n" /* mm5 = ___Y15___Y14___Y13___Y12 */
		"   punpcklbw %%mm3, %%mm2	\n" /* mm2 = Y05Y01V04V00Y04Y00U04U00 */
		"   punpcklbw %%mm7, %%mm6	\n" /* mm6 = Y13Y09V12V08Y12Y08U12U08 */
		"   movq      %%mm2, %%mm3	\n" /* mm3 = Y05Y01V04V00Y04Y00U04U00 */
		"   movq      %%mm6, %%mm7	\n" /* mm7 = Y13Y09V12V08Y12Y08U12U08 */
		"   psrlq     $32, %%mm3	\n" /* mm3 = ____________Y05Y01V04V00 */
		"   psrlq     $32, %%mm7	\n" /* mm7 = ____________Y13Y09V12V08 */
		"   punpcklwd %%mm6, %%mm2	\n" /* mm2 = Y12Y08Y04Y00U12U08U04U00 */
		"   punpcklwd %%mm7, %%mm3	\n" /* mm3 = Y13Y09Y05Y01V12V08V04V00 */
		"   packuswb  %%mm1, %%mm0	\n" /* mm0 = Y07Y06Y05Y04Y03Y02Y01Y00 */
		"   packuswb  %%mm5, %%mm4	\n" /* mm4 = Y15Y14Y13Y12Y11Y10Y09Y08 */
		"   movq      %%mm0, (%%edi)	\n" /* store Y pixels */
		"   movq      %%mm4, 8(%%edi)	\n"
		"   movd      %%mm3, (%%edx)	\n" /* store V pixels */
		"   movd      %%mm2, (%%ebx)	\n" /* store U pixels */
		"   addl      $16, %%edi	\n"
		"   addl      $32, %%esi	\n"
		"   addl      $4, %%ebx		\n"
		"   addl      $4, %%edx		\n"
		"   decl      %%ecx		\n"
		"   jnz	      1b		\n"
		"   emms			\n"
		/* output */		:
		/* input */		: "g" (src), "g" (dstY), "g" (dstU), "g" (dstV), "g" (size)
		/* clobber registers */ : "cc", "ebx", "ecx", "edx", "esi", "edi");
		kernel_fpu_end ();
	} else
#endif
	{
		while (size--) {
			*dstU++ = *src++;
			*dstY++ = *src++;
			*dstV++ = *src++;
			*dstY++ = *src++;
			src++;
			*dstY++ = *src++;
			src++;
			*dstY++ = *src++;
		}
	}
}

/* Convert the video decoders native UYVY color format into YUV420P. 
   No average done for odd/even lines. Odd lines skipped. */
static void rivatv_UYVY_to_YUV420P (char *src, char *dst, int width, int height)
{
	u8 *dstY, *dstU, *dstV;
	u32 c, size = width * height;

	dstY = dst;
	dstU = dstY + size;
	dstV = dstU + (size >> 2);
	height >>= 1;
	width >>= 1;

#if (defined (__i386__) || defined (__x86_64__)) && RIVATV_ENABLE_ASM
	if (isMMX) {
		kernel_fpu_begin ();
		__asm__ __volatile__(
		"   movl      %0, %%esi		\n"
		"   movl      %1, %%edi		\n"
		"   movl      %2, %%ebx		\n"
		"   movl      %3, %%eax		\n"
		"   movl      %5, %%edx		\n"
		/* Process 8 pixels at once. */
		"1: movl      %4, %%ecx		\n"
		"   shrl      $2, %%ecx		\n"
		/* Even lines here. */
		"2: movq      (%%esi), %%mm0	\n" /* mm0 = Y3V2Y2U2Y1V0Y0U0 */
		"   movq      8(%%esi), %%mm1	\n" /* mm1 = Y7V6Y6U6Y5V4Y4U4 */
		"   movq      %%mm0, %%mm2	\n" /* mm2 = Y3V2Y2U2Y1V0Y0U0 */
		"   movq      %%mm1, %%mm3	\n" /* mm3 = Y7V6Y6U6Y5V4Y4U4 */
		"   psrlw     $8, %%mm0		\n" /* mm0 = __Y3__Y2__Y1__Y0 */
		"   psrlw     $8, %%mm1		\n" /* mm1 = __Y7__Y6__Y5__Y4 */
		"   psllw     $8, %%mm2		\n" /* mm2 = V2__U2__V0__U0__ */
		"   psllw     $8, %%mm3		\n" /* mm3 = V6__U6__V4__U4__ */
		"   psrlw     $8, %%mm2		\n" /* mm2 = __V2__U2__V0__U0 */
		"   psrlw     $8, %%mm3		\n" /* mm3 = __V6__U6__V4__U4 */
		"   packuswb  %%mm1, %%mm0	\n" /* mm0 = Y7Y6Y5Y4Y3Y2Y1Y0 */
		"   packuswb  %%mm3, %%mm2	\n" /* mm2 = V6U6V4U4V2U2V0U0 */
		"   movq      %%mm0, (%%edi)	\n" /* store Y pixels */
		"   movq      %%mm2, %%mm3	\n" /* mm3 = V6U6V4U4V2U2V0U0 */
		"   psllw     $8, %%mm2		\n" /* mm2 = U6__U4__U2__U0__ */
		"   psrlw     $8, %%mm3		\n" /* mm3 = __V6__V4__V2__V0 */
		"   psrlw     $8, %%mm2		\n" /* mm2 = __U6__U4__U2__U0 */
		"   packuswb  %%mm3, %%mm3	\n" /* mm3 = V6V4V2V0V6V4V2V0 */
		"   packuswb  %%mm2, %%mm2	\n" /* mm2 = U6U4U2U0U6U4U2U0 */
		"   movd      %%mm3, (%%eax)	\n" /* store V pixels */
		"   movd      %%mm2, (%%ebx)	\n" /* store U pixels */
		"   addl      $16, %%esi	\n"
		"   addl      $8, %%edi		\n"
		"   addl      $4, %%eax		\n"
		"   addl      $4, %%ebx		\n"
		"   decl      %%ecx		\n"
		"   jnz	      2b		\n"
		"   movl      %4, %%ecx		\n"
		"   shrl      $2, %%ecx		\n"
		/* Odd lines here. */
		"3: movq      (%%esi), %%mm0	\n" /* mm0 = Y3V2Y2U2Y1V0Y0U0 */
		"   movq      8(%%esi), %%mm1	\n" /* mm1 = Y7V6Y6U6Y5V4Y4U4 */
		"   psrlw     $8, %%mm0		\n" /* mm0 = __Y3__Y2__Y1__Y0 */
		"   psrlw     $8, %%mm1		\n" /* mm1 = __Y7__Y6__Y5__Y4 */
		"   packuswb  %%mm1, %%mm0	\n" /* mm0 = Y7Y6Y5Y4Y3Y2Y1Y0 */
		"   movq      %%mm0, (%%edi)	\n" /* store Y pixels */
		"   addl      $16, %%esi	\n"
		"   addl      $8, %%edi		\n"
		"   decl      %%ecx		\n"
		"   jnz	      3b		\n"
		"   decl      %%edx		\n"
		"   jnz	      1b		\n"
		"   emms			\n"
		/* output */		:
		/* input */		: "g" (src), "g" (dstY), "g" (dstU), "g" (dstV), "g" (width), "g" (height)
		/* clobber registers */ : "cc", "eax", "ebx", "ecx", "edx", "esi", "edi");
		kernel_fpu_end ();
	} else
#endif
	{
		while (height--) {
			/* even lines */
			c = width;
			while (c--) {
				*dstU++ = *src++;
				*dstY++ = *src++;
				*dstV++ = *src++;
				*dstY++ = *src++;
			}
			/* odd lines */
			c = width;
			while (c--) {
				src++;
				*dstY++ = *src++;
				src++;
				*dstY++ = *src++;
			}
		}
	}
}

/* Convert the video decoders native UYVY color format into YUV410P. */
static void rivatv_UYVY_to_YUV410P (char *src, char *dst, int width, int height)
{
	u8 *dstY, *dstU, *dstV;
	u32 c, size = width * height;

	dstY = dst;
	dstU = dstY + size;
	dstV = dstU + (size >> 4);
	height >>= 2;
	width >>= 2;

#if (defined (__i386__) || defined (__x86_64__)) && RIVATV_ENABLE_ASM
	if (isMMX) {
		kernel_fpu_begin ();
		__asm__ __volatile__(
		"   movl      %0, %%esi		\n"
		"   movl      %1, %%edi		\n"
		"   movl      %2, %%ebx		\n"
		"   movl      %3, %%eax		\n"
		"   movl      %5, %%edx		\n"
		/* Process 16 pixels at once. */
		"1: movl      %4, %%ecx		\n"
		"   shrl      $2, %%ecx		\n"
		/* First line here. */
		"2: movq      (%%esi), %%mm0	\n" /* mm0 = Y03V02Y02U02Y01V00Y00U00 */
		"   movq      8(%%esi), %%mm1	\n" /* mm1 = Y07V06Y06U06Y05V04Y04U04 */
		"   movq      16(%%esi), %%mm2	\n" /* mm2 = Y11V10Y10U10Y09V08Y08U08 */
		"   movq      24(%%esi), %%mm3	\n" /* mm3 = Y15V14Y14U14Y13V12Y12U12 */
		"   movq      %%mm0, %%mm4	\n" /* mm4 = Y03V02Y02U02Y01V00Y00U00 */
		"   movq      %%mm1, %%mm5	\n" /* mm5 = Y07V06Y06U06Y05V04Y04U04 */
		"   movq      %%mm2, %%mm6	\n" /* mm6 = Y11V10Y10U10Y09V08Y08U08 */
		"   movq      %%mm3, %%mm7	\n" /* mm7 = Y15V14Y14U14Y13V12Y12U12 */
		"   psrlw     $8, %%mm0		\n" /* mm0 = ___Y03___Y02___Y01___Y00 */
		"   psrlw     $8, %%mm1		\n" /* mm1 = ___Y07___Y06___Y05___Y04 */
		"   psrlw     $8, %%mm2		\n" /* mm2 = ___Y11___Y10___Y09___Y08 */
		"   psrlw     $8, %%mm3		\n" /* mm3 = ___Y15___Y14___Y13___Y12 */
		"   punpcklbw %%mm5, %%mm4	\n" /* mm4 = Y05Y01V04V00Y04Y00U04U00 */
		"   punpcklbw %%mm7, %%mm6	\n" /* mm6 = Y13Y09V12V08Y12Y08U12U08 */
		"   movq      %%mm4, %%mm5	\n" /* mm5 = Y05Y01V04V00Y04Y00U04U00 */
		"   movq      %%mm6, %%mm7	\n" /* mm7 = Y13Y09V12V08Y12Y08U12U08 */
		"   psrlq     $32, %%mm5	\n" /* mm5 = ____________Y05Y01V04V00 */
		"   psrlq     $32, %%mm7	\n" /* mm7 = ____________Y13Y09V12V08 */
		"   punpcklwd %%mm6, %%mm4	\n" /* mm4 = Y12Y08Y04Y00U12U08U04U00 */
		"   punpcklwd %%mm7, %%mm5	\n" /* mm5 = Y13Y09Y05Y01V12V08V04V00 */
		"   packuswb  %%mm1, %%mm0	\n" /* mm0 = Y07Y06Y05Y04Y03Y02Y01Y00 */
		"   packuswb  %%mm3, %%mm2	\n" /* mm2 = Y15Y14Y13Y12Y11Y10Y09Y08 */
		"   movq      %%mm0, (%%edi)	\n" /* store Y pixels */
		"   movq      %%mm2, 8(%%edi)	\n"
		"   movd      %%mm5, (%%eax)	\n" /* store V pixels */
		"   movd      %%mm4, (%%ebx)	\n" /* store U pixels */
		"   addl      $32, %%esi	\n"
		"   addl      $16, %%edi	\n"
		"   addl      $4, %%eax		\n"
		"   addl      $4, %%ebx		\n"
		"   decl      %%ecx		\n"
		"   jnz	      2b		\n"
		"   movl      %4, %%ecx		\n"
		"   shrl      $2, %%ecx		\n"
		/* Second line here. */
		"3: movq      (%%esi), %%mm0	\n" /* mm0 = Y03V02Y02U02Y01V00Y00U00 */
		"   movq      8(%%esi), %%mm1	\n" /* mm1 = Y07V06Y06U06Y05V04Y04U04 */
		"   movq      16(%%esi), %%mm2	\n" /* mm2 = Y11V10Y10U10Y09V08Y08U08 */
		"   movq      24(%%esi), %%mm3	\n" /* mm3 = Y15V14Y14U14Y13V12Y12U12 */
		"   psrlw     $8, %%mm0		\n" /* mm0 = ___Y03___Y02___Y01___Y00 */
		"   psrlw     $8, %%mm1		\n" /* mm1 = ___Y07___Y06___Y05___Y04 */
		"   psrlw     $8, %%mm2		\n" /* mm2 = ___Y11___Y10___Y09___Y08 */
		"   psrlw     $8, %%mm3		\n" /* mm3 = ___Y15___Y14___Y13___Y12 */
		"   packuswb  %%mm1, %%mm0	\n" /* mm0 = Y07Y06Y05Y04Y03Y02Y01Y00 */
		"   packuswb  %%mm3, %%mm2	\n" /* mm2 = Y15Y14Y13Y12Y11Y10Y09Y08 */
		"   movq      %%mm0, (%%edi)	\n" /* store Y pixels */
		"   movq      %%mm2, 8(%%edi)	\n"
		"   addl      $32, %%esi	\n"
		"   addl      $16, %%edi	\n"
		"   decl      %%ecx		\n"
		"   jnz	      3b		\n"
		"   movl      %4, %%ecx		\n"
		"   shrl      $2, %%ecx		\n"
		/* Third line here. */
		"4: movq      (%%esi), %%mm0	\n" /* mm0 = Y03V02Y02U02Y01V00Y00U00 */
		"   movq      8(%%esi), %%mm1	\n" /* mm1 = Y07V06Y06U06Y05V04Y04U04 */
		"   movq      16(%%esi), %%mm2	\n" /* mm2 = Y11V10Y10U10Y09V08Y08U08 */
		"   movq      24(%%esi), %%mm3	\n" /* mm3 = Y15V14Y14U14Y13V12Y12U12 */
		"   psrlw     $8, %%mm0		\n" /* mm0 = ___Y03___Y02___Y01___Y00 */
		"   psrlw     $8, %%mm1		\n" /* mm1 = ___Y07___Y06___Y05___Y04 */
		"   psrlw     $8, %%mm2		\n" /* mm2 = ___Y11___Y10___Y09___Y08 */
		"   psrlw     $8, %%mm3		\n" /* mm3 = ___Y15___Y14___Y13___Y12 */
		"   packuswb  %%mm1, %%mm0	\n" /* mm0 = Y07Y06Y05Y04Y03Y02Y01Y00 */
		"   packuswb  %%mm3, %%mm2	\n" /* mm2 = Y15Y14Y13Y12Y11Y10Y09Y08 */
		"   movq      %%mm0, (%%edi)	\n" /* store Y pixels */
		"   movq      %%mm2, 8(%%edi)	\n"
		"   addl      $32, %%esi	\n"
		"   addl      $16, %%edi	\n"
		"   decl      %%ecx		\n"
		"   jnz	      4b		\n"
		"   movl      %4, %%ecx		\n"
		"   shrl      $2, %%ecx		\n"
		/* Fourth line here. */
		"5: movq      (%%esi), %%mm0	\n" /* mm0 = Y03V02Y02U02Y01V00Y00U00 */
		"   movq      8(%%esi), %%mm1	\n" /* mm1 = Y07V06Y06U06Y05V04Y04U04 */
		"   movq      16(%%esi), %%mm2	\n" /* mm2 = Y11V10Y10U10Y09V08Y08U08 */
		"   movq      24(%%esi), %%mm3	\n" /* mm3 = Y15V14Y14U14Y13V12Y12U12 */
		"   psrlw     $8, %%mm0		\n" /* mm0 = ___Y03___Y02___Y01___Y00 */
		"   psrlw     $8, %%mm1		\n" /* mm1 = ___Y07___Y06___Y05___Y04 */
		"   psrlw     $8, %%mm2		\n" /* mm2 = ___Y11___Y10___Y09___Y08 */
		"   psrlw     $8, %%mm3		\n" /* mm3 = ___Y15___Y14___Y13___Y12 */
		"   packuswb  %%mm1, %%mm0	\n" /* mm0 = Y07Y06Y05Y04Y03Y02Y01Y00 */
		"   packuswb  %%mm3, %%mm2	\n" /* mm2 = Y15Y14Y13Y12Y11Y10Y09Y08 */
		"   movq      %%mm0, (%%edi)	\n" /* store Y pixels */
		"   movq      %%mm2, 8(%%edi)	\n"
		"   addl      $32, %%esi	\n"
		"   addl      $16, %%edi	\n"
		"   decl      %%ecx		\n"
		"   jnz	      5b		\n"
		"   decl      %%edx		\n"
		"   jnz	      1b		\n"
		"   emms			\n"
		/* output */		:
		/* input */		: "g" (src), "g" (dstY), "g" (dstU), "g" (dstV), "g" (width), "g" (height)
		/* clobber registers */ : "cc", "eax", "ebx", "ecx", "edx", "esi", "edi");
		kernel_fpu_end ();
	} else
#endif
	{
		while (height--) {
			/* first line */
			c = width;
			while (c--) {
				*dstU++ = src[0];
				*dstV++ = src[2];
				*dstY++ = src[1];
				*dstY++ = src[3];
				*dstY++ = src[5];
				*dstY++ = src[7];
				src += 8;
			}
			/* second line */
			c = width;
			while (c--) {
				*dstY++ = src[1];
				*dstY++ = src[3];
				*dstY++ = src[5];
				*dstY++ = src[7];
				src += 8;
			}
			/* third line */
			c = width;
			while (c--) {
				*dstY++ = src[1];
				*dstY++ = src[3];
				*dstY++ = src[5];
				*dstY++ = src[7];
				src += 8;
			}
			/* fourth line */
			c = width;
			while (c--) {
				*dstY++ = src[1];
				*dstY++ = src[3];
				*dstY++ = src[5];
				*dstY++ = src[7];
				src += 8;
			}
		}
	}
}

#endif /* RIVATV_DISABLE_CONVERSION */

#ifdef RIVATV_DISABLE_CONVERSION
#define rivatv_UYVY_to_GREY    NULL
#define rivatv_UYVY_to_RGB565  NULL
#define rivatv_UYVY_to_RGB32   NULL
#define rivatv_UYVY_to_RGB555  NULL
#define rivatv_UYVY_to_RGB24   NULL
#define rivatv_UYVY_to_YUV422  NULL
#define rivatv_UYVY_to_YUYV    NULL
#define rivatv_UYVY_to_YUV411  NULL
#define rivatv_UYVY_to_YUV422P NULL
#define rivatv_UYVY_to_YUV411P NULL
#define rivatv_UYVY_to_YUV420P NULL
#define rivatv_UYVY_to_YUV410P NULL
#endif /* RIVATV_DISABLE_CONVERSION */

/* Image conversion matrix. */
struct rivatv_conversion rivatv_convert[17] = {
	{ /* 0			      */  0, 0, "<invalid>", { } },
	{ /* 1	VIDEO_PALETTE_GREY    */  8, 0, "GREY", { } },
	{ /* 2	VIDEO_PALETTE_HI240   */  8, 0, "HI240", { } },
	{ /* 3	VIDEO_PALETTE_RGB565  */ 16, 0, "RGB565",
	  { NULL, NULL, NULL, rivatv_copy_BPP2, NULL, NULL, 
	    NULL, NULL, NULL, NULL, NULL, NULL, 
	    NULL, NULL, NULL, NULL, NULL } },
	{ /* 4	VIDEO_PALETTE_RGB24   */ 24, 0, "RGB24",
	  { NULL, NULL, NULL, NULL, rivatv_copy_BPP3, NULL, 
	    NULL, NULL, NULL, NULL, NULL, NULL, 
	    NULL, NULL, NULL, NULL, NULL } },
	{ /* 5	VIDEO_PALETTE_RGB32   */ 32, 0, "RGB32", { } },
	{ /* 6	VIDEO_PALETTE_RGB555  */ 16, 0, "RGB555", { } },
	{ /* 7	VIDEO_PALETTE_YUV422  */ 16, 0, "YUV422", 
	  { NULL, NULL, NULL, NULL, NULL, NULL, 
	    NULL, rivatv_copy_BPP2, rivatv_copy_BPP2, NULL, NULL, NULL, 
	    NULL, NULL, NULL, NULL, NULL } },
	{ /* 8	VIDEO_PALETTE_YUYV    */ 16, 0, "YUYV", 
	  { NULL, NULL, NULL, NULL, NULL, NULL, 
	    NULL, rivatv_copy_BPP2, rivatv_copy_BPP2, NULL, NULL, NULL, 
	    NULL, NULL, NULL, NULL, NULL } },
	{ /* 9	VIDEO_PALETTE_UYVY    */ 16, 0, "UYVY",
	  { NULL,		    rivatv_UYVY_to_GREY,    NULL, 
	    rivatv_UYVY_to_RGB565,  rivatv_UYVY_to_RGB24,   rivatv_UYVY_to_RGB32, 
	    rivatv_UYVY_to_RGB555,  rivatv_UYVY_to_YUV422,  rivatv_UYVY_to_YUYV, 
	    rivatv_copy_BPP2,	    NULL,		    rivatv_UYVY_to_YUV411, 
	    NULL,		    rivatv_UYVY_to_YUV422P, rivatv_UYVY_to_YUV411P, 
	    rivatv_UYVY_to_YUV420P, rivatv_UYVY_to_YUV410P } },
	{ /* 10 VIDEO_PALETTE_YUV420  */ 12, 0, "YUV420", { } },
	{ /* 11 VIDEO_PALETTE_YUV411  */ 12, 0, "YUV411", { } },
	{ /* 12 VIDEO_PALETTE_RAW     */  0, 0, "RAW", { } },
	{ /* 13 VIDEO_PALETTE_YUV422P */ 16, 1, "YUV422P", { } },
	{ /* 14 VIDEO_PALETTE_YUV411P */ 12, 1, "YUV411P", { } },
	{ /* 15 VIDEO_PALETTE_YUV420P */ 12, 1, "YUV420P", { } },
	{ /* 16 VIDEO_PALETTE_YUV410P */  9, 1, "YUV410P", { } }
};
