Added debian extra dependency packages.

Signed-off-by: Michele Calgaro <[email protected]>
author: Michele Calgaro <[email protected]> 2020-09-11 14:38:47 +0900
committer: Michele Calgaro <[email protected]> 2020-09-11 14:38:47 +0900
commit: 884c8093d63402a1ad0b502244b791e3c6782be3 (patch)
tree: a600d4ab0d431a2bdfe4c15b70df43c14fbd8dd0 /debian/transcode/transcode-1.1.7/aclib
parent: 14e1aa2006796f147f3f4811fb908a6b01e79253 (diff)
download: extra-dependencies-884c8093d63402a1ad0b502244b791e3c6782be3.tar.gz
extra-dependencies-884c8093d63402a1ad0b502244b791e3c6782be3.zip
18 files changed, 8672 insertions, 0 deletions
diff --git a/debian/transcode/transcode-1.1.7/aclib/Makefile.am b/debian/transcode/transcode-1.1.7/aclib/Makefile.am
new file mode 100644
index 00000000..54951ce6
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/Makefile.am
@@ -0,0 +1,27 @@
+# # Process this file with automake to produce Makefile.in.
+
+AM_CPPFLAGS = \
+        $(PTHREAD_CFLAGS) \
+        -I$(top_srcdir)
+
+noinst_LTLIBRARIES = libac.la
+
+libac_la_SOURCES = \
+        accore.c \
+        average.c \
+        imgconvert.c \
+        img_rgb_packed.c \
+        img_yuv_mixed.c \
+        img_yuv_packed.c \
+        img_yuv_planar.c \
+        img_yuv_rgb.c \
+        memcpy.c \
+        rescale.c
+
+EXTRA_DIST = \
+        ac.h \
+        ac_internal.h \
+        imgconvert.h \
+        img_internal.h \
+        img_x86_common.h \
+        rgb-yuv-conv.pl 
diff --git a/debian/transcode/transcode-1.1.7/aclib/Makefile.in b/debian/transcode/transcode-1.1.7/aclib/Makefile.in
new file mode 100644
index 00000000..8f3a132a
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/Makefile.in
@@ -0,0 +1,610 @@
+# Makefile.in generated by automake 1.11.1 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009  Free Software Foundation,
+# Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# # Process this file with automake to produce Makefile.in.
+
+VPATH = @srcdir@
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+target_triplet = @target@
+subdir = aclib
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/acinclude.m4 \
+	$(top_srcdir)/configure.in
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+libac_la_LIBADD =
+am_libac_la_OBJECTS = accore.lo average.lo imgconvert.lo \
+	img_rgb_packed.lo img_yuv_mixed.lo img_yuv_packed.lo \
+	img_yuv_planar.lo img_yuv_rgb.lo memcpy.lo rescale.lo
+libac_la_OBJECTS = $(am_libac_la_OBJECTS)
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/autotools/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(libac_la_SOURCES)
+DIST_SOURCES = $(libac_la_SOURCES)
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+A52_CFLAGS = @A52_CFLAGS@
+A52_LIBS = @A52_LIBS@
+ACLIB_LIBS = @ACLIB_LIBS@
+ACLOCAL = @ACLOCAL@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVILIB_LIBS = @AVILIB_LIBS@
+AWK = @AWK@
+BSDAV_CFLAGS = @BSDAV_CFLAGS@
+BSDAV_LIBS = @BSDAV_LIBS@
+CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CXXCPP = @CXXCPP@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLDARWIN_CFLAGS = @DLDARWIN_CFLAGS@
+DLDARWIN_LIBS = @DLDARWIN_LIBS@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+FAAC_CFLAGS = @FAAC_CFLAGS@
+FAAC_LIBS = @FAAC_LIBS@
+FGREP = @FGREP@
+FREETYPE2_CFLAGS = @FREETYPE2_CFLAGS@
+FREETYPE2_LIBS = @FREETYPE2_LIBS@
+GREP = @GREP@
+IBP_LIBS = @IBP_LIBS@
+ICONV_CFLAGS = @ICONV_CFLAGS@
+ICONV_LIBS = @ICONV_LIBS@
+IMAGEMAGICK_CFLAGS = @IMAGEMAGICK_CFLAGS@
+IMAGEMAGICK_LIBS = @IMAGEMAGICK_LIBS@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LAME_CFLAGS = @LAME_CFLAGS@
+LAME_LIBS = @LAME_LIBS@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBAVCODEC_CFLAGS = @LIBAVCODEC_CFLAGS@
+LIBAVCODEC_LIBS = @LIBAVCODEC_LIBS@
+LIBAVFORMAT_CFLAGS = @LIBAVFORMAT_CFLAGS@
+LIBAVFORMAT_LIBS = @LIBAVFORMAT_LIBS@
+LIBDVDREAD_CFLAGS = @LIBDVDREAD_CFLAGS@
+LIBDVDREAD_LIBS = @LIBDVDREAD_LIBS@
+LIBDV_CFLAGS = @LIBDV_CFLAGS@
+LIBDV_LIBS = @LIBDV_LIBS@
+LIBJPEG_CFLAGS = @LIBJPEG_CFLAGS@
+LIBJPEG_LIBS = @LIBJPEG_LIBS@
+LIBMPEG2CONVERT_CFLAGS = @LIBMPEG2CONVERT_CFLAGS@
+LIBMPEG2CONVERT_LIBS = @LIBMPEG2CONVERT_LIBS@
+LIBMPEG2_CFLAGS = @LIBMPEG2_CFLAGS@
+LIBMPEG2_LIBS = @LIBMPEG2_LIBS@
+LIBOBJS = @LIBOBJS@
+LIBPOSTPROC_CFLAGS = @LIBPOSTPROC_CFLAGS@
+LIBPOSTPROC_LIBS = @LIBPOSTPROC_LIBS@
+LIBQUICKTIME_CFLAGS = @LIBQUICKTIME_CFLAGS@
+LIBQUICKTIME_LIBS = @LIBQUICKTIME_LIBS@
+LIBS = @LIBS@
+LIBTCAUDIO_LIBS = @LIBTCAUDIO_LIBS@
+LIBTCVIDEO_LIBS = @LIBTCVIDEO_LIBS@
+LIBTC_LIBS = @LIBTC_LIBS@
+LIBTOOL = @LIBTOOL@
+LIBV4L2_CFLAGS = @LIBV4L2_CFLAGS@
+LIBV4L2_LIBS = @LIBV4L2_LIBS@
+LIBV4LCONVERT_CFLAGS = @LIBV4LCONVERT_CFLAGS@
+LIBV4LCONVERT_LIBS = @LIBV4LCONVERT_LIBS@
+LIBXML2_CFLAGS = @LIBXML2_CFLAGS@
+LIBXML2_LIBS = @LIBXML2_LIBS@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+LZO_CFLAGS = @LZO_CFLAGS@
+LZO_LIBS = @LZO_LIBS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MJPEGTOOLS_CFLAGS = @MJPEGTOOLS_CFLAGS@
+MJPEGTOOLS_LIBS = @MJPEGTOOLS_LIBS@
+MKDIR_P = @MKDIR_P@
+MOD_PATH = @MOD_PATH@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OGG_CFLAGS = @OGG_CFLAGS@
+OGG_LIBS = @OGG_LIBS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+PATH_TO_AWK = @PATH_TO_AWK@
+PKG_CONFIG = @PKG_CONFIG@
+PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
+PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
+PROF_PATH = @PROF_PATH@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+PVM3_CFLAGS = @PVM3_CFLAGS@
+PVM3_LIBS = @PVM3_LIBS@
+PVM3_PVMGS = @PVM3_PVMGS@
+RANLIB = @RANLIB@
+SDL_CFLAGS = @SDL_CFLAGS@
+SDL_LIBS = @SDL_LIBS@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+SIMD_FLAGS = @SIMD_FLAGS@
+STRIP = @STRIP@
+THEORA_CFLAGS = @THEORA_CFLAGS@
+THEORA_LIBS = @THEORA_LIBS@
+USE_DLDARWIN = @USE_DLDARWIN@
+VERSION = @VERSION@
+VORBIS_CFLAGS = @VORBIS_CFLAGS@
+VORBIS_LIBS = @VORBIS_LIBS@
+WAVLIB_LIBS = @WAVLIB_LIBS@
+X264_CFLAGS = @X264_CFLAGS@
+X264_LIBS = @X264_LIBS@
+XIO_CFLAGS = @XIO_CFLAGS@
+XIO_LIBS = @XIO_LIBS@
+XMKMF = @XMKMF@
+XVID_CFLAGS = @XVID_CFLAGS@
+XVID_LIBS = @XVID_LIBS@
+X_CFLAGS = @X_CFLAGS@
+X_EXTRA_LIBS = @X_EXTRA_LIBS@
+X_LIBS = @X_LIBS@
+X_PRE_LIBS = @X_PRE_LIBS@
+a52_config = @a52_config@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+bsdav_config = @bsdav_config@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+faac_config = @faac_config@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+iconv_config = @iconv_config@
+imagemagick_config = @imagemagick_config@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+lame_config = @lame_config@
+libdir = @libdir@
+libdvdread_config = @libdvdread_config@
+libexecdir = @libexecdir@
+libjpeg_config = @libjpeg_config@
+libjpegmmx_config = @libjpegmmx_config@
+localedir = @localedir@
+localstatedir = @localstatedir@
+lzo_config = @lzo_config@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+pvm3_config = @pvm3_config@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target = @target@
+target_alias = @target_alias@
+target_cpu = @target_cpu@
+target_os = @target_os@
+target_vendor = @target_vendor@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+x_includes = @x_includes@
+x_libraries = @x_libraries@
+xvid_config = @xvid_config@
+AM_CPPFLAGS = \
+        $(PTHREAD_CFLAGS) \
+        -I$(top_srcdir)
+
+noinst_LTLIBRARIES = libac.la
+libac_la_SOURCES = \
+        accore.c \
+        average.c \
+        imgconvert.c \
+        img_rgb_packed.c \
+        img_yuv_mixed.c \
+        img_yuv_packed.c \
+        img_yuv_planar.c \
+        img_yuv_rgb.c \
+        memcpy.c \
+        rescale.c
+
+EXTRA_DIST = \
+        ac.h \
+        ac_internal.h \
+        imgconvert.h \
+        img_internal.h \
+        img_x86_common.h \
+        rgb-yuv-conv.pl 
+
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu aclib/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu aclib/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+libac.la: $(libac_la_OBJECTS) $(libac_la_DEPENDENCIES) 
+	$(LINK)  $(libac_la_OBJECTS) $(libac_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/accore.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/average.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/img_rgb_packed.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/img_yuv_mixed.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/img_yuv_packed.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/img_yuv_planar.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/img_yuv_rgb.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/imgconvert.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memcpy.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rescale.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	  `test -z '$(STRIP)' || \
+	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstLTLIBRARIES ctags distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+	pdf pdf-am ps ps-am tags uninstall uninstall-am
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/debian/transcode/transcode-1.1.7/aclib/ac.h b/debian/transcode/transcode-1.1.7/aclib/ac.h
new file mode 100644
index 00000000..d2a542b2
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/ac.h
@@ -0,0 +1,107 @@
+/*
+ * ac.h -- main aclib include
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later).  See the file COPYING
+ * for details.
+ */
+
+#ifndef ACLIB_AC_H
+#define ACLIB_AC_H
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+/*************************************************************************/
+
+/* CPU acceleration support flags, for use with ac_init(): */
+
+#define AC_IA32ASM      0x0001  /* x86-32: standard assembly (no MMX) */
+#define AC_AMD64ASM     0x0002  /* x86-64: standard assembly (no MMX) */
+#define AC_CMOVE        0x0004  /* x86: CMOVcc instruction */
+#define AC_MMX          0x0008  /* x86: MMX instructions */
+#define AC_MMXEXT       0x0010  /* x86: MMX extended instructions (AMD) */
+#define AC_3DNOW        0x0020  /* x86: 3DNow! instructions (AMD) */
+#define AC_3DNOWEXT     0x0040  /* x86: 3DNow! instructions (AMD) */
+#define AC_SSE          0x0080  /* x86: SSE instructions */
+#define AC_SSE2         0x0100  /* x86: SSE2 instructions */
+#define AC_SSE3         0x0200  /* x86: SSE3 instructions */
+#define AC_SSSE3        0x0400  /* x86: SSSE3 instructions */
+#define AC_SSE41        0x0800  /* x86: SSE4.1 instructions */
+#define AC_SSE42        0x1000  /* x86: SSE4.2 instructions (Intel) */
+#define AC_SSE4A        0x2000  /* x86: SSE4a instructions (AMD) */
+#define AC_SSE5         0x4000  /* x86: SSE5 instructions (AMD) */
+
+#define AC_NONE         0       /* No acceleration (vanilla C functions) */
+#define AC_ALL          (~0)    /* All available acceleration */
+
+
+/* Endianness flag: */
+#define AC_LITTLE_ENDIAN        1
+#define AC_BIG_ENDIAN           2
+
+/*************************************************************************/
+
+/* Library initialization function--MUST be called before any other aclib
+ * functions are used!  `accel' selects the accelerations to enable:
+ * AC_NONE, AC_ALL, or a combination of the other AC_* flags above.  The
+ * value will always be masked to the acceleration options available on the
+ * actual CPU, as returned by ac_cpuinfo().  Returns 1 on success, 0 on
+ * failure.  This function can be called multiple times to change the set
+ * of acceleration features to be used. */
+extern int ac_init(int accel);
+
+/* Returns the set of acceleration features supported by this CPU. */
+extern int ac_cpuinfo(void);
+
+/* Returns the endianness of this CPU (AC_BIG_ENDIAN or AC_LITTLE_ENDIAN). */
+extern int ac_endian(void);
+
+/* Utility routine to convert a set of flags to a descriptive string.  The
+ * string is stored in a static buffer overwritten each call. */
+extern const char *ac_flagstotext(int accel);
+
+/* Utility routine to parse a comma-separate descriptive string to the
+   corrisponding flag. The reverse of ac_flagstotext.
+   Returns 1 on success, 0 on failure */
+extern int ac_parseflags(const char *text, int *accel);
+
+/*************************************************************************/
+
+/* Acceleration-enabled functions: */
+
+/* Optimized memcpy().  The copy direction is guaranteed to be ascending
+ * (so ac_memcpy(ptr, ptr+1, size) will work). */
+extern void *ac_memcpy(void *dest, const void *src, size_t size);
+
+/* Average of two sets of data */
+extern void ac_average(const uint8_t *src1, const uint8_t *src2,
+                       uint8_t *dest, int bytes);
+
+/* Weighted average of two sets of data (weight1+weight2 should be 65536) */
+extern void ac_rescale(const uint8_t *src1, const uint8_t *src2,
+                       uint8_t *dest, int bytes,
+                       uint32_t weight1, uint32_t weight2);
+
+/* Image format manipulation is available in aclib/imgconvert.h */
+
+/*************************************************************************/
+
+#endif  /* ACLIB_AC_H */
+
+/*
+ * Local variables:
+ *   c-file-style: "stroustrup"
+ *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ *   indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/ac_internal.h b/debian/transcode/transcode-1.1.7/aclib/ac_internal.h
new file mode 100644
index 00000000..67a9c59f
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/ac_internal.h
@@ -0,0 +1,42 @@
+/*
+ * ac_internal.h -- internal include file for aclib functions
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later).  See the file COPYING
+ * for details.
+ */
+
+#ifndef ACLIB_AC_INTERNAL_H
+#define ACLIB_AC_INTERNAL_H
+
+
+/* Compiler hint that a condition is unlikely */
+#ifdef __GNUC__
+# define UNLIKELY(x) (__builtin_expect((x) != 0, 0))
+#else
+# define UNLIKELY(x) (x)
+#endif
+
+/* Are _all_ of the given acceleration flags (`test') available? */
+#define HAS_ACCEL(accel,test) (((accel) & (test)) == (test))
+
+/* Initialization subfunctions */
+extern int ac_average_init(int accel);
+extern int ac_imgconvert_init(int accel);
+extern int ac_memcpy_init(int accel);
+extern int ac_rescale_init(int accel);
+
+
+#endif  /* ACLIB_AC_INTERNAL_H */
+
+/*
+ * Local variables:
+ *   c-file-style: "stroustrup"
+ *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ *   indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/accore.c b/debian/transcode/transcode-1.1.7/aclib/accore.c
new file mode 100644
index 00000000..ec7ea2dd
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/accore.c
@@ -0,0 +1,320 @@
+/*
+ * accore.c -- core aclib functions
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later).  See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "ac_internal.h"
+#include "imgconvert.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+static int cpuinfo_x86(void);
+#endif
+
+/*************************************************************************/
+
+/* Library initialization function.  Determines CPU features, then calls
+ * all initialization subfunctions with appropriate flags.  Returns 1 on
+ * success, 0 on failure.  This function can be called multiple times to
+ * change the set of acceleration features to be used. */
+
+int ac_init(int accel)
+{
+    accel &= ac_cpuinfo();
+    if (!ac_average_init(accel)
+     || !ac_imgconvert_init(accel)
+     || !ac_memcpy_init(accel)
+     || !ac_rescale_init(accel)
+    ) {
+        return 0;
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+/* Returns the set of acceleration features supported by this CPU. */
+
+int ac_cpuinfo(void)
+{
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+    return cpuinfo_x86();
+#else
+    return 0;
+#endif
+}
+
+/*************************************************************************/
+
+/* Returns the endianness of this CPU (AC_BIG_ENDIAN or AC_LITTLE_ENDIAN). */
+
+int ac_endian(void)
+{
+    volatile int test;
+
+    test = 1;
+    if (*((uint8_t *)&test))
+        return AC_LITTLE_ENDIAN;
+    else
+        return AC_BIG_ENDIAN;
+}
+
+/*************************************************************************/
+
+/* Utility routine to convert a set of flags to a descriptive string.  The
+ * string is stored in a static buffer overwritten each call.  `filter'
+ * selects whether to filter out flags not supported by the CPU. */
+
+const char *ac_flagstotext(int accel)
+{
+    static char retbuf[1000];
+    if (!accel)
+        return "none";
+    snprintf(retbuf, sizeof(retbuf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+             accel & AC_SSE5                  ? " sse5"     : "",
+             accel & AC_SSE4A                 ? " sse4a"    : "",
+             accel & AC_SSE42                 ? " sse42"    : "",
+             accel & AC_SSE41                 ? " sse41"    : "",
+             accel & AC_SSSE3                 ? " ssse3"    : "",
+             accel & AC_SSE3                  ? " sse3"     : "",
+             accel & AC_SSE2                  ? " sse2"     : "",
+             accel & AC_SSE                   ? " sse"      : "",
+             accel & AC_3DNOWEXT              ? " 3dnowext" : "",
+             accel & AC_3DNOW                 ? " 3dnow"    : "",
+             accel & AC_MMXEXT                ? " mmxext"   : "",
+             accel & AC_MMX                   ? " mmx"      : "",
+             accel & AC_CMOVE                 ? " cmove"    : "",
+             accel & (AC_IA32ASM|AC_AMD64ASM) ? " asm"      : "");
+    return *retbuf ? retbuf+1 : retbuf;  /* skip initial space */
+}
+
+/* Utility routine to parse a comma-separate descriptive string to the
+   corrisponding flag. The reverse of ac_flagstotext.
+   Returns 1 on success, 0 on failure */
+
+#define AC_FLAG_LEN     16
+
+int ac_parseflags(const char *text, int *accel)
+{
+    int parsed = 1, done = 0;
+    if (!text || !accel)
+        return 0;
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+    *accel = 0;
+
+    while (parsed && !done) {
+        char buf[AC_FLAG_LEN + 1] = { '\0' };
+        const char *comma = strchr(text, ',');
+        if (!comma) {
+            strncpy(buf, text, AC_FLAG_LEN);
+            done = 1;
+        } else {
+            /* parse the remaining and exit*/
+            size_t len = (comma - text);
+            if (len > AC_FLAG_LEN)
+                len = AC_FLAG_LEN;
+            strncpy(buf, text, len);
+        }
+//fprintf(stderr, "(%s) buf=[%s]\n", __func__, buf);
+        if (strcasecmp(buf, "C") == 0)  // dummy for "no accel"
+            *accel |= 0;
+#ifdef ARCH_X86
+        else if (strcasecmp(buf, "asm"     ) == 0)
+            *accel |= AC_IA32ASM;
+#endif
+#ifdef ARCH_X86_64
+        else if (strcasecmp(buf, "asm"     ) == 0)
+            *accel |= AC_AMD64ASM;
+#endif
+        else if (strcasecmp(buf, "mmx"     ) == 0)
+            *accel |= AC_MMX;
+        else if (strcasecmp(buf, "mmxext"  ) == 0)
+            *accel |= AC_MMXEXT;
+        else if (strcasecmp(buf, "3dnow"   ) == 0)
+            *accel |= AC_3DNOW;
+        else if (strcasecmp(buf, "3dnowext") == 0)
+            *accel |= AC_3DNOWEXT;
+        else if (strcasecmp(buf, "sse"     ) == 0)
+            *accel |= AC_SSE;
+        else if (strcasecmp(buf, "sse2"    ) == 0)
+            *accel |= AC_SSE2;
+        else if (strcasecmp(buf, "sse3"    ) == 0)
+            *accel |= AC_SSE3;
+        else if (strcasecmp(buf, "ssse3"   ) == 0)
+            *accel |= AC_SSSE3;
+        else if (strcasecmp(buf, "sse41"   ) == 0)
+            *accel |= AC_SSE41;
+        else if (strcasecmp(buf, "sse42"   ) == 0)
+            *accel |= AC_SSE42;
+        else if (strcasecmp(buf, "sse4a"   ) == 0)
+            *accel |= AC_SSE4A;
+        else if (strcasecmp(buf, "sse5"    ) == 0)
+            *accel |= AC_SSE5;
+        else
+            parsed = 0;
+        text = comma + 1;
+    }
+#endif
+    return parsed;
+}
+
+#undef AC_FLAG_LEN
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Private functions to return acceleration flags corresponding to available
+ * CPU features for various CPUs.  Currently only x86 is supported. */
+
+/*************************************************************************/
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+
+#ifdef ARCH_X86_64
+# define EAX "%%rax"
+# define EBX "%%rbx"
+# define ESI "%%rsi"
+# define PUSHF "pushfq"
+# define POPF "popfq"
+#else
+# define EAX "%%eax"
+# define EBX "%%ebx"
+# define ESI "%%esi"
+# define PUSHF "pushfl"
+# define POPF "popfl"
+#endif
+
+/* Macro to execute the CPUID instruction with EAX = func.  Results are
+ * placed in ret_a (EAX), ret_b (EBX), ret_c (ECX), and ret_d (EDX), which
+ * must be lvalues.  Note that we save and restore EBX (RBX on x86-64)
+ * because it is the PIC register. */
+#define CPUID(func,ret_a,ret_b,ret_c,ret_d)                             \
+    asm("mov "EBX", "ESI"; cpuid; xchg "EBX", "ESI                      \
+        : "=a" (ret_a), "=S" (ret_b), "=c" (ret_c), "=d" (ret_d)        \
+        : "a" (func))
+
+/* Various CPUID flags.  The second word of the macro name indicates the
+ * function (1: function 1, X1: function 0x80000001) and register (D: EDX)
+ * to which the value belongs. */
+#define CPUID_1D_CMOVE          (1UL<<15)
+#define CPUID_1D_MMX            (1UL<<23)
+#define CPUID_1D_SSE            (1UL<<25)
+#define CPUID_1D_SSE2           (1UL<<26)
+#define CPUID_1C_SSE3           (1UL<< 0)
+#define CPUID_1C_SSSE3          (1UL<< 9)
+#define CPUID_1C_SSE41          (1UL<<19)
+#define CPUID_1C_SSE42          (1UL<<20)
+#define CPUID_X1D_AMD_MMXEXT    (1UL<<22)  /* AMD only */
+#define CPUID_X1D_AMD_3DNOW     (1UL<<31)  /* AMD only */
+#define CPUID_X1D_AMD_3DNOWEXT  (1UL<<30)  /* AMD only */
+#define CPUID_X1D_CYRIX_MMXEXT  (1UL<<24)  /* Cyrix only */
+#define CPUID_X1C_AMD_SSE4A     (1UL<< 6)  /* AMD only */
+#define CPUID_X1C_AMD_SSE5      (1UL<<11)  /* AMD only */
+
+static int cpuinfo_x86(void)
+{
+    uint32_t eax, ebx, ecx, edx;
+    uint32_t cpuid_max, cpuid_ext_max;  /* Maximum CPUID function numbers */
+    union {
+        char string[13];
+        struct { uint32_t ebx, edx, ecx; } regs;
+    } cpu_vendor;  /* 12-byte CPU vendor string + trailing null */
+    uint32_t cpuid_1D, cpuid_1C, cpuid_X1C, cpuid_X1D;
+    int accel;
+
+    /* First see if the CPUID instruction is even available.  We try to
+     * toggle bit 21 (ID) of the flags register; if the bit changes, then
+     * CPUID is available. */
+    asm(PUSHF"                  \n\
+        pop "EAX"               \n\
+        mov %%eax, %%edx        \n\
+        xor $0x200000, %%eax    \n\
+        push "EAX"              \n\
+        "POPF"                  \n\
+        "PUSHF"                 \n\
+        pop "EAX"               \n\
+        xor %%edx, %%eax"
+        : "=a" (eax) : : "edx");
+    if (!eax)
+        return 0;
+
+    /* Determine the maximum function number available, and save the vendor
+     * string */
+    CPUID(0, cpuid_max, ebx, ecx, edx);
+    cpu_vendor.regs.ebx = ebx;
+    cpu_vendor.regs.ecx = ecx;
+    cpu_vendor.regs.edx = edx;
+    cpu_vendor.string[12] = 0;
+    cpuid_ext_max = 0;  /* FIXME: how do early CPUs respond to 0x80000000? */
+    CPUID(0x80000000, cpuid_ext_max, ebx, ecx, edx);
+
+    /* Read available features */
+    cpuid_1D = cpuid_1C = cpuid_X1C = cpuid_X1D = 0;
+    if (cpuid_max >= 1)
+        CPUID(1, eax, ebx, cpuid_1C, cpuid_1D);
+    if (cpuid_ext_max >= 0x80000001)
+        CPUID(0x80000001, eax, ebx, cpuid_X1C, cpuid_X1D);
+
+    /* Convert to acceleration flags */
+#ifdef ARCH_X86_64
+    accel = AC_AMD64ASM;  /* but not IA32! (register size issues) */
+#else
+    accel = AC_IA32ASM;
+#endif
+    if (cpuid_1D & CPUID_1D_CMOVE)
+        accel |= AC_CMOVE;
+    if (cpuid_1D & CPUID_1D_MMX)
+        accel |= AC_MMX;
+    if (cpuid_1D & CPUID_1D_SSE)
+        accel |= AC_SSE;
+    if (cpuid_1D & CPUID_1D_SSE2)
+        accel |= AC_SSE2;
+    if (cpuid_1C & CPUID_1C_SSE3)
+        accel |= AC_SSE3;
+    if (cpuid_1C & CPUID_1C_SSSE3)
+        accel |= AC_SSSE3;
+    if (cpuid_1C & CPUID_1C_SSE41)
+        accel |= AC_SSE41;
+    if (cpuid_1C & CPUID_1C_SSE42)
+        accel |= AC_SSE42;
+    if (strcmp(cpu_vendor.string, "AuthenticAMD") == 0) {
+        if (cpuid_X1D & CPUID_X1D_AMD_MMXEXT)
+            accel |= AC_MMXEXT;
+        if (cpuid_X1D & CPUID_X1D_AMD_3DNOW)
+            accel |= AC_3DNOW;
+        if (cpuid_X1D & CPUID_X1D_AMD_3DNOWEXT)
+            accel |= AC_3DNOWEXT;
+        if (cpuid_X1C & CPUID_X1C_AMD_SSE4A)
+            accel |= AC_SSE4A;
+        if (cpuid_X1C & CPUID_X1C_AMD_SSE5)
+            accel |= AC_SSE5;
+    } else if (strcmp(cpu_vendor.string, "CyrixInstead") == 0) {
+        if (cpuid_X1D & CPUID_X1D_CYRIX_MMXEXT)
+            accel |= AC_MMXEXT;
+    }
+
+    /* And return */
+    return accel;
+}
+
+#endif  /* ARCH_X86 || ARCH_X86_64 */
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ *   c-file-style: "stroustrup"
+ *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ *   indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/average.c b/debian/transcode/transcode-1.1.7/aclib/average.c
new file mode 100644
index 00000000..517102e6
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/average.c
@@ -0,0 +1,243 @@
+/*
+ * average.c -- average two sets of byte data
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later).  See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "ac_internal.h"
+
+static void average(const uint8_t *, const uint8_t *, uint8_t *, int);
+static void (*average_ptr)(const uint8_t *, const uint8_t *, uint8_t *, int)
+     = average;
+
+/*************************************************************************/
+
+/* External interface */
+
+void ac_average(const uint8_t *src1, const uint8_t *src2,
+                uint8_t *dest, int bytes)
+{
+    (*average_ptr)(src1, src2, dest, bytes);
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Vanilla C version */
+
+static void average(const uint8_t *src1, const uint8_t *src2,
+                    uint8_t *dest, int bytes)
+{
+    int i;
+    for (i = 0; i < bytes; i++)
+        dest[i] = (src1[i]+src2[i]+1) / 2;
+}
+
+/*************************************************************************/
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)  /* i.e. not x86_64 */
+
+static void average_mmx(const uint8_t *src1, const uint8_t *src2,
+                        uint8_t *dest, int bytes)
+{
+    if (bytes >= 8) {
+        asm("\
+            pxor %%mm7, %%mm7                                           \n\
+            movq %%mm7, %%mm6                                           \n\
+            pcmpeqw %%mm5, %%mm5                                        \n\
+            psubw %%mm5, %%mm6          # Put 0x0001*4 in MM6           \n\
+            0:                                                          \n\
+            movq -8(%%esi,%%eax), %%mm0                                 \n\
+            movq %%mm0, %%mm1                                           \n\
+            punpcklbw %%mm7, %%mm0                                      \n\
+            punpckhbw %%mm7, %%mm1                                      \n\
+            movq -8(%%edx,%%eax), %%mm2                                 \n\
+            movq %%mm2, %%mm3                                           \n\
+            punpcklbw %%mm7, %%mm2                                      \n\
+            punpckhbw %%mm7, %%mm3                                      \n\
+            paddw %%mm2, %%mm0                                          \n\
+            paddw %%mm6, %%mm0                                          \n\
+            psrlw $1, %%mm0                                             \n\
+            paddw %%mm3, %%mm1                                          \n\
+            paddw %%mm6, %%mm1                                          \n\
+            psrlw $1, %%mm1                                             \n\
+            packuswb %%mm1, %%mm0                                       \n\
+            movq %%mm0, -8(%%edi,%%eax)                                 \n\
+            subl $8, %%eax                                              \n\
+            jnz 0b                                                      \n\
+            emms"
+            : /* no outputs */
+            : "S" (src1), "d" (src2), "D" (dest), "a" (bytes & ~7));
+    }
+    if (UNLIKELY(bytes & 7)) {
+        average(src1+(bytes & ~7), src2+(bytes & ~7), dest+(bytes & ~7),
+                bytes & 7);
+    }
+}
+
+#endif  /* HAVE_ASM_MMX && ARCH_X86 */
+
+/*************************************************************************/
+
+#if defined(HAVE_ASM_SSE) && defined(ARCH_X86)
+
+/* SSE has PAVGB */
+
+static void average_sse(const uint8_t *src1, const uint8_t *src2,
+                        uint8_t *dest, int bytes)
+{
+    if (bytes >= 8) {
+        asm("\
+            testl $~0x1F, %%eax                                         \n\
+            jz 1f                                                       \n\
+            0:                                                          \n\
+            movq -32(%%esi,%%eax), %%mm0                                \n\
+            movq -24(%%esi,%%eax), %%mm1                                \n\
+            movq -16(%%esi,%%eax), %%mm2                                \n\
+            movq -8(%%esi,%%eax), %%mm3                                 \n\
+            movq -32(%%edx,%%eax), %%mm4                                \n\
+            pavgb %%mm4, %%mm0                                          \n\
+            movq -24(%%edx,%%eax), %%mm5                                \n\
+            pavgb %%mm5, %%mm1                                          \n\
+            movq -16(%%edx,%%eax), %%mm6                                \n\
+            pavgb %%mm6, %%mm2                                          \n\
+            movq -8(%%edx,%%eax), %%mm7                                 \n\
+            pavgb %%mm7, %%mm3                                          \n\
+            movntq %%mm0, -32(%%edi,%%eax)                              \n\
+            movntq %%mm1, -24(%%edi,%%eax)                              \n\
+            movntq %%mm2, -16(%%edi,%%eax)                              \n\
+            movntq %%mm3, -8(%%edi,%%eax)                               \n\
+            subl $32, %%eax                                             \n\
+            testl $~0x1F, %%eax                                         \n\
+            jnz 0b                                                      \n\
+            testl %%eax, %%eax                                          \n\
+            jz 2f                                                       \n\
+            1:                                                          \n\
+            movq -8(%%esi,%%eax), %%mm0                                 \n\
+            movq -8(%%edx,%%eax), %%mm1                                 \n\
+            pavgb %%mm1, %%mm0                                          \n\
+            movntq %%mm0, -8(%%edi,%%eax)                               \n\
+            subl $8, %%eax                                              \n\
+            jnz 1b                                                      \n\
+            2:                                                          \n\
+            emms                                                        \n\
+            sfence"
+            : /* no outputs */
+            : "S" (src1), "d" (src2), "D" (dest), "a" (bytes & ~7));
+    }
+    if (UNLIKELY(bytes & 7)) {
+        average(src1+(bytes & ~7), src2+(bytes & ~7), dest+(bytes & ~7),
+                bytes & 7);
+    }
+}
+
+#endif  /* HAVE_ASM_SSE && ARCH_X86 */
+
+/*************************************************************************/
+
+#if defined(HAVE_ASM_SSE2)
+
+#if defined(ARCH_X86_64)
+# define EAX "%%rax"
+# define EDX "%%rdx"
+# define ESI "%%rsi"
+# define EDI "%%rdi"
+#else
+# define EAX "%%eax"
+# define EDX "%%edx"
+# define ESI "%%esi"
+# define EDI "%%edi"
+#endif
+
+static void average_sse2(const uint8_t *src1, const uint8_t *src2,
+                         uint8_t *dest, int bytes)
+{
+    if (bytes >= 8) {
+        asm("\
+            testl $~0x3F, %%eax                                         \n\
+            jz 1f                                                       \n\
+            0:                                                          \n\
+            movdqu -64("ESI","EAX"), %%xmm0                             \n\
+            movdqu -48("ESI","EAX"), %%xmm1                             \n\
+            movdqu -32("ESI","EAX"), %%xmm2                             \n\
+            movdqu -16("ESI","EAX"), %%xmm3                             \n\
+            movdqu -64("EDX","EAX"), %%xmm4                             \n\
+            pavgb %%xmm4, %%xmm0                                        \n\
+            movdqu -48("EDX","EAX"), %%xmm5                             \n\
+            pavgb %%xmm5, %%xmm1                                        \n\
+            movdqu -32("EDX","EAX"), %%xmm6                             \n\
+            pavgb %%xmm6, %%xmm2                                        \n\
+            movdqu -16("EDX","EAX"), %%xmm7                             \n\
+            pavgb %%xmm7, %%xmm3                                        \n\
+            # Note that movntdq requires 16-byte alignment, which we're \n\
+            # not guaranteed                                            \n\
+            movdqu %%xmm0, -64("EDI","EAX")                             \n\
+            movdqu %%xmm1, -48("EDI","EAX")                             \n\
+            movdqu %%xmm2, -32("EDI","EAX")                             \n\
+            movdqu %%xmm3, -16("EDI","EAX")                             \n\
+            subl $64, %%eax                                             \n\
+            testl $~0x3F, %%eax                                         \n\
+            jnz 0b                                                      \n\
+            testl %%eax, %%eax                                          \n\
+            jz 2f                                                       \n\
+            1:                                                          \n\
+            movq -8("ESI","EAX"), %%mm0                                 \n\
+            movq -8("EDX","EAX"), %%mm1                                 \n\
+            pavgb %%mm1, %%mm0                                          \n\
+            movq %%mm0, -8("EDI","EAX")                                 \n\
+            subl $8, %%eax                                              \n\
+            jnz 1b                                                      \n\
+            2:                                                          \n\
+            emms"
+            : /* no outputs */
+            : "S" (src1), "d" (src2), "D" (dest), "a" (bytes & ~7));
+    }
+    if (UNLIKELY(bytes & 7)) {
+        average(src1+(bytes & ~7), src2+(bytes & ~7), dest+(bytes & ~7),
+                bytes & 7);
+    }
+}
+
+#endif  /* HAVE_ASM_SSE2 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Initialization routine. */
+
+int ac_average_init(int accel)
+{
+    average_ptr = average;
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
+    if (HAS_ACCEL(accel, AC_MMX))
+        average_ptr = average_mmx;
+#endif
+#if defined(HAVE_ASM_SSE) && defined(ARCH_X86)
+    if (HAS_ACCEL(accel, AC_SSE))
+        average_ptr = average_sse;
+#endif
+#if defined(HAVE_ASM_SSE2)
+    if (HAS_ACCEL(accel, AC_SSE2))
+        average_ptr = average_sse2;
+#endif
+
+    return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ *   c-file-style: "stroustrup"
+ *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ *   indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/img_internal.h b/debian/transcode/transcode-1.1.7/aclib/img_internal.h
new file mode 100644
index 00000000..153a2fb6
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/img_internal.h
@@ -0,0 +1,40 @@
+/*
+ * img_internal.h - imgconvert internal use header
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later).  See the file COPYING
+ * for details.
+ */
+
+#ifndef ACLIB_IMG_INTERNAL_H
+#define ACLIB_IMG_INTERNAL_H
+
+/* Type of a conversion function */
+typedef int (*ConversionFunc)(uint8_t **src, uint8_t **dest,
+                              int width, int height);
+
+/* Function to register a conversion */
+extern int register_conversion(ImageFormat srcfmt, ImageFormat destfmt,
+                               ConversionFunc function);
+
+/* Initialization routines */
+extern int ac_imgconvert_init(int accel);
+extern int ac_imgconvert_init_yuv_planar(int accel);
+extern int ac_imgconvert_init_yuv_packed(int accel);
+extern int ac_imgconvert_init_yuv_mixed(int accel);
+extern int ac_imgconvert_init_yuv_rgb(int accel);
+extern int ac_imgconvert_init_rgb_packed(int accel);
+
+#endif  /* ACLIB_IMG_INTERNAL_H */
+
+/*
+ * Local variables:
+ *   c-file-style: "stroustrup"
+ *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ *   indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/img_rgb_packed.c b/debian/transcode/transcode-1.1.7/aclib/img_rgb_packed.c
new file mode 100644
index 00000000..e6d5bf35
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/img_rgb_packed.c
@@ -0,0 +1,1106 @@
+/*
+ * img_rgb_packed.c - RGB packed image format conversion routines
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later).  See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "imgconvert.h"
+#include "img_internal.h"
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Standard C implementations */
+
+/*************************************************************************/
+
+/* Identity transformations, all work when src==dest */
+
+static int rgb_copy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height*3);
+    return 1;
+}
+
+static int rgba_copy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height*4);
+    return 1;
+}
+
+static int gray8_copy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height);
+    return 1;
+}
+
+/*************************************************************************/
+
+/* Conversions between various 32-bit formats, all usable when src==dest */
+
+/* RGBA<->ABGR and ARGB<->BGRA: reverse byte order */
+static int rgba_swapall(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    uint32_t *srcp  = (uint32_t *)src[0];
+    uint32_t *destp = (uint32_t *)dest[0];
+    int i;
+    for (i = 0; i < width*height; i++) {
+        /* This shortcut works regardless of CPU endianness */
+        destp[i] =  srcp[i]               >> 24
+                 | (srcp[i] & 0x00FF0000) >>  8
+                 | (srcp[i] & 0x0000FF00) <<  8
+                 |  srcp[i]               << 24;
+    }
+    return 1;
+}
+
+/* RGBA<->BGRA: swap bytes 0 and 2 */
+static int rgba_swap02(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        uint8_t tmp    = src[0][i*4+2];
+        dest[0][i*4+2] = src[0][i*4  ];
+        dest[0][i*4  ] = tmp;
+        dest[0][i*4+1] = src[0][i*4+1];
+        dest[0][i*4+3] = src[0][i*4+3];
+    }
+    return 1;
+}
+
+/* ARGB<->ABGR: swap bytes 1 and 3 */
+static int rgba_swap13(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        uint8_t tmp    = src[0][i*4+3];
+        dest[0][i*4+3] = src[0][i*4+1];
+        dest[0][i*4+1] = tmp;
+        dest[0][i*4  ] = src[0][i*4  ];
+        dest[0][i*4+2] = src[0][i*4+2];
+    }
+    return 1;
+}
+
+/* RGBA->ARGB and BGRA->ABGR: alpha moves from byte 3 to byte 0 */
+static int rgba_alpha30(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        uint8_t tmp    = src[0][i*4+3];
+        dest[0][i*4+3] = src[0][i*4+2];
+        dest[0][i*4+2] = src[0][i*4+1];
+        dest[0][i*4+1] = src[0][i*4  ];
+        dest[0][i*4  ] = tmp;
+    }
+    return 1;
+}
+
+/* ARGB->RGBA and ABGR->BGRA: alpha moves from byte 0 to byte 3 */
+static int rgba_alpha03(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        uint8_t tmp    = src[0][i*4  ];
+        dest[0][i*4  ] = src[0][i*4+1];
+        dest[0][i*4+1] = src[0][i*4+2];
+        dest[0][i*4+2] = src[0][i*4+3];
+        dest[0][i*4+3] = tmp;
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+static int rgb24_bgr24(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        dest[0][i*3  ] = src[0][i*3+2];
+        dest[0][i*3+1] = src[0][i*3+1];
+        dest[0][i*3+2] = src[0][i*3  ];
+    }
+    return 1;
+}
+
+static int rgb24_rgba32(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        dest[0][i*4  ] = src[0][i*3  ];
+        dest[0][i*4+1] = src[0][i*3+1];
+        dest[0][i*4+2] = src[0][i*3+2];
+        dest[0][i*4+3] = 0;
+    }
+    return 1;
+}
+
+static int rgb24_abgr32(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        dest[0][i*4  ] = 0;
+        dest[0][i*4+1] = src[0][i*3+2];
+        dest[0][i*4+2] = src[0][i*3+1];
+        dest[0][i*4+3] = src[0][i*3  ];
+    }
+    return 1;
+}
+
+static int rgb24_argb32(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        dest[0][i*4  ] = 0;
+        dest[0][i*4+1] = src[0][i*3  ];
+        dest[0][i*4+2] = src[0][i*3+1];
+        dest[0][i*4+3] = src[0][i*3+2];
+    }
+    return 1;
+}
+
+static int rgb24_bgra32(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        dest[0][i*4  ] = src[0][i*3+2];
+        dest[0][i*4+1] = src[0][i*3+1];
+        dest[0][i*4+2] = src[0][i*3  ];
+        dest[0][i*4+3] = 0;
+    }
+    return 1;
+}
+
+static int rgb24_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        /* Use the Y part of a YUV transformation, scaled to 0..255 */
+        int r = src[0][i*3  ];
+        int g = src[0][i*3+1];
+        int b = src[0][i*3+2];
+        dest[0][i] = (19595*r + 38470*g + 7471*b + 32768) >> 16;
+    }
+    return 1;
+}
+
+static int bgr24_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        /* Use the Y part of a YUV transformation, scaled to 0..255 */
+        int r = src[0][i*3+2];
+        int g = src[0][i*3+1];
+        int b = src[0][i*3  ];
+        dest[0][i] = (19595*r + 38470*g + 7471*b + 32768) >> 16;
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+static int rgba32_rgb24(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        dest[0][i*3  ] = src[0][i*4  ];
+        dest[0][i*3+1] = src[0][i*4+1];
+        dest[0][i*3+2] = src[0][i*4+2];
+    }
+    return 1;
+}
+
+static int bgra32_rgb24(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        dest[0][i*3  ] = src[0][i*4+2];
+        dest[0][i*3+1] = src[0][i*4+1];
+        dest[0][i*3+2] = src[0][i*4  ];
+    }
+    return 1;
+}
+
+static int rgba32_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        /* Use the Y part of a YUV transformation, scaled to 0..255 */
+        int r = src[0][i*4  ];
+        int g = src[0][i*4+1];
+        int b = src[0][i*4+2];
+        dest[0][i] = (19595*r + 38470*g + 7471*b + 32768) >> 16;
+    }
+    return 1;
+}
+
+static int bgra32_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        /* Use the Y part of a YUV transformation, scaled to 0..255 */
+        int r = src[0][i*4+2];
+        int g = src[0][i*4+1];
+        int b = src[0][i*4  ];
+        dest[0][i] = (19595*r + 38470*g + 7471*b + 32768) >> 16;
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+static int argb32_rgb24(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        dest[0][i*3  ] = src[0][i*4+1];
+        dest[0][i*3+1] = src[0][i*4+2];
+        dest[0][i*3+2] = src[0][i*4+3];
+    }
+    return 1;
+}
+
+static int abgr32_rgb24(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        dest[0][i*3  ] = src[0][i*4+3];
+        dest[0][i*3+1] = src[0][i*4+2];
+        dest[0][i*3+2] = src[0][i*4+1];
+    }
+    return 1;
+}
+
+static int argb32_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        /* Use the Y part of a YUV transformation, scaled to 0..255 */
+        int r = src[0][i*4+1];
+        int g = src[0][i*4+2];
+        int b = src[0][i*4+3];
+        dest[0][i] = (19595*r + 38470*g + 7471*b + 32768) >> 16;
+    }
+    return 1;
+}
+
+static int abgr32_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        /* Use the Y part of a YUV transformation, scaled to 0..255 */
+        int r = src[0][i*4+3];
+        int g = src[0][i*4+2];
+        int b = src[0][i*4+1];
+        dest[0][i] = (19595*r + 38470*g + 7471*b + 32768) >> 16;
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+static int gray8_rgb24(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        dest[0][i*3  ] = src[0][i];
+        dest[0][i*3+1] = src[0][i];
+        dest[0][i*3+2] = src[0][i];
+    }
+    return 1;
+}
+
+static int gray8_rgba32(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        dest[0][i*4  ] = src[0][i];
+        dest[0][i*4+1] = src[0][i];
+        dest[0][i*4+2] = src[0][i];
+        dest[0][i*4+3] = 0;
+    }
+    return 1;
+}
+
+static int gray8_argb32(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        dest[0][i*4  ] = 0;
+        dest[0][i*4+1] = src[0][i];
+        dest[0][i*4+2] = src[0][i];
+        dest[0][i*4+3] = src[0][i];
+    }
+    return 1;
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+
+#define DEFINE_MASK_DATA
+#include "img_x86_common.h"
+
+/*************************************************************************/
+
+/* Basic assembly routines */
+
+/* RGBA<->ABGR and ARGB<->BGRA: reverse byte order */
+static int rgba_swapall_x86(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_REV32_X86(width*height);
+    return 1;
+}
+
+/* RGBA<->BGRA: swap bytes 0 and 2 */
+static int rgba_swap02_x86(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_SWAP32_02_X86(width*height);
+    return 1;
+}
+
+/* ARGB<->ABGR: swap bytes 1 and 3 */
+static int rgba_swap13_x86(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_SWAP32_13_X86(width*height);
+    return 1;
+}
+
+/* RGBA->ARGB and BGRA->ABGR: alpha moves from byte 3 to byte 0 */
+static int rgba_alpha30_x86(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_ROL32_X86(width*height);
+    return 1;
+}
+
+/* ARGB->RGBA and ABGR->BGRA: alpha moves from byte 0 to byte 3 */
+static int rgba_alpha03_x86(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_ROR32_X86(width*height);
+    return 1;
+}
+
+/*************************************************************************/
+
+/* MMX routines */
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)  /* i.e. not x86_64 */
+
+/* RGBA<->ABGR and ARGB<->BGRA: reverse byte order */
+static int rgba_swapall_mmx(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_REV32_MMX(width*height);
+    return 1;
+}
+
+/* RGBA<->BGRA: swap bytes 0 and 2 */
+static int rgba_swap02_mmx(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_SWAP32_02_MMX(width*height);
+    return 1;
+}
+
+/* ARGB<->ABGR: swap bytes 1 and 3 */
+static int rgba_swap13_mmx(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_SWAP32_13_MMX(width*height);
+    return 1;
+}
+
+/* RGBA->ARGB and BGRA->ABGR: alpha moves from byte 3 to byte 0 */
+static int rgba_alpha30_mmx(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_ROL32_MMX(width*height);
+    return 1;
+}
+
+/* ARGB->RGBA and ABGR->BGRA: alpha moves from byte 0 to byte 3 */
+static int rgba_alpha03_mmx(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_ROR32_MMX(width*height);
+    return 1;
+}
+
+#endif  /* HAVE_ASM_MMX && ARCH_X86 */
+
+/*************************************************************************/
+
+/* SSE2 routines */
+
+#if defined(HAVE_ASM_SSE2)
+
+static const struct { uint32_t n[4]; } __attribute__((aligned(16))) rgb_bgr_data = {{
+    0xFF0000FF, 0x00FF0000, 0x0000FF00, 0x00000000
+}};
+
+#define SHIFT_RBSWAP \
+        "movdqa %%xmm6, %%xmm2          # XMM2: low bytes mask          \n\
+        pand %%xmm0, %%xmm2             # XMM2: R/B bytes               \n\
+        pshuflw $0xB1, %%xmm2, %%xmm2   # XMM2: swap R and B (low quad) \n\
+        pand %%xmm7, %%xmm0             # XMM0: G bytes                 \n\
+        pshufhw $0xB1, %%xmm2, %%xmm2   # XMM2: swap R and B (high quad)\n\
+        por %%xmm2, %%xmm0              # XMM0: data now in BGRA32      \n"
+
+#define SHIFT_AFIRST \
+        "pslldq $1, %%xmm0              # XMM0: move A first            \n"
+
+#define SHIFT_ALAST \
+        "psrldq $1, %%xmm0              # XMM0: move A last             \n"
+
+#define RGB24TO32(ROFS,GOFS,BOFS,AOFS,SHIFT) \
+    asm("pcmpeqd %%xmm5, %%xmm5                                         \n\
+        movdqa %%xmm5, %%xmm6                                           \n\
+        psrldq $13, %%xmm5              # XMM5: 24-bit mask             \n\
+        movdqa %%xmm6, %%xmm7                                           \n\
+        psrlw $8, %%xmm6                # XMM6: low bytes mask          \n\
+        psllw $8, %%xmm7                # XMM7: high bytes mask         \n"\
+        SIMD_LOOP_WRAPPER(                                              \
+        /* blocksize */ 4,                                              \
+        /* push_regs */ "",                                             \
+        /* pop_regs  */ "",                                             \
+        /* small_loop */                                                \
+        "lea ("ECX","ECX",2),"EDX"                                      \n\
+        movb -3("ESI","EDX"), %%al                                      \n\
+        movb %%al, ("#ROFS"-4)("EDI","ECX",4)                           \n\
+        movb -2("ESI","EDX"), %%al                                      \n\
+        movb %%al, ("#GOFS"-4)("EDI","ECX",4)                           \n\
+        movb -1("ESI","EDX"), %%al                                      \n\
+        movb %%al, ("#BOFS"-4)("EDI","ECX",4)                           \n\
+        movb $0, ("#AOFS"-4)("EDI","ECX",4)",                           \
+        /* main_loop */                                                 \
+        "lea ("ECX","ECX",2),"EDX"                                      \n\
+        # We can't just movdqu, because we might run over the edge      \n\
+        movd -12("ESI","EDX"), %%xmm1                                   \n\
+        movq -8("ESI","EDX"), %%xmm0                                    \n\
+        pshufd $0xD3, %%xmm0, %%xmm0    # shift left by 4 bytes         \n\
+        por %%xmm1, %%xmm0              # XMM0: original RGB24 data     \n\
+        pshufd $0xF3, %%xmm5, %%xmm2    # XMM2: pixel 1 mask            \n\
+        movdqa %%xmm5, %%xmm1           # XMM1: pixel 0 mask            \n\
+        pshufd $0xCF, %%xmm5, %%xmm3    # XMM3: pixel 2 mask            \n\
+        pand %%xmm0, %%xmm1             # XMM1: pixel 0                 \n\
+        pslldq $1, %%xmm0                                               \n\
+        pand %%xmm0, %%xmm2             # XMM2: pixel 1                 \n\
+        pshufd $0x3F, %%xmm5, %%xmm4    # XMM4: pixel 3 mask            \n\
+        por %%xmm2, %%xmm1              # XMM1: pixels 0 and 1          \n\
+        pslldq $1, %%xmm0                                               \n\
+        pand %%xmm0, %%xmm3             # XMM3: pixel 2                 \n\
+        por %%xmm3, %%xmm1              # XMM1: pixels 0, 1, and 2      \n\
+        pslldq $1, %%xmm0                                               \n\
+        pand %%xmm4, %%xmm0             # XMM0: pixel 3                 \n\
+        por %%xmm1, %%xmm0              # XMM0: RGBA32 data             \n\
+        "SHIFT"                         # shift bytes to target position\n\
+        movdqu %%xmm0, -16("EDI","ECX",4)",                             \
+        /* emms */ "emms")                                              \
+        : /* no outputs */                                              \
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),              \
+          "d" (&rgb_bgr_data), "m" (rgb_bgr_data)                       \
+        : "eax");
+
+#define RGB32TO24(ROFS,GOFS,BOFS,AOFS,SHIFT) \
+    asm("pcmpeqd %%xmm5, %%xmm5                                         \n\
+        movdqa %%xmm5, %%xmm6                                           \n\
+        psrldq $13, %%xmm5              # 24-bit mask                   \n\
+        movdqa %%xmm6, %%xmm7                                           \n\
+        psrlw $8, %%xmm6                # low bytes mask                \n\
+        psllw $8, %%xmm7                # high bytes mask               \n"\
+        SIMD_LOOP_WRAPPER(                                              \
+        /* blocksize */ 4,                                              \
+        /* push_regs */ "",                                             \
+        /* pop_regs  */ "",                                             \
+        /* small_loop */                                                \
+        "lea ("ECX","ECX",2),"EDX"                                      \n\
+        movb ("#ROFS"-4)("ESI","ECX",4), %%al                           \n\
+        movb %%al, -3("EDI","EDX")                                      \n\
+        movb ("#GOFS"-4)("ESI","ECX",4), %%al                           \n\
+        movb %%al, -2("EDI","EDX")                                      \n\
+        movb ("#BOFS"-4)("ESI","ECX",4), %%al                           \n\
+        movb %%al, -1("EDI","EDX")",                                    \
+        /* main_loop */                                                 \
+        "lea ("ECX","ECX",2),"EDX"                                      \n\
+        movdqu -16("ESI","ECX",4), %%xmm0                               \n\
+        "SHIFT"                         # shift source data to RGBA     \n\
+        pshufd $0xF3, %%xmm5, %%xmm1    # XMM1: pixel 1 mask            \n\
+        pshufd $0xCF, %%xmm5, %%xmm2    # XMM2: pixel 2 mask            \n\
+        pshufd $0x3F, %%xmm5, %%xmm3    # XMM3: pixel 3 mask            \n\
+        pand %%xmm0, %%xmm3             # XMM3: pixel 3                 \n\
+        psrldq $1, %%xmm3                                               \n\
+        pand %%xmm0, %%xmm2             # XMM2: pixel 2                 \n\
+        por %%xmm3, %%xmm2              # XMM2: pixels 2 and 3          \n\
+        psrldq $1, %%xmm2                                               \n\
+        pand %%xmm0, %%xmm1             # XMM1: pixel 1                 \n\
+        pand %%xmm5, %%xmm0             # XMM0: pixel 0                 \n\
+        por %%xmm2, %%xmm1              # XMM1: pixels 1, 2, and 3      \n\
+        psrldq $1, %%xmm1                                               \n\
+        por %%xmm1, %%xmm0              # XMM0: RGB24 data              \n\
+        # We can't just movdqu, because we might run over the edge      \n\
+        movd %%xmm0, -12("EDI","EDX")   # store low 4 bytes             \n\
+        pshufd $0xF9, %%xmm0, %%xmm0    # shift right 4 bytes           \n\
+        movq %%xmm0, -8("EDI","EDX")    # store high 8 bytes            \n",\
+        /* emms */ "emms")                                              \
+        : /* no outputs */                                              \
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),              \
+          "d" (&rgb_bgr_data), "m" (rgb_bgr_data)                       \
+        : "eax");
+
+
+/* RGBA<->ABGR and ARGB<->BGRA: reverse byte order */
+static int rgba_swapall_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_REV32_SSE2(width*height);
+    return 1;
+}
+
+/* RGBA<->BGRA: swap bytes 0 and 2 */
+static int rgba_swap02_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_SWAP32_02_SSE2(width*height);
+    return 1;
+}
+
+/* ARGB<->ABGR: swap bytes 1 and 3 */
+static int rgba_swap13_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_SWAP32_13_SSE2(width*height);
+    return 1;
+}
+
+/* RGBA->ARGB and BGRA->ABGR: alpha moves from byte 3 to byte 0 */
+static int rgba_alpha30_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_ROL32_SSE2(width*height);
+    return 1;
+}
+
+/* ARGB->RGBA and ABGR->BGRA: alpha moves from byte 0 to byte 3 */
+static int rgba_alpha03_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_ROR32_SSE2(width*height);
+    return 1;
+}
+
+/* RGB<->BGR */
+static int rgb24_bgr24_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("movdqa ("EDX"), %%xmm5         # byte 0 mask                   \n\
+        pshufd $0xD2, %%xmm5, %%xmm6    # byte 1 mask                   \n\
+        pshufd $0xC9, %%xmm5, %%xmm7    # byte 2 mask                   \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 4,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */
+        "lea ("ECX","ECX",2),"EDX"                                      \n\
+        movb -3("ESI","EDX"), %%al                                      \n\
+        movb -2("ESI","EDX"), %%ah                                      \n\
+        movb %%ah, -2("EDI","EDX")                                      \n\
+        movb -1("ESI","EDX"), %%ah                                      \n\
+        movb %%ah, -3("EDI","EDX")                                      \n\
+        movb %%al, -1("EDI","EDX")",
+        /* main_loop */
+        "lea ("ECX","ECX",2),"EDX"                                      \n\
+        # We can't just movdqu, because we might run over the edge      \n\
+        movd -12("ESI","EDX"), %%xmm1                                   \n\
+        movq -8("ESI","EDX"), %%xmm0                                    \n\
+        pshufd $0xD3, %%xmm0, %%xmm0    # shift left by 4 bytes         \n\
+        por %%xmm1, %%xmm0              # XMM0: original data           \n\
+        movdqa %%xmm5, %%xmm2                                           \n\
+        movdqa %%xmm6, %%xmm3                                           \n\
+        movdqa %%xmm7, %%xmm4                                           \n\
+        pand %%xmm0, %%xmm2             # XMM2: byte 0                  \n\
+        pslldq $2, %%xmm2               # shift to byte 2 position      \n\
+        pand %%xmm0, %%xmm3             # XMM3: byte 1                  \n\
+        pand %%xmm0, %%xmm4             # XMM4: byte 2                  \n\
+        psrldq $2, %%xmm4               # shift to byte 0 position      \n\
+        por %%xmm2, %%xmm3                                              \n\
+        por %%xmm4, %%xmm3              # XMM3: reversed data           \n\
+        movd %%xmm3, -12("EDI","EDX")   # avoid running over the edge   \n\
+        pshufd $0xF9, %%xmm3, %%xmm3    # shift right by 4 bytes        \n\
+        movq %%xmm3, -8("EDI","EDX")",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "d" (&rgb_bgr_data), "m" (rgb_bgr_data)
+        : "eax");
+    return 1;
+}
+
+/* RGB->RGBA */
+static int rgb24_rgba32_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    RGB24TO32(0,1,2,3, "");
+    return 1;
+}
+
+/* RGB->ABGR */
+static int rgb24_abgr32_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    RGB24TO32(3,2,1,0, SHIFT_RBSWAP SHIFT_AFIRST);
+    return 1;
+}
+
+/* RGB->ARGB */
+static int rgb24_argb32_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    RGB24TO32(1,2,3,0, SHIFT_AFIRST);
+    return 1;
+}
+
+/* RGB->BGRA */
+static int rgb24_bgra32_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    RGB24TO32(2,1,0,3, SHIFT_RBSWAP);
+    return 1;
+}
+
+/* RGBA->RGB */
+static int rgba32_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    RGB32TO24(0,1,2,3, "");
+    return 1;
+}
+
+/* ABGR->RGB */
+static int abgr32_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    RGB32TO24(3,2,1,0, SHIFT_ALAST SHIFT_RBSWAP);
+    return 1;
+}
+
+/* ARGB->RGB */
+static int argb32_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    RGB32TO24(1,2,3,0, SHIFT_ALAST);
+    return 1;
+}
+
+/* BGRA->RGB */
+static int bgra32_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    RGB32TO24(2,1,0,3, SHIFT_RBSWAP);
+    return 1;
+}
+
+/*************************************************************************/
+
+#define R_GRAY  19595
+#define G_GRAY  38470
+#define B_GRAY   7471
+#define INIT_GRAY8 \
+        "pxor %%xmm4, %%xmm4            # XMM4: all 0's                 \n\
+        movl %3, %%eax                                                  \n\
+        movd %%eax, %%xmm5                                              \n\
+        pshuflw $0x00, %%xmm5, %%xmm5                                   \n\
+        pshufd $0x00, %%xmm5, %%xmm5    # XMM5: R->gray constant        \n\
+        movl %4, %%eax                                                  \n\
+        movd %%eax, %%xmm6                                              \n\
+        pshuflw $0x00, %%xmm6, %%xmm6                                   \n\
+        pshufd $0x00, %%xmm6, %%xmm6    # XMM6: G->gray constant        \n\
+        movl %5, %%eax                                                  \n\
+        movd %%eax, %%xmm7                                              \n\
+        pshuflw $0x00, %%xmm7, %%xmm7                                   \n\
+        pshufd $0x00, %%xmm7, %%xmm7    # XMM7: B->gray constant        \n\
+        pcmpeqd %%xmm3, %%xmm3                                          \n\
+        psllw $15, %%xmm3                                               \n\
+        psrlw $8, %%xmm3                # XMM3: 0x0080*8 (for rounding) \n"
+#define SINGLE_GRAY8(idx,ofsR,ofsG,ofsB) \
+        "movzbl "#ofsR"("ESI","idx"), %%eax     # retrieve red byte     \n\
+        imull %3, %%eax                 # multiply by red->gray factor  \n\
+        movzbl "#ofsG"("ESI","idx"), %%edx      # retrieve green byte   \n\
+        imull %4, %%edx                 # multiply by green->gray factor\n\
+        addl %%edx, %%eax               # add to total                  \n\
+        movzbl "#ofsB"("ESI","idx"), %%edx      # retrieve blue byte    \n\
+        imull %5, %%edx                 # multiply by blue->gray factor \n\
+        addl %%edx, %%eax               # add to total                  \n\
+        addl $0x8000, %%eax             # round                         \n\
+        shrl $16, %%eax                 # shift back down               \n\
+        movb %%al, -1("EDI","ECX")      # and store                     \n"
+#define STORE_GRAY8 \
+        "psllw $8, %%xmm0               # XMM0: add 8 bits of precision \n\
+        pmulhuw %%xmm5, %%xmm0          # XMM0: r7 r6 r5 r4 r3 r2 r1 r0 \n\
+        psllw $8, %%xmm1                # XMM1: add 8 bits of precision \n\
+        pmulhuw %%xmm6, %%xmm1          # XMM1: g7 g6 g5 g4 g3 g2 g1 g0 \n\
+        paddw %%xmm3, %%xmm0            # XMM0: add rounding constant   \n\
+        psllw $8, %%xmm2                # XMM2: add 8 bits of precision \n\
+        pmulhuw %%xmm7, %%xmm2          # XMM2: b7 b6 b5 b4 b3 b2 b1 b0 \n\
+        paddw %%xmm1, %%xmm0            # XMM0: add green part          \n\
+        paddw %%xmm2, %%xmm0            # XMM0: add blue part           \n\
+        psrlw $8, %%xmm0                # XMM0: shift back to bytes     \n\
+        packuswb %%xmm4, %%xmm0         # XMM0: gray7..gray0 packed     \n\
+        movq %%xmm0, -8("EDI","ECX")                                    \n"
+
+#define ASM_RGB24_GRAY(ofsR,ofsG,ofsB,load) \
+    asm(INIT_GRAY8                                                      \
+        PUSH(EBX)"                                                      \n\
+        lea ("ECX","ECX",2),"EBX"                                       \n"\
+        SIMD_LOOP_WRAPPER(                                              \
+        /* blocksize  */ 8,                                             \
+        /* push_regs  */ "",                                            \
+        /* pop_regs   */ "",                                            \
+        /* small_loop */ SINGLE_GRAY8(EBX, ofsR,ofsG,ofsB) "subl $3, %%ebx;",\
+        /* main_loop  */ load(4) STORE_GRAY8 "subl $24, %%ebx;",        \
+        /* emms */ "emms")                                              \
+        POP(EBX)                                                        \
+        : /* no outputs */                                              \
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),              \
+          "i" (R_GRAY), "i" (G_GRAY), "i" (B_GRAY)                      \
+        : "eax", "edx" COMMA_FAKE_PUSH_REG                              \
+    )
+
+#define ASM_RGB32_GRAY(ofsR,ofsG,ofsB,load) \
+    asm(INIT_GRAY8                                                      \
+        SIMD_LOOP_WRAPPER(                                              \
+        /* blocksize  */ 8,                                             \
+        /* push_regs  */ "",                                            \
+        /* pop_regs   */ "",                                            \
+        /* small_loop */ SINGLE_GRAY8(ECX",4", ofsR,ofsG,ofsB),         \
+        /* main_loop  */ load(4) STORE_GRAY8,                           \
+        /* emms */ "emms")                                              \
+        : /* no outputs */                                              \
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),              \
+          "i" (R_GRAY), "i" (G_GRAY), "i" (B_GRAY)                      \
+        : "eax", "edx")
+
+
+static int rgb24_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_RGB24_GRAY(-3,-2,-1, SSE2_LOAD_RGB24);
+    return 1;
+}
+
+static int bgr24_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_RGB24_GRAY(-1,-2,-3, SSE2_LOAD_BGR24);
+    return 1;
+}
+
+static int rgba32_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_RGB32_GRAY(-4,-3,-2, SSE2_LOAD_RGBA32);
+    return 1;
+}
+
+static int bgra32_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_RGB32_GRAY(-2,-3,-4, SSE2_LOAD_BGRA32);
+    return 1;
+}
+
+static int argb32_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_RGB32_GRAY(-3,-2,-1, SSE2_LOAD_ARGB32);
+    return 1;
+}
+
+static int abgr32_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_RGB32_GRAY(-1,-2,-3, SSE2_LOAD_ABGR32);
+    return 1;
+}
+
+/*************************************************************************/
+
+static int gray8_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("# Store all 0's in XMM4                                        \n\
+        pxor %%xmm4, %%xmm4                                             \n\
+        # Generate mask in XMM7 to select bytes 0,3,6,9 of an XMM register\n\
+        pcmpeqd %%xmm7, %%xmm7          # XMM7: all 1's                 \n\
+        psrlw $8, %%xmm7                # XMM7: 0x00FF * 8              \n\
+        pcmpeqd %%xmm6, %%xmm6          # XMM6: all 1's                 \n\
+        psllw $8, %%xmm6                # XMM6: 0xFF00 * 8              \n\
+        pslldq $8, %%xmm6                                               \n\
+        psrldq $8, %%xmm7                                               \n\
+        por %%xmm6, %%xmm7              # XMM7: 0xFF00*4, 0x00FF*4      \n\
+        pshufd $0xCC, %%xmm7, %%xmm7    # XMM7: {0xFF00*2, 0x00FF*2} * 2\n\
+        pshuflw $0xC0, %%xmm7, %%xmm7   # XMM7.l: FF0000FF00FF00FF      \n\
+        psrldq $4, %%xmm7               # XMM7: 0x00000000FF00FF00      \n\
+                                        #         00FF00FFFF0000FF      \n\
+        pshufd $0xEC, %%xmm7, %%xmm7    # XMM7: 0x00000000FF00FF00      \n\
+                                        #         00000000FF0000FF      \n\
+        pshuflw $0x24, %%xmm7, %%xmm7   # XMM7.l: 00FF0000FF0000FF      \n\
+        pshufhw $0xFC, %%xmm7, %%xmm7   # XMM7.h: 000000000000FF00      \n\
+        # Load ECX*3 into EDX ahead of time                             \n\
+        lea ("ECX","ECX",2), "EDX"                                      \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 4,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movb -1("ESI","ECX"), %%al      # retrieve gray byte            \n\
+        movb %%al, -3("EDI","EDX")      # and store 3 times             \n\
+        movb %%al, -2("EDI","EDX")                                      \n\
+        movb %%al, -1("EDI","EDX")                                      \n\
+        subl $3, %%edx                                                  \n",
+        /* main_loop */ "\
+        movd -4("ESI","ECX"), %%xmm0    # XMM0: G3..G0                  \n\
+        pshufd $0xCC, %%xmm0, %%xmm0    # XMM0: {0,0,0,0,G3..G0} * 2    \n\
+        pshuflw $0x50, %%xmm0, %%xmm0   # X0.l: G3 G2 G3 G2 G1 G0 G1 G0 \n\
+        pshufhw $0x55, %%xmm0, %%xmm0   # X0.h: G3 G2 G3 G2 G3 G2 G3 G2 \n\
+        pand %%xmm7, %%xmm0             # XMM0: ------3--2--1--0        \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: ------3--2--1--0        \n\
+        pslldq $1, %%xmm1               # XMM1: -----3--2--1--0-        \n\
+        movdqa %%xmm0, %%xmm2           # XMM2: ------3--2--1--0        \n\
+        pslldq $2, %%xmm2               # XMM2: ----3--2--1--0--        \n\
+        por %%xmm1, %%xmm0              # XMM0: -----33-22-11-00        \n\
+        por %%xmm2, %%xmm0              # XMM0: ----333222111000        \n\
+        movd %%xmm0, -12("EDI","EDX")                                   \n\
+        pshufd $0xC9, %%xmm0, %%xmm0                                    \n\
+        movq %%xmm0, -8("EDI","EDX")                                    \n\
+        subl $12, %%edx                                                 \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height)
+        : "eax", "edx");
+    return 1;
+}
+
+static int gray8_rgba32_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("pxor %%xmm4, %%xmm4            # XMM4: all 0's                 \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 4,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movb -1("ESI","ECX"), %%al      # retrieve gray byte            \n\
+        movb %%al, -4("EDI","ECX",4)    # and store 3 times             \n\
+        movb %%al, -3("EDI","ECX",4)                                    \n\
+        movb %%al, -2("EDI","ECX",4)                                    \n\
+        movb $0, -1("EDI","ECX",4)      # clear A byte                  \n",
+        /* main_loop */ "\
+        movd -4("ESI","ECX"), %%xmm0    # XMM0: 00 00 00 00 G3 G2 G1 G0 \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: 00 00 00 00 G3 G2 G1 G0 \n\
+        punpcklbw %%xmm0, %%xmm0        # XMM0: G3 G3 G2 G2 G1 G1 G0 G0 \n\
+        punpcklbw %%xmm4, %%xmm1        # XMM1: 00 G3 00 G2 00 G1 00 G0 \n\
+        punpcklbw %%xmm1, %%xmm0        # XMM0: 0GGG3 0GGG2 0GGG1 0GGG0 \n\
+        movdqu %%xmm0, -16("EDI","ECX",4)                               \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height)
+        : "eax");
+    return 1;
+}
+
+static int gray8_argb32_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("pxor %%xmm4, %%xmm4            # XMM4: all 0's                 \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 4,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movb -1("ESI","ECX"), %%al      # retrieve gray byte            \n\
+        movb %%al, -3("EDI","ECX",4)    # and store 3 times             \n\
+        movb %%al, -2("EDI","ECX",4)                                    \n\
+        movb %%al, -1("EDI","ECX",4)                                    \n\
+        movb $0, -4("EDI","ECX",4)      # clear A byte                  \n",
+        /* main_loop */ "\
+        movd -4("ESI","ECX"), %%xmm0    # XMM0: 00 00 00 00 G3 G2 G1 G0 \n\
+        movdqa %%xmm4, %%xmm1           # XMM1: 00 00 00 00 00 00 00 00 \n\
+        punpcklbw %%xmm0, %%xmm1        # XMM1: G3 00 G2 00 G1 00 G0 00 \n\
+        punpcklbw %%xmm0, %%xmm0        # XMM0: G3 G3 G2 G2 G1 G1 G0 G0 \n\
+        punpcklbw %%xmm0, %%xmm1        # XMM0: GGG03 GGG02 GGG01 GGG00 \n\
+        movdqu %%xmm1, -16("EDI","ECX",4)                               \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height)
+        : "eax");
+    return 1;
+}
+
+#endif  /* HAVE_ASM_SSE2 */
+
+/*************************************************************************/
+
+#endif  /* ARCH_X86 || ARCH_X86_64 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Initialization */
+
+int ac_imgconvert_init_rgb_packed(int accel)
+{
+    if (!register_conversion(IMG_RGB24,   IMG_RGB24,   rgb_copy)
+     || !register_conversion(IMG_RGB24,   IMG_BGR24,   rgb24_bgr24)
+     || !register_conversion(IMG_RGB24,   IMG_RGBA32,  rgb24_rgba32)
+     || !register_conversion(IMG_RGB24,   IMG_ABGR32,  rgb24_abgr32)
+     || !register_conversion(IMG_RGB24,   IMG_ARGB32,  rgb24_argb32)
+     || !register_conversion(IMG_RGB24,   IMG_BGRA32,  rgb24_bgra32)
+     || !register_conversion(IMG_RGB24,   IMG_GRAY8,   rgb24_gray8)
+
+     || !register_conversion(IMG_BGR24,   IMG_BGR24,   rgb_copy)
+     || !register_conversion(IMG_BGR24,   IMG_RGB24,   rgb24_bgr24)
+     || !register_conversion(IMG_BGR24,   IMG_RGBA32,  rgb24_bgra32)
+     || !register_conversion(IMG_BGR24,   IMG_ABGR32,  rgb24_argb32)
+     || !register_conversion(IMG_BGR24,   IMG_ARGB32,  rgb24_abgr32)
+     || !register_conversion(IMG_BGR24,   IMG_BGRA32,  rgb24_rgba32)
+     || !register_conversion(IMG_BGR24,   IMG_GRAY8,   bgr24_gray8)
+
+     || !register_conversion(IMG_RGBA32,  IMG_RGB24,   rgba32_rgb24)
+     || !register_conversion(IMG_RGBA32,  IMG_BGR24,   bgra32_rgb24)
+     || !register_conversion(IMG_RGBA32,  IMG_RGBA32,  rgba_copy)
+     || !register_conversion(IMG_RGBA32,  IMG_ABGR32,  rgba_swapall)
+     || !register_conversion(IMG_RGBA32,  IMG_ARGB32,  rgba_alpha30)
+     || !register_conversion(IMG_RGBA32,  IMG_BGRA32,  rgba_swap02)
+     || !register_conversion(IMG_RGBA32,  IMG_GRAY8,   rgba32_gray8)
+
+     || !register_conversion(IMG_ABGR32,  IMG_RGB24,   abgr32_rgb24)
+     || !register_conversion(IMG_ABGR32,  IMG_BGR24,   argb32_rgb24)
+     || !register_conversion(IMG_ABGR32,  IMG_RGBA32,  rgba_swapall)
+     || !register_conversion(IMG_ABGR32,  IMG_ABGR32,  rgba_copy)
+     || !register_conversion(IMG_ABGR32,  IMG_ARGB32,  rgba_swap13)
+     || !register_conversion(IMG_ABGR32,  IMG_BGRA32,  rgba_alpha03)
+     || !register_conversion(IMG_ABGR32,  IMG_GRAY8,   abgr32_gray8)
+
+     || !register_conversion(IMG_ARGB32,  IMG_RGB24,   argb32_rgb24)
+     || !register_conversion(IMG_ARGB32,  IMG_BGR24,   abgr32_rgb24)
+     || !register_conversion(IMG_ARGB32,  IMG_RGBA32,  rgba_alpha03)
+     || !register_conversion(IMG_ARGB32,  IMG_ABGR32,  rgba_swap13)
+     || !register_conversion(IMG_ARGB32,  IMG_ARGB32,  rgba_copy)
+     || !register_conversion(IMG_ARGB32,  IMG_BGRA32,  rgba_swapall)
+     || !register_conversion(IMG_ARGB32,  IMG_GRAY8,   argb32_gray8)
+
+     || !register_conversion(IMG_BGRA32,  IMG_RGB24,   bgra32_rgb24)
+     || !register_conversion(IMG_BGRA32,  IMG_BGR24,   rgba32_rgb24)
+     || !register_conversion(IMG_BGRA32,  IMG_RGBA32,  rgba_swap02)
+     || !register_conversion(IMG_BGRA32,  IMG_ABGR32,  rgba_alpha30)
+     || !register_conversion(IMG_BGRA32,  IMG_ARGB32,  rgba_swapall)
+     || !register_conversion(IMG_BGRA32,  IMG_BGRA32,  rgba_copy)
+     || !register_conversion(IMG_BGRA32,  IMG_GRAY8,   bgra32_gray8)
+
+     || !register_conversion(IMG_GRAY8,   IMG_RGB24,   gray8_rgb24)
+     || !register_conversion(IMG_GRAY8,   IMG_BGR24,   gray8_rgb24)
+     || !register_conversion(IMG_GRAY8,   IMG_RGBA32,  gray8_rgba32)
+     || !register_conversion(IMG_GRAY8,   IMG_ABGR32,  gray8_argb32)
+     || !register_conversion(IMG_GRAY8,   IMG_ARGB32,  gray8_argb32)
+     || !register_conversion(IMG_GRAY8,   IMG_BGRA32,  gray8_rgba32)
+     || !register_conversion(IMG_GRAY8,   IMG_GRAY8,   gray8_copy)
+    ) {
+        return 0;
+    }
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+
+    if (accel & (AC_IA32ASM | AC_AMD64ASM)) {
+        if (!register_conversion(IMG_RGBA32,  IMG_ABGR32,  rgba_swapall_x86)
+         || !register_conversion(IMG_RGBA32,  IMG_ARGB32,  rgba_alpha30_x86)
+         || !register_conversion(IMG_RGBA32,  IMG_BGRA32,  rgba_swap02_x86)
+
+         || !register_conversion(IMG_ABGR32,  IMG_RGBA32,  rgba_swapall_x86)
+         || !register_conversion(IMG_ABGR32,  IMG_ARGB32,  rgba_swap13_x86)
+         || !register_conversion(IMG_ABGR32,  IMG_BGRA32,  rgba_alpha03_x86)
+
+         || !register_conversion(IMG_ARGB32,  IMG_RGBA32,  rgba_alpha03_x86)
+         || !register_conversion(IMG_ARGB32,  IMG_ABGR32,  rgba_swap13_x86)
+         || !register_conversion(IMG_ARGB32,  IMG_BGRA32,  rgba_swapall_x86)
+
+         || !register_conversion(IMG_BGRA32,  IMG_RGBA32,  rgba_swap02_x86)
+         || !register_conversion(IMG_BGRA32,  IMG_ABGR32,  rgba_alpha30_x86)
+         || !register_conversion(IMG_BGRA32,  IMG_ARGB32,  rgba_swapall_x86)
+        ) {
+            return 0;
+        }
+    }
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
+    if (accel & AC_MMX) {
+        if (!register_conversion(IMG_RGBA32,  IMG_ABGR32,  rgba_swapall_mmx)
+         || !register_conversion(IMG_RGBA32,  IMG_ARGB32,  rgba_alpha30_mmx)
+         || !register_conversion(IMG_RGBA32,  IMG_BGRA32,  rgba_swap02_mmx)
+
+         || !register_conversion(IMG_ABGR32,  IMG_RGBA32,  rgba_swapall_mmx)
+         || !register_conversion(IMG_ABGR32,  IMG_ARGB32,  rgba_swap13_mmx)
+         || !register_conversion(IMG_ABGR32,  IMG_BGRA32,  rgba_alpha03_mmx)
+
+         || !register_conversion(IMG_ARGB32,  IMG_RGBA32,  rgba_alpha03_mmx)
+         || !register_conversion(IMG_ARGB32,  IMG_ABGR32,  rgba_swap13_mmx)
+         || !register_conversion(IMG_ARGB32,  IMG_BGRA32,  rgba_swapall_mmx)
+
+         || !register_conversion(IMG_BGRA32,  IMG_RGBA32,  rgba_swap02_mmx)
+         || !register_conversion(IMG_BGRA32,  IMG_ABGR32,  rgba_alpha30_mmx)
+         || !register_conversion(IMG_BGRA32,  IMG_ARGB32,  rgba_swapall_mmx)
+        ) {
+            return 0;
+        }
+    }
+#endif
+
+#if defined(HAVE_ASM_SSE2)
+    if (accel & AC_SSE2) {
+        if (!register_conversion(IMG_RGB24,   IMG_BGR24,   rgb24_bgr24_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_RGBA32,  rgb24_rgba32_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_ABGR32,  rgb24_abgr32_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_ARGB32,  rgb24_argb32_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_BGRA32,  rgb24_bgra32_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_GRAY8,   rgb24_gray8_sse2)
+
+         || !register_conversion(IMG_BGR24,   IMG_RGB24,   rgb24_bgr24_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_RGBA32,  rgb24_bgra32_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_ABGR32,  rgb24_argb32_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_ARGB32,  rgb24_abgr32_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_BGRA32,  rgb24_rgba32_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_GRAY8,   bgr24_gray8_sse2)
+
+         || !register_conversion(IMG_RGBA32,  IMG_RGB24,   rgba32_rgb24_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_BGR24,   bgra32_rgb24_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_ABGR32,  rgba_swapall_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_ARGB32,  rgba_alpha30_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_BGRA32,  rgba_swap02_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_GRAY8,   rgba32_gray8_sse2)
+
+         || !register_conversion(IMG_ABGR32,  IMG_RGB24,   abgr32_rgb24_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_BGR24,   argb32_rgb24_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_RGBA32,  rgba_swapall_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_ARGB32,  rgba_swap13_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_BGRA32,  rgba_alpha03_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_GRAY8,   abgr32_gray8_sse2)
+
+         || !register_conversion(IMG_ARGB32,  IMG_RGB24,   argb32_rgb24_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_BGR24,   abgr32_rgb24_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_RGBA32,  rgba_alpha03_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_ABGR32,  rgba_swap13_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_BGRA32,  rgba_swapall_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_GRAY8,   argb32_gray8_sse2)
+
+         || !register_conversion(IMG_BGRA32,  IMG_RGB24,   bgra32_rgb24_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_BGR24,   rgba32_rgb24_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_RGBA32,  rgba_swap02_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_ABGR32,  rgba_alpha30_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_ARGB32,  rgba_swapall_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_GRAY8,   bgra32_gray8_sse2)
+
+         || !register_conversion(IMG_GRAY8,   IMG_RGB24,   gray8_rgb24_sse2)
+         || !register_conversion(IMG_GRAY8,   IMG_BGR24,   gray8_rgb24_sse2)
+         || !register_conversion(IMG_GRAY8,   IMG_RGBA32,  gray8_rgba32_sse2)
+         || !register_conversion(IMG_GRAY8,   IMG_ABGR32,  gray8_argb32_sse2)
+         || !register_conversion(IMG_GRAY8,   IMG_ARGB32,  gray8_argb32_sse2)
+         || !register_conversion(IMG_GRAY8,   IMG_BGRA32,  gray8_rgba32_sse2)
+        ) {
+            return 0;
+        }
+    }
+#endif
+
+#endif  /* ARCH_X86 || ARCH_X86_64 */
+
+    return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ *   c-file-style: "stroustrup"
+ *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ *   indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/img_x86_common.h b/debian/transcode/transcode-1.1.7/aclib/img_x86_common.h
new file mode 100644
index 00000000..13ed851f
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/img_x86_common.h
@@ -0,0 +1,613 @@
+/*
+ * img_x86_common.h - common x86/x86-64 assembly macros
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later).  See the file COPYING
+ * for details.
+ */
+
+#ifndef ACLIB_IMG_X86_COMMON_H
+#define ACLIB_IMG_X86_COMMON_H
+
+/*************************************************************************/
+
+/* Register names for pointers */
+#ifdef ARCH_X86_64
+# define EAX "%%rax"
+# define EBX "%%rbx"
+# define ECX "%%rcx"
+# define EDX "%%rdx"
+# define ESP "%%rsp"
+# define EBP "%%rbp"
+# define ESI "%%rsi"
+# define EDI "%%rdi"
+#else
+# define EAX "%%eax"
+# define EBX "%%ebx"
+# define ECX "%%ecx"
+# define EDX "%%edx"
+# define ESP "%%esp"
+# define EBP "%%ebp"
+# define ESI "%%esi"
+# define EDI "%%edi"
+#endif
+
+/* Macros to push and pop one or two registers within an assembly block.
+ * The x86-64 ABI allows leaf functions to write to 128 bytes BELOW
+ * (yes, below) the stack pointer, so we can't just push our own stuff
+ * there.  Argh. */
+#ifdef ARCH_X86_64
+# define FAKE_PUSH_REG "r12"
+# define FAKE_PUSH_REG_2 "r13"
+# define COMMA_FAKE_PUSH_REG ,FAKE_PUSH_REG
+# define PUSH(reg) "mov " reg ", %%" FAKE_PUSH_REG
+# define POP(reg)  "mov %%" FAKE_PUSH_REG ", " reg
+# define PUSH2(reg1,reg2) PUSH(reg1) "; mov " reg2 ", %%" FAKE_PUSH_REG_2
+# define POP2(reg2,reg1)  "mov %%" FAKE_PUSH_REG_2 ", " reg2 "; " POP(reg1)
+#else
+# define COMMA_FAKE_PUSH_REG /*nothing*/
+# define PUSH(reg) "push " reg
+# define POP(reg)  "pop "  reg
+# define PUSH2(reg1,reg2) "push " reg1 "; push " reg2
+# define POP2(reg2,reg1)  "pop "  reg2 "; pop "  reg1
+#endif
+
+/* Data for isolating particular bytes.  Used by the SWAP32 macros; if you
+ * use them, make sure to define DEFINE_MASK_DATA before including this
+ * file! */
+#ifdef DEFINE_MASK_DATA
+static const struct { uint32_t n[64]; } __attribute__((aligned(16))) mask_data = {{
+    0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF,
+    0x0000FF00, 0x0000FF00, 0x0000FF00, 0x0000FF00,
+    0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF,
+    0x00FF0000, 0x00FF0000, 0x00FF0000, 0x00FF0000,
+    0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF,
+    0x00FFFF00, 0x00FFFF00, 0x00FFFF00, 0x00FFFF00,
+    0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF,
+    0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
+    0xFF0000FF, 0xFF0000FF, 0xFF0000FF, 0xFF0000FF,
+    0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00,
+    0xFF00FFFF, 0xFF00FFFF, 0xFF00FFFF, 0xFF00FFFF,
+    0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000,
+    0xFFFF00FF, 0xFFFF00FF, 0xFFFF00FF, 0xFFFF00FF,
+    0xFFFFFF00, 0xFFFFFF00, 0xFFFFFF00, 0xFFFFFF00,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+}};
+#endif
+
+/*************************************************************************/
+
+/* Basic assembly macros, used for odd-count loops */
+
+/* Swap bytes in pairs of 16-bit values */
+#define X86_SWAP16_2 \
+        "movl -4("ESI","ECX",4), %%eax                                  \n\
+        movl %%eax, %%edx                                               \n\
+        shll $8, %%eax                                                  \n\
+        andl $0xFF00FF00, %%eax                                         \n\
+        shrl $8, %%edx                                                  \n\
+        andl $0x00FF00FF, %%edx                                         \n\
+        orl %%edx, %%eax                                                \n\
+        movl %%eax, -4("EDI","ECX",4)"
+
+/* Swap words in a 32-bit value */
+#define X86_SWAP32 \
+        "movl -4("ESI","ECX",4), %%eax                                  \n\
+        roll $16, %%eax                                                 \n\
+        movl %%eax, -4("EDI","ECX",4)"
+
+/* Swap bytes 0 and 2 of a 32-bit value */
+#define X86_SWAP32_02 \
+        "movw -4("ESI","ECX",4), %%ax                                   \n\
+        movw -2("ESI","ECX",4), %%dx                                    \n\
+        xchg %%dl, %%al                                                 \n\
+        movw %%ax, -4("EDI","ECX",4)                                    \n\
+        movw %%dx, -2("EDI","ECX",4)"
+
+/* Swap bytes 1 and 3 of a 32-bit value */
+#define X86_SWAP32_13 \
+        "movw -4("ESI","ECX",4), %%ax                                   \n\
+        movw -2("ESI","ECX",4), %%dx                                    \n\
+        xchg %%dh, %%ah                                                 \n\
+        movw %%ax, -4("EDI","ECX",4)                                    \n\
+        movw %%dx, -2("EDI","ECX",4)"
+
+/* Reverse the order of bytes in a 32-bit value */
+#define X86_REV32 \
+        "movl -4("ESI","ECX",4), %%eax                                  \n\
+        xchg %%ah, %%al                                                 \n\
+        roll $16, %%eax                                                 \n\
+        xchg %%ah, %%al                                                 \n\
+        movl %%eax, -4("EDI","ECX",4)"
+
+/* The same, using the BSWAP instruction */
+#define X86_REV32_BSWAP \
+        "movl -4("ESI","ECX",4), %%eax                                  \n\
+        bswap %%eax                                                     \n\
+        movl %%eax, -4("EDI","ECX",4)"
+
+/* Rotate a 32-bit value left 8 bits */
+#define X86_ROL32 \
+        "movl -4("ESI","ECX",4), %%eax                                  \n\
+        roll $8, %%eax                                                  \n\
+        movl %%eax, -4("EDI","ECX",4)"
+
+/* Rotate a 32-bit value right 8 bits */
+#define X86_ROR32 \
+        "movl -4("ESI","ECX",4), %%eax                                  \n\
+        rorl $8, %%eax                                                  \n\
+        movl %%eax, -4("EDI","ECX",4)"
+
+/*************************************************************************/
+
+/* Basic assembly routines.  Sizes are all given in 32-bit units. */
+
+#define ASM_SWAP16_2_X86(size) \
+    asm("0: "X86_SWAP16_2"                                              \n\
+        subl $1, %%ecx                                                  \n\
+        jnz 0b"                                                         \
+        : /* no outputs */                                              \
+        : "S" (src[0]), "D" (dest[0]), "c" (size)                       \
+        : "eax", "edx")
+
+#define ASM_SWAP32_X86(size) \
+    asm("0: "X86_SWAP32"                                                \n\
+        subl $1, %%ecx                                                  \n\
+        jnz 0b"                                                         \
+        : /* no outputs */                                              \
+        : "S" (src[0]), "D" (dest[0]), "c" (size)                       \
+        : "eax", "edx")
+
+#define ASM_SWAP32_02_X86(size) \
+    asm("0: "X86_SWAP32_02"                                             \n\
+        subl $1, %%ecx                                                  \n\
+        jnz 0b"                                                         \
+        : /* no outputs */                                              \
+        : "S" (src[0]), "D" (dest[0]), "c" (size)                       \
+        : "eax", "edx")
+
+#define ASM_SWAP32_13_X86(size) \
+    asm("0: "X86_SWAP32_13"                                             \n\
+        subl $1, %%ecx                                                  \n\
+        jnz 0b"                                                         \
+        : /* no outputs */                                              \
+        : "S" (src[0]), "D" (dest[0]), "c" (size)                       \
+        : "eax", "edx")
+
+#define ASM_REV32_X86(size) \
+    asm("0: "X86_REV32"                                                 \n\
+        subl $1, %%ecx                                                  \n\
+        jnz 0b"                                                         \
+        : /* no outputs */                                              \
+        : "S" (src[0]), "D" (dest[0]), "c" (size)                       \
+        : "eax")
+
+#define ASM_ROL32_X86(size) \
+    asm("0: "X86_ROL32"                                                 \n\
+        subl $1, %%ecx                                                  \n\
+        jnz 0b"                                                         \
+        : /* no outputs */                                              \
+        : "S" (src[0]), "D" (dest[0]), "c" (size)                       \
+        : "eax")
+
+#define ASM_ROR32_X86(size) \
+    asm("0: "X86_ROR32"                                                 \n\
+        subl $1, %%ecx                                                  \n\
+        jnz 0b"                                                         \
+        : /* no outputs */                                              \
+        : "S" (src[0]), "D" (dest[0]), "c" (size)                       \
+        : "eax")
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Wrapper for SIMD loops.  This generates the body of an asm() construct
+ * (the string only, not the input/output/clobber lists) given the data
+ * block size (number of data units processed per SIMD loop iteration),
+ * instructions to save and restore unclobberable registers (such as EBX),
+ * and the bodies of the odd-count and main loops.  The data count is
+ * assumed to be preloaded in ECX.  Parameters are:
+ *     blocksize: number of units of data processed per SIMD loop (must be
+ *                a power of 2); can be a constant or a numerical
+ *                expression containing only constants
+ *     push_regs: string constant containing instructions to push registers
+ *                that must be saved over the small loop
+ *      pop_regs: string constant containing instructions to pop registers
+ *                saved by `push_regs' (restored before the main loop)
+ *    small_loop: loop for handling data elements one at a time (when the
+ *                count is not a multiple of `blocksize'
+ *     main_loop: main SIMD loop for processing data
+ *          emms: EMMS/SFENCE instructions to end main loop with, as needed
+ */
+
+#define SIMD_LOOP_WRAPPER(blocksize,push_regs,pop_regs,small_loop,main_loop,emms) \
+        /* Check whether the count is a multiple of the blocksize (this \
+         * can cause branch mispredicts but seems to be faster overall) */ \
+        "testl $(("#blocksize")-1), %%ecx; "                            \
+        "jz 1f; "                                                       \
+        /* It's not--run the small loop to align the count */           \
+        push_regs"; "                                                   \
+        "0: "                                                           \
+        small_loop"; "                                                  \
+        "subl $1, %%ecx; "                                              \
+        "testl $(("#blocksize")-1), %%ecx; "                            \
+        "jnz 0b; "                                                      \
+        pop_regs"; "                                                    \
+        /* Make sure there's some data left */                          \
+        "testl %%ecx, %%ecx; "                                          \
+        "jz 2f; "                                                       \
+        /* Now run the main SIMD loop */                                \
+        "1: "                                                           \
+        main_loop"; "                                                   \
+        "subl $("#blocksize"), %%ecx; "                                 \
+        "jnz 1b; "                                                      \
+        /* Clear MMX state and/or SFENCE, as needed */                  \
+        emms"; "                                                        \
+        /* Done */                                                      \
+        "2: "
+
+/*************************************************************************/
+
+/* MMX- and SSE2-optimized swap/rotate routines.  These routines are
+ * identical save for data size, so we use common macros to implement them,
+ * with register names and data offsets replaced by parameters to the
+ * macros. */
+
+#define ASM_SIMD_MMX(name,size) \
+    name((size), 64,                            \
+         "movq", "movq", "movq", "",            \
+         "%%mm0", "%%mm1", "%%mm2", "%%mm3",    \
+         "%%mm4", "%%mm5", "%%mm6", "%%mm7")
+#define ASM_SIMD_SSE2(name,size) \
+    name((size), 128,                           \
+         "movdqu", "movdqa", "movdqu", "",      \
+         "%%xmm0", "%%xmm1", "%%xmm2", "%%xmm3",\
+         "%%xmm4", "%%xmm5", "%%xmm6", "%%xmm7")
+#define ASM_SIMD_SSE2_ALIGNED(name,size) \
+    name((size), 128,                           \
+         "movdqa", "movdqa", "movntdq", "sfence",\
+         "%%xmm0", "%%xmm1", "%%xmm2", "%%xmm3",\
+         "%%xmm4", "%%xmm5", "%%xmm6", "%%xmm7")
+
+#define ASM_SWAP16_2_MMX(size)    ASM_SIMD_MMX(ASM_SWAP16_2_SIMD,(size))
+#define ASM_SWAP16_2_SSE2(size)   ASM_SIMD_SSE2(ASM_SWAP16_2_SIMD,(size))
+#define ASM_SWAP16_2_SSE2A(size)  ASM_SIMD_SSE2_ALIGNED(ASM_SWAP16_2_SIMD,(size))
+#define ASM_SWAP32_MMX(size)      ASM_SIMD_MMX(ASM_SWAP32_SIMD,(size))
+#define ASM_SWAP32_SSE2(size)     ASM_SIMD_SSE2(ASM_SWAP32_SIMD,(size))
+#define ASM_SWAP32_SSE2A(size)    ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_SIMD,(size))
+#define ASM_SWAP32_02_MMX(size)   ASM_SIMD_MMX(ASM_SWAP32_02_SIMD,(size))
+#define ASM_SWAP32_02_SSE2(size)  ASM_SIMD_SSE2(ASM_SWAP32_02_SIMD,(size))
+#define ASM_SWAP32_02_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_02_SIMD,(size))
+#define ASM_SWAP32_13_MMX(size)   ASM_SIMD_MMX(ASM_SWAP32_13_SIMD,(size))
+#define ASM_SWAP32_13_SSE2(size)  ASM_SIMD_SSE2(ASM_SWAP32_13_SIMD,(size))
+#define ASM_SWAP32_13_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_13_SIMD,(size))
+#define ASM_REV32_MMX(size)       ASM_SIMD_MMX(ASM_REV32_SIMD,(size))
+#define ASM_REV32_SSE2(size)      ASM_SIMD_SSE2(ASM_REV32_SIMD,(size))
+#define ASM_REV32_SSE2A(size)     ASM_SIMD_SSE2_ALIGNED(ASM_REV32_SIMD,(size))
+#define ASM_ROL32_MMX(size)       ASM_SIMD_MMX(ASM_ROL32_SIMD,(size))
+#define ASM_ROL32_SSE2(size)      ASM_SIMD_SSE2(ASM_ROL32_SIMD,(size))
+#define ASM_ROL32_SSE2A(size)     ASM_SIMD_SSE2_ALIGNED(ASM_ROL32_SIMD,(size))
+#define ASM_ROR32_MMX(size)       ASM_SIMD_MMX(ASM_ROR32_SIMD,(size))
+#define ASM_ROR32_SSE2(size)      ASM_SIMD_SSE2(ASM_ROR32_SIMD,(size))
+#define ASM_ROR32_SSE2A(size)     ASM_SIMD_SSE2_ALIGNED(ASM_ROR32_SIMD,(size))
+
+/*************************************************************************/
+
+/* Actual implementations.  Note that unrolling the SIMD loops doesn't seem
+ * to be a win (only 2-3% improvement at most), and in fact can lose by a
+ * bit in short loops. */
+
+#define ASM_SWAP16_2_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
+    asm(SIMD_LOOP_WRAPPER(                                              \
+        /* blocksize  */ (regsize)/32,                                  \
+        /* push_regs  */ "",                                            \
+        /* pop_regs   */ "",                                            \
+        /* small_loop */ X86_SWAP16_2,                                  \
+        /* main_loop  */                                                \
+         ldq" -("#regsize"/8)("ESI","ECX",4), "MM0"                     \n\
+                                        # MM0: 7 6 5 4 3 2 1 0          \n\
+        "movq" "MM0", "MM1"             # MM1: 7 6 5 4 3 2 1 0          \n\
+        psrlw $8, "MM0"                 # MM0: - 7 - 5 - 3 - 1          \n\
+        psllw $8, "MM1"                 # MM1: 6 - 4 - 2 - 0 -          \n\
+        por "MM1", "MM0"                # MM0: 6 7 4 5 2 3 0 1          \n\
+        "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)",                   \
+        /* emms */ "emms; "sfence)                                      \
+        : /* no outputs */                                              \
+        : "S" (src[0]), "D" (dest[0]), "c" (size)                       \
+        : "eax", "edx")
+
+#define ASM_SWAP32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
+    asm(SIMD_LOOP_WRAPPER(                                              \
+        /* blocksize  */ (regsize)/32,                                  \
+        /* push_regs  */ "",                                            \
+        /* pop_regs   */ "",                                            \
+        /* small_loop */ X86_SWAP32,                                    \
+        /* main_loop  */                                                \
+         ldq" -("#regsize"/8)("ESI","ECX",4), "MM0"                     \n\
+                                        # MM0: 7 6 5 4 3 2 1 0          \n\
+        "movq" "MM0", "MM1"             # MM1: 7 6 5 4 3 2 1 0          \n\
+        psrld $16, "MM0"                # MM0: - - 7 6 - - 3 2          \n\
+        pslld $16, "MM1"                # MM1: 5 4 - - 1 0 - -          \n\
+        por "MM1", "MM0"                # MM0: 5 4 7 6 1 0 3 2          \n\
+        "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)",                   \
+        /* emms */ "emms; "sfence)                                      \
+        : /* no outputs */                                              \
+        : "S" (src[0]), "D" (dest[0]), "c" (size)                       \
+        : "eax")
+
+#define ASM_SWAP32_02_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
+    asm(SIMD_LOOP_WRAPPER(                                              \
+        /* blocksize  */ (regsize)/32,                                  \
+        /* push_regs  */ "push "EDX,                                    \
+        /* pop_regs   */ "pop "EDX,                                     \
+        /* small_loop */ X86_SWAP32_02,                                 \
+        /* main_loop  */                                                \
+         ldq" -("#regsize"/8)("ESI","ECX",4), "MM0"                     \n\
+                                        # MM0: 7 6 5 4 3 2 1 0          \n\
+        "movq" "MM0", "MM1"             # MM1: 7 6 5 4 3 2 1 0          \n\
+        "movq" "MM0", "MM2"             # MM2: 7 6 5 4 3 2 1 0          \n\
+        pand 16("EDX"), "MM1"           # MM1: - - - 4 - - - 0          \n\
+        pslld $16, "MM1"                # MM1: - 4 - - - 0 - -          \n\
+        pand 64("EDX"), "MM2"           # MM2: - 6 - - - 2 - -          \n\
+        psrld $16, "MM2"                # MM2: - - - 6 - - - 2          \n\
+        pand 160("EDX"), "MM0"          # MM0: 7 - 5 - 3 - 1 -          \n\
+        por "MM1", "MM0"                # MM0: 7 4 5 - 3 0 1 -          \n\
+        por "MM2", "MM0"                # MM0: 7 4 5 6 3 0 1 2          \n\
+        "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)",                   \
+        /* emms */ "emms; "sfence)                                      \
+        : /* no outputs */                                              \
+        : "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data),    \
+          "m" (mask_data)                                               \
+        : "eax")
+
+#define ASM_SWAP32_13_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
+    asm(SIMD_LOOP_WRAPPER(                                              \
+        /* blocksize  */ (regsize)/32,                                  \
+        /* push_regs  */ "push "EDX,                                    \
+        /* pop_regs   */ "pop "EDX,                                     \
+        /* small_loop */ X86_SWAP32_13,                                 \
+        /* main_loop  */                                                \
+         ldq" -("#regsize"/8)("ESI","ECX",4), "MM0"                     \n\
+                                        # MM0: 7 6 5 4 3 2 1 0          \n\
+        "movq" "MM0", "MM1"             # MM1: 7 6 5 4 3 2 1 0          \n\
+        "movq" "MM0", "MM2"             # MM2: 7 6 5 4 3 2 1 0          \n\
+        pand 32("EDX"), "MM1"           # MM1: - - 5 - - - 1 -          \n\
+        pslld $16, "MM1"                # MM1: 5 - - - 1 - - -          \n\
+        pand 128("EDX"), "MM2"          # MM2: 7 - - - 3 - - -          \n\
+        psrld $16, "MM2"                # MM2: - - 7 - - - 3 -          \n\
+        pand 80("EDX"), "MM0"           # MM0: - 6 - 4 - 2 - 0          \n\
+        por "MM1", "MM0"                # MM0: 5 6 - 4 1 2 - 0          \n\
+        por "MM2", "MM0"                # MM0: 5 6 7 4 1 2 3 0          \n\
+        "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)",                   \
+        /* emms */ "emms; "sfence)                                      \
+        : /* no outputs */                                              \
+        : "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data),    \
+          "m" (mask_data)                                               \
+        : "eax");
+
+#define ASM_REV32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
+    asm(SIMD_LOOP_WRAPPER(                                              \
+        /* blocksize  */ (regsize)/32,                                  \
+        /* push_regs  */ "",                                            \
+        /* pop_regs   */ "",                                            \
+        /* small_loop */ X86_REV32_BSWAP,                               \
+        /* main_loop  */                                                \
+         ldq" -("#regsize"/8)("ESI","ECX",4), "MM0"                     \n\
+                                        # MM0: 7 6 5 4 3 2 1 0          \n\
+        "movq" "MM0", "MM1"             # MM1: 7 6 5 4 3 2 1 0          \n\
+        "movq" "MM0", "MM2"             # MM2: 7 6 5 4 3 2 1 0          \n\
+        "movq" "MM0", "MM3"             # MM3: 7 6 5 4 3 2 1 0          \n\
+        psrld $24, "MM0"                # MM0: - - - 7 - - - 3          \n\
+        pand 32("EDX"), "MM2"           # MM2: - - 5 - - - 1 -          \n\
+        psrld $8, "MM1"                 # MM1: - 7 6 5 - 3 2 1          \n\
+        pand 32("EDX"), "MM1"           # MM1: - - 6 - - - 2 -          \n\
+        pslld $8, "MM2"                 # MM2: - 5 - - - 1 - -          \n\
+        pslld $24, "MM3"                # MM3: 4 - - - 0 - - -          \n\
+        por "MM1", "MM0"                # MM0: - - 6 7 - - 2 3          \n\
+        por "MM2", "MM0"                # MM0: - 5 6 7 - 1 2 3          \n\
+        por "MM3", "MM0"                # MM0: 4 5 6 7 0 1 2 3          \n\
+        "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)",                   \
+        /* emms */ "emms; "sfence)                                      \
+        : /* no outputs */                                              \
+        : "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data),    \
+          "m" (mask_data)                                               \
+        : "eax")
+
+#define ASM_ROL32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
+    asm(SIMD_LOOP_WRAPPER(                                              \
+        /* blocksize  */ (regsize)/32,                                  \
+        /* push_regs  */ "",                                            \
+        /* pop_regs   */ "",                                            \
+        /* small_loop */ X86_ROL32,                                     \
+        /* main_loop  */                                                \
+         ldq" -("#regsize"/8)("ESI","ECX",4), "MM0"                     \n\
+                                        # MM0: 7 6 5 4 3 2 1 0          \n\
+        "movq" "MM0", "MM1"             # MM1: 7 6 5 4 3 2 1 0          \n\
+        pslld $8, "MM0"                 # MM0: 6 5 4 - 2 1 0 -          \n\
+        psrld $24, "MM1"                # MM1: - - - 7 - - - 3          \n\
+        por "MM1", "MM0"                # MM0: 6 5 4 7 2 1 0 3          \n\
+        "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)",                   \
+        /* emms */ "emms; "sfence)                                      \
+        : /* no outputs */                                              \
+        : "S" (src[0]), "D" (dest[0]), "c" (size)                       \
+        : "eax")
+
+#define ASM_ROR32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
+    asm(SIMD_LOOP_WRAPPER(                                              \
+        /* blocksize  */ (regsize)/32,                                  \
+        /* push_regs  */ "",                                            \
+        /* pop_regs   */ "",                                            \
+        /* small_loop */ X86_ROR32,                                     \
+        /* main_loop  */                                                \
+         ldq" -("#regsize"/8)("ESI","ECX",4), "MM0"                     \n\
+                                        # MM0: 7 6 5 4 3 2 1 0          \n\
+        "movq" "MM0", "MM1"             # MM1: 7 6 5 4 3 2 1 0          \n\
+        psrld $8, "MM0"                 # MM0: - 7 6 5 - 3 2 1          \n\
+        pslld $24, "MM1"                # MM1: 4 - - - 0 - - -          \n\
+        por "MM1", "MM0"                # MM0: 4 7 6 5 0 3 2 1          \n\
+        "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)",                   \
+        /* emms */ "emms; "sfence)                                      \
+        : /* no outputs */                                              \
+        : "S" (src[0]), "D" (dest[0]), "c" (size)                       \
+        : "eax")
+
+/*************************************************************************/
+
+/* SSE2 macros to load 8 24- or 32-bit RGB pixels into XMM0/1/2 (R/G/B) as
+ * 16-bit values, used for RGB->YUV and RGB->grayscale conversions.
+ * ZERO is the number of the XMM register containing all zeroes. */
+
+#define SSE2_LOAD_RGB24(ZERO) \
+        "movl -21("ESI","EBX"), %%eax                                   \n\
+        movd %%eax, %%xmm0              # XMM0: ----- ----- ----- xBGR1 \n\
+        pshufd $0x39, %%xmm0, %%xmm0    # XMM0: xBGR1 ----- ----- ----- \n\
+        movl -18("ESI","EBX"), %%eax                                    \n\
+        movd %%eax, %%xmm2                                              \n\
+        por %%xmm2, %%xmm0              # XMM0: xBGR1 ----- ----- xBGR2 \n\
+        pshufd $0x39, %%xmm0, %%xmm0    # XMM0: xBGR2 xBGR1 ----- ----- \n\
+        movl -15("ESI","EBX"), %%eax                                    \n\
+        movd %%eax, %%xmm2                                              \n\
+        por %%xmm2, %%xmm0              # XMM0: xBGR2 xBGR1 ----- xBGR3 \n\
+        pshufd $0x39, %%xmm0, %%xmm0    # XMM0: xBGR3 xBGR2 xBGR1 ----- \n\
+        movl -24("ESI","EBX"), %%eax                                    \n\
+        movd %%eax, %%xmm2                                              \n\
+        por %%xmm2, %%xmm0              # XMM0: xBGR3 xBGR2 xBGR1 xBGR0 \n\
+        movl -9("ESI","EBX"), %%eax                                     \n\
+        movd %%eax, %%xmm1              # XMM1: ----- ----- ----- xBGR5 \n\
+        pshufd $0x39, %%xmm1, %%xmm1    # XMM1: xBGR5 ----- ----- ----- \n\
+        movl -6("ESI","EBX"), %%eax                                     \n\
+        movd %%eax, %%xmm2                                              \n\
+        por %%xmm2, %%xmm1              # XMM1: xBGR5 ----- ----- xBGR6 \n\
+        pshufd $0x39, %%xmm1, %%xmm1    # XMM1: xBGR6 xBGR5 ----- ----- \n\
+        movl -3("ESI","EBX"), %%eax                                     \n\
+        movd %%eax, %%xmm2                                              \n\
+        por %%xmm2, %%xmm1              # XMM1: xBGR6 xBGR5 ----- xBGR7 \n\
+        pshufd $0x39, %%xmm1, %%xmm1    # XMM1: xBGR7 xBGR6 xBGR5 ----- \n\
+        movl -12("ESI","EBX"), %%eax                                    \n\
+        movd %%eax, %%xmm2                                              \n\
+        por %%xmm2, %%xmm1              # XMM1: xBGR7 xBGR6 xBGR5 xBGR4 \n"\
+        SSE2_MASSAGE_RGBA32(ZERO)
+
+#define SSE2_LOAD_BGR24(ZERO) \
+        "movl -21("ESI","EBX"), %%eax                                   \n\
+        movd %%eax, %%xmm0              # XMM0: ----- ----- ----- xRGB1 \n\
+        pshufd $0x39, %%xmm0, %%xmm0    # XMM0: xRGB1 ----- ----- ----- \n\
+        movl -18("ESI","EBX"), %%eax                                    \n\
+        movd %%eax, %%xmm2                                              \n\
+        por %%xmm2, %%xmm0              # XMM0: xRGB1 ----- ----- xRGB2 \n\
+        pshufd $0x39, %%xmm0, %%xmm0    # XMM0: xRGB2 xRGB1 ----- ----- \n\
+        movl -15("ESI","EBX"), %%eax                                    \n\
+        movd %%eax, %%xmm2                                              \n\
+        por %%xmm2, %%xmm0              # XMM0: xRGB2 xRGB1 ----- xRGB3 \n\
+        pshufd $0x39, %%xmm0, %%xmm0    # XMM0: xRGB3 xRGB2 xRGB1 ----- \n\
+        movl -24("ESI","EBX"), %%eax                                    \n\
+        movd %%eax, %%xmm2                                              \n\
+        por %%xmm2, %%xmm0              # XMM0: xRGB3 xRGB2 xRGB1 xRGB0 \n\
+        movl -9("ESI","EBX"), %%eax                                     \n\
+        movd %%eax, %%xmm1              # XMM1: ----- ----- ----- xRGB5 \n\
+        pshufd $0x39, %%xmm1, %%xmm1    # XMM1: xRGB5 ----- ----- ----- \n\
+        movl -6("ESI","EBX"), %%eax                                     \n\
+        movd %%eax, %%xmm2                                              \n\
+        por %%xmm2, %%xmm1              # XMM1: xRGB5 ----- ----- xRGB6 \n\
+        pshufd $0x39, %%xmm1, %%xmm1    # XMM1: xRGB6 xRGB5 ----- ----- \n\
+        movl -3("ESI","EBX"), %%eax                                     \n\
+        movd %%eax, %%xmm2                                              \n\
+        por %%xmm2, %%xmm1              # XMM1: xRGB6 xRGB5 ----- xRGB7 \n\
+        pshufd $0x39, %%xmm1, %%xmm1    # XMM1: xRGB7 xRGB6 xRGB5 ----- \n\
+        movl -12("ESI","EBX"), %%eax                                    \n\
+        movd %%eax, %%xmm2                                              \n\
+        por %%xmm2, %%xmm1              # XMM1: xRGB7 xRGB6 xRGB5 xRGB4 \n"\
+        SSE2_MASSAGE_BGRA32(ZERO)
+
+#define SSE2_LOAD_RGBA32(ZERO) "\
+        movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: ABGR3 ABGR2 ABGR1 ABGR0 \n\
+        movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: ABGR7 ABGR6 ABGR5 ABGR4 \n"\
+        SSE2_MASSAGE_RGBA32(ZERO)
+#define SSE2_MASSAGE_RGBA32(ZERO) "\
+        movdqa %%xmm0, %%xmm2           # XMM2: ABGR3 ABGR2 ABGR1 ABGR0 \n\
+        punpcklbw %%xmm1, %%xmm0        # X0.l: A4 A0 B4 B0 G4 G0 R4 R0 \n\
+        punpckhbw %%xmm1, %%xmm2        # X2.l: A6 A2 B6 B2 G6 G2 R6 R2 \n\
+        movdqa %%xmm0, %%xmm1           # X1.l: A4 A0 B4 B0 G4 G0 R4 R0 \n\
+        punpcklbw %%xmm2, %%xmm0        # X0.l: G6 G4 G2 G0 R6 R4 R2 R0 \n\
+        punpckhbw %%xmm2, %%xmm1        # X1.l: G7 G5 G3 G1 R7 R5 R3 R1 \n\
+        movdqa %%xmm0, %%xmm2           # X2.l: G6 G4 G2 G0 R6 R4 R2 R0 \n\
+        punpcklbw %%xmm1, %%xmm0        # XMM0: G7.......G0 R7.......R0 \n\
+        punpckhbw %%xmm1, %%xmm2        # XMM2: A7.......A0 B7.......B0 \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: G7.......G0 R7.......R0 \n\
+        punpcklbw %%xmm4, %%xmm0        # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\
+        punpckhbw %%xmm4, %%xmm1        # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\
+        punpcklbw %%xmm4, %%xmm2        # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n"
+
+#define SSE2_LOAD_BGRA32(ZERO) "\
+        movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: ARGB3 ARGB2 ARGB1 ARGB0 \n\
+        movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: ARGB7 ARGB6 ARGB5 ARGB4 \n"\
+        SSE2_MASSAGE_BGRA32(ZERO)
+#define SSE2_MASSAGE_BGRA32(ZERO) "\
+        movdqa %%xmm0, %%xmm2           # XMM2: ARGB3 ARGB2 ARGB1 ARGB0 \n\
+        punpcklbw %%xmm1, %%xmm2        # X2.l: A4 A0 R4 R0 G4 G0 B4 B0 \n\
+        punpckhbw %%xmm1, %%xmm0        # X0.l: A6 A2 R6 R2 G6 G2 B6 B2 \n\
+        movdqa %%xmm2, %%xmm1           # X1.l: A4 A0 R4 R0 G4 G0 B4 B0 \n\
+        punpcklbw %%xmm0, %%xmm2        # X2.l: G6 G4 G2 G0 B6 B4 B2 B0 \n\
+        punpckhbw %%xmm0, %%xmm1        # X1.l: G7 G5 G3 G1 B7 B5 B3 B1 \n\
+        movdqa %%xmm2, %%xmm0           # X0.l: G6 G4 G2 G0 B6 B4 B2 B0 \n\
+        punpcklbw %%xmm1, %%xmm2        # XMM2: G7.......G0 B7.......B0 \n\
+        punpckhbw %%xmm1, %%xmm0        # XMM0: A7.......A0 R7.......R0 \n\
+        movdqa %%xmm2, %%xmm1           # XMM1: G7.......G0 B7.......B0 \n\
+        punpcklbw %%xmm4, %%xmm0        # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\
+        punpckhbw %%xmm4, %%xmm1        # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\
+        punpcklbw %%xmm4, %%xmm2        # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n"
+
+#define SSE2_LOAD_ARGB32(ZERO) "\
+        movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: BGRA3 BGRA2 BGRA1 BGRA0 \n\
+        movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: BGRA7 BGRA6 BGRA5 BGRA4 \n"\
+        SSE2_MASSAGE_ARGB32(ZERO)
+#define SSE2_MASSAGE_ARGB32(ZERO) "\
+        movdqa %%xmm0, %%xmm2           # XMM2: BGRA3 BGRA2 BGRA1 BGRA0 \n\
+        punpcklbw %%xmm1, %%xmm0        # X0.l: B4 B0 G4 G0 R4 R0 A4 A0 \n\
+        punpckhbw %%xmm1, %%xmm2        # X2.l: B6 B2 G6 G2 R6 R2 A6 A2 \n\
+        movdqa %%xmm0, %%xmm1           # X1.l: B4 B0 G4 G0 R4 R0 A4 A0 \n\
+        punpcklbw %%xmm2, %%xmm0        # X0.l: R6 R4 R2 R0 A6 A4 A2 A0 \n\
+        punpckhbw %%xmm2, %%xmm1        # X1.l: R7 R5 R3 R1 A7 A5 A3 A1 \n\
+        movdqa %%xmm0, %%xmm2           # X2.l: R6 R4 R2 R0 A6 A4 A2 A0 \n\
+        punpcklbw %%xmm1, %%xmm0        # XMM0: R7.......G0 A7.......A0 \n\
+        punpckhbw %%xmm1, %%xmm2        # XMM2: B7.......G0 G7.......G0 \n\
+        movdqa %%xmm2, %%xmm1           # XMM1: B7.......B0 G7.......G0 \n\
+        punpckhbw %%xmm4, %%xmm0        # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\
+        punpcklbw %%xmm4, %%xmm1        # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\
+        punpckhbw %%xmm4, %%xmm2        # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n"
+
+#define SSE2_LOAD_ABGR32(ZERO) "\
+        movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: RGBA3 RGBA2 RGBA1 RGBA0 \n\
+        movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: RGBA7 RGBA6 RGBA5 RGBA4 \n"\
+        SSE2_MASSAGE_ABGR32(ZERO)
+#define SSE2_MASSAGE_ABGR32(ZERO) "\
+        movdqa %%xmm0, %%xmm2           # XMM2: RGBA3 RGBA2 RGBA1 RGBA0 \n\
+        punpcklbw %%xmm1, %%xmm2        # X2.l: R4 R0 G4 G0 B4 B0 A4 A0 \n\
+        punpckhbw %%xmm1, %%xmm0        # X0.l: R6 R2 G6 G2 B6 B2 A6 A2 \n\
+        movdqa %%xmm2, %%xmm1           # X1.l: R4 R0 G4 G0 B4 B0 A4 A0 \n\
+        punpcklbw %%xmm0, %%xmm2        # X2.l: B6 B4 B2 B0 A6 A4 A2 A0 \n\
+        punpckhbw %%xmm0, %%xmm1        # X1.l: B7 B5 B3 B1 A7 A5 A3 A1 \n\
+        movdqa %%xmm2, %%xmm0           # X0.l: B6 B4 B2 B0 A6 A4 A2 A0 \n\
+        punpcklbw %%xmm1, %%xmm2        # XMM2: B7.......B0 A7.......A0 \n\
+        punpckhbw %%xmm1, %%xmm0        # XMM0: R7.......R0 G7.......G0 \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: R7.......R0 G7.......G0 \n\
+        punpckhbw %%xmm4, %%xmm0        # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\
+        punpcklbw %%xmm4, %%xmm1        # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\
+        punpckhbw %%xmm4, %%xmm2        # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n"
+
+/*************************************************************************/
+
+#endif  /* ACLIB_IMG_X86_COMMON_H */
+
+/*
+ * Local variables:
+ *   c-file-style: "stroustrup"
+ *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ *   indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/img_yuv_mixed.c b/debian/transcode/transcode-1.1.7/aclib/img_yuv_mixed.c
new file mode 100644
index 00000000..7f4b8d70
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/img_yuv_mixed.c
@@ -0,0 +1,981 @@
+/*
+ * img_yuv_packed.c - YUV planar<->packed image format conversion routines
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later).  See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "imgconvert.h"
+#include "img_internal.h"
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Standard C implementations */
+
+/*************************************************************************/
+
+/* Wrappers for UYVY and YVYU */
+/* Note: we rely on YUY2<->{UYVY,YVYU} working for src==dest */
+/* FIXME: when converting from UYVY/YVYU, src is destroyed! */
+
+static int uyvy_yvyu_wrapper(uint8_t **src, ImageFormat srcfmt,
+                             uint8_t **dest, ImageFormat destfmt,
+                             int width, int height)
+{
+    if (srcfmt == IMG_UYVY || srcfmt == IMG_YVYU)
+        return ac_imgconvert(src, srcfmt, src, IMG_YUY2, width, height)
+            && ac_imgconvert(src, IMG_YUY2, dest, destfmt, width, height);
+    else
+        return ac_imgconvert(src, srcfmt, dest, IMG_YUY2, width, height)
+            && ac_imgconvert(dest, IMG_YUY2, dest, destfmt, width, height);
+}
+
+static int yuv420p_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YUV420P, dest, IMG_UYVY, width, height); }
+
+static int yuv420p_yvyu(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YUV420P, dest, IMG_YVYU, width, height); }
+
+static int yuv411p_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YUV411P, dest, IMG_UYVY, width, height); }
+
+static int yuv411p_yvyu(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YUV411P, dest, IMG_YVYU, width, height); }
+
+static int yuv422p_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YUV422P, dest, IMG_UYVY, width, height); }
+
+static int yuv422p_yvyu(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YUV422P, dest, IMG_YVYU, width, height); }
+
+static int yuv444p_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YUV444P, dest, IMG_UYVY, width, height); }
+
+static int yuv444p_yvyu(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YUV444P, dest, IMG_YVYU, width, height); }
+
+static int uyvy_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV420P, width, height); }
+
+static int yvyu_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV420P, width, height); }
+
+static int uyvy_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV411P, width, height); }
+
+static int yvyu_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV411P, width, height); }
+
+static int uyvy_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV422P, width, height); }
+
+static int yvyu_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV422P, width, height); }
+
+static int uyvy_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV444P, width, height); }
+
+static int yvyu_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV444P, width, height); }
+
+/*************************************************************************/
+
+static int yuv420p_yuy2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int x, y;
+
+    for (y = 0; y < (height & ~1); y++) {
+        for (x = 0; x < (width & ~1); x += 2) {
+            dest[0][(y*width+x)*2  ] = src[0][y*width+x];
+            dest[0][(y*width+x)*2+1] = src[1][(y/2)*(width/2)+x/2];
+            dest[0][(y*width+x)*2+2] = src[0][y*width+x+1];
+            dest[0][(y*width+x)*2+3] = src[2][(y/2)*(width/2)+x/2];
+        }
+    }
+    return 1;
+}
+
+static int yuv411p_yuy2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int x, y;
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < (width & ~1); x += 2) {
+            dest[0][(y*width+x)*2  ] = src[0][y*width+x];
+            dest[0][(y*width+x)*2+1] = src[1][y*(width/4)+x/4];
+            dest[0][(y*width+x)*2+2] = src[0][y*width+x+1];
+            dest[0][(y*width+x)*2+3] = src[2][y*(width/4)+x/4];
+        }
+    }
+    return 1;
+}
+
+static int yuv422p_yuy2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < (width/2)*height; i++) {
+        dest[0][i*4  ] = src[0][i*2];
+        dest[0][i*4+1] = src[1][i];
+        dest[0][i*4+2] = src[0][i*2+1];
+        dest[0][i*4+3] = src[2][i];
+    }
+    return 1;
+}
+
+static int yuv444p_yuy2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < (width/2)*height; i++) {
+        dest[0][i*4  ] = src[0][i*2];
+        dest[0][i*4+1] = (src[1][i*2] + src[1][i*2+1]) / 2;
+        dest[0][i*4+2] = src[0][i*2+1];
+        dest[0][i*4+3] = (src[2][i*2] + src[2][i*2+1]) / 2;
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+static int yuy2_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int x, y;
+
+    for (y = 0; y < (height & ~1); y++) {
+        for (x = 0; x < (width & ~1); x += 2) {
+            dest[0][y*width+x  ] = src[0][(y*width+x)*2  ];
+            dest[0][y*width+x+1] = src[0][(y*width+x)*2+2];
+            if (y%2 == 0) {
+                dest[1][(y/2)*(width/2)+x/2] = src[0][(y*width+x)*2+1];
+                dest[2][(y/2)*(width/2)+x/2] = src[0][(y*width+x)*2+3];
+            } else {
+                dest[1][(y/2)*(width/2)+x/2] =
+                    (dest[1][(y/2)*(width/2)+x/2] + src[0][(y*width+x)*2+1] + 1) / 2;
+                dest[2][(y/2)*(width/2)+x/2] =
+                    (dest[2][(y/2)*(width/2)+x/2] + src[0][(y*width+x)*2+3] + 1) / 2;
+            }
+        }
+    }
+    return 1;
+}
+
+static int yuy2_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int x, y;
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < (width & ~3); x += 4) {
+            dest[0][y*width+x]       = src[0][(y*width+x)*2  ];
+            dest[0][y*width+x+1]     = src[0][(y*width+x)*2+2];
+            dest[0][y*width+x+2]     = src[0][(y*width+x)*2+4];
+            dest[0][y*width+x+3]     = src[0][(y*width+x)*2+6];
+            dest[1][y*(width/4)+x/4] = (src[0][(y*width+x)*2+1]
+                                      + src[0][(y*width+x)*2+5] + 1) / 2;
+            dest[2][y*(width/4)+x/4] = (src[0][(y*width+x)*2+3]
+                                      + src[0][(y*width+x)*2+7] + 1) / 2;
+        }
+    }
+    return 1;
+}
+
+static int yuy2_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < (width/2)*height; i++) {
+        dest[0][i*2]   = src[0][i*4  ];
+        dest[1][i]     = src[0][i*4+1];
+        dest[0][i*2+1] = src[0][i*4+2];
+        dest[2][i]     = src[0][i*4+3];
+    }
+    return 1;
+}
+
+static int yuy2_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < (width & ~1)*height; i += 2) {
+        dest[0][i]   = src[0][i*2  ];
+        dest[1][i]   = src[0][i*2+1];
+        dest[1][i+1] = src[0][i*2+1];
+        dest[0][i+1] = src[0][i*2+2];
+        dest[2][i]   = src[0][i*2+3];
+        dest[2][i+1] = src[0][i*2+3];
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+static int y8_yuy2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        dest[0][i*2  ] = src[0][i];
+        dest[0][i*2+1] = 128;
+    }
+    return 1;
+}
+
+static int y8_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++) {
+        dest[0][i*2  ] = 128;
+        dest[0][i*2+1] = src[0][i];
+    }
+    return 1;
+}
+
+static int yuy2_y8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++)
+        dest[0][i] = src[0][i*2];
+    return 1;
+}
+
+static int uyvy_y8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height; i++)
+        dest[0][i] = src[0][i*2+1];
+    return 1;
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+#if defined(HAVE_ASM_SSE2)
+
+/* SSE2 routines.  See comments in img_x86_common.h for why we don't bother
+ * unrolling the loops. */
+
+/* Common macros/data for x86 code */
+#include "img_x86_common.h"
+
+/* YUV420P (1 row) or YUV422P -> YUY2 (unit: 2 pixels) */
+#define YUV42XP_YUY2 \
+    SIMD_LOOP_WRAPPER(                                                  \
+        /* blocksize */ 8,                                              \
+        /* push_regs */ PUSH(EBX),                                      \
+        /* pop_regs  */ POP(EBX),                                       \
+        /* small_loop */                                                \
+        "movb -1("EDX","ECX"), %%bh                                     \n\
+        movb -1("ESI","ECX",2), %%bl                                    \n\
+        shll $16, %%ebx                                                 \n\
+        movb -1("EAX","ECX"), %%bh                                      \n\
+        movb -2("ESI","ECX",2), %%bl                                    \n\
+        movl %%ebx, -4("EDI","ECX",4)",                                 \
+        /* main_loop */                                                 \
+        "movdqu -16("ESI","ECX",2),%%xmm0 #XM0: YF YE YD ..... Y2 Y1 Y0 \n\
+        movq -8("EAX","ECX"), %%xmm2    # XMM2: U7 U6 U5 U4 U3 U2 U1 U0 \n\
+        movq -8("EDX","ECX"), %%xmm3    # XMM3: V7 V6 V5 V4 V3 V2 V1 V0 \n\
+        punpcklbw %%xmm3, %%xmm2        # XMM2: V7 U7 V6 ..... U1 V0 U0 \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\
+        punpcklbw %%xmm2, %%xmm0        # XMM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+        punpckhbw %%xmm2, %%xmm1        # XMM1: V7 YF U7 ..... Y9 U4 Y8 \n\
+        movdqu %%xmm0, -32("EDI","ECX",4)                               \n\
+        movdqu %%xmm1, -16("EDI","ECX",4)",                             \
+        /* emms */ "emms")
+
+/* YUV411P -> YUY2 (unit: 4 pixels) */
+#define YUV411P_YUY2 \
+    SIMD_LOOP_WRAPPER(                                                  \
+        /* blocksize */ 4,                                              \
+        /* push_regs */ PUSH(EBX),                                      \
+        /* pop_regs  */ POP(EBX),                                       \
+        /* small_loop */                                                \
+        "movb -1("EDX","ECX"), %%bh                                     \n\
+        movb -1("ESI","ECX",4), %%bl                                    \n\
+        shll $16, %%ebx                                                 \n\
+        movb -1("EAX","ECX"), %%bh                                      \n\
+        movb -2("ESI","ECX",4), %%bl                                    \n\
+        movl %%ebx, -4("EDI","ECX",8)                                   \n\
+        movb -1("EDX","ECX"), %%bh                                      \n\
+        movb -3("ESI","ECX",4), %%bl                                    \n\
+        shll $16, %%ebx                                                 \n\
+        movb -1("EAX","ECX"), %%bh                                      \n\
+        movb -4("ESI","ECX",4), %%bl                                    \n\
+        movl %%ebx, -8("EDI","ECX",8)",                                 \
+        /* main_loop */                                                 \
+        "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: YF YE YD ..... Y2 Y1 Y0 \n\
+        movd -4("EAX","ECX"), %%xmm2    # XMM2:             U3 U2 U1 U0 \n\
+        punpcklbw %%xmm2, %%xmm2        # XMM2: U3 U3 U2 U2 U1 U1 U0 U0 \n\
+        movd -4("EDX","ECX"), %%xmm3    # XMM3:             V3 V2 V1 V0 \n\
+        punpcklbw %%xmm3, %%xmm3        # XMM3: V3 V3 V2 V2 V1 V1 V0 V0 \n\
+        punpcklbw %%xmm3, %%xmm2        # XMM2: V3 U3 V3 ..... U0 V0 U0 \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\
+        punpcklbw %%xmm2, %%xmm0        # XMM0: V1 Y7 U1 ..... Y1 U0 Y0 \n\
+        punpckhbw %%xmm2, %%xmm1        # XMM1: V3 YF U3 ..... Y9 U2 Y8 \n\
+        movdqu %%xmm0, -32("EDI","ECX",8)                               \n\
+        movdqu %%xmm1, -16("EDI","ECX",8)",                             \
+        /* emms */ "emms")
+
+/* YUV444P -> YUY2 (unit: 2 pixels) */
+#define YUV444P_YUY2 \
+    /* Load 0x00FF*8 into XMM7 for masking */                           \
+    "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;"                         \
+    SIMD_LOOP_WRAPPER(                                                  \
+        /* blocksize */ 8,                                              \
+        /* push_regs */ PUSH2(EBX,EBP),                                 \
+        /* pop_regs  */ POP2(EBP,EBX),                                  \
+        /* small_loop */                                                \
+        "movzbl -1("EDX","ECX",2), %%ebx                                \n\
+        movzbl -2("EDX","ECX",2), %%ebp                                 \n\
+        addl %%ebp, %%ebx                                               \n\
+        shrl $1, %%ebx                                                  \n\
+        movb %%bl, -1("EDI","ECX",4)                                    \n\
+        movb -1("ESI","ECX",2), %%bl                                    \n\
+        movb %%bl, -2("EDI","ECX",4)                                    \n\
+        movzbl -1("EAX","ECX",2), %%ebx                                 \n\
+        movzbl -2("EAX","ECX",2), %%ebp                                 \n\
+        addl %%ebp, %%ebx                                               \n\
+        shrl $1, %%ebx                                                  \n\
+        movb %%bl, -3("EDI","ECX",4)                                    \n\
+        movb -2("ESI","ECX",2), %%bl                                    \n\
+        movb %%bl, -4("EDI","ECX",4)",                                  \
+        /* main_loop */                                                 \
+        "movdqu -16("ESI","ECX",2),%%xmm0 #XM0: YF YE YD ..... Y2 Y1 Y0 \n\
+        movdqu -16("EAX","ECX",2), %%xmm2 #XM2: UF UE UD ..... U2 U1 U0 \n\
+        movdqu -16("EDX","ECX",2), %%xmm3 #XM3: VF VE VD ..... V2 V1 V0 \n\
+        movdqa %%xmm2, %%xmm4           # XMM4: UF UE UD ..... U2 U1 U0 \n\
+        pand %%xmm7, %%xmm2             # XMM2: -- UE -- ..... U2 -- U0 \n\
+        psrlw $8, %%xmm4                # XMM4: -- UF -- ..... U3 -- U1 \n\
+        pavgw %%xmm4, %%xmm2            # XMM2: -- u7 -- ..... u1 -- u0 \n\
+        movdqa %%xmm3, %%xmm5           # XMM4: UF UE UD ..... U2 U1 U0 \n\
+        pand %%xmm7, %%xmm3             # XMM3: -- VE -- ..... V2 -- V0 \n\
+        psrlw $8, %%xmm5                # XMM5: -- VF -- ..... V3 -- V1 \n\
+        pavgw %%xmm5, %%xmm3            # XMM3: -- v7 -- ..... v1 -- v0 \n\
+        psllw $8, %%xmm3                # XMM3: v7 -- v6 ..... -- v0 -- \n\
+        por %%xmm3, %%xmm2              # XMM2: v7 u7 v6 ..... u1 v0 u0 \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\
+        punpcklbw %%xmm2, %%xmm0        # XMM0: v3 Y7 u3 ..... Y1 u0 Y0 \n\
+        punpckhbw %%xmm2, %%xmm1        # XMM1: v7 YF u7 ..... Y9 u4 Y8 \n\
+        movdqu %%xmm0, -32("EDI","ECX",4)                               \n\
+        movdqu %%xmm1, -16("EDI","ECX",4)",                             \
+        /* emms */ "emms")
+
+/* YUY2 -> YUV420P (U row) (unit: 2 pixels) */
+#define YUY2_YUV420P_U \
+    /* Load 0x00FF*8 into XMM7 for masking */                           \
+    "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;"                         \
+    SIMD_LOOP_WRAPPER(                                                  \
+        /* blocksize */ 4,                                              \
+        /* push_regs */ PUSH2(EBX,EBP),                                 \
+        /* pop_regs  */ POP2(EBP,EBX),                                  \
+        /* small_loop */                                                \
+        "movb -4("ESI","ECX",4), %%bl                                   \n\
+        movb %%bl, -2("EDI","ECX",2)                                    \n\
+        movb -2("ESI","ECX",4), %%bl                                    \n\
+        movb %%bl, -1("EDI","ECX",2)                                    \n\
+        movzbl -3("ESI","ECX",4), %%ebx                                 \n\
+        movzbl -3("EAX","ECX",4), %%ebp                                 \n\
+        addl %%ebp, %%ebx                                               \n\
+        shrl $1, %%ebx                                                  \n\
+        movb %%bl, -1("EDX","ECX")",                                    \
+        /* main_loop */                                                 \
+        "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+        movdqu -16("EAX","ECX",4),%%xmm2 #XMM2: Vd Yh Ud ..... Yb Ua Ya \n\
+        pand %%xmm7, %%xmm0             # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+        psrlw $8, %%xmm1                # XMM1: -- V3 -- ..... V0 -- U0 \n\
+        psrlw $8, %%xmm2                # XMM2: -- Vd -- ..... Va -- Ua \n\
+        pavgw %%xmm2, %%xmm1            # XMM1: -- v3 -- ..... v0 -- u0 \n\
+        packuswb %%xmm1, %%xmm1         # XMM1: v3 u3 v2 u2 v1 u1 v0 u0 \n\
+        pand %%xmm7, %%xmm1             # XMM1: -- u3 -- u2 -- u1 -- u0 \n\
+        packuswb %%xmm1, %%xmm1         # XMM1:             u3 u2 u1 u0 \n\
+        movq %%xmm0, -8("EDI","ECX",2)                                  \n\
+        movd %%xmm1, -4("EDX","ECX")",                                  \
+        /* emms */ "emms")
+
+/* YUY2 -> YUV420P (V row) (unit: 2 pixels) */
+#define YUY2_YUV420P_V \
+    /* Load 0x00FF*8 into XMM7 for masking */                           \
+    "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;"                         \
+    SIMD_LOOP_WRAPPER(                                                  \
+        /* blocksize */ 4,                                              \
+        /* push_regs */ PUSH2(EBX,EBP),                                 \
+        /* pop_regs  */ POP2(EBP,EBX),                                  \
+        /* small_loop */                                                \
+        "movb -4("ESI","ECX",4), %%bl                                   \n\
+        movb %%bl, -2("EDI","ECX",2)                                    \n\
+        movb -2("ESI","ECX",4), %%bl                                    \n\
+        movb %%bl, -1("EDI","ECX",2)                                    \n\
+        movzbl -1("ESI","ECX",4), %%ebx                                 \n\
+        movzbl -1("EAX","ECX",4), %%ebp                                 \n\
+        addl %%ebp, %%ebx                                               \n\
+        shrl $1, %%ebx                                                  \n\
+        movb %%bl, -1("EDX","ECX")",                                    \
+        /* main_loop */                                                 \
+        "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+        movdqu -16("EAX","ECX",4),%%xmm2 #XMM2: Vd Yh Ud ..... Yb Ua Ya \n\
+        pand %%xmm7, %%xmm0             # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+        psrlw $8, %%xmm1                # XMM1: -- V3 -- ..... V0 -- U0 \n\
+        psrlw $8, %%xmm2                # XMM2: -- Vd -- ..... Va -- Ua \n\
+        pavgw %%xmm1, %%xmm2            # XMM2: -- v3 -- ..... v0 -- u0 \n\
+        packuswb %%xmm2, %%xmm2         # XMM2: v3 u3 v2 u2 v1 u1 v0 u0 \n\
+        psrlw $8, %%xmm2                # XMM2: -- v3 -- v2 -- v1 -- v0 \n\
+        packuswb %%xmm2, %%xmm2         # XMM2:             v3 v2 v1 v0 \n\
+        movq %%xmm0, -8("EDI","ECX",2)                                  \n\
+        movd %%xmm2, -4("EDX","ECX")",                                  \
+        /* emms */ "emms")
+
+/* YUY2 -> YUV411P (unit: 4 pixels) */
+#define YUY2_YUV411P \
+    /* Load 0x000..000FFFFFFFF into XMM6, 0x00FF*8 into XMM7 for masking */ \
+    "pcmpeqd %%xmm6, %%xmm6; psrldq $12, %%xmm6;"                       \
+    "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;"                         \
+    SIMD_LOOP_WRAPPER(                                                  \
+        /* blocksize */ 2,                                              \
+        /* push_regs */ PUSH2(EBX,EBP),                                 \
+        /* pop_regs  */ POP2(EBP,EBX),                                  \
+        /* small_loop */                                                \
+        "movb -8("ESI","ECX",8), %%bl                                   \n\
+        movb %%bl, -4("EDI","ECX",4)                                    \n\
+        movb -6("ESI","ECX",8), %%bl                                    \n\
+        movb %%bl, -3("EDI","ECX",4)                                    \n\
+        movb -4("ESI","ECX",8), %%bl                                    \n\
+        movb %%bl, -2("EDI","ECX",4)                                    \n\
+        movb -2("ESI","ECX",8), %%bl                                    \n\
+        movb %%bl, -1("EDI","ECX",4)                                    \n\
+        movzbl -7("ESI","ECX",8), %%ebx                                 \n\
+        movzbl -3("ESI","ECX",8), %%ebp                                 \n\
+        addl %%ebp, %%ebx                                               \n\
+        shrl $1, %%ebx                                                  \n\
+        movb %%bl, -1("EAX","ECX")                                      \n\
+        movzbl -5("ESI","ECX",8), %%ebx                                 \n\
+        movzbl -1("ESI","ECX",8), %%ebp                                 \n\
+        addl %%ebp, %%ebx                                               \n\
+        shrl $1, %%ebx                                                  \n\
+        movb %%bl, -1("EDX","ECX")",                                    \
+        /* main_loop */                                                 \
+        "movdqu -16("ESI","ECX",8),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+        pand %%xmm7, %%xmm0             # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+        psrlw $8, %%xmm1                # XMM1: -- V3 -- ..... V0 -- U0 \n\
+        packuswb %%xmm1, %%xmm1         # XMM1: V3 U3 V2 U2 V1 U1 V0 U0 \n\
+        movdqa %%xmm1, %%xmm2           # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\
+        pand %%xmm7, %%xmm1             # XMM1: -- U3 -- U2 -- U1 -- U0 \n\
+        psrlw $8, %%xmm2                # XMM2: -- V3 -- V2 -- V1 -- V0 \n\
+        packuswb %%xmm1, %%xmm1         # XMM1:             U3 U2 U1 U0 \n\
+        packuswb %%xmm2, %%xmm2         # XMM2:             V3 V2 V1 V0 \n\
+        pand %%xmm6, %%xmm1             # XMM1: -- -- -- -- U3 U2 U1 U0 \n\
+        psllq $32, %%xmm2               # XMM2: V3 V2 V1 V0 -- -- -- -- \n\
+        por %%xmm1, %%xmm2              # XMM2: V3 V2 V1 V0 U3 U2 U1 U0 \n\
+        movdqa %%xmm2, %%xmm1           # XMM1: V3 V2 V1 V0 U3 U2 U1 U0 \n\
+        pand %%xmm7, %%xmm1             # XMM1: -- V2 -- V0 -- U2 -- U0 \n\
+        psrlw $8, %%xmm2                # XMM2: -- V3 -- V1 -- U3 -- U1 \n\
+        pavgw %%xmm2, %%xmm1            # XMM1: -- v1 -- v0 -- u1 -- u0 \n\
+        packuswb %%xmm1, %%xmm1         # XMM1:             v1 v0 u1 u0 \n\
+        movq %%xmm0, -8("EDI","ECX",4)                                  \n\
+        movd %%xmm1, %%ebx                                              \n\
+        movw %%bx, -2("EAX","ECX")                                      \n\
+        shrl $16, %%ebx;                                                \n\
+        movw %%bx, -2("EDX","ECX")",                                    \
+        /* emms */ "emms")
+
+/* YUY2 -> YUV422P (unit: 2 pixels) */
+#define YUY2_YUV422P \
+    /* Load 0x00FF*8 into XMM7 for masking */                           \
+    "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;"                         \
+    SIMD_LOOP_WRAPPER(                                                  \
+        /* blocksize */ 4,                                              \
+        /* push_regs */ PUSH(EBX),                                      \
+        /* pop_regs  */ POP(EBX),                                       \
+        /* small_loop */                                                \
+        "movb -4("ESI","ECX",4), %%bl                                   \n\
+        movb %%bl, -2("EDI","ECX",2)                                    \n\
+        movb -2("ESI","ECX",4), %%bl                                    \n\
+        movb %%bl, -1("EDI","ECX",2)                                    \n\
+        movb -3("ESI","ECX",4), %%bl                                    \n\
+        movb %%bl, -1("EAX","ECX")                                      \n\
+        movb -1("ESI","ECX",4), %%bl                                    \n\
+        movb %%bl, -1("EDX","ECX")",                                    \
+        /* main_loop */                                                 \
+        "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+        pand %%xmm7, %%xmm0             # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+        psrlw $8, %%xmm1                # XMM1: -- V3 -- ..... V0 -- U0 \n\
+        packuswb %%xmm1, %%xmm1         # XMM1: V3 U3 V2 U2 V1 U1 V0 U0 \n\
+        movdqa %%xmm1, %%xmm2           # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\
+        pand %%xmm7, %%xmm1             # XMM1: -- U3 -- U2 -- U1 -- U0 \n\
+        psrlw $8, %%xmm2                # XMM2: -- V3 -- V2 -- V1 -- V0 \n\
+        packuswb %%xmm1, %%xmm1         # XMM1:             U3 U2 U1 U0 \n\
+        packuswb %%xmm2, %%xmm2         # XMM2:             V3 V2 V1 V0 \n\
+        movq %%xmm0, -8("EDI","ECX",2)                                  \n\
+        movd %%xmm1, -4("EAX","ECX")                                    \n\
+        movd %%xmm2, -4("EDX","ECX")",                                  \
+        /* emms */ "emms")
+
+/* YUY2 -> YUV444P (unit: 2 pixels) */
+#define YUY2_YUV444P \
+    /* Load 0x00FF*8 into XMM7 for masking */                           \
+    "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;"                         \
+    SIMD_LOOP_WRAPPER(                                                  \
+        /* blocksize */ 4,                                              \
+        /* push_regs */ PUSH(EBX),                                      \
+        /* pop_regs  */ POP(EBX),                                       \
+        /* small_loop */                                                \
+        "movb -4("ESI","ECX",4), %%bl                                   \n\
+        movb %%bl, -2("EDI","ECX",2)                                    \n\
+        movb -2("ESI","ECX",4), %%bl                                    \n\
+        movb %%bl, -1("EDI","ECX",2)                                    \n\
+        movb -3("ESI","ECX",4), %%bl                                    \n\
+        movb %%bl, -2("EAX","ECX",2)                                    \n\
+        movb %%bl, -1("EAX","ECX",2)                                    \n\
+        movb -1("ESI","ECX",4), %%bl                                    \n\
+        movb %%bl, -2("EDX","ECX",2)                                    \n\
+        movb %%bl, -1("EDX","ECX",2)",                                  \
+        /* main_loop */                                                 \
+        "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+        pand %%xmm7, %%xmm0             # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+        psrlw $8, %%xmm1                # XMM1: -- V3 -- ..... V0 -- U0 \n\
+        packuswb %%xmm1, %%xmm1         # XMM1: V3 U3 V2 U2 V1 U1 V0 U0 \n\
+        movdqa %%xmm1, %%xmm2           # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\
+        pand %%xmm7, %%xmm1             # XMM1: -- U3 -- U2 -- U1 -- U0 \n\
+        psrlw $8, %%xmm2                # XMM2: -- V3 -- V2 -- V1 -- V0 \n\
+        movdqa %%xmm1, %%xmm3           # XMM3: -- U3 -- U2 -- U1 -- U0 \n\
+        psllw $8, %%xmm3                # XMM3: U3 -- U2 -- U1 -- U0 -- \n\
+        por %%xmm3, %%xmm1              # XMM1: U3 U3 U2 U2 U1 U1 U0 U0 \n\
+        movdqa %%xmm2, %%xmm3           # XMM3: -- V3 -- V2 -- V1 -- V0 \n\
+        psllw $8, %%xmm3                # XMM3: V3 -- V2 -- V1 -- V0 -- \n\
+        por %%xmm3, %%xmm2              # XMM1: V3 V3 V2 V2 V1 V1 V0 V0 \n\
+        movq %%xmm0, -8("EDI","ECX",2)                                  \n\
+        movq %%xmm1, -8("EAX","ECX",2)                                  \n\
+        movq %%xmm2, -8("EDX","ECX",2)",                                \
+        /* emms */ "emms")
+
+
+/* Y8 -> YUY2/YVYU (unit: 1 pixel) */
+#define Y8_YUY2 \
+    /* Load 0x80*16 into XMM7 for interlacing U/V */                    \
+    "pcmpeqd %%xmm7, %%xmm7; psllw $7, %%xmm7; packsswb %%xmm7, %%xmm7;"\
+    SIMD_LOOP_WRAPPER(                                                  \
+        /* blocksize */ 16,                                             \
+        /* push_regs */ PUSH(EBX),                                      \
+        /* pop_regs  */ POP(EBX),                                       \
+        /* small_loop */                                                \
+        "movb -1("ESI","ECX"), %%al                                     \n\
+        movb %%al, -2("EDI","ECX",2)                                    \n\
+        movb $0x80, -1("EDI","ECX",2)",                                 \
+        /* main_loop */                                                 \
+        "movdqu -16("ESI","ECX"),%%xmm0 # XMM0: YF YE YD ..... Y2 Y1 Y0 \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\
+        punpcklbw %%xmm7, %%xmm0        # XMM0: 80 Y7 80 ..... Y1 80 Y0 \n\
+        movdqu %%xmm0, -32("EDI","ECX",2)                               \n\
+        punpckhbw %%xmm7, %%xmm1        # XMM1: 80 YF 80 ..... Y9 80 Y8 \n\
+        movdqu %%xmm1, -16("EDI","ECX",2)",                             \
+        /* emms */ "emms")
+
+/* Y8 -> UYVY (unit: 1 pixel) */
+#define Y8_UYVY \
+    /* Load 0x80*16 into XMM7 for interlacing U/V */                    \
+    "pcmpeqd %%xmm7, %%xmm7; psllw $7, %%xmm7; packsswb %%xmm7, %%xmm7;"\
+    SIMD_LOOP_WRAPPER(                                                  \
+        /* blocksize */ 16,                                             \
+        /* push_regs */ "",                                             \
+        /* pop_regs  */ "",                                             \
+        /* small_loop */                                                \
+        "movb -1("ESI","ECX"), %%al                                     \n\
+        movb %%al, -1("EDI","ECX",2)                                    \n\
+        movb $0x80, -2("EDI","ECX",2)",                                 \
+        /* main_loop */                                                 \
+        "movdqu -16("ESI","ECX"),%%xmm0 # XMM0: YF YE YD ..... Y2 Y1 Y0 \n\
+        movdqa %%xmm7, %%xmm1           # XMM1: 80 80 80 ..... 80 80 80 \n\
+        punpcklbw %%xmm0, %%xmm1        # XMM1: Y7 80 Y6 ..... 80 Y0 80 \n\
+        movdqu %%xmm1, -32("EDI","ECX",2)                               \n\
+        movdqa %%xmm7, %%xmm2           # XMM2: 80 80 80 ..... 80 80 80 \n\
+        punpckhbw %%xmm0, %%xmm2        # XMM0: YF 80 YE ..... 80 Y8 80 \n\
+        movdqu %%xmm2, -16("EDI","ECX",2)",                             \
+        /* emms */ "emms")
+
+/* YUY2/YVYU -> Y8 (unit: 1 pixel) */
+#define YUY2_Y8 \
+    /* Load 0x00FF*8 into XMM7 for masking */                           \
+    "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;"                         \
+    SIMD_LOOP_WRAPPER(                                                  \
+        /* blocksize */ 8,                                              \
+        /* push_regs */ "",                                             \
+        /* pop_regs  */ "",                                             \
+        /* small_loop */                                                \
+        "movb -2("ESI","ECX",2), %%al                                   \n\
+        movb %%al, -1("EDI","ECX")",                                    \
+        /* main_loop */                                                 \
+        "movdqu -16("ESI","ECX",2),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+        pand %%xmm7, %%xmm0             # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+        movq %%xmm0, -8("EDI","ECX")",                                  \
+        /* emms */ "emms")
+
+/* UYVY -> Y8 (unit: 1 pixel) */
+#define UYVY_Y8 \
+    SIMD_LOOP_WRAPPER(                                                  \
+        /* blocksize */ 8,                                              \
+        /* push_regs */ "",                                             \
+        /* pop_regs  */ "",                                             \
+        /* small_loop */                                                \
+        "movb -1("ESI","ECX",2), %%al                                   \n\
+        movb %%al, -1("EDI","ECX")",                                    \
+        /* main_loop */                                                 \
+        "movdqu -16("ESI","ECX",2),%%xmm0 #XM0: Y7 V3 Y6 ..... V0 Y0 U0 \n\
+        psrlw $8, %%xmm0                # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+        movq %%xmm0, -8("EDI","ECX")",                                  \
+        /* emms */ "emms")
+
+/*************************************************************************/
+
+static int yuv420p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int y;
+    for (y = 0; y < (height & ~1); y++) {
+        int dummy;
+        asm volatile(YUV42XP_YUY2
+            : "=c" (dummy)  // Ensure GCC reloads ECX each time through
+            : "S" (src[0]+y*width), "a" (src[1]+(y/2)*(width/2)),
+              "d" (src[2]+(y/2)*(width/2)), "D" (dest[0]+y*width*2),
+              "0" (width/2)
+#ifdef ARCH_X86_64
+            : FAKE_PUSH_REG
+#endif
+        );
+    }
+    return 1;
+}
+
+static int yuv411p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    if (!(width & 3)) {
+        asm(YUV411P_YUY2
+            : /* no outputs */
+            : "S" (src[0]), "a" (src[1]), "d" (src[2]), "D" (dest[0]),
+              "c" ((width/4)*height)
+#ifdef ARCH_X86_64
+            : FAKE_PUSH_REG
+#endif
+        );
+    } else {
+        int y;
+        for (y = 0; y < height; y++) {
+            int dummy;
+            asm volatile(YUV411P_YUY2
+                : "=c" (dummy)
+                : "S" (src[0]+y*width), "a" (src[1]+y*(width/4)),
+                  "d" (src[2]+y*(width/4)), "D" (dest[0]+y*width*2),
+                  "0" (width/4)
+#ifdef ARCH_X86_64
+                : FAKE_PUSH_REG
+#endif
+            );
+        }
+    }
+    return 1;
+}
+
+static int yuv422p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    if (!(width & 1)) {
+        asm(YUV42XP_YUY2
+            : /* no outputs */
+            : "S" (src[0]), "a" (src[1]), "d" (src[2]), "D" (dest[0]),
+              "c" ((width/2)*height)
+#ifdef ARCH_X86_64
+            : FAKE_PUSH_REG
+#endif
+        );
+    } else {
+        int y;
+        for (y = 0; y < height; y++) {
+            int dummy;
+            asm volatile(YUV42XP_YUY2
+                : "=c" (dummy)
+                : "S" (src[0]+y*width), "a" (src[1]+y*(width/2)),
+                  "d" (src[2]+y*(width/2)), "D" (dest[0]+y*width*2),
+                  "0" (width/2)
+#ifdef ARCH_X86_64
+                : FAKE_PUSH_REG
+#endif
+            );
+        }
+    }
+    return 1;
+}
+
+static int yuv444p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    if (!(width & 1)) {
+        asm(YUV444P_YUY2
+            : /* no outputs */
+            : "S" (src[0]), "a" (src[1]), "d" (src[2]), "D" (dest[0]),
+              "c" ((width/2)*height)
+#ifdef ARCH_X86_64
+            : FAKE_PUSH_REG, FAKE_PUSH_REG_2
+#endif
+        );
+    } else {
+        int y;
+        for (y = 0; y < height; y++) {
+            int dummy;
+            asm volatile(YUV444P_YUY2
+                : "=c" (dummy)
+                : "S" (src[0]+y*width), "a" (src[1]+y*(width/2)),
+                  "d" (src[2]+y*(width/2)), "D" (dest[0]+y*width*2),
+                  "0" (width/2)
+#ifdef ARCH_X86_64
+                : FAKE_PUSH_REG, FAKE_PUSH_REG_2
+#endif
+            );
+        }
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+static int yuy2_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int y;
+
+    for (y = 0; y < (height & ~1); y += 2) {
+        int dummy;
+        asm volatile(YUY2_YUV420P_U
+            : "=c" (dummy)
+            : "S" (src[0]+y*width*2), "a" (src[0]+(y+1)*width*2),
+              "D" (dest[0]+y*width), "d" (dest[1]+(y/2)*(width/2)),
+              "0" (width/2)
+#ifdef ARCH_X86_64
+            : FAKE_PUSH_REG, FAKE_PUSH_REG_2
+#endif
+        );
+        asm volatile(YUY2_YUV420P_V
+            : "=c" (dummy)
+            : "S" (src[0]+(y+1)*width*2), "a" (src[0]+y*width*2),
+              "D" (dest[0]+(y+1)*width), "d" (dest[2]+(y/2)*(width/2)),
+              "0" (width/2)
+#ifdef ARCH_X86_64
+            : FAKE_PUSH_REG, FAKE_PUSH_REG_2
+#endif
+        );
+    }
+    return 1;
+}
+
+static int yuy2_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    if (!(width & 3)) {
+        asm(YUY2_YUV411P
+            : /* no outputs */
+            : "S" (src[0]), "D" (dest[0]), "a" (dest[1]), "d" (dest[2]),
+              "c" ((width/4)*height)
+#ifdef ARCH_X86_64
+            : FAKE_PUSH_REG, FAKE_PUSH_REG_2
+#endif
+        );
+    } else {
+        int y;
+        for (y = 0; y < height; y++) {
+            int dummy;
+            asm volatile(YUY2_YUV411P
+                : "=c" (dummy)
+                : "S" (src[0]+y*width*2), "D" (dest[0]+y*width),
+                  "a" (dest[1]+y*(width/4)), "d" (dest[2]+y*(width/4)),
+                  "0" (width/4)
+#ifdef ARCH_X86_64
+                : FAKE_PUSH_REG, FAKE_PUSH_REG_2
+#endif
+            );
+        }
+    }
+    return 1;
+}
+
+static int yuy2_yuv422p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    if (!(width & 1)) {
+        asm(YUY2_YUV422P
+            : /* no outputs */
+            : "S" (src[0]), "D" (dest[0]), "a" (dest[1]), "d" (dest[2]),
+              "c" ((width/2)*height)
+#ifdef ARCH_X86_64
+            : FAKE_PUSH_REG
+#endif
+        );
+    } else {
+        int y;
+        for (y = 0; y < height; y++) {
+            int dummy;
+            asm volatile(YUY2_YUV422P
+                : "=c" (dummy)
+                : "S" (src[0]+y*width*2), "D" (dest[0]+y*width),
+                  "a" (dest[1]+y*(width/2)), "d" (dest[2]+y*(width/2)),
+                  "0" (width/2)
+#ifdef ARCH_X86_64
+                : FAKE_PUSH_REG
+#endif
+            );
+        }
+    }
+    return 1;
+}
+
+static int yuy2_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    if (!(width & 1)) {
+        asm(YUY2_YUV444P
+            : /* no outputs */
+            : "S" (src[0]), "D" (dest[0]), "a" (dest[1]), "d" (dest[2]),
+              "c" ((width/2)*height)
+#ifdef ARCH_X86_64
+            : FAKE_PUSH_REG
+#endif
+        );
+    } else {
+        int y;
+        for (y = 0; y < height; y++) {
+            int dummy;
+            asm volatile(YUY2_YUV444P
+                : "=c" (dummy)
+                : "S" (src[0]+y*width*2), "D" (dest[0]+y*width),
+                  "a" (dest[1]+y*width), "d" (dest[2]+y*width),
+                  "0" (width/2)
+#ifdef ARCH_X86_64
+                : FAKE_PUSH_REG
+#endif
+            );
+        }
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+static int y8_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm(Y8_YUY2
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height)
+        : "eax" COMMA_FAKE_PUSH_REG
+    );
+    return 1;
+}
+
+static int y8_uyvy_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm(Y8_UYVY
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height)
+        : "eax");
+    return 1;
+}
+
+static int yuy2_y8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm(YUY2_Y8
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height)
+        : "eax");
+    return 1;
+}
+
+static int uyvy_y8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm(UYVY_Y8
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height)
+        : "eax");
+    return 1;
+}
+
+/*************************************************************************/
+
+#endif  /* HAVE_ASM_SSE2 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Initialization */
+
+int ac_imgconvert_init_yuv_mixed(int accel)
+{
+    if (!register_conversion(IMG_YUV420P, IMG_YUY2,    yuv420p_yuy2)
+     || !register_conversion(IMG_YUV411P, IMG_YUY2,    yuv411p_yuy2)
+     || !register_conversion(IMG_YUV422P, IMG_YUY2,    yuv422p_yuy2)
+     || !register_conversion(IMG_YUV444P, IMG_YUY2,    yuv444p_yuy2)
+     || !register_conversion(IMG_Y8,      IMG_YUY2,    y8_yuy2)
+     || !register_conversion(IMG_YUV420P, IMG_UYVY,    yuv420p_uyvy)
+     || !register_conversion(IMG_YUV411P, IMG_UYVY,    yuv411p_uyvy)
+     || !register_conversion(IMG_YUV422P, IMG_UYVY,    yuv422p_uyvy)
+     || !register_conversion(IMG_YUV444P, IMG_UYVY,    yuv444p_uyvy)
+     || !register_conversion(IMG_Y8,      IMG_UYVY,    y8_uyvy)
+     || !register_conversion(IMG_YUV420P, IMG_YVYU,    yuv420p_yvyu)
+     || !register_conversion(IMG_YUV411P, IMG_YVYU,    yuv411p_yvyu)
+     || !register_conversion(IMG_YUV422P, IMG_YVYU,    yuv422p_yvyu)
+     || !register_conversion(IMG_YUV444P, IMG_YVYU,    yuv444p_yvyu)
+     || !register_conversion(IMG_Y8,      IMG_YVYU,    y8_yuy2)
+
+     || !register_conversion(IMG_YUY2,    IMG_YUV420P, yuy2_yuv420p)
+     || !register_conversion(IMG_YUY2,    IMG_YUV411P, yuy2_yuv411p)
+     || !register_conversion(IMG_YUY2,    IMG_YUV422P, yuy2_yuv422p)
+     || !register_conversion(IMG_YUY2,    IMG_YUV444P, yuy2_yuv444p)
+     || !register_conversion(IMG_YUY2,    IMG_Y8,      yuy2_y8)
+     || !register_conversion(IMG_UYVY,    IMG_YUV420P, uyvy_yuv420p)
+     || !register_conversion(IMG_UYVY,    IMG_YUV411P, uyvy_yuv411p)
+     || !register_conversion(IMG_UYVY,    IMG_YUV422P, uyvy_yuv422p)
+     || !register_conversion(IMG_UYVY,    IMG_YUV444P, uyvy_yuv444p)
+     || !register_conversion(IMG_UYVY,    IMG_Y8,      uyvy_y8)
+     || !register_conversion(IMG_YVYU,    IMG_YUV420P, yvyu_yuv420p)
+     || !register_conversion(IMG_YVYU,    IMG_YUV411P, yvyu_yuv411p)
+     || !register_conversion(IMG_YVYU,    IMG_YUV422P, yvyu_yuv422p)
+     || !register_conversion(IMG_YVYU,    IMG_YUV444P, yvyu_yuv444p)
+     || !register_conversion(IMG_YVYU,    IMG_Y8,      yuy2_y8)
+    ) {
+        return 0;
+    }
+
+#if defined(HAVE_ASM_SSE2)
+    if (accel & AC_SSE2) {
+        if (!register_conversion(IMG_YUV420P, IMG_YUY2,    yuv420p_yuy2_sse2)
+         || !register_conversion(IMG_YUV411P, IMG_YUY2,    yuv411p_yuy2_sse2)
+         || !register_conversion(IMG_YUV422P, IMG_YUY2,    yuv422p_yuy2_sse2)
+         || !register_conversion(IMG_YUV444P, IMG_YUY2,    yuv444p_yuy2_sse2)
+         || !register_conversion(IMG_Y8,      IMG_YUY2,    y8_yuy2_sse2)
+         || !register_conversion(IMG_Y8,      IMG_UYVY,    y8_uyvy_sse2)
+         || !register_conversion(IMG_Y8,      IMG_YVYU,    y8_yuy2_sse2)
+
+         || !register_conversion(IMG_YUY2,    IMG_YUV420P, yuy2_yuv420p_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_YUV411P, yuy2_yuv411p_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_YUV422P, yuy2_yuv422p_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_YUV444P, yuy2_yuv444p_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_Y8,      yuy2_y8_sse2)
+         || !register_conversion(IMG_UYVY,    IMG_Y8,      uyvy_y8_sse2)
+         || !register_conversion(IMG_YVYU,    IMG_Y8,      yuy2_y8_sse2)
+        ) {
+            return 0;
+        }
+    }
+#endif  /* HAVE_ASM_SSE2 */
+
+    return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ *   c-file-style: "stroustrup"
+ *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ *   indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/img_yuv_packed.c b/debian/transcode/transcode-1.1.7/aclib/img_yuv_packed.c
new file mode 100644
index 00000000..05357405
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/img_yuv_packed.c
@@ -0,0 +1,290 @@
+/*
+ * img_yuv_packed.c - YUV packed image format conversion routines
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later).  See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "imgconvert.h"
+#include "img_internal.h"
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Standard C implementations */
+
+/*************************************************************************/
+
+/* Identity transformation, works when src==dest */
+static int yuv16_copy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height*2);
+    return 1;
+}
+
+/* Used for YUY2->UYVY and UYVY->YUY2, works when src==dest */
+static int yuv16_swap16(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    uint16_t *srcp  = (uint16_t *)src[0];
+    uint16_t *destp = (uint16_t *)dest[0];
+    int i;
+    for (i = 0; i < width*height; i++)
+        destp[i] = srcp[i]>>8 | srcp[i]<<8;
+    return 1;
+}
+
+/* Used for YUY2->YVYU and YVYU->YUY2, works when src==dest */
+static int yuv16_swapuv(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height/2; i++) {
+        uint8_t tmp   = src[0][i*4+1];
+        dest[0][i*4  ] = src[0][i*4  ];
+        dest[0][i*4+1] = src[0][i*4+3];
+        dest[0][i*4+2] = src[0][i*4+2];
+        dest[0][i*4+3] = tmp;
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+static int uyvy_yvyu(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height/2; i++) {
+        dest[0][i*4  ] = src[0][i*4+1];
+        dest[0][i*4+1] = src[0][i*4+2];
+        dest[0][i*4+2] = src[0][i*4+3];
+        dest[0][i*4+3] = src[0][i*4  ];
+    }
+    return 1;
+}
+
+static int yvyu_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    for (i = 0; i < width*height/2; i++) {
+        dest[0][i*4  ] = src[0][i*4+3];
+        dest[0][i*4+1] = src[0][i*4  ];
+        dest[0][i*4+2] = src[0][i*4+1];
+        dest[0][i*4+3] = src[0][i*4+2];
+    }
+    return 1;
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+
+/* Common macros/data for x86 code */
+#define DEFINE_MASK_DATA
+#include "img_x86_common.h"
+
+/*************************************************************************/
+
+/* Basic assembly routines */
+
+static int yuv16_swap16_x86(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_SWAP16_2_X86(width*height/2);
+    if (width*height % 1)
+        ((uint16_t *)(dest[0]))[width*height-1] =
+            src[0][width*height*2-2]<<8 | src[0][width*height*2-1];
+    return 1;
+}
+
+static int yuv16_swapuv_x86(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_SWAP32_13_X86(width*height/2);
+    return 1;
+}
+
+static int uyvy_yvyu_x86(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_ROR32_X86(width*height/2);
+    if (width*height % 1)
+        ((uint16_t *)(dest[0]))[width*height-1] =
+            src[0][width*height*2-2]<<8 | src[0][width*height*2-1];
+    return 1;
+}
+
+static int yvyu_uyvy_x86(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_ROL32_X86(width*height/2);
+    if (width*height % 1)
+        ((uint16_t *)(dest[0]))[width*height-1] =
+            src[0][width*height*2-2]<<8 | src[0][width*height*2-1];
+    return 1;
+}
+
+/*************************************************************************/
+
+/* MMX routines */
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)  /* i.e. not x86_64 */
+
+static int yuv16_swap16_mmx(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_SWAP16_2_MMX(width*height/2);
+    if (width*height % 1)
+        ((uint16_t *)(dest[0]))[width*height-1] =
+            src[0][width*height*2-2]<<8 | src[0][width*height*2-1];
+    return 1;
+}
+
+static int yuv16_swapuv_mmx(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_SWAP32_13_MMX(width*height/2);
+    return 1;
+}
+
+static int uyvy_yvyu_mmx(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_ROR32_MMX(width*height/2);
+    if (width*height % 1)
+        ((uint16_t *)(dest[0]))[width*height-1] =
+            src[0][width*height*2-2]<<8 | src[0][width*height*2-1];
+    return 1;
+}
+
+static int yvyu_uyvy_mmx(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_ROL32_MMX(width*height/2);
+    if (width*height % 1)
+        ((uint16_t *)(dest[0]))[width*height-1] =
+            src[0][width*height*2-2]<<8 | src[0][width*height*2-1];
+    return 1;
+}
+
+#endif  /* HAVE_ASM_MMX && ARCH_X86 */
+
+/*************************************************************************/
+
+/* SSE2 routines */
+
+#if defined(HAVE_ASM_SSE2)
+
+static int yuv16_swap16_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_SWAP16_2_SSE2(width*height/2);
+    if (width*height % 1)
+        ((uint16_t *)(dest[0]))[width*height-1] =
+            src[0][width*height*2-2]<<8 | src[0][width*height*2-1];
+    return 1;
+}
+
+static int yuv16_swapuv_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_SWAP32_13_SSE2(width*height/2);
+    return 1;
+}
+
+static int uyvy_yvyu_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_ROR32_SSE2(width*height/2);
+    if (width*height % 1)
+        ((uint16_t *)(dest[0]))[width*height-1] =
+            src[0][width*height*2-2]<<8 | src[0][width*height*2-1];
+    return 1;
+}
+
+static int yvyu_uyvy_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ASM_ROL32_SSE2(width*height/2);
+    if (width*height % 1)
+        ((uint16_t *)(dest[0]))[width*height-1] =
+            src[0][width*height*2-2]<<8 | src[0][width*height*2-1];
+    return 1;
+}
+
+#endif  /* HAVE_ASM_SSE2 */
+
+/*************************************************************************/
+
+#endif  /* ARCH_X86 || ARCH_X86_64 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Initialization */
+
+int ac_imgconvert_init_yuv_packed(int accel)
+{
+    if (!register_conversion(IMG_YUY2,    IMG_YUY2,    yuv16_copy)
+     || !register_conversion(IMG_YUY2,    IMG_UYVY,    yuv16_swap16)
+     || !register_conversion(IMG_YUY2,    IMG_YVYU,    yuv16_swapuv)
+
+     || !register_conversion(IMG_UYVY,    IMG_YUY2,    yuv16_swap16)
+     || !register_conversion(IMG_UYVY,    IMG_UYVY,    yuv16_copy)
+     || !register_conversion(IMG_UYVY,    IMG_YVYU,    uyvy_yvyu)
+
+     || !register_conversion(IMG_YVYU,    IMG_YUY2,    yuv16_swapuv)
+     || !register_conversion(IMG_YVYU,    IMG_UYVY,    yvyu_uyvy)
+     || !register_conversion(IMG_YVYU,    IMG_YVYU,    yuv16_copy)
+    ) {
+        return 0;
+    }
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+    if (accel & (AC_IA32ASM | AC_AMD64ASM)) {
+        if (!register_conversion(IMG_YUY2,    IMG_UYVY,    yuv16_swap16_x86)
+         || !register_conversion(IMG_YUY2,    IMG_YVYU,    yuv16_swapuv_x86)
+         || !register_conversion(IMG_UYVY,    IMG_YUY2,    yuv16_swap16_x86)
+         || !register_conversion(IMG_UYVY,    IMG_YVYU,    uyvy_yvyu_x86)
+         || !register_conversion(IMG_YVYU,    IMG_YUY2,    yuv16_swapuv_x86)
+         || !register_conversion(IMG_YVYU,    IMG_UYVY,    yvyu_uyvy_x86)
+        ) {
+            return 0;
+        }
+    }
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
+    if (accel & AC_MMX) {
+        if (!register_conversion(IMG_YUY2,    IMG_UYVY,    yuv16_swap16_mmx)
+         || !register_conversion(IMG_YUY2,    IMG_YVYU,    yuv16_swapuv_mmx)
+         || !register_conversion(IMG_UYVY,    IMG_YUY2,    yuv16_swap16_mmx)
+         || !register_conversion(IMG_UYVY,    IMG_YVYU,    uyvy_yvyu_mmx)
+         || !register_conversion(IMG_YVYU,    IMG_YUY2,    yuv16_swapuv_mmx)
+         || !register_conversion(IMG_YVYU,    IMG_UYVY,    yvyu_uyvy_mmx)
+        ) {
+            return 0;
+        }
+    }
+#endif
+
+#if defined(HAVE_ASM_SSE2)
+    if (accel & AC_SSE2) {
+        if (!register_conversion(IMG_YUY2,    IMG_UYVY,    yuv16_swap16_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_YVYU,    yuv16_swapuv_sse2)
+         || !register_conversion(IMG_UYVY,    IMG_YUY2,    yuv16_swap16_sse2)
+         || !register_conversion(IMG_UYVY,    IMG_YVYU,    uyvy_yvyu_sse2)
+         || !register_conversion(IMG_YVYU,    IMG_YUY2,    yuv16_swapuv_sse2)
+         || !register_conversion(IMG_YVYU,    IMG_UYVY,    yvyu_uyvy_sse2)
+        ) {
+            return 0;
+        }
+    }
+#endif
+
+#endif  /* ARCH_X86 || ARCH_X86_64 */
+
+    return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ *   c-file-style: "stroustrup"
+ *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ *   indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/img_yuv_planar.c b/debian/transcode/transcode-1.1.7/aclib/img_yuv_planar.c
new file mode 100644
index 00000000..e510fa4a
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/img_yuv_planar.c
@@ -0,0 +1,788 @@
+/*
+ * img_yuv_planar.c - YUV planar image format conversion routines
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later).  See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "imgconvert.h"
+#include "img_internal.h"
+
+#include <string.h>
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Standard C implementations */
+
+/*************************************************************************/
+
+/* Identity transformations */
+
+static int yuv420p_copy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height);
+    ac_memcpy(dest[1], src[1], (width/2)*(height/2));
+    ac_memcpy(dest[2], src[2], (width/2)*(height/2));
+    return 1;
+}
+
+static int yuv411p_copy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height);
+    ac_memcpy(dest[1], src[1], (width/4)*height);
+    ac_memcpy(dest[2], src[2], (width/4)*height);
+    return 1;
+}
+
+static int yuv422p_copy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height);
+    ac_memcpy(dest[1], src[1], (width/2)*height);
+    ac_memcpy(dest[2], src[2], (width/2)*height);
+    return 1;
+}
+
+static int yuv444p_copy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height);
+    ac_memcpy(dest[1], src[1], width*height);
+    ac_memcpy(dest[2], src[2], width*height);
+    return 1;
+}
+
+static int y8_copy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height);
+    return 1;
+}
+
+/*************************************************************************/
+
+static int yuv420p_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int x, y;
+    ac_memcpy(dest[0], src[0], width*height);
+    for (y = 0; y < (height & ~1); y += 2) {
+        for (x = 0; x < (width/2 & ~1); x += 2) {
+            dest[1][y*(width/4)+x/2] = (src[1][(y/2)*(width/2)+x]
+                                      + src[1][(y/2)*(width/2)+x+1] + 1) / 2;
+            dest[2][y*(width/4)+x/2] = (src[2][(y/2)*(width/2)+x]
+                                      + src[2][(y/2)*(width/2)+x+1] + 1) / 2;
+        }
+        ac_memcpy(dest[1]+(y+1)*(width/4), dest[1]+y*(width/4), width/4);
+        ac_memcpy(dest[2]+(y+1)*(width/4), dest[2]+y*(width/4), width/4);
+    }
+    return 1;
+}
+
+static int yuv420p_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int y;
+    ac_memcpy(dest[0], src[0], width*height);
+    for (y = 0; y < (height & ~1); y += 2) {
+        ac_memcpy(dest[1]+(y  )*(width/2), src[1]+(y/2)*(width/2), width/2);
+        ac_memcpy(dest[1]+(y+1)*(width/2), src[1]+(y/2)*(width/2), width/2);
+        ac_memcpy(dest[2]+(y  )*(width/2), src[2]+(y/2)*(width/2), width/2);
+        ac_memcpy(dest[2]+(y+1)*(width/2), src[2]+(y/2)*(width/2), width/2);
+    }
+    return 1;
+}
+
+static int yuv420p_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int x, y;
+    ac_memcpy(dest[0], src[0], width*height);
+    for (y = 0; y < height; y += 2) {
+        for (x = 0; x < width; x += 2) {
+            dest[1][y*width+x  ] =
+            dest[1][y*width+x+1] = src[1][(y/2)*(width/2)+(x/2)];
+            dest[2][y*width+x  ] =
+            dest[2][y*width+x+1] = src[2][(y/2)*(width/2)+(x/2)];
+        }
+        ac_memcpy(dest[1]+(y+1)*width, dest[1]+y*width, width);
+        ac_memcpy(dest[2]+(y+1)*width, dest[2]+y*width, width);
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+static int yuv411p_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int x, y;
+    ac_memcpy(dest[0], src[0], width*height);
+    for (y = 0; y < (height & ~1); y += 2) {
+        for (x = 0; x < ((width/2) & ~1); x += 2) {
+            dest[1][(y/2)*(width/2)+x] = (src[1][y*(width/4)+x/2]
+                                        + src[1][(y+1)*(width/4)+x/2] + 1) / 2;
+            dest[2][(y/2)*(width/2)+x] = (src[2][y*(width/4)+x/2]
+                                        + src[2][(y+1)*(width/4)+x/2] + 1) / 2;
+            dest[1][(y/2)*(width/2)+x+1] = dest[1][(y/2)*(width/2)+x];
+            dest[2][(y/2)*(width/2)+x+1] = dest[2][(y/2)*(width/2)+x];
+        }
+    }
+    return 1;
+}
+
+static int yuv411p_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int x, y;
+    ac_memcpy(dest[0], src[0], width*height);
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < ((width/2) & ~1); x += 2) {
+            dest[1][y*(width/2)+x  ] = src[1][y*(width/4)+x/2];
+            dest[1][y*(width/2)+x+1] = src[1][y*(width/4)+x/2];
+            dest[2][y*(width/2)+x  ] = src[2][y*(width/4)+x/2];
+            dest[2][y*(width/2)+x+1] = src[2][y*(width/4)+x/2];
+        }
+    }
+    return 1;
+}
+
+static int yuv411p_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int x, y;
+    ac_memcpy(dest[0], src[0], width*height);
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < (width & ~3); x += 4) {
+            dest[1][y*width+x  ] = src[1][y*(width/4)+x/4];
+            dest[1][y*width+x+1] = src[1][y*(width/4)+x/4];
+            dest[1][y*width+x+2] = src[1][y*(width/4)+x/4];
+            dest[1][y*width+x+3] = src[1][y*(width/4)+x/4];
+            dest[2][y*width+x  ] = src[2][y*(width/4)+x/4];
+            dest[2][y*width+x+1] = src[2][y*(width/4)+x/4];
+            dest[2][y*width+x+2] = src[2][y*(width/4)+x/4];
+            dest[2][y*width+x+3] = src[2][y*(width/4)+x/4];
+        }
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+static int yuv422p_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int x, y;
+    ac_memcpy(dest[0], src[0], width*height);
+    for (y = 0; y < (height & ~1); y += 2) {
+        for (x = 0; x < width/2; x++) {
+            dest[1][(y/2)*(width/2)+x] = (src[1][y*(width/2)+x]
+                                        + src[1][(y+1)*(width/2)+x] + 1) / 2;
+            dest[2][(y/2)*(width/2)+x] = (src[2][y*(width/2)+x]
+                                        + src[2][(y+1)*(width/2)+x] + 1) / 2;
+        }
+    }
+    return 1;
+}
+
+static int yuv422p_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int x, y;
+    ac_memcpy(dest[0], src[0], width*height);
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < ((width/2) & ~1); x += 2) {
+            dest[1][y*(width/4)+x/2] = (src[1][y*(width/2)+x]
+                                      + src[1][y*(width/2)+x+1] + 1) / 2;
+            dest[2][y*(width/4)+x/2] = (src[2][y*(width/2)+x]
+                                      + src[2][y*(width/2)+x+1] + 1) / 2;
+        }
+    }
+    return 1;
+}
+
+static int yuv422p_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int x, y;
+    ac_memcpy(dest[0], src[0], width*height);
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < (width & ~1); x += 2) {
+            dest[1][y*width+x  ] = src[1][y*(width/2)+x/2];
+            dest[1][y*width+x+1] = src[1][y*(width/2)+x/2];
+            dest[2][y*width+x  ] = src[2][y*(width/2)+x/2];
+            dest[2][y*width+x+1] = src[2][y*(width/2)+x/2];
+        }
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+static int yuv444p_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int x, y;
+    ac_memcpy(dest[0], src[0], width*height);
+    for (y = 0; y < (height & ~1); y += 2) {
+        for (x = 0; x < (width & ~1); x += 2) {
+            dest[1][(y/2)*(width/2)+x/2] = (src[1][y*width+x]
+                                          + src[1][y*width+x+1]
+                                          + src[1][(y+1)*width+x]
+                                          + src[1][(y+1)*width+x+1] + 2) / 4;
+            dest[2][(y/2)*(width/2)+x/2] = (src[2][y*width+x]
+                                          + src[2][y*width+x+1]
+                                          + src[2][(y+1)*width+x]
+                                          + src[2][(y+1)*width+x+1] + 2) / 4;
+        }
+    }
+    return 1;
+}
+
+static int yuv444p_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int x, y;
+    ac_memcpy(dest[0], src[0], width*height);
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < (width & ~3); x += 4) {
+            dest[1][y*(width/4)+x/4] = (src[1][y*width+x]
+                                      + src[1][y*width+x+1]
+                                      + src[1][y*width+x+2]
+                                      + src[1][y*width+x+3] + 2) / 4;
+            dest[2][y*(width/4)+x/4] = (src[2][y*width+x]
+                                      + src[2][y*width+x+1]
+                                      + src[2][y*width+x+2]
+                                      + src[2][y*width+x+3] + 2) / 4;
+        }
+    }
+    return 1;
+}
+
+static int yuv444p_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int x, y;
+    ac_memcpy(dest[0], src[0], width*height);
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < (width & ~1); x += 2) {
+            dest[1][y*(width/2)+x/2] = (src[1][y*width+x]
+                                      + src[1][y*width+x+1] + 1) / 2;
+            dest[2][y*(width/2)+x/2] = (src[2][y*width+x]
+                                      + src[2][y*width+x+1] + 1) / 2;
+        }
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+/* We treat Y8 as a planar format */
+
+static int yuvp_y8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height);
+    return 1;
+}
+
+static int y8_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height);
+    memset(dest[1], 128, (width/2)*(height/2));
+    memset(dest[2], 128, (width/2)*(height/2));
+    return 1;
+}
+
+static int y8_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height);
+    memset(dest[1], 128, (width/4)*height);
+    memset(dest[2], 128, (width/4)*height);
+    return 1;
+}
+
+static int y8_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height);
+    memset(dest[1], 128, (width/2)*height);
+    memset(dest[2], 128, (width/2)*height);
+    return 1;
+}
+
+static int y8_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height);
+    memset(dest[1], 128, width*height);
+    memset(dest[2], 128, width*height);
+    return 1;
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+#if defined(HAVE_ASM_SSE2)
+
+/* SSE2 routines.  See comments in img_x86_common.h for why we don't bother
+ * unrolling the loops. */
+
+/* Common macros/data for x86 code */
+#include "img_x86_common.h"
+
+/* Average 2 bytes horizontally (e.g. 422P->411P) (unit: 2 source bytes) */
+#define AVG_2H(src,dest,count)  do { \
+    int dummy;                                                          \
+    asm volatile(                                                       \
+        "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" /* XMM7: 0x00FF*8 */ \
+        SIMD_LOOP_WRAPPER(                                              \
+        /* blocksize */ 8,                                              \
+        /* push_regs */ "",                                             \
+        /* pop_regs  */ "",                                             \
+        /* small_loop */                                                \
+        "movzbl -2("ESI","ECX",2), %%eax                                \n\
+        movzbl -1("ESI","ECX",2), %%edx                                 \n\
+        addl %%edx, %%eax                                               \n\
+        shrl $1, %%eax                                                  \n\
+        movb %%al, -1("EDI","ECX")",                                    \
+        /* main_loop */                                                 \
+        "movdqu -16("ESI","ECX",2),%%xmm0 #XMM0:FEDCBA9876543210        \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: FEDCBA9876543210        \n\
+        pand %%xmm7, %%xmm0             # XMM0:  E C A 8 6 4 2 0        \n\
+        psrlw $8, %%xmm1                # XMM1:  F D B 9 7 5 3 1        \n\
+        pavgw %%xmm1, %%xmm0            # XMM0:  w v u t s r q p (avgs) \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: wvutsrqpwvutsrqp        \n\
+        movq %%xmm0, -8("EDI","ECX")",                                  \
+        /* emms */ "emms")                                              \
+        : "=c" (dummy)                                                  \
+        : "S" (src), "D" (dest), "0" (count)                            \
+        : "eax", "edx");                                                \
+} while (0)
+
+/* Average 4 bytes horizontally (e.g. 444P->411P) (unit: 4 source bytes) */
+#define AVG_4H(src,dest,count)  do { \
+    int dummy;                                                          \
+    asm volatile(                                                       \
+        "pcmpeqd %%xmm7, %%xmm7; psrld $24, %%xmm7;" /* XMM7: 0x000000FF*4 */ \
+        SIMD_LOOP_WRAPPER(                                              \
+        /* blocksize */ 4,                                              \
+        /* push_regs */ "",                                             \
+        /* pop_regs  */ "",                                             \
+        /* small_loop */                                                \
+        "movzbl -4("ESI","ECX",4), %%eax                                \n\
+        movzbl -3("ESI","ECX",4), %%edx                                 \n\
+        addl %%edx, %%eax                                               \n\
+        movzbl -2("ESI","ECX",4), %%edx                                 \n\
+        addl %%edx, %%eax                                               \n\
+        movzbl -1("ESI","ECX",4), %%edx                                 \n\
+        addl %%edx, %%eax                                               \n\
+        shrl $2, %%eax                                                  \n\
+        movb %%al, -1("EDI","ECX")",                                    \
+        /* main_loop */                                                 \
+        "movdqu -16("ESI","ECX",4),%%xmm0 #XMM0:FEDCBA9876543210        \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: FEDCBA9876543210        \n\
+        movdqa %%xmm0, %%xmm2           # XMM2: FEDCBA9876543210        \n\
+        movdqa %%xmm0, %%xmm3           # XMM3: FEDCBA9876543210        \n\
+        pand %%xmm7, %%xmm0             # XMM0:    C   8   4   0        \n\
+        psrld $8, %%xmm1                # XMM1:  FED BA9 765 321        \n\
+        pand %%xmm7, %%xmm1             # XMM1:    D   9   5   1        \n\
+        psrld $16, %%xmm2               # XMM2:   FE  BA  76  32        \n\
+        pand %%xmm7, %%xmm2             # XMM2:    E   A   6   2        \n\
+        psrld $24, %%xmm3               # XMM3:    F   B   7   3        \n\
+        pavgw %%xmm1, %%xmm0            # XMM0:  C+D 8+9 4+5 0+1 (avgs) \n\
+        pavgw %%xmm3, %%xmm2            # XMM2:  E+F A+B 6+7 2+3 (avgs) \n\
+        pavgw %%xmm2, %%xmm0            # XMM0:    s   r   q   p (avgs) \n\
+        packuswb %%xmm0, %%xmm0         # XMM0:  s r q p s r q p        \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: srqpsrqpsrqpsrqp        \n\
+        movd %%xmm0, -4("EDI","ECX")",                                  \
+        /* emms */ "emms")                                              \
+        : "=c" (dummy)                                                  \
+        : "S" (src), "D" (dest), "0" (count)                            \
+        : "eax", "edx");                                                \
+} while (0)
+
+/* Repeat 2 bytes horizontally (e.g. 422P->444P) (unit: 1 source byte) */
+#define REP_2H(src,dest,count)  do { \
+    int dummy;                                                          \
+    asm volatile(SIMD_LOOP_WRAPPER(                                     \
+        /* blocksize */ 8,                                              \
+        /* push_regs */ "",                                             \
+        /* pop_regs  */ "",                                             \
+        /* small_loop */                                                \
+        "movb -1("ESI","ECX"), %%al                                     \n\
+        movb %%al, %%ah                                                 \n\
+        movw %%ax, -2("EDI","ECX",2)",                                  \
+        /* main_loop */                                                 \
+        "movq -8("ESI","ECX"), %%xmm0   # XMM0:         76543210        \n\
+        punpcklbw %%xmm0, %%xmm0        # XMM0: 7766554433221100        \n\
+        movdqu %%xmm0, -16("EDI","ECX",2)",                             \
+        /* emms */ "emms")                                              \
+        : "=c" (dummy)                                                  \
+        : "S" (src), "D" (dest), "0" (count)                            \
+        : "eax");                                                       \
+} while (0)
+
+/* Repeat 4 bytes horizontally (e.g. 411P->444P) (unit: 1 source byte) */
+#define REP_4H(src,dest,count)  do { \
+    int dummy;                                                          \
+    asm volatile(SIMD_LOOP_WRAPPER(                                     \
+        /* blocksize */ 4,                                              \
+        /* push_regs */ "",                                             \
+        /* pop_regs  */ "",                                             \
+        /* small_loop */                                                \
+        "movzbl -1("ESI","ECX"), %%eax                                  \n\
+        movb %%al, %%ah                                                 \n\
+        movl %%eax, %%edx                                               \n\
+        shll $16, %%eax                                                 \n\
+        orl %%edx, %%eax                                                \n\
+        movl %%eax, -4("EDI","ECX",4)",                                 \
+        /* main_loop */                                                 \
+        "movd -4("ESI","ECX"), %%xmm0   # XMM0:             3210        \n\
+        punpcklbw %%xmm0, %%xmm0        # XMM0:         33221100        \n\
+        punpcklwd %%xmm0, %%xmm0        # XMM0: 3333222211110000        \n\
+        movdqu %%xmm0, -16("EDI","ECX",4)",                             \
+        /* emms */ "emms")                                              \
+        : "=c" (dummy)                                                  \
+        : "S" (src), "D" (dest), "0" (count)                            \
+        : "eax", "edx");                                                \
+} while (0)
+
+/* Average 2 bytes vertically and double horizontally (411P->420P)
+ * (unit: 1 source byte) */
+#define AVG_411_420(src1,src2,dest,count)  do { \
+    int dummy;                                                          \
+    asm volatile(SIMD_LOOP_WRAPPER(                                     \
+        /* blocksize */ 8,                                              \
+        /* push_regs */ "push "EBX,                                     \
+        /* pop_regs  */ "pop "EBX,                                      \
+        /* small_loop */                                                \
+        "movzbl -1("ESI","ECX"), %%eax                                  \n\
+        movzbl -1("EDX","ECX"), %%ebx                                   \n\
+        addl %%ebx, %%eax                                               \n\
+        shrl $1, %%eax                                                  \n\
+        movb %%al, %%ah                                                 \n\
+        movw %%ax, -2("EDI","ECX",2)",                                  \
+        /* main_loop */                                                 \
+        "movq -8("ESI","ECX"), %%xmm0                                   \n\
+        movq -8("EDX","ECX"), %%xmm1                                    \n\
+        pavgb %%xmm1, %%xmm0                                            \n\
+        punpcklbw %%xmm0, %%xmm0                                        \n\
+        movdqu %%xmm0, -16("EDI","ECX",2)",                             \
+        /* emms */ "emms")                                              \
+        : "=c" (dummy)                                                  \
+        : "S" (src1), "d" (src2), "D" (dest), "0" (count)               \
+        : "eax");                                                       \
+} while (0)
+
+/* Average 2 bytes vertically (422P->420P) (unit: 1 source byte) */
+#define AVG_422_420(src1,src2,dest,count)  do { \
+    int dummy;                                                          \
+    asm volatile(SIMD_LOOP_WRAPPER(                                     \
+        /* blocksize */ 16,                                             \
+        /* push_regs */ "push "EBX,                                     \
+        /* pop_regs  */ "pop "EBX,                                      \
+        /* small_loop */                                                \
+        "movzbl -1("ESI","ECX"), %%eax                                  \n\
+        movzbl -1("EDX","ECX"), %%ebx                                   \n\
+        addl %%ebx, %%eax                                               \n\
+        shrl $1, %%eax                                                  \n\
+        movb %%al, -1("EDI","ECX")",                                    \
+        /* main_loop */                                                 \
+        "movdqu -16("ESI","ECX"), %%xmm0                                \n\
+        movdqu -16("EDX","ECX"), %%xmm1                                 \n\
+        pavgb %%xmm1, %%xmm0                                            \n\
+        movdqu %%xmm0, -16("EDI","ECX")",                               \
+        /* emms */ "emms")                                              \
+        : "=c" (dummy)                                                  \
+        : "S" (src1), "d" (src2), "D" (dest), "0" (count)               \
+        : "eax");                                                       \
+} while (0)
+
+/* Average 4 bytes, 2 horizontally and 2 vertically (444P->420P)
+ * (unit: 2 source bytes) */
+#define AVG_444_420(src1,src2,dest,count)  do { \
+    int dummy;                                                          \
+    asm volatile(                                                       \
+        "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" /* XMM7: 0x00FF*8 */ \
+        SIMD_LOOP_WRAPPER(                                              \
+        /* blocksize */ 8,                                              \
+        /* push_regs */ "push "EBX,                                     \
+        /* pop_regs  */ "pop "EBX,                                      \
+        /* small_loop */                                                \
+        "movzbl -2("ESI","ECX",2), %%eax                                \n\
+        movzbl -1("ESI","ECX",2), %%ebx                                 \n\
+        addl %%ebx, %%eax                                               \n\
+        movzbl -2("EDX","ECX",2), %%ebx                                 \n\
+        addl %%ebx, %%eax                                               \n\
+        movzbl -1("EDX","ECX",2), %%ebx                                 \n\
+        addl %%ebx, %%eax                                               \n\
+        shrl $2, %%eax                                                  \n\
+        movb %%al, -1("EDI","ECX")",                                    \
+        /* main_loop */                                                 \
+        "movdqu -16("ESI","ECX",2), %%xmm0                              \n\
+        movdqu -16("EDX","ECX",2), %%xmm2                               \n\
+        movdqa %%xmm0, %%xmm1                                           \n\
+        pand %%xmm7, %%xmm0                                             \n\
+        psrlw $8, %%xmm1                                                \n\
+        pavgw %%xmm1, %%xmm0                                            \n\
+        movdqa %%xmm2, %%xmm3                                           \n\
+        pand %%xmm7, %%xmm2                                             \n\
+        psrlw $8, %%xmm3                                                \n\
+        pavgw %%xmm3, %%xmm2                                            \n\
+        pavgw %%xmm2, %%xmm0                                            \n\
+        packuswb %%xmm0, %%xmm0                                         \n\
+        movq %%xmm0, -8("EDI","ECX")",                                  \
+        /* emms */ "emms")                                              \
+        : "=c" (dummy)                                                  \
+        : "S" (src1), "d" (src2), "D" (dest), "c" (count));             \
+} while (0)
+
+/*************************************************************************/
+
+static int yuv420p_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int y;
+    ac_memcpy(dest[0], src[0], width*height);
+    for (y = 0; y < (height & ~1); y += 2) {
+        AVG_2H(src[1]+(y/2)*(width/2), dest[1]+y*(width/4), width/4);
+        ac_memcpy(dest[1]+(y+1)*(width/4), dest[1]+y*(width/4), width/4);
+        AVG_2H(src[2]+(y/2)*(width/2), dest[2]+y*(width/4), width/4);
+        ac_memcpy(dest[2]+(y+1)*(width/4), dest[2]+y*(width/4), width/4);
+    }
+    return 1;
+}
+
+static int yuv420p_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int y;
+    ac_memcpy(dest[0], src[0], width*height);
+    for (y = 0; y < height; y += 2) {
+        REP_2H(src[1]+(y/2)*(width/2), dest[1]+y*width, width/2);
+        ac_memcpy(dest[1]+(y+1)*width, dest[1]+y*width, width);
+        REP_2H(src[2]+(y/2)*(width/2), dest[2]+y*width, width/2);
+        ac_memcpy(dest[2]+(y+1)*width, dest[2]+y*width, width);
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+static int yuv411p_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int y;
+    ac_memcpy(dest[0], src[0], width*height);
+    for (y = 0; y < (height & ~1); y += 2) {
+        AVG_411_420(src[1]+y*(width/4), src[1]+(y+1)*(width/4),
+                    dest[1]+(y/2)*(width/2), width/4);
+        AVG_411_420(src[2]+y*(width/4), src[2]+(y+1)*(width/4),
+                    dest[2]+(y/2)*(width/2), width/4);
+    }
+    return 1;
+}
+
+static int yuv411p_yuv422p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height);
+    if (!(width & 3)) {
+        /* Fast version, no bytes at end of row to skip */
+        REP_2H(src[1], dest[1], (width/4)*height);
+        REP_2H(src[2], dest[2], (width/4)*height);
+    } else {
+        /* Slow version, loop through each row */
+        int y;
+        for (y = 0; y < height; y++) {
+            REP_2H(src[1]+y*(width/4), dest[1]+y*(width/2), width/4);
+            REP_2H(src[2]+y*(width/4), dest[2]+y*(width/2), width/4);
+        }
+    }
+    return 1;
+}
+
+static int yuv411p_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height);
+    if (!(width & 3)) {
+        /* Fast version, no bytes at end of row to skip */
+        REP_4H(src[1], dest[1], (width/4)*height);
+        REP_4H(src[2], dest[2], (width/4)*height);
+    } else {
+        /* Slow version, loop through each row */
+        int y;
+        for (y = 0; y < height; y++) {
+            REP_4H(src[1]+y*(width/4), dest[1]+y*width, width/4);
+            REP_4H(src[2]+y*(width/4), dest[2]+y*width, width/4);
+        }
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+static int yuv422p_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int y;
+    ac_memcpy(dest[0], src[0], width*height);
+    for (y = 0; y < (height & ~1); y += 2) {
+        AVG_422_420(src[1]+y*(width/2), src[1]+(y+1)*(width/2),
+                    dest[1]+(y/2)*(width/2), width/2);
+        AVG_422_420(src[2]+y*(width/2), src[2]+(y+1)*(width/2),
+                    dest[2]+(y/2)*(width/2), width/2);
+    }
+    return 1;
+}
+
+static int yuv422p_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height);
+    if (!(width & 3)) {
+        /* Fast version, no bytes at end of row to skip */
+        AVG_2H(src[1], dest[1], (width/4)*height);
+        AVG_2H(src[2], dest[2], (width/4)*height);
+    } else {
+        /* Slow version, loop through each row */
+        int y;
+        for (y = 0; y < height; y++) {
+            AVG_2H(src[1]+y*(width/2), dest[1]+y*(width/4), width/4);
+            AVG_2H(src[2]+y*(width/2), dest[2]+y*(width/4), width/4);
+        }
+    }
+    return 1;
+}
+
+static int yuv422p_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height);
+    if (!(width & 1)) {
+        /* Fast version, no bytes at end of row to skip */
+        REP_2H(src[1], dest[1], (width/2)*height);
+        REP_2H(src[2], dest[2], (width/2)*height);
+    } else {
+        /* Slow version, loop through each row */
+        int y;
+        for (y = 0; y < height; y++) {
+            REP_2H(src[1]+y*(width/2), dest[1]+y*width, width/2);
+            REP_2H(src[2]+y*(width/2), dest[2]+y*width, width/2);
+        }
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+static int yuv444p_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int y;
+    ac_memcpy(dest[0], src[0], width*height);
+    for (y = 0; y < (height & ~1); y += 2) {
+        AVG_444_420(src[1]+y*width, src[1]+(y+1)*width,
+                    dest[1]+(y/2)*(width/2), width/2);
+        AVG_444_420(src[2]+y*width, src[2]+(y+1)*width,
+                    dest[2]+(y/2)*(width/2), width/2);
+    }
+    return 1;
+}
+
+static int yuv444p_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height);
+    if (!(width & 3)) {
+        /* Fast version, no bytes at end of row to skip */
+        AVG_4H(src[1], dest[1], (width/4)*height);
+        AVG_4H(src[2], dest[2], (width/4)*height);
+    } else {
+        /* Slow version, loop through each row */
+        int y;
+        for (y = 0; y < height; y++) {
+            AVG_4H(src[1]+y*width, dest[1]+y*(width/4), width/4);
+            AVG_4H(src[2]+y*width, dest[2]+y*(width/4), width/4);
+        }
+    }
+    return 1;
+}
+
+static int yuv444p_yuv422p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    ac_memcpy(dest[0], src[0], width*height);
+    if (!(width & 1)) {
+        /* Fast version, no bytes at end of row to skip */
+        AVG_2H(src[1], dest[1], (width/2)*height);
+        AVG_2H(src[2], dest[2], (width/2)*height);
+    } else {
+        /* Slow version, loop through each row */
+        int y;
+        for (y = 0; y < height; y++) {
+            AVG_2H(src[1]+y*width, dest[1]+y*(width/2), width/2);
+            AVG_2H(src[2]+y*width, dest[2]+y*(width/2), width/2);
+        }
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+#endif  /* HAVE_ASM_SSE2 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Initialization */
+
+int ac_imgconvert_init_yuv_planar(int accel)
+{
+    if (!register_conversion(IMG_YUV420P, IMG_YUV420P, yuv420p_copy)
+     || !register_conversion(IMG_YUV420P, IMG_YUV411P, yuv420p_yuv411p)
+     || !register_conversion(IMG_YUV420P, IMG_YUV422P, yuv420p_yuv422p)
+     || !register_conversion(IMG_YUV420P, IMG_YUV444P, yuv420p_yuv444p)
+     || !register_conversion(IMG_YUV420P, IMG_Y8,      yuvp_y8)
+
+     || !register_conversion(IMG_YUV411P, IMG_YUV420P, yuv411p_yuv420p)
+     || !register_conversion(IMG_YUV411P, IMG_YUV411P, yuv411p_copy)
+     || !register_conversion(IMG_YUV411P, IMG_YUV422P, yuv411p_yuv422p)
+     || !register_conversion(IMG_YUV411P, IMG_YUV444P, yuv411p_yuv444p)
+     || !register_conversion(IMG_YUV411P, IMG_Y8,      yuvp_y8)
+
+     || !register_conversion(IMG_YUV422P, IMG_YUV420P, yuv422p_yuv420p)
+     || !register_conversion(IMG_YUV422P, IMG_YUV411P, yuv422p_yuv411p)
+     || !register_conversion(IMG_YUV422P, IMG_YUV422P, yuv422p_copy)
+     || !register_conversion(IMG_YUV422P, IMG_YUV444P, yuv422p_yuv444p)
+     || !register_conversion(IMG_YUV422P, IMG_Y8,      yuvp_y8)
+
+     || !register_conversion(IMG_YUV444P, IMG_YUV420P, yuv444p_yuv420p)
+     || !register_conversion(IMG_YUV444P, IMG_YUV411P, yuv444p_yuv411p)
+     || !register_conversion(IMG_YUV444P, IMG_YUV422P, yuv444p_yuv422p)
+     || !register_conversion(IMG_YUV444P, IMG_YUV444P, yuv444p_copy)
+     || !register_conversion(IMG_YUV444P, IMG_Y8,      yuvp_y8)
+
+     || !register_conversion(IMG_Y8,      IMG_YUV420P, y8_yuv420p)
+     || !register_conversion(IMG_Y8,      IMG_YUV411P, y8_yuv411p)
+     || !register_conversion(IMG_Y8,      IMG_YUV422P, y8_yuv422p)
+     || !register_conversion(IMG_Y8,      IMG_YUV444P, y8_yuv444p)
+     || !register_conversion(IMG_Y8,      IMG_Y8,      y8_copy)
+    ) {
+        return 0;
+    }
+
+#if defined(HAVE_ASM_SSE2)
+    if (accel & AC_SSE2) {
+        if (!register_conversion(IMG_YUV420P, IMG_YUV411P, yuv420p_yuv411p_sse2)
+         || !register_conversion(IMG_YUV420P, IMG_YUV444P, yuv420p_yuv444p_sse2)
+
+         || !register_conversion(IMG_YUV411P, IMG_YUV420P, yuv411p_yuv420p_sse2)
+         || !register_conversion(IMG_YUV411P, IMG_YUV422P, yuv411p_yuv422p_sse2)
+         || !register_conversion(IMG_YUV411P, IMG_YUV444P, yuv411p_yuv444p_sse2)
+
+         || !register_conversion(IMG_YUV422P, IMG_YUV420P, yuv422p_yuv420p_sse2)
+         || !register_conversion(IMG_YUV422P, IMG_YUV411P, yuv422p_yuv411p_sse2)
+         || !register_conversion(IMG_YUV422P, IMG_YUV444P, yuv422p_yuv444p_sse2)
+
+         || !register_conversion(IMG_YUV444P, IMG_YUV420P, yuv444p_yuv420p_sse2)
+         || !register_conversion(IMG_YUV444P, IMG_YUV411P, yuv444p_yuv411p_sse2)
+         || !register_conversion(IMG_YUV444P, IMG_YUV422P, yuv444p_yuv422p_sse2)
+        ) {
+            return 0;
+        }
+    }
+#endif  /* ARCH_X86 || ARCH_X86_64 */
+
+    return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ *   c-file-style: "stroustrup"
+ *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ *   indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/img_yuv_rgb.c b/debian/transcode/transcode-1.1.7/aclib/img_yuv_rgb.c
new file mode 100644
index 00000000..9dc04fcb
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/img_yuv_rgb.c
@@ -0,0 +1,2410 @@
+/*
+ * img_yuv_rgb.c - YUV<->RGB image format conversion routines
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later).  See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "ac_internal.h"
+#include "imgconvert.h"
+#include "img_internal.h"
+
+#include <string.h>
+
+#define USE_LOOKUP_TABLES  /* for YUV420P->RGB24 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Standard C implementations */
+
+const int cY  =  76309;
+const int crV = 104597;
+const int cgU = -25675;
+const int cgV = -53279;
+const int cbU = 132201;
+
+/*************************************************************************/
+
+#ifdef USE_LOOKUP_TABLES
+# define TABLE_SCALE 16   /* scale factor for Y */
+static int Ylutbase[768*TABLE_SCALE];
+static int *Ylut = Ylutbase+256*TABLE_SCALE;
+static int rVlut[256];
+static int gUlut[256];
+static int gVlut[256];
+static int bUlut[256];
+static void yuv_create_tables(void) {
+    static int yuv_tables_created = 0;
+    if (!yuv_tables_created) {
+        int i;
+        for (i = -256*TABLE_SCALE; i < 512*TABLE_SCALE; i++) {
+            int v = ((cY*(i-16*TABLE_SCALE)/TABLE_SCALE) + 32768) >> 16;
+            Ylut[i] = v<0 ? 0 : v>255 ? 255 : v;
+        }
+        for (i = 0; i < 256; i++) {
+            rVlut[i] = ((crV * (i-128)) * TABLE_SCALE + cY/2) / cY;
+            gUlut[i] = ((cgU * (i-128)) * TABLE_SCALE + cY/2) / cY;
+            gVlut[i] = ((cgV * (i-128)) * TABLE_SCALE + cY/2) / cY;
+            bUlut[i] = ((cbU * (i-128)) * TABLE_SCALE + cY/2) / cY;
+        }
+        yuv_tables_created = 1;
+    }
+}
+# define YUV2RGB(uvofs,rgbsz,rofs,gofs,bofs) do {               \
+    int Y = src[0][y*width+x] * TABLE_SCALE;                    \
+    int U = src[1][(uvofs)];                                    \
+    int V = src[2][(uvofs)];                                    \
+    dest[0][(y*width+x)*rgbsz+rofs] = Ylut[Y+rVlut[V]];         \
+    dest[0][(y*width+x)*rgbsz+gofs] = Ylut[Y+gUlut[U]+gVlut[V]];\
+    dest[0][(y*width+x)*rgbsz+bofs] = Ylut[Y+bUlut[U]];         \
+} while (0)
+# define YUV2RGB_PACKED(yofs,uofs,vofs,rgbsz,rofs,gofs,bofs) do { \
+    int Y = src[0][(y*width+x)*2+yofs] * TABLE_SCALE;           \
+    int U = src[0][(y*width+(x&~1))*2+uofs];                    \
+    int V = src[0][(y*width+(x&~1))*2+vofs];                    \
+    dest[0][(y*width+x)*rgbsz+rofs] = Ylut[Y+rVlut[V]];         \
+    dest[0][(y*width+x)*rgbsz+gofs] = Ylut[Y+gUlut[U]+gVlut[V]];\
+    dest[0][(y*width+x)*rgbsz+bofs] = Ylut[Y+bUlut[U]];         \
+} while (0)
+#else  /* !USE_LOOKUP_TABLES */
+# define yuv_create_tables() /*nothing*/
+# define YUV2RGB(uvofs,rgbsz,rofs,gofs,bofs) do {               \
+    int Y = cY * (src[0][y*width+x] - 16);                      \
+    int U = src[1][(uvofs)] - 128;                              \
+    int V = src[2][(uvofs)] - 128;                              \
+    int r = (Y + crV*V + 32768) >> 16;                          \
+    int g = (Y + cgU*U + cgV*V + 32768) >> 16;                  \
+    int b = (Y + cbU*U + 32768) >> 16;                          \
+    dest[0][(y*width+x)*rgbsz+rofs] = r<0 ? 0 : r>255 ? 255 : r;\
+    dest[0][(y*width+x)*rgbsz+gofs] = g<0 ? 0 : g>255 ? 255 : g;\
+    dest[0][(y*width+x)*rgbsz+bofs] = b<0 ? 0 : b>255 ? 255 : b;\
+} while (0)
+# define YUV2RGB_PACKED(yofs,uofs,vofs,rgbsz,rofs,gofs,bofs) do { \
+    int Y = cY * (src[0][(y*width+x)*2+yofs] - 16);             \
+    int U = src[0][(y*width+(x&~1))*2+uofs] - 128;              \
+    int V = src[0][(y*width+(x&~1))*2+vofs] - 128;              \
+    int r = (Y + crV*V + 32768) >> 16;                          \
+    int g = (Y + cgU*U + cgV*V + 32768) >> 16;                  \
+    int b = (Y + cbU*U + 32768) >> 16;                          \
+    dest[0][(y*width+x)*rgbsz+rofs] = r<0 ? 0 : r>255 ? 255 : r;\
+    dest[0][(y*width+x)*rgbsz+gofs] = g<0 ? 0 : g>255 ? 255 : g;\
+    dest[0][(y*width+x)*rgbsz+bofs] = b<0 ? 0 : b>255 ? 255 : b;\
+} while (0)
+#endif
+
+#define YUV2RGB_420P(s,r,g,b)  YUV2RGB((y/2)*(width/2)+(x/2),s,r,g,b)
+#define YUV2RGB_411P(s,r,g,b)  YUV2RGB((y  )*(width/4)+(x/4),s,r,g,b)
+#define YUV2RGB_422P(s,r,g,b)  YUV2RGB((y  )*(width/2)+(x/2),s,r,g,b)
+#define YUV2RGB_444P(s,r,g,b)  YUV2RGB((y  )*(width  )+(x  ),s,r,g,b)
+#define YUV2RGB_YUY2(s,r,g,b)  YUV2RGB_PACKED(0,1,3,         s,r,g,b)
+#define YUV2RGB_UYVY(s,r,g,b)  YUV2RGB_PACKED(1,0,2,         s,r,g,b)
+#define YUV2RGB_YVYU(s,r,g,b)  YUV2RGB_PACKED(0,3,1,         s,r,g,b)
+
+#define DEFINE_YUV2RGB(name,op) \
+static int name(uint8_t **src, uint8_t **dest, int width, int height)   \
+{                                                                       \
+    int x, y;                                                           \
+                                                                        \
+    yuv_create_tables();                                                \
+    for (y = 0; y < height; y++) {                                      \
+        for (x = 0; x < width; x++) {                                   \
+            op;                                                         \
+        }                                                               \
+    }                                                                   \
+    return 1;                                                           \
+}
+
+#define DEFINE_YUV2RGB_SET(rgb,rgbsz,rofs,gofs,bofs) \
+    DEFINE_YUV2RGB(yuv420p_##rgb, YUV2RGB_420P(rgbsz,rofs,gofs,bofs)) \
+    DEFINE_YUV2RGB(yuv411p_##rgb, YUV2RGB_411P(rgbsz,rofs,gofs,bofs)) \
+    DEFINE_YUV2RGB(yuv422p_##rgb, YUV2RGB_422P(rgbsz,rofs,gofs,bofs)) \
+    DEFINE_YUV2RGB(yuv444p_##rgb, YUV2RGB_444P(rgbsz,rofs,gofs,bofs)) \
+    DEFINE_YUV2RGB(yuy2_##rgb,    YUV2RGB_YUY2(rgbsz,rofs,gofs,bofs)) \
+    DEFINE_YUV2RGB(uyvy_##rgb,    YUV2RGB_UYVY(rgbsz,rofs,gofs,bofs)) \
+    DEFINE_YUV2RGB(yvyu_##rgb,    YUV2RGB_YVYU(rgbsz,rofs,gofs,bofs))
+
+DEFINE_YUV2RGB_SET(rgb24,  3,0,1,2)
+DEFINE_YUV2RGB_SET(bgr24,  3,2,1,0)
+DEFINE_YUV2RGB_SET(rgba32, 4,0,1,2)
+DEFINE_YUV2RGB_SET(abgr32, 4,3,2,1)
+DEFINE_YUV2RGB_SET(argb32, 4,1,2,3)
+DEFINE_YUV2RGB_SET(bgra32, 4,2,1,0)
+
+/* Y8->RGB is defined as part of grayscale stuff below */
+
+/*************************************************************************/
+
+#define RGB2Y() \
+    (dest[0][y*width+x] = ((16829*r + 33039*g +  6416*b + 32768) >> 16) + 16)
+#define RGB2U(uvofs) \
+    (dest[1][(uvofs)]   = ((-9714*r - 19070*g + 28784*b + 32768) >> 16) + 128)
+#define RGB2V(uvofs) \
+    (dest[2][(uvofs)]   = ((28784*r - 24103*g -  4681*b + 32768) >> 16) + 128)
+#define RGB2Y_PACKED(ofs) \
+    (dest[0][(y*width+x)*2+(ofs)] = ((16829*r + 33039*g +  6416*b + 32768) >> 16) + 16)
+#define RGB2U_PACKED(ofs) \
+    (dest[0][(y*width+x)*2+(ofs)] = ((-9714*r - 19070*g + 28784*b + 32768) >> 16) + 128)
+#define RGB2V_PACKED(ofs) \
+    (dest[0][(y*width+x)*2+(ofs)] = ((28784*r - 24103*g -  4681*b + 32768) >> 16) + 128)
+
+#define RGB2YUV(utest,vtest,uvofs) \
+    RGB2Y(); if (utest) RGB2U(uvofs); if (vtest) RGB2V(uvofs)
+#define RGB2YUV_PACKED(utest,vtest,yofs,uvofs) \
+    RGB2Y_PACKED(yofs); \
+    if (utest) RGB2U_PACKED(uvofs); \
+    if (vtest) RGB2V_PACKED(uvofs)
+/* YUV420P: take Cb/Cr from opposite corners */
+#define RGB2YUV_420P  RGB2YUV(!((x|y) & 1), (x&y) & 1, (y/2)*(width/2)+(x/2))
+/* YUV411P: take Cb/Cr from points 2 pixels apart */
+#define RGB2YUV_411P  RGB2YUV(!(x & 3), !((x^2) & 3), y*(width/4)+(x/4))
+/* YUV422P: take Cb/Cr from adjacent pixels */
+#define RGB2YUV_422P  RGB2YUV(!(x & 1), x & 1, y*(width/2)+(x/2))
+/* YUV444P: every pixel is sampled */
+#define RGB2YUV_444P  RGB2YUV(1, 1, y*width+x)
+/* YUY2/UYVY/YVYU: take Cb/Cr from the corresponding pixel */
+#define RGB2YUV_YUY2  RGB2YUV_PACKED(!(x & 1), x & 1, 0,1)
+#define RGB2YUV_UYVY  RGB2YUV_PACKED(!(x & 1), x & 1, 1,0)
+#define RGB2YUV_YVYU  RGB2YUV_PACKED(x & 1, !(x & 1), 0,1)
+
+#define DEFINE_RGB2YUV(name,rgbsz,rofs,gofs,bofs,op) \
+static int name(uint8_t **src, uint8_t **dest, int width, int height)   \
+{                                                                       \
+    int x, y;                                                           \
+                                                                        \
+    for (y = 0; y < height; y++) {                                      \
+        for (x = 0; x < width; x++) {                                   \
+            int r = src[0][(y*width+x)*rgbsz+rofs];                     \
+            int g = src[0][(y*width+x)*rgbsz+gofs];                     \
+            int b = src[0][(y*width+x)*rgbsz+bofs];                     \
+            op;                                                         \
+        }                                                               \
+    }                                                                   \
+    return 1;                                                           \
+}
+
+#define DEFINE_RGB2Y8(name,rgbsz,rofs,gofs,bofs) \
+static int name(uint8_t **src, uint8_t **dest, int width, int height)   \
+{                                                                       \
+    int x, y;                                                           \
+                                                                        \
+    for (y = 0; y < height; y++) {                                      \
+        for (x = 0; x < width; x++) {                                   \
+            int r = src[0][(y*width+x)*rgbsz+rofs];                     \
+            int g = src[0][(y*width+x)*rgbsz+gofs];                     \
+            int b = src[0][(y*width+x)*rgbsz+bofs];                     \
+            RGB2Y();                                                    \
+        }                                                               \
+    }                                                                   \
+    return 1;                                                           \
+}
+
+#define DEFINE_RGB2YUV_SET(rgb,rgbsz,rofs,gofs,bofs) \
+    DEFINE_RGB2YUV(rgb##_yuv420p,  rgbsz,rofs,gofs,bofs, RGB2YUV_420P) \
+    DEFINE_RGB2YUV(rgb##_yuv411p,  rgbsz,rofs,gofs,bofs, RGB2YUV_411P) \
+    DEFINE_RGB2YUV(rgb##_yuv422p,  rgbsz,rofs,gofs,bofs, RGB2YUV_422P) \
+    DEFINE_RGB2YUV(rgb##_yuv444p,  rgbsz,rofs,gofs,bofs, RGB2YUV_444P) \
+    DEFINE_RGB2YUV(rgb##_yuy2,     rgbsz,rofs,gofs,bofs, RGB2YUV_YUY2) \
+    DEFINE_RGB2YUV(rgb##_uyvy,     rgbsz,rofs,gofs,bofs, RGB2YUV_UYVY) \
+    DEFINE_RGB2YUV(rgb##_yvyu,     rgbsz,rofs,gofs,bofs, RGB2YUV_YVYU) \
+    DEFINE_RGB2Y8 (rgb##_y8,       rgbsz,rofs,gofs,bofs)
+
+DEFINE_RGB2YUV_SET(rgb24,  3,0,1,2)
+DEFINE_RGB2YUV_SET(bgr24,  3,2,1,0)
+DEFINE_RGB2YUV_SET(rgba32, 4,0,1,2)
+DEFINE_RGB2YUV_SET(abgr32, 4,3,2,1)
+DEFINE_RGB2YUV_SET(argb32, 4,1,2,3)
+DEFINE_RGB2YUV_SET(bgra32, 4,2,1,0)
+
+/*************************************************************************/
+
+/* All YUV planar formats convert to grayscale the same way */
+
+#ifdef USE_LOOKUP_TABLES
+static uint8_t graylut[2][256];
+static int graylut_created = 0;
+static void gray8_create_tables(void)
+{
+    if (!graylut_created) {
+        int i;
+        for (i = 0; i < 256; i++) {
+            if (i <= 16)
+                graylut[0][i] = 0;
+            else if (i >= 235)
+                graylut[0][i] = 255;
+            else
+                graylut[0][i] = (i-16) * 255 / 219;
+            graylut[1][i] = 16 + i*219/255;
+        }
+        graylut_created = 1;
+    }
+}
+# define Y2GRAY(val) (graylut[0][(val)])
+# define GRAY2Y(val) (graylut[1][(val)])
+#else
+# define gray8_create_tables() /*nothing*/
+# define Y2GRAY(val) ((val)<16 ? 0 : (val)>=235 ? 255 : ((val)-16)*256/219)
+# define GRAY2Y(val) (16 + (val)*219/255)
+#endif
+
+static int yuvp_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    gray8_create_tables();
+    for (i = 0; i < width*height; i++)
+        dest[0][i] = Y2GRAY(src[0][i]);
+    return 1;
+}
+
+static int yuy2_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    gray8_create_tables();
+    for (i = 0; i < width*height; i++)
+        dest[0][i] = Y2GRAY(src[0][i*2]);
+    return 1;
+}
+
+static int uyvy_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    gray8_create_tables();
+    for (i = 0; i < width*height; i++)
+        dest[0][i] = Y2GRAY(src[0][i*2+1]);
+    return 1;
+}
+
+/*************************************************************************/
+
+static int gray8_y8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    gray8_create_tables();
+    for (i = 0; i < width*height; i++)
+        dest[0][i] = GRAY2Y(src[0][i]);
+    return 1;
+}
+
+static int gray8_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    if (!ac_imgconvert(src, IMG_GRAY8, dest, IMG_Y8, width, height))
+        return 0;
+    memset(dest[1], 128, (width/2)*(height/2));
+    memset(dest[2], 128, (width/2)*(height/2));
+    return 1;
+}
+
+static int gray8_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    if (!ac_imgconvert(src, IMG_GRAY8, dest, IMG_Y8, width, height))
+        return 0;
+    memset(dest[1], 128, (width/4)*height);
+    memset(dest[2], 128, (width/4)*height);
+    return 1;
+}
+
+static int gray8_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    if (!ac_imgconvert(src, IMG_GRAY8, dest, IMG_Y8, width, height))
+        return 0;
+    memset(dest[1], 128, (width/2)*height);
+    memset(dest[2], 128, (width/2)*height);
+    return 1;
+}
+
+static int gray8_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    if (!ac_imgconvert(src, IMG_GRAY8, dest, IMG_Y8, width, height))
+        return 0;
+    memset(dest[1], 128, width*height);
+    memset(dest[2], 128, width*height);
+    return 1;
+}
+
+static int gray8_yuy2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    gray8_create_tables();
+    for (i = 0; i < width*height; i++) {
+        dest[0][i*2  ] = GRAY2Y(src[0][i]);
+        dest[0][i*2+1] = 128;
+    }
+    return 1;
+}
+
+static int gray8_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    gray8_create_tables();
+    for (i = 0; i < width*height; i++) {
+        dest[0][i*2  ] = 128;
+        dest[0][i*2+1] = GRAY2Y(src[0][i]);
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+/* We only need 3 functions for Y8->RGB (no difference between RGB and BGR) */
+
+static int y8_rgb24(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    gray8_create_tables();
+    for (i = 0; i < width*height; i++)
+        dest[0][i*3] = dest[0][i*3+1] = dest[0][i*3+2] = Y2GRAY(src[0][i]);
+    return 1;
+}
+
+static int y8_rgba32(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    gray8_create_tables();
+    for (i = 0; i < width*height; i++)
+        dest[0][i*4] = dest[0][i*4+1] = dest[0][i*4+2] = Y2GRAY(src[0][i]);
+    return 1;
+}
+
+static int y8_argb32(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    gray8_create_tables();
+    for (i = 0; i < width*height; i++)
+        dest[0][i*4+1] = dest[0][i*4+2] = dest[0][i*4+3] = Y2GRAY(src[0][i]);
+    return 1;
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Accelerated versions of colorspace routines. */
+
+/* Common constant values used in routines: */
+
+#if defined(HAVE_ASM_MMX)
+
+#include "img_x86_common.h"
+
+static const struct { uint16_t n[72]; } __attribute__((aligned(16))) yuv_data = {{
+    0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF, /* for odd/even */
+    0x0010,0x0010,0x0010,0x0010,0x0010,0x0010,0x0010,0x0010, /* for Y -16    */
+    0x0080,0x0080,0x0080,0x0080,0x0080,0x0080,0x0080,0x0080, /* for U/V -128 */
+    0x2543,0x2543,0x2543,0x2543,0x2543,0x2543,0x2543,0x2543, /* Y constant   */
+    0x3313,0x3313,0x3313,0x3313,0x3313,0x3313,0x3313,0x3313, /* rV constant  */
+    0xF377,0xF377,0xF377,0xF377,0xF377,0xF377,0xF377,0xF377, /* gU constant  */
+    0xE5FC,0xE5FC,0xE5FC,0xE5FC,0xE5FC,0xE5FC,0xE5FC,0xE5FC, /* gV constant  */
+    0x408D,0x408D,0x408D,0x408D,0x408D,0x408D,0x408D,0x408D, /* bU constant  */
+    0x0008,0x0008,0x0008,0x0008,0x0008,0x0008,0x0008,0x0008, /* for rounding */
+}};
+/* Note that G->Y exceeds 0x7FFF, so be careful to treat it as unsigned
+ * (the rest of the values are signed) */
+static const struct { uint16_t n[96]; } __attribute__((aligned(16))) rgb_data = {{
+    0x41BD,0x41BD,0x41BD,0x41BD,0x41BD,0x41BD,0x41BD,0x41BD, /* R->Y         */
+    0x810F,0x810F,0x810F,0x810F,0x810F,0x810F,0x810F,0x810F, /* G->Y         */
+    0x1910,0x1910,0x1910,0x1910,0x1910,0x1910,0x1910,0x1910, /* B->Y         */
+    0xDA0E,0xDA0E,0xDA0E,0xDA0E,0xDA0E,0xDA0E,0xDA0E,0xDA0E, /* R->U         */
+    0xB582,0xB582,0xB582,0xB582,0xB582,0xB582,0xB582,0xB582, /* G->U         */
+    0x7070,0x7070,0x7070,0x7070,0x7070,0x7070,0x7070,0x7070, /* B->U         */
+    0x7070,0x7070,0x7070,0x7070,0x7070,0x7070,0x7070,0x7070, /* R->V         */
+    0xA1D9,0xA1D9,0xA1D9,0xA1D9,0xA1D9,0xA1D9,0xA1D9,0xA1D9, /* G->V         */
+    0xEDB7,0xEDB7,0xEDB7,0xEDB7,0xEDB7,0xEDB7,0xEDB7,0xEDB7, /* B->V         */
+    0x0420,0x0420,0x0420,0x0420,0x0420,0x0420,0x0420,0x0420, /* Y +16.5      */
+    0x2020,0x2020,0x2020,0x2020,0x2020,0x2020,0x2020,0x2020, /* U/V +128.5   */
+    0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF, /* for odd/even */
+}};
+#define Y_GRAY 0x4A85
+#define GRAY_Y 0x36F7
+static const struct { uint16_t n[32]; } __attribute__((aligned(16))) gray_data = {{
+    Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY, /* 255/219      */
+    GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y, /* 219/255      */
+    0x0010,0x0010,0x0010,0x0010,0x0010,0x0010,0x0010,0x0010, /* Y +/-16      */
+    0x00FF,0xFF00,0x0000,0x00FF,0xFF00,0x0000,0x0000,0x0000, /* for Y->RGB   */
+}};
+
+/* Convert 4 RGB32 pixels in EAX/EBX/ECX/EDX to RGB24 in EAX/EBX/ECX */
+#define IA32_RGB32_TO_RGB24 \
+        "movl %%ebx, %%esi      # ESI: 00 B1 G1 R1                      \n\
+        shll $24, %%esi         # ESI: R1 00 00 00                      \n\
+        shrl $8, %%ebx          # EBX: 00 00 B1 G1                      \n\
+        orl %%esi, %%eax        # EAX: R1 B0 G0 R0                      \n\
+        movl %%ecx, %%esi       # ESI: 00 B2 G2 R2                      \n\
+        shll $16, %%esi         # ESI: G2 R2 00 00                      \n\
+        shrl $16, %%ecx         # ECX: 00 00 00 B2                      \n\
+        shll $8, %%edx          # EDX: B3 G3 R3 00                      \n\
+        orl %%esi, %%ebx        # EBX: G2 R2 B1 G1                      \n\
+        orl %%edx, %%ecx        # ECX: B3 G3 R3 B2                      \n"
+
+/* Convert 4 RGB24 pixels in EAX/EBX/ECX to RGB32 in EAX/EBX/ECX/EDX */
+#define IA32_RGB24_TO_RGB32 \
+        "movl %%ecx, %%edx      # EDX: B3 G3 R3 B2                      \n\
+        shrl $8, %%edx          # EDX: 00 B3 G3 R3                      \n\
+        andl $0xFF, %%ecx       # ECX: 00 00 00 B2                      \n\
+        movl %%ebx, %%edi       # EDI: G2 R2 B1 G1                      \n\
+        andl $0xFFFF0000, %%edi # EDI: G2 R2 00 00                      \n\
+        orl %%edi, %%ecx        # ECX: G2 R2 00 B2                      \n\
+        rorl $16, %%ecx         # ECX: 00 B2 G2 R2                      \n\
+        movl %%eax, %%edi       # EDI: R1 B0 G0 R0                      \n\
+        andl $0xFF000000, %%edi # EDI: R1 00 00 00                      \n\
+        andl $0x0000FFFF, %%ebx # EBX: 00 00 B1 G1                      \n\
+        orl %%edi, %%ebx        # EBX: R1 00 B1 G1                      \n\
+        roll $8, %%ebx          # EBX: 00 B1 G1 R1                      \n\
+        andl $0x00FFFFFF, %%eax # EAX: 00 B0 G0 R0                      \n"
+
+#endif  /* HAVE_ASM_MMX */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* MMX routines */
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)  /* i.e. not x86_64 */
+
+static inline void mmx_yuv42Xp_to_rgb(uint8_t *srcY, uint8_t *srcU,
+                                      uint8_t *srcV);
+#define mmx_yuv420p_to_rgb mmx_yuv42Xp_to_rgb
+#define mmx_yuv422p_to_rgb mmx_yuv42Xp_to_rgb
+static inline void mmx_store_rgb24(uint8_t *dest);
+static inline void mmx_store_bgr24(uint8_t *dest);
+static inline void mmx_store_rgba32(uint8_t *dest);
+static inline void mmx_store_abgr32(uint8_t *dest);
+static inline void mmx_store_argb32(uint8_t *dest);
+static inline void mmx_store_bgra32(uint8_t *dest);
+
+#define DEFINE_YUV2RGB_MMX(yuv,rgb,uvofs,rgbsz,rofs,gofs,bofs) \
+static int yuv##_##rgb##_mmx(uint8_t **src, uint8_t **dest,             \
+                             int width, int height)                     \
+{                                                                       \
+    int x, y;                                                           \
+                                                                        \
+    yuv_create_tables();                                                \
+    for (y = 0; y < height; y++) {                                      \
+        for (x = 0; x < (width & ~7); x += 8) {                         \
+            mmx_##yuv##_to_rgb(src[0]+y*width+x,                        \
+                               src[1]+(uvofs), src[2]+(uvofs));         \
+            mmx_store_##rgb(dest[0]+(y*width+x)*rgbsz);                 \
+        }                                                               \
+        while (x < width) {                                             \
+            YUV2RGB(uvofs,rgbsz,rofs,gofs,bofs);                        \
+            x++;                                                        \
+        }                                                               \
+    }                                                                   \
+    asm("emms");                                                        \
+    return 1;                                                           \
+}
+
+#define DEFINE_YUV2RGB_MMX_SET(rgb,rgbsz,rofs,gofs,bofs) \
+    DEFINE_YUV2RGB_MMX(yuv420p,rgb,(y/2)*(width/2)+(x/2),rgbsz,rofs,gofs,bofs)\
+    DEFINE_YUV2RGB_MMX(yuv422p,rgb,(y  )*(width/2)+(x/2),rgbsz,rofs,gofs,bofs)
+
+DEFINE_YUV2RGB_MMX_SET(rgb24,  3,0,1,2)
+DEFINE_YUV2RGB_MMX_SET(bgr24,  3,2,1,0)
+DEFINE_YUV2RGB_MMX_SET(rgba32, 4,0,1,2)
+DEFINE_YUV2RGB_MMX_SET(abgr32, 4,3,2,1)
+DEFINE_YUV2RGB_MMX_SET(argb32, 4,1,2,3)
+DEFINE_YUV2RGB_MMX_SET(bgra32, 4,2,1,0)
+
+/************************************/
+
+static inline void mmx_yuv42Xp_to_rgb(uint8_t *srcY, uint8_t *srcU,
+                                      uint8_t *srcV)
+{
+    asm("\
+        # Load data, bias and expand to 16 bits                         \n\
+        pxor %%mm4, %%mm4       # MM4: 00 00 00 00 00 00 00 00          \n\
+        movq ("EAX"), %%mm6     # MM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0          \n\
+        movd ("ECX"), %%mm2     # MM2:             U3 U2 U1 U0          \n\
+        movd ("EDX"), %%mm3     # MM3:             V3 V2 V1 V0          \n\
+        movq %%mm6, %%mm7       # MM7: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0          \n\
+        pand ("ESI"), %%mm6     # MM6:  -Y6-  -Y4-  -Y2-  -Y0-          \n\
+        psrlw $8, %%mm7         # MM7:  -Y7-  -Y5-  -Y3-  -Y1-          \n\
+        punpcklbw %%mm4, %%mm2  # MM2:  -U3-  -U2-  -U1-  -U0-          \n\
+        punpcklbw %%mm4, %%mm3  # MM3:  -V3-  -V2-  -V1-  -V0-          \n\
+        psubw 16("ESI"), %%mm6  # MM6: subtract 16                      \n\
+        psubw 16("ESI"), %%mm7  # MM7: subtract 16                      \n\
+        psubw 32("ESI"), %%mm2  # MM2: subtract 128                     \n\
+        psubw 32("ESI"), %%mm3  # MM3: subtract 128                     \n\
+        psllw $7, %%mm6         # MM6: convert to fixed point 8.7       \n\
+        psllw $7, %%mm7         # MM7: convert to fixed point 8.7       \n\
+        psllw $7, %%mm2         # MM2: convert to fixed point 8.7       \n\
+        psllw $7, %%mm3         # MM3: convert to fixed point 8.7       \n\
+        # Multiply by constants                                         \n\
+        pmulhw 48("ESI"), %%mm6 # MM6: -cY6- -cY4- -cY2- -cY0-          \n\
+        pmulhw 48("ESI"), %%mm7 # MM6: -cY7- -cY5- -cY3- -cY1-          \n\
+        movq 80("ESI"), %%mm4   # MM4: gU constant                      \n\
+        movq 96("ESI"), %%mm5   # MM5: gV constant                      \n\
+        pmulhw %%mm2, %%mm4     # MM4: -gU3- -gU2- -gU1- -gU0-          \n\
+        pmulhw %%mm3, %%mm5     # MM5: -gV3- -gV2- -gV1- -gV0-          \n\
+        paddw %%mm5, %%mm4      # MM4:  -g3-  -g2-  -g1-  -g0-          \n\
+        pmulhw 64("ESI"), %%mm3 # MM3:  -r3-  -r2-  -r1-  -r0-          \n\
+        pmulhw 112("ESI"),%%mm2 # MM2:  -b3-  -b2-  -b1-  -b0-          \n\
+        movq %%mm3, %%mm0       # MM0:  -r3-  -r2-  -r1-  -r0-          \n\
+        movq %%mm4, %%mm1       # MM1:  -g3-  -g2-  -g1-  -g0-          \n\
+        movq %%mm2, %%mm5       # MM5:  -b3-  -b2-  -b1-  -b0-          \n\
+        # Add intermediate results and round/shift to get R/G/B values  \n\
+        paddw 128("ESI"), %%mm6 # Add rounding value (0.5 @ 8.4 fixed)  \n\
+        paddw 128("ESI"), %%mm7                                         \n\
+        paddw %%mm6, %%mm0      # MM0:  -R6-  -R4-  -R2-  -R0-          \n\
+        psraw $4, %%mm0         # Shift back to 8.0 fixed               \n\
+        paddw %%mm6, %%mm1      # MM1:  -G6-  -G4-  -G2-  -G0-          \n\
+        psraw $4, %%mm1                                                 \n\
+        paddw %%mm6, %%mm2      # MM2:  -B6-  -B4-  -B2-  -B0-          \n\
+        psraw $4, %%mm2                                                 \n\
+        paddw %%mm7, %%mm3      # MM3:  -R7-  -R5-  -R3-  -R1-          \n\
+        psraw $4, %%mm3                                                 \n\
+        paddw %%mm7, %%mm4      # MM4:  -G7-  -G5-  -G3-  -G1-          \n\
+        psraw $4, %%mm4                                                 \n\
+        paddw %%mm7, %%mm5      # MM5:  -B7-  -B5-  -B3-  -B1-          \n\
+        psraw $4, %%mm5                                                 \n\
+        # Saturate to 0-255 and pack into bytes                         \n\
+        packuswb %%mm0, %%mm0   # MM0: R6 R4 R2 R0 R6 R4 R2 R0          \n\
+        packuswb %%mm1, %%mm1   # MM1: G6 G4 G2 G0 G6 G4 G2 G0          \n\
+        packuswb %%mm2, %%mm2   # MM2: B6 B4 B2 B0 B6 B4 B2 B0          \n\
+        packuswb %%mm3, %%mm3   # MM3: R7 R5 R3 R1 R7 R5 R3 R1          \n\
+        packuswb %%mm4, %%mm4   # MM4: G7 G5 G3 G1 G7 G5 G3 G1          \n\
+        packuswb %%mm5, %%mm5   # MM5: B7 B5 B3 B1 B7 B5 B3 B1          \n\
+        punpcklbw %%mm3, %%mm0  # MM0: R7 R6 R5 R4 R3 R2 R1 R0          \n\
+        punpcklbw %%mm4, %%mm1  # MM1: G7 G6 G5 G4 G3 G2 G1 G0          \n\
+        punpcklbw %%mm5, %%mm2  # MM2: B7 B6 B5 B4 B3 B2 B1 B0          \n"
+        : /* no outputs */
+        : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+/************************************/
+
+/* Convert YUV->RGB output to RGBA pixels in MM0..MM3 */
+#define MMX_RGB_TO_RGBA "\
+        pxor %%mm7, %%mm7       # MM7: 00 00 00 00 00 00 00 00          \n\
+        movq %%mm0, %%mm3       # MM3: R7 R6 R5 R4 R3 R2 R1 R0          \n\
+        movq %%mm1, %%mm4       # MM4: G7 G6 G5 G4 G3 G2 G1 G0          \n\
+        movq %%mm2, %%mm5       # MM5: B7 B6 B5 B4 B3 B2 B1 B0          \n\
+        punpcklbw %%mm1, %%mm0  # MM0: G3 R3 G2 R2 G1 R1 G0 R0          \n\
+        punpcklbw %%mm7, %%mm2  # MM2: 00 B3 00 B2 00 B1 00 B0          \n\
+        movq %%mm0, %%mm1       # MM1: G3 R3 G2 R2 G1 R1 G0 R0          \n\
+        punpcklwd %%mm2, %%mm0  # MM0: 00 B1 G1 R1 00 B0 G0 R0          \n\
+        punpckhwd %%mm2, %%mm1  # MM1: 00 B3 G3 R3 00 B2 G2 R2          \n\
+        punpckhbw %%mm4, %%mm3  # MM3: G7 R7 G6 R6 G5 R5 G4 R4          \n\
+        punpckhbw %%mm7, %%mm5  # MM5: 00 B7 00 B6 00 B5 00 B4          \n\
+        movq %%mm3, %%mm2       # MM2: G7 R7 G6 R6 G5 R5 G4 R4          \n\
+        punpckhwd %%mm5, %%mm3  # MM3: 00 B7 G7 R7 00 B6 G6 R6          \n\
+        punpcklwd %%mm5, %%mm2  # MM2: 00 B5 G5 R5 00 B4 G4 R4          \n"
+
+/* Convert YUV->RGB output to BGRA pixels in MM0..MM3 */
+#define MMX_RGB_TO_BGRA "\
+        pxor %%mm7, %%mm7       # MM7: 00 00 00 00 00 00 00 00          \n\
+        movq %%mm0, %%mm5       # MM5: R7 R6 R5 R4 R3 R2 R1 R0          \n\
+        movq %%mm1, %%mm4       # MM4: G7 G6 G5 G4 G3 G2 G1 G0          \n\
+        movq %%mm2, %%mm3       # MM3: B7 B6 B5 B4 B3 B2 B1 B0          \n\
+        punpcklbw %%mm1, %%mm2  # MM2: G3 B3 G2 B2 G1 B1 G0 B0          \n\
+        punpcklbw %%mm7, %%mm0  # MM0: 00 R3 00 R2 00 R1 00 R0          \n\
+        movq %%mm2, %%mm1       # MM1: G3 B3 G2 B2 G1 B1 G0 B0          \n\
+        punpcklwd %%mm0, %%mm2  # MM2: 00 R1 G1 B1 00 R0 G0 B0          \n\
+        punpckhwd %%mm0, %%mm1  # MM1: 00 R3 G3 B3 00 R2 G2 B2          \n\
+        movq %%mm2, %%mm0       # MM0: 00 R1 G1 B1 00 R0 G0 B0          \n\
+        punpckhbw %%mm4, %%mm3  # MM3: G7 B7 G6 B6 G5 B5 G4 B4          \n\
+        punpckhbw %%mm7, %%mm5  # MM5: 00 R7 00 R6 00 R5 00 R4          \n\
+        movq %%mm3, %%mm2       # MM2: G7 B7 G6 B6 G5 B5 G4 B4          \n\
+        punpckhwd %%mm5, %%mm3  # MM3: 00 R7 G7 B7 00 R6 G6 B6          \n\
+        punpcklwd %%mm5, %%mm2  # MM2: 00 R5 G5 B5 00 R4 G4 B4          \n"
+
+
+static inline void mmx_store_rgb24(uint8_t *dest)
+{
+    /* It looks like it's fastest to go to RGB32 first, then shift the
+     * result to merge the 24-bit pixels together. */
+    asm(MMX_RGB_TO_RGBA                                                  "\
+        movq %%mm0, %%mm4       # MM4: 00 B1 G1 R1 00 B0 G0 R0          \n\
+        movq %%mm1, %%mm5       # MM5: 00 B3 G3 R3 00 B2 G2 R2          \n\
+        movq %%mm2, %%mm6       # MM6: 00 B5 G5 R5 00 B4 G4 R4          \n\
+        movq %%mm3, %%mm7       # MM7: 00 B7 G7 R7 00 B6 G6 R6          \n\
+        psrlq $32, %%mm4        # MM4: 00 00 00 00 00 B1 G1 R1          \n\
+        psrlq $32, %%mm5        # MM5: 00 00 00 00 00 B3 G3 R3          \n\
+        psrlq $32, %%mm6        # MM6: 00 00 00 00 00 B5 G5 R5          \n\
+        psrlq $32, %%mm7        # MM7: 00 00 00 00 00 B7 G7 R7          \n\
+        push "EBX"                                                      \n\
+        movd %%mm0, %%eax       # EAX: 00 B0 G0 R0                      \n\
+        movd %%mm4, %%ebx       # EBX: 00 B1 G1 R1                      \n\
+        movd %%mm1, %%ecx       # ECX: 00 B2 G2 R2                      \n\
+        movd %%mm5, %%edx       # EDX: 00 B3 G3 R3                      \n\
+        "IA32_RGB32_TO_RGB24"                                           \n\
+        movl %%eax, ("EDI")                                             \n\
+        movl %%ebx, 4("EDI")                                            \n\
+        movl %%ecx, 8("EDI")                                            \n\
+        movd %%mm2, %%eax       # EAX: 00 B4 G4 R4                      \n\
+        movd %%mm6, %%ebx       # EBX: 00 B5 G5 R5                      \n\
+        movd %%mm3, %%ecx       # ECX: 00 B6 G6 R6                      \n\
+        movd %%mm7, %%edx       # EDX: 00 B7 G7 R7                      \n\
+        "IA32_RGB32_TO_RGB24"                                           \n\
+        movl %%eax, 12("EDI")                                           \n\
+        movl %%ebx, 16("EDI")                                           \n\
+        movl %%ecx, 20("EDI")                                           \n\
+        pop "EBX"                                                       \n"
+        : /* no outputs */
+        : "D" (dest)
+        : "eax", "ecx", "edx", "esi"
+    );
+}
+
+static inline void mmx_store_bgr24(uint8_t *dest)
+{
+    asm(MMX_RGB_TO_BGRA                                                  "\
+        movq %%mm0, %%mm4       # MM4: 00 B1 G1 R1 00 B0 G0 R0          \n\
+        movq %%mm1, %%mm5       # MM5: 00 B3 G3 R3 00 B2 G2 R2          \n\
+        movq %%mm2, %%mm6       # MM6: 00 B5 G5 R5 00 B4 G4 R4          \n\
+        movq %%mm3, %%mm7       # MM7: 00 B7 G7 R7 00 B6 G6 R6          \n\
+        psrlq $32, %%mm4        # MM4: 00 00 00 00 00 B1 G1 R1          \n\
+        psrlq $32, %%mm5        # MM5: 00 00 00 00 00 B3 G3 R3          \n\
+        psrlq $32, %%mm6        # MM6: 00 00 00 00 00 B5 G5 R5          \n\
+        psrlq $32, %%mm7        # MM7: 00 00 00 00 00 B7 G7 R7          \n\
+        push "EBX"                                                      \n\
+        movd %%mm0, %%eax       # EAX: 00 B0 G0 R0                      \n\
+        movd %%mm4, %%ebx       # EBX: 00 B1 G1 R1                      \n\
+        movd %%mm1, %%ecx       # ECX: 00 B2 G2 R2                      \n\
+        movd %%mm5, %%edx       # EDX: 00 B3 G3 R3                      \n\
+        "IA32_RGB32_TO_RGB24"                                           \n\
+        movl %%eax, ("EDI")                                             \n\
+        movl %%ebx, 4("EDI")                                            \n\
+        movl %%ecx, 8("EDI")                                            \n\
+        movd %%mm2, %%eax       # EAX: 00 B4 G4 R4                      \n\
+        movd %%mm6, %%ebx       # EBX: 00 B5 G5 R5                      \n\
+        movd %%mm3, %%ecx       # ECX: 00 B6 G6 R6                      \n\
+        movd %%mm7, %%edx       # EDX: 00 B7 G7 R7                      \n\
+        "IA32_RGB32_TO_RGB24"                                           \n\
+        movl %%eax, 12("EDI")                                           \n\
+        movl %%ebx, 16("EDI")                                           \n\
+        movl %%ecx, 20("EDI")                                           \n\
+        pop "EBX"                                                       \n"
+        : /* no outputs */
+        : "D" (dest)
+        : "eax", "ecx", "edx", "esi"
+    );
+}
+
+static inline void mmx_store_rgba32(uint8_t *dest)
+{
+    asm(MMX_RGB_TO_RGBA                                                  "\
+        movq %%mm0,   ("EDI")                                           \n\
+        movq %%mm1,  8("EDI")                                           \n\
+        movq %%mm2, 16("EDI")                                           \n\
+        movq %%mm3, 24("EDI")                                           \n"
+        : /* no outputs */
+        : "D" (dest)
+    );
+}
+
+static inline void mmx_store_abgr32(uint8_t *dest)
+{
+    asm(MMX_RGB_TO_BGRA                                                  "\
+        psllq $8, %%mm0                                                 \n\
+        psllq $8, %%mm1                                                 \n\
+        psllq $8, %%mm2                                                 \n\
+        psllq $8, %%mm3                                                 \n\
+        movq %%mm0,   ("EDI")                                           \n\
+        movq %%mm1,  8("EDI")                                           \n\
+        movq %%mm2, 16("EDI")                                           \n\
+        movq %%mm3, 24("EDI")                                           \n"
+        : /* no outputs */
+        : "D" (dest)
+    );
+}
+
+static inline void mmx_store_argb32(uint8_t *dest)
+{
+    asm(MMX_RGB_TO_RGBA                                                  "\
+        psllq $8, %%mm0                                                 \n\
+        psllq $8, %%mm1                                                 \n\
+        psllq $8, %%mm2                                                 \n\
+        psllq $8, %%mm3                                                 \n\
+        movq %%mm0,   ("EDI")                                           \n\
+        movq %%mm1,  8("EDI")                                           \n\
+        movq %%mm2, 16("EDI")                                           \n\
+        movq %%mm3, 24("EDI")                                           \n"
+        : /* no outputs */
+        : "D" (dest)
+    );
+}
+
+static inline void mmx_store_bgra32(uint8_t *dest)
+{
+    asm(MMX_RGB_TO_BGRA                                                  "\
+        movq %%mm0,   ("EDI")                                           \n\
+        movq %%mm1,  8("EDI")                                           \n\
+        movq %%mm2, 16("EDI")                                           \n\
+        movq %%mm3, 24("EDI")                                           \n"
+        : /* no outputs */
+        : "D" (dest)
+    );
+}
+
+#endif  /* HAVE_ASM_MMX && ARCH_X86 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* SSE2 routines */
+
+#if defined(HAVE_ASM_SSE2)
+
+/*************************************************************************/
+
+static inline void sse2_load_yuv420p(uint8_t *srcY, uint8_t *srcU,
+                                     uint8_t *srcV, int x, int y, int width);
+static inline void sse2_load_yuv411p(uint8_t *srcY, uint8_t *srcU,
+                                     uint8_t *srcV, int x, int y, int width);
+static inline void sse2_load_yuv422p(uint8_t *srcY, uint8_t *srcU,
+                                     uint8_t *srcV, int x, int y, int width);
+static inline void sse2_load_yuv444p(uint8_t *srcY, uint8_t *srcU,
+                                     uint8_t *srcV, int x, int y, int width);
+static inline void sse2_load_yuy2(uint8_t *srcY, uint8_t *srcU,
+                                  uint8_t *srcV, int x, int y, int width);
+static inline void sse2_load_uyvy(uint8_t *srcY, uint8_t *srcU,
+                                  uint8_t *srcV, int x, int y, int width);
+static inline void sse2_load_yvyu(uint8_t *srcY, uint8_t *srcU,
+                                  uint8_t *srcV, int x, int y, int width);
+static inline void sse2_yuv_to_rgb(void);
+static inline void sse2_yuv444_to_rgb(void);
+static inline void sse2_store_rgb24(uint8_t *dest);
+static inline void sse2_store_bgr24(uint8_t *dest);
+static inline void sse2_store_rgba32(uint8_t *dest);
+static inline void sse2_store_abgr32(uint8_t *dest);
+static inline void sse2_store_argb32(uint8_t *dest);
+static inline void sse2_store_bgra32(uint8_t *dest);
+
+#define DEFINE_YUV2RGB_SSE2(yuv,y2r,rgb,rgbsz,slowop) \
+static int yuv##_##rgb##_sse2(uint8_t **src, uint8_t **dest,            \
+                              int width, int height)                    \
+{                                                                       \
+    int x, y;                                                           \
+                                                                        \
+    yuv_create_tables();                                                \
+    for (y = 0; y < height; y++) {                                      \
+        for (x = 0; x < (width & ~15); x += 16) {                       \
+            sse2_load_##yuv(src[0], src[1], src[2], x, y, width);       \
+            sse2_##y2r();                                               \
+            sse2_store_##rgb(dest[0] + (y*width+x)*rgbsz);              \
+        }                                                               \
+        while (x < width) {                                             \
+            slowop;                                                     \
+            x++;                                                        \
+        }                                                               \
+    }                                                                   \
+    asm("emms");                                                        \
+    return 1;                                                           \
+}
+
+#define DEFINE_YUV2RGB_SSE2_SET(rgb,sz,r,g,b) \
+    DEFINE_YUV2RGB_SSE2(yuv420p, yuv_to_rgb,   rgb,sz, YUV2RGB_420P(sz,r,g,b))\
+    DEFINE_YUV2RGB_SSE2(yuv411p, yuv_to_rgb,   rgb,sz, YUV2RGB_411P(sz,r,g,b))\
+    DEFINE_YUV2RGB_SSE2(yuv422p, yuv_to_rgb,   rgb,sz, YUV2RGB_422P(sz,r,g,b))\
+    DEFINE_YUV2RGB_SSE2(yuv444p, yuv444_to_rgb,rgb,sz, YUV2RGB_444P(sz,r,g,b))\
+    DEFINE_YUV2RGB_SSE2(yuy2,    yuv_to_rgb,   rgb,sz, YUV2RGB_YUY2(sz,r,g,b))\
+    DEFINE_YUV2RGB_SSE2(uyvy,    yuv_to_rgb,   rgb,sz, YUV2RGB_UYVY(sz,r,g,b))\
+    DEFINE_YUV2RGB_SSE2(yvyu,    yuv_to_rgb,   rgb,sz, YUV2RGB_YVYU(sz,r,g,b))
+
+DEFINE_YUV2RGB_SSE2_SET(rgb24,  3,0,1,2)
+DEFINE_YUV2RGB_SSE2_SET(bgr24,  3,2,1,0)
+DEFINE_YUV2RGB_SSE2_SET(rgba32, 4,0,1,2)
+DEFINE_YUV2RGB_SSE2_SET(abgr32, 4,3,2,1)
+DEFINE_YUV2RGB_SSE2_SET(argb32, 4,1,2,3)
+DEFINE_YUV2RGB_SSE2_SET(bgra32, 4,2,1,0)
+
+/************************************/
+
+static inline void sse2_load_yuv420p(uint8_t *srcY, uint8_t *srcU,
+                                     uint8_t *srcV, int x, int y, int width)
+{
+    srcY += y*width+x;
+    srcU += (y/2)*(width/2)+(x/2);
+    srcV += (y/2)*(width/2)+(x/2);
+    asm("\
+        # Load data, bias and expand to 16 bits                         \n\
+        pxor %%xmm4, %%xmm4     # XMM4: 00 00 00 00 00 00 00 00         \n\
+        movdqu ("EAX"), %%xmm6  # XMM6: YF...................Y0         \n\
+        movq ("ECX"), %%xmm2    # XMM2:             U7.......U0         \n\
+        movq ("EDX"), %%xmm3    # XMM3:             V7.......V0         \n\
+        movdqa %%xmm6, %%xmm7   # XMM7: YF...................Y0         \n\
+        pand ("ESI"), %%xmm6    # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0         \n\
+        psrlw $8, %%xmm7        # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1         \n\
+        punpcklbw %%xmm4,%%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0         \n\
+        punpcklbw %%xmm4,%%xmm3 # XMM3: V7 V6 V5 V4 V3 V2 V1 V0         \n"
+        : /* no outputs */
+        : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+static inline void sse2_load_yuv411p(uint8_t *srcY, uint8_t *srcU,
+                                     uint8_t *srcV, int x, int y, int width)
+{
+    srcY += y*width+x;
+    srcU += y*(width/4)+(x/4);
+    srcV += y*(width/4)+(x/4);
+    asm("\
+        # Load data, bias and expand to 16 bits                         \n\
+        pxor %%xmm4, %%xmm4     # XMM4: 00 00 00 00 00 00 00 00         \n\
+        movdqu ("EAX"), %%xmm6  # XMM6: YF...................Y0         \n\
+        movd ("ECX"), %%xmm2    # XMM2:                   U3.U0         \n\
+        punpcklbw %%xmm2,%%xmm2 # XMM2:             U3 U3.U0 U0         \n\
+        movd ("EDX"), %%xmm3    # XMM3:                   V3.V0         \n\
+        punpcklbw %%xmm3,%%xmm3 # XMM2:             V3 V3.V0 V0         \n\
+        movdqa %%xmm6, %%xmm7   # XMM7: YF...................Y0         \n\
+        pand ("ESI"), %%xmm6    # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0         \n\
+        psrlw $8, %%xmm7        # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1         \n\
+        punpcklbw %%xmm4,%%xmm2 # XMM2: U3 U3 U2 U2 U1 U1 U0 U0         \n\
+        punpcklbw %%xmm4,%%xmm3 # XMM3: V3 V3 V2 V2 V1 V1 V0 V0         \n"
+        : /* no outputs */
+        : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+static inline void sse2_load_yuv422p(uint8_t *srcY, uint8_t *srcU,
+                                     uint8_t *srcV, int x, int y, int width)
+{
+    srcY += y*width+x;
+    srcU += y*(width/2)+(x/2);
+    srcV += y*(width/2)+(x/2);
+    asm("\
+        # Load data, bias and expand to 16 bits                         \n\
+        pxor %%xmm4, %%xmm4     # XMM4: 00 00 00 00 00 00 00 00         \n\
+        movdqu ("EAX"), %%xmm6  # XMM6: YF...................Y0         \n\
+        movq ("ECX"), %%xmm2    # XMM2:             U7.......U0         \n\
+        movq ("EDX"), %%xmm3    # XMM3:             V7.......V0         \n\
+        movdqa %%xmm6, %%xmm7   # XMM7: YF...................Y0         \n\
+        pand ("ESI"), %%xmm6    # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0         \n\
+        psrlw $8, %%xmm7        # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1         \n\
+        punpcklbw %%xmm4,%%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0         \n\
+        punpcklbw %%xmm4,%%xmm3 # XMM3: V7 V6 V5 V4 V3 V2 V1 V0         \n"
+        : /* no outputs */
+        : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+static inline void sse2_load_yuv444p(uint8_t *srcY, uint8_t *srcU,
+                                     uint8_t *srcV, int x, int y, int width)
+{
+    srcY += y*width+x;
+    srcU += y*width+x;
+    srcV += y*width+x;
+    asm("\
+        # Load data, bias and expand to 16 bits                         \n\
+        pxor %%xmm4, %%xmm4     # XMM4: 00 00 00 00 00 00 00 00         \n\
+        movdqu ("EAX"), %%xmm6  # XMM6: YF...................Y0         \n\
+        movdqu ("ECX"), %%xmm2  # XMM2: UF...................U0         \n\
+        movdqu ("EDX"), %%xmm0  # XMM0: VF...................V0         \n\
+        movdqa %%xmm6, %%xmm7   # XMM7: YF...................Y0         \n\
+        punpcklbw %%xmm4,%%xmm6 # XMM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0         \n\
+        punpckhbw %%xmm4,%%xmm7 # XMM7: YF YE YD YC YB YA Y9 Y8         \n\
+        movdqa %%xmm2, %%xmm5   # XMM5: UF...................U0         \n\
+        punpcklbw %%xmm4,%%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0         \n\
+        punpckhbw %%xmm4,%%xmm5 # XMM5: UF UE UD UC UB UA U9 U8         \n\
+        movdqa %%xmm0, %%xmm3   # XMM3: VF...................V0         \n\
+        punpcklbw %%xmm4,%%xmm0 # XMM0: V7 V6 V5 V4 V3 V2 V1 V0         \n\
+        punpckhbw %%xmm4,%%xmm3 # XMM3: VF VE VD VC VB VA V9 V8         \n"
+        : /* no outputs */
+        : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+static inline void sse2_load_yuy2(uint8_t *srcY, uint8_t *srcU,
+                                  uint8_t *srcV, int x, int y, int width)
+{
+    srcY += (y*width+x)*2;
+    asm("\
+        # Load data, bias and expand to 16 bits                         \n\
+        pxor %%xmm4, %%xmm4     # XMM4: 00 00 00 00 00 00 00 00         \n\
+        movdqu ("EAX"), %%xmm6  # XMM6: V3 Y7.............U0 Y0         \n\
+        movdqu 16("EAX"),%%xmm7 # XMM7: V7 YF.............U4 Y8         \n\
+        movdqa %%xmm6, %%xmm2   # XMM2: V3 Y7.............U0 Y0         \n\
+        psrlw $8, %%xmm2        # XMM2: V3 U3 V2 U2 V1 U1 V0 U0         \n\
+        pand ("ESI"), %%xmm6    # XMM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0         \n\
+        movdqa %%xmm7, %%xmm3   # XMM3: V7 YF.............U4 Y8         \n\
+        psrlw $8, %%xmm3        # XMM3: V7 U7 V6 U6 V5 U5 V4 U4         \n\
+        pand ("ESI"), %%xmm7    # XMM6: YF YE YD YC YB YA Y9 Y8         \n\
+        packuswb %%xmm3, %%xmm2 # XMM2: V7 U7.............V0 U0         \n\
+        movdqa %%xmm2, %%xmm3   # XMM3: V7 U7.............V0 U0         \n\
+        pand ("ESI"), %%xmm2    # XMM2: U7 U6 U5 U4 U3 U2 U1 U0         \n\
+        psrlw $8, %%xmm3        # XMM3: V7 V6 V5 V4 V3 V2 V1 V0         \n\
+        packuswb %%xmm7, %%xmm6 # XMM6: YF...................Y0         \n\
+        movdqa %%xmm6, %%xmm7   # XMM7: YF...................Y0         \n\
+        pand ("ESI"), %%xmm6    # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0         \n\
+        psrlw $8, %%xmm7        # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1         \n"
+        : /* no outputs */
+        : "a" (srcY), "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+static inline void sse2_load_uyvy(uint8_t *srcY, uint8_t *srcU,
+                                  uint8_t *srcV, int x, int y, int width)
+{
+    srcY += (y*width+x)*2;
+    asm("\
+        # Load data, bias and expand to 16 bits                         \n\
+        pxor %%xmm4, %%xmm4     # XMM4: 00 00 00 00 00 00 00 00         \n\
+        movdqu ("EAX"), %%xmm6  # XMM6: Y7 V3.............Y0 00         \n\
+        movdqu 16("EAX"),%%xmm7 # XMM7: YF V7.............Y8 U4         \n\
+        movdqa %%xmm6, %%xmm2   # XMM2: Y7 V3.............Y0 U0         \n\
+        pand ("ESI"), %%xmm2    # XMM2: V3 U3 V2 U2 V1 U1 V0 U0         \n\
+        psrlw $8, %%xmm6        # XMM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0         \n\
+        movdqa %%xmm7, %%xmm3   # XMM3: YF V7.............Y8 U4         \n\
+        pand ("ESI"), %%xmm3    # XMM3: V7 U7 V6 U6 V5 U5 V4 U4         \n\
+        psrlw $8, %%xmm7        # XMM6: YF YE YD YC YB YA Y9 Y8         \n\
+        packuswb %%xmm3, %%xmm2 # XMM2: V7 U7.............V0 U0         \n\
+        movdqa %%xmm2, %%xmm3   # XMM3: V7 U7.............V0 U0         \n\
+        pand ("ESI"), %%xmm2    # XMM2: U7 U6 U5 U4 U3 U2 U1 U0         \n\
+        psrlw $8, %%xmm3        # XMM3: V7 V6 V5 V4 V3 V2 V1 V0         \n\
+        packuswb %%xmm7, %%xmm6 # XMM6: YF...................Y0         \n\
+        movdqa %%xmm6, %%xmm7   # XMM7: YF...................Y0         \n\
+        pand ("ESI"), %%xmm6    # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0         \n\
+        psrlw $8, %%xmm7        # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1         \n"
+        : /* no outputs */
+        : "a" (srcY), "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+static inline void sse2_load_yvyu(uint8_t *srcY, uint8_t *srcU,
+                                  uint8_t *srcV, int x, int y, int width)
+{
+    srcY += (y*width+x)*2;
+    asm("\
+        # Load data, bias and expand to 16 bits                         \n\
+        pxor %%xmm4, %%xmm4     # XMM4: 00 00 00 00 00 00 00 00         \n\
+        movdqu ("EAX"), %%xmm6  # XMM6: U3 Y7.............V0 Y0         \n\
+        movdqu 16("EAX"),%%xmm7 # XMM7: U7 YF.............V4 Y8         \n\
+        movdqa %%xmm6, %%xmm2   # XMM2: U3 Y7.............V0 Y0         \n\
+        psrlw $8, %%xmm2        # XMM2: U3 V3 U2 V2 U1 V1 U0 V0         \n\
+        pand ("ESI"), %%xmm6    # XMM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0         \n\
+        movdqa %%xmm7, %%xmm3   # XMM3: U7 YF.............V4 Y8         \n\
+        psrlw $8, %%xmm3        # XMM3: U7 V7 U6 V6 U5 V5 U4 V4         \n\
+        pand ("ESI"), %%xmm7    # XMM6: YF YE YD YC YB YA Y9 Y8         \n\
+        packuswb %%xmm3, %%xmm2 # XMM2: U7 V7.............U0 V0         \n\
+        movdqa %%xmm2, %%xmm3   # XMM3: U7 V7.............U0 V0         \n\
+        psrlw $8, %%xmm2        # XMM2: U7 U6 U5 U4 U3 U2 U1 U0         \n\
+        pand ("ESI"), %%xmm3    # XMM3: V7 V6 V5 V4 V3 V2 V1 V0         \n\
+        packuswb %%xmm7, %%xmm6 # XMM6: YF...................Y0         \n\
+        movdqa %%xmm6, %%xmm7   # XMM7: YF...................Y0         \n\
+        pand ("ESI"), %%xmm6    # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0         \n\
+        psrlw $8, %%xmm7        # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1         \n"
+        : /* no outputs */
+        : "a" (srcY), "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+/************************************/
+
+/* Standard YUV->RGB (Yodd=XMM7 Yeven=XMM6 U=XMM2 V=XMM3) */
+static inline void sse2_yuv_to_rgb(void)
+{
+    asm("\
+        psubw 16("ESI"), %%xmm6 # XMM6: subtract 16                     \n\
+        psllw $7, %%xmm6        # XMM6: convert to fixed point 8.7      \n\
+        psubw 16("ESI"), %%xmm7 # XMM7: subtract 16                     \n\
+        psllw $7, %%xmm7        # XMM7: convert to fixed point 8.7      \n\
+        psubw 32("ESI"), %%xmm2 # XMM2: subtract 128                    \n\
+        psllw $7, %%xmm2        # XMM2: convert to fixed point 8.7      \n\
+        psubw 32("ESI"), %%xmm3 # XMM3: subtract 128                    \n\
+        psllw $7, %%xmm3        # XMM3: convert to fixed point 8.7      \n\
+        # Multiply by constants                                         \n\
+        pmulhw 48("ESI"),%%xmm6 # XMM6: cYE.................cY0         \n\
+        pmulhw 48("ESI"),%%xmm7 # XMM7: cYF.................cY1         \n\
+        movdqa 80("ESI"),%%xmm4 # XMM4: gU constant                     \n\
+        pmulhw %%xmm2, %%xmm4   # XMM4: gU7.................gU0         \n\
+        movdqa 96("ESI"),%%xmm5 # XMM5: gV constant                     \n\
+        pmulhw %%xmm3, %%xmm5   # XMM5: gV7.................gV0         \n\
+        paddw %%xmm5, %%xmm4    # XMM4: g7 g6 g5 g4 g3 g2 g1 g0         \n\
+        pmulhw 64("ESI"),%%xmm3 # XMM3: r7 r6 r5 r4 r3 r2 r1 r0         \n\
+        pmulhw 112("ESI"),%%xmm2 #XMM2: b7 b6 b5 b4 b3 b2 b1 b0         \n\
+        movdqa %%xmm3, %%xmm0   # XMM0: r7 r6 r5 r4 r3 r2 r1 r0         \n\
+        movdqa %%xmm4, %%xmm1   # XMM1: g7 g6 g5 g4 g3 g2 g1 g0         \n\
+        movdqa %%xmm2, %%xmm5   # XMM5: b7 b6 b5 b4 b3 b2 b1 b0         \n\
+        # Add intermediate results and round/shift to get R/G/B values  \n\
+        paddw 128("ESI"),%%xmm6 # Add rounding value (0.5 @ 8.4 fixed)  \n\
+        paddw 128("ESI"),%%xmm7                                         \n\
+        paddw %%xmm6, %%xmm0    # XMM0: RE RC RA R8 R6 R4 R2 R0         \n\
+        psraw $4, %%xmm0        # Shift back to 8.0 fixed               \n\
+        paddw %%xmm6, %%xmm1    # XMM1: GE GC GA G8 G6 G4 G2 G0         \n\
+        psraw $4, %%xmm1                                                \n\
+        paddw %%xmm6, %%xmm2    # XMM2: BE BC BA B8 B6 B4 B2 B0         \n\
+        psraw $4, %%xmm2                                                \n\
+        paddw %%xmm7, %%xmm3    # XMM3: RF RD RB R9 R7 R5 R3 R1         \n\
+        psraw $4, %%xmm3                                                \n\
+        paddw %%xmm7, %%xmm4    # XMM4: GF GD GB G9 G7 G5 G3 G1         \n\
+        psraw $4, %%xmm4                                                \n\
+        paddw %%xmm7, %%xmm5    # XMM5: BF BD BB B9 B7 B5 B3 B1         \n\
+        psraw $4, %%xmm5                                                \n\
+        # Saturate to 0-255 and pack into bytes                         \n\
+        packuswb %%xmm0, %%xmm0 # XMM0: RE.......R0 RE.......R0         \n\
+        packuswb %%xmm1, %%xmm1 # XMM1: GE.......G0 GE.......G0         \n\
+        packuswb %%xmm2, %%xmm2 # XMM2: BE.......B0 BE.......B0         \n\
+        packuswb %%xmm3, %%xmm3 # XMM3: RF.......R1 RF.......R1         \n\
+        packuswb %%xmm4, %%xmm4 # XMM4: GF.......G1 GF.......G1         \n\
+        packuswb %%xmm5, %%xmm5 # XMM5: BF.......B1 BF.......B1         \n\
+        punpcklbw %%xmm3,%%xmm0 # XMM0: RF...................R0         \n\
+        punpcklbw %%xmm4,%%xmm1 # XMM1: GF...................G0         \n\
+        punpcklbw %%xmm5,%%xmm2 # XMM2: BF...................B0         \n"
+        : /* no outputs */
+        : "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+/* YUV444 YUV->RGB (Y=XMM7:XMM6 U=XMM5:XMM2 V=XMM3:XMM0) */
+static inline void sse2_yuv444_to_rgb(void)
+{
+    asm("\
+        psubw 16("ESI"), %%xmm6 # XMM6: subtract 16                     \n\
+        psllw $7, %%xmm6        # XMM6: convert to fixed point 8.7      \n\
+        psubw 16("ESI"), %%xmm7 # XMM7: subtract 16                     \n\
+        psllw $7, %%xmm7        # XMM7: convert to fixed point 8.7      \n\
+        psubw 32("ESI"), %%xmm2 # XMM2: subtract 128                    \n\
+        psllw $7, %%xmm2        # XMM2: convert to fixed point 8.7      \n\
+        psubw 32("ESI"), %%xmm5 # XMM5: subtract 128                    \n\
+        psllw $7, %%xmm5        # XMM5: convert to fixed point 8.7      \n\
+        psubw 32("ESI"), %%xmm0 # XMM0: subtract 128                    \n\
+        psllw $7, %%xmm0        # XMM0: convert to fixed point 8.7      \n\
+        psubw 32("ESI"), %%xmm3 # XMM3: subtract 128                    \n\
+        psllw $7, %%xmm3        # XMM3: convert to fixed point 8.7      \n\
+        # Multiply by constants                                         \n\
+        pmulhw 48("ESI"),%%xmm6 # XMM6: cY7.................cY0         \n\
+        movdqa 80("ESI"),%%xmm1 # XMM1: gU constant                     \n\
+        pmulhw %%xmm2, %%xmm1   # XMM1: gU7.................gU0         \n\
+        movdqa 96("ESI"),%%xmm4 # XMM4: gV constant                     \n\
+        pmulhw %%xmm0, %%xmm4   # XMM4: gV7.................gV0         \n\
+        paddw %%xmm4, %%xmm1    # XMM1: g7 g6 g5 g4 g3 g2 g1 g0         \n\
+        pmulhw 64("ESI"),%%xmm0 # XMM0: r7 r6 r5 r4 r3 r2 r1 r0         \n\
+        pmulhw 112("ESI"),%%xmm2 #XMM2: b7 b6 b5 b4 b3 b2 b1 b0         \n\
+        # Add intermediate results and round/shift to get R/G/B values  \n\
+        paddw 128("ESI"),%%xmm6 # Add rounding value (0.5 @ 8.4 fixed)  \n\
+        paddw %%xmm6, %%xmm0    # XMM0: R7 R6 R5 R4 R3 R2 R1 R0         \n\
+        psraw $4, %%xmm0        # Shift back to 8.0 fixed               \n\
+        paddw %%xmm6, %%xmm1    # XMM1: G7 G6 G5 G4 G3 G2 G1 G0         \n\
+        psraw $4, %%xmm1                                                \n\
+        paddw %%xmm6, %%xmm2    # XMM2: B7 B6 B5 B4 B3 B2 B1 B0         \n\
+        psraw $4, %%xmm2                                                \n\
+        # Do it all over again for pixels 8-15                          \n\
+        pmulhw 48("ESI"),%%xmm7 # XMM7: cYF.................cY8         \n\
+        movdqa 80("ESI"),%%xmm6 # XMM6: gU constant                     \n\
+        pmulhw %%xmm5, %%xmm6   # XMM6: gUF.................gU8         \n\
+        movdqa 96("ESI"),%%xmm4 # XMM4: gV constant                     \n\
+        pmulhw %%xmm3, %%xmm4   # XMM4: gVF.................gV8         \n\
+        paddw %%xmm6, %%xmm4    # XMM4: gF gE gD gC gB gA g9 g8         \n\
+        pmulhw 64("ESI"),%%xmm3 # XMM3: rF rE rD rC rB rA r9 r8         \n\
+        pmulhw 112("ESI"),%%xmm5 #XMM5: bF bE bD bC bB bA b9 b8         \n\
+        paddw 128("ESI"),%%xmm7 # Add rounding value (0.5 @ 8.4 fixed)  \n\
+        paddw %%xmm7, %%xmm3    # XMM3: RF RE RD RC RB RA R9 R8         \n\
+        psraw $4, %%xmm3                                                \n\
+        paddw %%xmm7, %%xmm4    # XMM4: GF GE GD GC GB GA G9 G8         \n\
+        psraw $4, %%xmm4                                                \n\
+        paddw %%xmm7, %%xmm5    # XMM5: BF BE BD BC BB BA B9 B8         \n\
+        psraw $4, %%xmm5                                                \n\
+        # Saturate to 0-255 and pack into bytes                         \n\
+        packuswb %%xmm3, %%xmm0 # XMM0: RF...................R0         \n\
+        packuswb %%xmm4, %%xmm1 # XMM1: GF...................G0         \n\
+        packuswb %%xmm5, %%xmm2 # XMM2: BF...................B0         \n"
+        : /* no outputs */
+        : "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+/************************************/
+
+/* Convert YUV->RGB output to RGBA pixels in XMM0..XMM3 */
+#define SSE2_RGB_TO_RGBA "\
+        pxor %%xmm7, %%xmm7     # XMM7: 00 00 00 00 00 00 00 00         \n\
+        movdqa %%xmm0, %%xmm3   # XMM3: RF...................R0         \n\
+        movdqa %%xmm1, %%xmm4   # XMM4: GF...................G0         \n\
+        movdqa %%xmm2, %%xmm5   # XMM5: BF...................B0         \n\
+        punpcklbw %%xmm1,%%xmm0 # XMM0: G7 R7.............G0 R0         \n\
+        punpcklbw %%xmm7,%%xmm2 # XMM2: 00 B7.............00 B0         \n\
+        movdqa %%xmm0, %%xmm1   # XMM1: G7 R7.............G0 R0         \n\
+        punpcklwd %%xmm2,%%xmm0 # XMM0: 0BGR3 0BGR2 0BGR1 0BGR0         \n\
+        punpckhwd %%xmm2,%%xmm1 # XMM1: 0BGR7 0BGR6 0BGR5 0BGR4         \n\
+        punpckhbw %%xmm4,%%xmm3 # XMM3: GF RF.............G8 R8         \n\
+        punpckhbw %%xmm7,%%xmm5 # XMM5: 00 BF.............00 B8         \n\
+        movdqa %%xmm3, %%xmm2   # XMM2: GF RF.............G8 R8         \n\
+        punpckhwd %%xmm5,%%xmm3 # XMM3: 0BGRF 0BGRE 0BGRD 0BGRC         \n\
+        punpcklwd %%xmm5,%%xmm2 # XMM2: 0BGRB 0BGRA 0BGR9 0BGR8         \n"
+
+/* Convert YUV->RGB output to BGRA pixels in XMM0..XMM3 */
+#define SSE2_RGB_TO_BGRA "\
+        pxor %%xmm7, %%xmm7     # XMM7: 00 00 00 00 00 00 00 00         \n\
+        movdqa %%xmm0, %%xmm5   # XMM5: RF...................R0         \n\
+        movdqa %%xmm1, %%xmm4   # XMM4: GF...................G0         \n\
+        movdqa %%xmm2, %%xmm3   # XMM3: BF...................B0         \n\
+        punpcklbw %%xmm1,%%xmm2 # XMM0: G7 B7.............G0 B0         \n\
+        punpcklbw %%xmm7,%%xmm0 # XMM2: 00 R7.............00 R0         \n\
+        movdqa %%xmm2, %%xmm1   # XMM1: G7 B7.............G0 B0         \n\
+        punpcklwd %%xmm0,%%xmm2 # XMM2: 0RGB3 0RGB2 0RGB1 0RGB0         \n\
+        punpckhwd %%xmm0,%%xmm1 # XMM1: 0RGB7 0RGB6 0RGB5 0RGB4         \n\
+        movdqa %%xmm2, %%xmm0   # XMM0: 0RGB3 0RGB2 0RGB1 0RGB0         \n\
+        punpckhbw %%xmm4,%%xmm3 # XMM3: GF BF.............G8 B8         \n\
+        punpckhbw %%xmm7,%%xmm5 # XMM5: 00 RF.............00 R8         \n\
+        movdqa %%xmm3, %%xmm2   # XMM2: GF BF.............G8 B8         \n\
+        punpckhwd %%xmm5,%%xmm3 # XMM3: 0RGBF 0RGBE 0RGBD 0RGBC         \n\
+        punpcklwd %%xmm5,%%xmm2 # XMM2: 0RGBB 0RGBA 0RGB9 0RGB8         \n"
+
+/* Convert and 4 RGBA32 (BGRA32) pixels in XMMn to RGB24 (BGR24) and store
+ * at EDI+(12*n) */
+#define SSE2_RGB32_TO_RGB24(n) "\
+        movd %%xmm"#n", %%eax   # EAX: 00 B0 G0 R0                      \n\
+        psrldq $4, %%xmm"#n"    # XMMn: 00000 0BGR3 0BGR2 0BGR1         \n\
+        movd %%xmm"#n", %%ebx   # EBX: 00 B1 G1 R1                      \n\
+        psrldq $4, %%xmm"#n"    # XMMn: 00000 00000 0BGR3 0BGR2         \n\
+        movd %%xmm"#n", %%ecx   # ECX: 00 B2 G2 R2                      \n\
+        psrldq $4, %%xmm"#n"    # XMMn: 00000 00000 00000 0BGR3         \n\
+        movd %%xmm"#n", %%edx   # EDX: 00 B3 G3 R3                      \n\
+        "IA32_RGB32_TO_RGB24"                                           \n\
+        movl %%eax, 12*"#n"+0("EDI")                                    \n\
+        movl %%ebx, 12*"#n"+4("EDI")                                    \n\
+        movl %%ecx, 12*"#n"+8("EDI")                                    \n"
+
+
+static inline void sse2_store_rgb24(uint8_t *dest)
+{
+    /* It looks like it's fastest to go to RGB32 first, then shift the
+     * result to merge the 24-bit pixels together. */
+    asm(SSE2_RGB_TO_RGBA"                                               \n\
+        "PUSH(EBX)"                                                     \n\
+        "SSE2_RGB32_TO_RGB24(0)"                                        \n\
+        "SSE2_RGB32_TO_RGB24(1)"                                        \n\
+        "SSE2_RGB32_TO_RGB24(2)"                                        \n\
+        "SSE2_RGB32_TO_RGB24(3)"                                        \n\
+        "POP(EBX)"                                                      \n"
+        : /* no outputs */
+        : "D" (dest)
+        : "eax", "ecx", "edx", "esi" COMMA_FAKE_PUSH_REG
+    );
+}
+
+static inline void sse2_store_bgr24(uint8_t *dest)
+{
+    asm(SSE2_RGB_TO_BGRA                                                 "\
+        "PUSH(EBX)"                                                     \n\
+        "SSE2_RGB32_TO_RGB24(0)"                                        \n\
+        "SSE2_RGB32_TO_RGB24(1)"                                        \n\
+        "SSE2_RGB32_TO_RGB24(2)"                                        \n\
+        "SSE2_RGB32_TO_RGB24(3)"                                        \n\
+        "POP(EBX)"                                                      \n"
+        : /* no outputs */
+        : "D" (dest)
+        : "eax", "ecx", "edx", "esi" COMMA_FAKE_PUSH_REG
+    );
+}
+
+/* It would be nice to be able to use movntdq here for a 50% speedup,
+ * but we're not guaranteed alignment... (think 766x512 for example) */
+static inline void sse2_store_rgba32(uint8_t *dest)
+{
+    asm(SSE2_RGB_TO_RGBA                                                 "\
+        movdqu %%xmm0,   ("EDI")                                        \n\
+        movdqu %%xmm1, 16("EDI")                                        \n\
+        movdqu %%xmm2, 32("EDI")                                        \n\
+        movdqu %%xmm3, 48("EDI")                                        \n"
+        : /* no outputs */
+        : "D" (dest)
+    );
+}
+
+static inline void sse2_store_abgr32(uint8_t *dest)
+{
+    asm(SSE2_RGB_TO_BGRA                                                 "\
+        pslldq $1, %%xmm0                                               \n\
+        pslldq $1, %%xmm1                                               \n\
+        pslldq $1, %%xmm2                                               \n\
+        pslldq $1, %%xmm3                                               \n\
+        movdqu %%xmm0,   ("EDI")                                        \n\
+        movdqu %%xmm1, 16("EDI")                                        \n\
+        movdqu %%xmm2, 32("EDI")                                        \n\
+        movdqu %%xmm3, 48("EDI")                                        \n"
+        : /* no outputs */
+        : "D" (dest)
+    );
+}
+
+static inline void sse2_store_argb32(uint8_t *dest)
+{
+    asm(SSE2_RGB_TO_RGBA                                                 "\
+        pslldq $1, %%xmm0                                               \n\
+        pslldq $1, %%xmm1                                               \n\
+        pslldq $1, %%xmm2                                               \n\
+        pslldq $1, %%xmm3                                               \n\
+        movdqu %%xmm0,   ("EDI")                                        \n\
+        movdqu %%xmm1, 16("EDI")                                        \n\
+        movdqu %%xmm2, 32("EDI")                                        \n\
+        movdqu %%xmm3, 48("EDI")                                        \n"
+        : /* no outputs */
+        : "D" (dest)
+    );
+}
+
+static inline void sse2_store_bgra32(uint8_t *dest)
+{
+    asm(SSE2_RGB_TO_BGRA                                                 "\
+        movdqu %%xmm0,   ("EDI")                                        \n\
+        movdqu %%xmm1, 16("EDI")                                        \n\
+        movdqu %%xmm2, 32("EDI")                                        \n\
+        movdqu %%xmm3, 48("EDI")                                        \n"
+        : /* no outputs */
+        : "D" (dest)
+    );
+}
+
+/*************************************************************************/
+
+static inline void sse2_load_rgb24(uint8_t *src);
+static inline void sse2_load_bgr24(uint8_t *src);
+static inline void sse2_load_rgba32(uint8_t *src);
+static inline void sse2_load_abgr32(uint8_t *src);
+static inline void sse2_load_argb32(uint8_t *src);
+static inline void sse2_load_bgra32(uint8_t *src);
+static inline void sse2_rgb_to_yuv420p(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_yuv411p(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_yuv422p(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_yuv444p(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_yuy2(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_uyvy(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_yvyu(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_y8(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+
+#define DEFINE_RGB2YUV_SSE2(rgb,yuv,rgbsz,rofs,gofs,bofs,slowop) \
+static int rgb##_##yuv##_sse2(uint8_t **src, uint8_t **dest,            \
+                              int width, int height)                    \
+{                                                                       \
+    int x, y;                                                           \
+                                                                        \
+    for (y = 0; y < height; y++) {                                      \
+        for (x = 0; x < (width & ~7); x += 8) {                         \
+            sse2_load_##rgb(src[0]+(y*width+x)*rgbsz);                  \
+            sse2_rgb_to_##yuv(dest[0], dest[1], dest[2], x, y, width);  \
+        }                                                               \
+        while (x < width) {                                             \
+            int r = src[0][(y*width+x)*rgbsz+rofs];                     \
+            int g = src[0][(y*width+x)*rgbsz+gofs];                     \
+            int b = src[0][(y*width+x)*rgbsz+bofs];                     \
+            slowop;                                                     \
+            x++;                                                        \
+        }                                                               \
+    }                                                                   \
+    asm("emms");                                                        \
+    return 1;                                                           \
+}
+
+#define DEFINE_RGB2YUV_SSE2_SET(rgb,sz,r,g,b) \
+    DEFINE_RGB2YUV_SSE2(rgb,yuv420p, sz,r,g,b, RGB2YUV_420P) \
+    DEFINE_RGB2YUV_SSE2(rgb,yuv411p, sz,r,g,b, RGB2YUV_411P) \
+    DEFINE_RGB2YUV_SSE2(rgb,yuv422p, sz,r,g,b, RGB2YUV_422P) \
+    DEFINE_RGB2YUV_SSE2(rgb,yuv444p, sz,r,g,b, RGB2YUV_444P) \
+    DEFINE_RGB2YUV_SSE2(rgb,yuy2,    sz,r,g,b, RGB2YUV_YUY2) \
+    DEFINE_RGB2YUV_SSE2(rgb,uyvy,    sz,r,g,b, RGB2YUV_UYVY) \
+    DEFINE_RGB2YUV_SSE2(rgb,yvyu,    sz,r,g,b, RGB2YUV_YVYU) \
+    DEFINE_RGB2YUV_SSE2(rgb,y8,      sz,r,g,b, RGB2Y())
+
+DEFINE_RGB2YUV_SSE2_SET(rgb24,  3,0,1,2)
+DEFINE_RGB2YUV_SSE2_SET(bgr24,  3,2,1,0)
+DEFINE_RGB2YUV_SSE2_SET(rgba32, 4,0,1,2)
+DEFINE_RGB2YUV_SSE2_SET(abgr32, 4,3,2,1)
+DEFINE_RGB2YUV_SSE2_SET(argb32, 4,1,2,3)
+DEFINE_RGB2YUV_SSE2_SET(bgra32, 4,2,1,0)
+
+/************************************/
+
+/* Split 8 RGBA pixels in XMMr/XMMb into R/G/B in XMM0/XMM1/XMM2.
+ * r and b are 0 and 2 for RGB, 2 and 0 for BGR */
+#define SSE2_SPLIT_RGB32(r,b) "\
+        movdqa 176("EDI"), %%xmm7       # XMM7: 00FF*8                  \n\
+        movdqa %%xmm"#r", %%xmm1        # XMM1: XBGR3 XBGR2 XBGR1 XBGR0 \n\
+        movdqa %%xmm"#b", %%xmm3        # XMM3: XBGR7 XBGR6 XBGR5 XBGR4 \n\
+        pand %%xmm7, %%xmm"#r"          # XMMr: B3 R3 B2 R2 B1 R1 B0 R0 \n\
+        psrld $8, %%xmm1                # XMM1: -XBG3 -XBG2 -XBG1 -XBG0 \n\
+        pand %%xmm7, %%xmm"#b"          # XMMb: B7 R7 B6 R6 B5 R5 B4 R4 \n\
+        psrld $8, %%xmm3                # XMM3: -XBG7 -XBG6 -XBG5 -XBG4 \n\
+        pand %%xmm7, %%xmm1             # XMM1: XX G3 XX G2 XX G1 XX G0 \n\
+        packuswb %%xmm"#b", %%xmm"#r"   # XMMr: B7 R7 ........... B0 R0 \n\
+        pand %%xmm7, %%xmm3             # XMM3: XX G7 XX G6 XX G5 XX G4 \n\
+        movdqa %%xmm"#r", %%xmm"#b"     # XMMb: B7 R7 ........... B0 R0 \n\
+        packuswb %%xmm3, %%xmm1         # XMM1: XX G7 ........... XX G0 \n\
+        pand %%xmm7, %%xmm"#r"          # XMMr: R7 R6 R5 R4 R3 R2 R1 R0 \n\
+        psrlw $8, %%xmm"#b"             # XMMb: B7 B6 B5 B4 B3 B2 B1 B0 \n\
+        pand %%xmm7, %%xmm1             # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n"
+
+static inline void sse2_load_rgb24(uint8_t *src)
+{
+    asm("\
+        "PUSH(EBX)"                                                     \n\
+        # Make stack space for loading XMM registers                    \n"
+#ifdef ARCH_X86_64
+"       sub $24+128, "ESP"                                              \n"
+#else
+"       sub $24, "ESP"                                                  \n"
+#endif
+"       # Copy source pixels to appropriate positions in stack (this    \n\
+        # seems to be the fastest way to get them where we want them)   \n\
+        movl $8, %%ebx                                                  \n\
+        movl $24, %%edx                                                 \n\
+        0:                                                              \n\
+        movb -3("ESI","EDX"), %%al                                      \n\
+        movb %%al, 0-1("ESP","EBX")                                     \n\
+        movb -2("ESI","EDX"), %%al                                      \n\
+        movb %%al, 8-1("ESP","EBX")                                     \n\
+        movb -1("ESI","EDX"), %%al                                      \n\
+        movb %%al, 16-1("ESP","EBX")                                    \n\
+        subl $3, %%edx                                                  \n\
+        subl $1, %%ebx                                                  \n\
+        jnz 0b                                                          \n\
+        # Load XMM0-XMM2 with R/G/B values and expand to 16-bit         \n\
+        pxor %%xmm7, %%xmm7                                             \n\
+        movq ("ESP"), %%xmm0                                            \n\
+        punpcklbw %%xmm7, %%xmm0                                        \n\
+        movq 8("ESP"), %%xmm1                                           \n\
+        punpcklbw %%xmm7, %%xmm1                                        \n\
+        movq 16("ESP"), %%xmm2                                          \n\
+        punpcklbw %%xmm7, %%xmm2                                        \n"
+#ifdef ARCH_X86_64
+"       add $24+128, "ESP"                                              \n"
+#else
+"       add $24, "ESP"                                                  \n"
+#endif
+"       "POP(EBX)"                                                      \n"
+        : /* no outputs */
+        : "S" (src)
+        : "eax", "ecx", "edx", "edi" COMMA_FAKE_PUSH_REG
+    );
+}
+
+static inline void sse2_load_bgr24(uint8_t *src)
+{
+    /* Load as RGB and swap registers */
+    sse2_load_rgb24(src);
+    asm("\
+        movdqa %%xmm0, %%xmm3                                           \n\
+        movdqa %%xmm2, %%xmm0                                           \n\
+        movdqa %%xmm3, %%xmm2                                           \n"
+        : /* no outputs */
+        : /* no inputs */
+    );
+}
+
+static inline void sse2_load_rgba32(uint8_t *src)
+{
+    asm("\
+        movdqu ("ESI"), %%xmm0          # XMM0: XBGR3 XBGR2 XBGR1 XBGR0 \n\
+        movdqu 16("ESI"), %%xmm2        # XMM2: XBGR7 XBGR6 XBGR5 XBGR4 \n\
+        "SSE2_SPLIT_RGB32(0,2)"                                         \n"
+        : /* no outputs */
+        : "S" (src), "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+static inline void sse2_load_abgr32(uint8_t *src)
+{
+    asm("\
+        movdqu ("ESI"), %%xmm2          # XMM2: RGBX3 RGBX2 RGBX1 RGBX0 \n\
+        movdqu 16("ESI"), %%xmm0        # XMM0: RGBX7 RGBX6 RGBX5 RGBX4 \n\
+        psrld $8, %%xmm2                # XMM2: -RGB3 -RGB2 -RGB1 -RGB0 \n\
+        psrld $8, %%xmm0                # XMM0: -RGB7 -RGB6 -RGB5 -RGB4 \n\
+        "SSE2_SPLIT_RGB32(2,0)"                                         \n"
+        : /* no outputs */
+        : "S" (src), "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+static inline void sse2_load_argb32(uint8_t *src)
+{
+    asm("\
+        movdqu ("ESI"), %%xmm0          # XMM0: BGRX3 BGRX2 BGRX1 BGRX0 \n\
+        movdqu 16("ESI"), %%xmm2        # XMM2: BGRX7 BGRX6 BGRX5 BGRX4 \n\
+        psrld $8, %%xmm0                # XMM0: -BGR3 -BGR2 -BGR1 -BGR0 \n\
+        psrld $8, %%xmm2                # XMM2: -BGR7 -BGR6 -BGR5 -BGR4 \n\
+        "SSE2_SPLIT_RGB32(0,2)"                                         \n"
+        : /* no outputs */
+        : "S" (src), "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+static inline void sse2_load_bgra32(uint8_t *src)
+{
+    asm("\
+        movdqu ("ESI"), %%xmm2          # XMM2: XRGB3 XRGB2 XRGB1 XRGB0 \n\
+        movdqu 16("ESI"), %%xmm0        # XMM0: XRGB7 XRGB6 XRGB5 XRGB4 \n\
+        "SSE2_SPLIT_RGB32(2,0)"                                         \n"
+        : /* no outputs */
+        : "S" (src), "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+/************************************/
+
+#define SSE2_RGB2Y "\
+        # Make RGB data into 8.6 fixed-point, then create 8.6           \n\
+        # fixed-point Y data in XMM3                                    \n\
+        psllw $6, %%xmm0                                                \n\
+        movdqa %%xmm0, %%xmm3                                           \n\
+        pmulhuw ("EDI"), %%xmm3                                         \n\
+        psllw $6, %%xmm1                                                \n\
+        movdqa %%xmm1, %%xmm6                                           \n\
+        pmulhuw 16("EDI"), %%xmm6                                       \n\
+        psllw $6, %%xmm2                                                \n\
+        movdqa %%xmm2, %%xmm7                                           \n\
+        pmulhuw 32("EDI"), %%xmm7                                       \n\
+        paddw %%xmm6, %%xmm3    # No possibility of overflow            \n\
+        paddw %%xmm7, %%xmm3                                            \n\
+        paddw 144("EDI"), %%xmm3                                        \n"
+#define SSE2_RGB2U "\
+        # Create 8.6 fixed-point U data in XMM4                         \n\
+        movdqa %%xmm0, %%xmm4                                           \n\
+        pmulhw 48("EDI"), %%xmm4                                        \n\
+        movdqa %%xmm1, %%xmm6                                           \n\
+        pmulhw 64("EDI"), %%xmm6                                        \n\
+        movdqa %%xmm2, %%xmm7                                           \n\
+        pmulhw 80("EDI"), %%xmm7                                        \n\
+        paddw %%xmm6, %%xmm4                                            \n\
+        paddw %%xmm7, %%xmm4                                            \n\
+        paddw 160("EDI"), %%xmm4                                        \n"
+#define SSE2_RGB2U0 "\
+        # Create 8.6 fixed-point U data in XMM0                         \n\
+        pmulhw 48("EDI"), %%xmm0                                        \n\
+        pmulhw 64("EDI"), %%xmm1                                        \n\
+        pmulhw 80("EDI"), %%xmm2                                        \n\
+        paddw %%xmm1, %%xmm0                                            \n\
+        paddw %%xmm2, %%xmm0                                            \n\
+        paddw 160("EDI"), %%xmm0                                        \n"
+#define SSE2_RGB2V "\
+        # Create 8.6 fixed-point V data in XMM0                         \n\
+        pmulhw 96("EDI"), %%xmm0                                        \n\
+        pmulhw 112("EDI"), %%xmm1                                       \n\
+        pmulhw 128("EDI"), %%xmm2                                       \n\
+        paddw %%xmm1, %%xmm0                                            \n\
+        paddw %%xmm2, %%xmm0                                            \n\
+        paddw 160("EDI"), %%xmm0                                        \n"
+#define SSE2_PACKYU "\
+        # Shift back down to 8-bit values                               \n\
+        psraw $6, %%xmm3                                                \n\
+        psraw $6, %%xmm0                                                \n\
+        # Pack into bytes                                               \n\
+        pxor %%xmm7, %%xmm7                                             \n\
+        packuswb %%xmm7, %%xmm3                                         \n\
+        packuswb %%xmm7, %%xmm0                                         \n"
+#define SSE2_PACKYUV "\
+        # Shift back down to 8-bit values                               \n\
+        psraw $6, %%xmm3                                                \n\
+        psraw $6, %%xmm4                                                \n\
+        psraw $6, %%xmm0                                                \n\
+        # Pack into bytes                                               \n\
+        pxor %%xmm7, %%xmm7                                             \n\
+        packuswb %%xmm7, %%xmm3                                         \n\
+        packuswb %%xmm7, %%xmm4                                         \n\
+        packuswb %%xmm7, %%xmm0                                         \n"
+#define SSE2_STRIPU(N) "\
+        # Remove every odd U value                                      \n\
+        pand 176("EDI"), %%xmm"#N"                                      \n\
+        packuswb %%xmm7, %%xmm"#N"                                      \n"
+#define SSE2_STRIPV "\
+        # Remove every even V value                                     \n\
+        psrlw $8, %%xmm0                                                \n\
+        packuswb %%xmm7, %%xmm0                                         \n"
+
+static inline void sse2_rgb_to_yuv420p(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+    if (y%2 == 0) {
+        asm("\
+            "SSE2_RGB2Y"                                                \n\
+            "SSE2_RGB2U0"                                               \n\
+            "SSE2_PACKYU"                                               \n\
+            "SSE2_STRIPU(0)"                                            \n\
+            # Store into destination pointers                           \n\
+            movq %%xmm3, ("EAX")                                        \n\
+            movd %%xmm0, ("ECX")                                        \n"
+            : /* no outputs */
+            : "a" (destY+y*width+x), "c" (destU+(y/2)*(width/2)+(x/2)),
+              "D" (&rgb_data), "m" (rgb_data)
+        );
+    } else {
+        asm("\
+            "SSE2_RGB2Y"                                                \n\
+            "SSE2_RGB2V"                                                \n\
+            "SSE2_PACKYU"                                               \n\
+            "SSE2_STRIPV"                                               \n\
+            # Store into destination pointers                           \n\
+            movq %%xmm3, ("EAX")                                        \n\
+            movd %%xmm0, ("EDX")                                        \n"
+            : /* no outputs */
+            : "a" (destY+y*width+x), "d" (destV+(y/2)*(width/2)+(x/2)),
+              "D" (&rgb_data), "m" (rgb_data)
+        );
+    }
+}
+
+static inline void sse2_rgb_to_yuv411p(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+    asm("\
+        "SSE2_RGB2Y"                                                    \n\
+        "SSE2_RGB2U"                                                    \n\
+        "SSE2_RGB2V"                                                    \n\
+        "SSE2_PACKYUV"                                                  \n\
+        "SSE2_STRIPU(4)"                                                \n\
+        "SSE2_STRIPU(4)"                                                \n\
+        "SSE2_STRIPU(0)"                                                \n\
+        "SSE2_STRIPV"                                                   \n\
+        # Store into destination pointers                               \n\
+        movq %%xmm3, ("EAX")                                            \n\
+        "PUSH(EAX)"  # needed because GCC might rely on it later        \n\
+        movd %%xmm4, %%eax                                              \n\
+        movw %%ax, ("ECX")                                              \n\
+        movd %%xmm0, %%eax                                              \n\
+        movw %%ax, ("EDX")                                              \n\
+        "POP(EAX)"                                                      \n"
+        : /* no outputs */
+        : "a" (destY+y*width+x), "c" (destU+y*(width/4)+(x/4)),
+          "d" (destV+y*(width/4)+(x/4)), "D" (&rgb_data), "m" (rgb_data)
+#ifdef ARCH_X86_64
+        : FAKE_PUSH_REG
+#endif
+    );
+}
+
+static inline void sse2_rgb_to_yuv422p(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+    asm("\
+        "SSE2_RGB2Y"                                                    \n\
+        "SSE2_RGB2U"                                                    \n\
+        "SSE2_RGB2V"                                                    \n\
+        "SSE2_PACKYUV"                                                  \n\
+        "SSE2_STRIPU(4)"                                                \n\
+        "SSE2_STRIPV"                                                   \n\
+        # Store into destination pointers                               \n\
+        movq %%xmm3, ("EAX")                                            \n\
+        movd %%xmm4, ("ECX")                                            \n\
+        movd %%xmm0, ("EDX")                                            \n"
+        : /* no outputs */
+        : "a" (destY+y*width+x), "c" (destU+y*(width/2)+(x/2)),
+          "d" (destV+y*(width/2)+(x/2)), "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+static inline void sse2_rgb_to_yuv444p(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+    asm("\
+        "SSE2_RGB2Y"                                                    \n\
+        "SSE2_RGB2U"                                                    \n\
+        "SSE2_RGB2V"                                                    \n\
+        "SSE2_PACKYUV"                                                  \n\
+        # Store into destination pointers                               \n\
+        movq %%xmm3, ("EAX")                                            \n\
+        movq %%xmm4, ("ECX")                                            \n\
+        movq %%xmm0, ("EDX")                                            \n"
+        : /* no outputs */
+        : "a" (destY+y*width+x), "c" (destU+y*width+x), "d" (destV+y*width+x),
+          "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+static inline void sse2_rgb_to_yuy2(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+    asm("\
+        "SSE2_RGB2Y"                                                    \n\
+        "SSE2_RGB2U"                                                    \n\
+        "SSE2_RGB2V"                                                    \n\
+        "SSE2_PACKYUV"                                                  \n\
+        "SSE2_STRIPU(4)"                                                \n\
+        "SSE2_STRIPV"                                                   \n\
+        # Interleave Y/U/V                                              \n\
+        punpcklbw %%xmm0, %%xmm4                                        \n\
+        punpcklbw %%xmm4, %%xmm3                                        \n\
+        # Store into destination pointer                                \n\
+        movdqu %%xmm3, ("EAX")                                          \n"
+        : /* no outputs */
+        : "a" (destY+(y*width+x)*2), "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+static inline void sse2_rgb_to_uyvy(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+    asm("\
+        "SSE2_RGB2Y"                                                    \n\
+        "SSE2_RGB2U"                                                    \n\
+        "SSE2_RGB2V"                                                    \n\
+        "SSE2_PACKYUV"                                                  \n\
+        "SSE2_STRIPU(4)"                                                \n\
+        "SSE2_STRIPV"                                                   \n\
+        # Interleave Y/U/V                                              \n\
+        punpcklbw %%xmm0, %%xmm4                                        \n\
+        punpcklbw %%xmm3, %%xmm4                                        \n\
+        # Store into destination pointer                                \n\
+        movdqu %%xmm4, ("EAX")                                          \n"
+        : /* no outputs */
+        : "a" (destY+(y*width+x)*2), "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+static inline void sse2_rgb_to_yvyu(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+    asm("\
+        "SSE2_RGB2Y"                                                    \n\
+        "SSE2_RGB2U"                                                    \n\
+        "SSE2_RGB2V"                                                    \n\
+        "SSE2_PACKYUV"                                                  \n\
+        # Remove every odd V value                                      \n\
+        pand 176("EDI"), %%xmm0                                         \n\
+        packuswb %%xmm7, %%xmm0                                         \n\
+        # Remove every even U value                                     \n\
+        psrlw $8, %%xmm4                                                \n\
+        packuswb %%xmm7, %%xmm4                                         \n\
+        # Interleave Y/U/V                                              \n\
+        punpcklbw %%xmm4, %%xmm0                                        \n\
+        punpcklbw %%xmm0, %%xmm3                                        \n\
+        # Store into destination pointer                                \n\
+        movdqu %%xmm3, ("EAX")                                          \n"
+        : /* no outputs */
+        : "a" (destY+(y*width+x)*2), "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+static inline void sse2_rgb_to_y8(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+    asm("\
+        psllw $6, %%xmm0                                                \n\
+        pmulhuw ("EDI"), %%xmm0                                         \n\
+        psllw $6, %%xmm1                                                \n\
+        pmulhuw 16("EDI"), %%xmm1                                       \n\
+        psllw $6, %%xmm2                                                \n\
+        pmulhuw 32("EDI"), %%xmm2                                       \n\
+        paddw %%xmm1, %%xmm0            # No possibility of overflow    \n\
+        paddw %%xmm2, %%xmm0                                            \n\
+        paddw 144("EDI"), %%xmm0                                        \n\
+        psraw $6, %%xmm0                                                \n\
+        packuswb %%xmm0, %%xmm0                                         \n\
+        movq %%xmm0, ("EAX")                                            \n"
+        : /* no outputs */
+        : "a" (destY+y*width+x), "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+/*************************************************************************/
+
+static int yuvp_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("movdqa ("EDX"), %%xmm7         # constant: 255/219             \n\
+        movdqa 32("EDX"), %%xmm6        # constant: 16                  \n\
+        pxor %%xmm4, %%xmm4             # constant: 0                   \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 16,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movzbl -1("ESI","ECX"), %%eax   # retrieve Y byte               \n\
+        subl $16, %%eax                 # subtract 16                   \n\
+        imull %3, %%eax                 # multiply by 255/219           \n\
+        shrl $14, %%eax                 # shift down to 8 bits          \n\
+        testb %%ah, %%ah                # saturate to 0..255            \n\
+        movl $-1, %%edx                 # (trash EDX, we don't need it  \n\
+        cmovnz %%edx, %%eax             #  anymore)                     \n\
+        movl $0, %%edx                                                  \n\
+        cmovs %%edx, %%eax                                              \n\
+        movb %%al, -1("EDI","ECX")      # and store                     \n",
+        /* main_loop */ "\
+        movdqu -16("ESI","ECX"), %%xmm0 # XMM0: Y15..Y0                 \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: Y15..Y0                 \n\
+        punpcklbw %%xmm4, %%xmm0        # XMM0: Y7..Y0                  \n\
+        psubw %%xmm6, %%xmm0            # XMM0: unbias by 16            \n\
+        psllw $2, %%xmm0                # XMM0: fixed point 8.2         \n\
+        pmulhw %%xmm7, %%xmm0           # XMM0: multiply by 255/219>>2  \n\
+        punpckhbw %%xmm4, %%xmm1        # XMM1: Y15..Y8 << 8            \n\
+        psubw %%xmm6, %%xmm1            # XMM1: unbias by 16            \n\
+        psllw $2, %%xmm1                # XMM1: fixed point 8.2         \n\
+        pmulhw %%xmm7, %%xmm1           # XMM1: multiply by 255/219>>2  \n\
+        packuswb %%xmm1, %%xmm0         # XMM0: G15..G0, saturated      \n\
+        movdqu %%xmm0, -16("EDI","ECX")                                 \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data)
+        : "eax");
+    return 1;
+}
+
+static int yuy2_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("movdqa ("EDX"), %%xmm7         # constant: 255/219             \n\
+        movdqa 32("EDX"), %%xmm6        # constant: 16                  \n\
+        pcmpeqd %%xmm5, %%xmm5                                          \n\
+        psrlw $8, %%xmm5                # constant: 0x00FF              \n\
+        pxor %%xmm4, %%xmm4             # constant: 0                   \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 8,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movzbl -2("ESI","ECX",2), %%eax # retrieve Y byte               \n\
+        subl $16, %%eax                 # subtract 16                   \n\
+        imull %3, %%eax                 # multiply by 255/219           \n\
+        shrl $14, %%eax                 # shift down to 8 bits          \n\
+        testb %%ah, %%ah                # saturate to 0..255            \n\
+        movl $-1, %%edx                 # (trash EDX, we don't need it  \n\
+        cmovnz %%edx, %%eax             #  anymore)                     \n\
+        movl $0, %%edx                                                  \n\
+        cmovs %%edx, %%eax                                              \n\
+        movb %%al, -1("EDI","ECX")      # and store                     \n",
+        /* main_loop */ "\
+        movdqu -16("ESI","ECX",2),%%xmm0 #XMM0: V3 Y7..U0 Y0            \n\
+        pand %%xmm5, %%xmm0             # XMM0: Y7..Y0                  \n\
+        psubw %%xmm6, %%xmm0            # XMM0: unbias by 16            \n\
+        psllw $2, %%xmm0                # XMM0: fixed point 8.2         \n\
+        pmulhw %%xmm7, %%xmm0           # XMM0: multiply by 255/219>>2  \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: G7..G0, saturated       \n\
+        movq %%xmm0, -8("EDI","ECX")                                    \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data)
+        : "eax");
+    return 1;
+}
+
+static int uyvy_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("movdqa ("EDX"), %%xmm7         # constant: 255/219             \n\
+        movdqa 32("EDX"), %%xmm6                                        \n\
+        psllw $2, %%xmm6                # constant: 16<<2               \n\
+        pcmpeqd %%xmm5, %%xmm5                                          \n\
+        psllw $8, %%xmm5                # constant: 0xFF00              \n\
+        pxor %%xmm4, %%xmm4             # constant: 0                   \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 8,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movzbl -1("ESI","ECX",2), %%eax # retrieve Y byte               \n\
+        subl $16, %%eax                 # subtract 16                   \n\
+        imull %3, %%eax                 # multiply by 255/219           \n\
+        shrl $14, %%eax                 # shift down to 8 bits          \n\
+        testb %%ah, %%ah                # saturate to 0..255            \n\
+        movl $-1, %%edx                 # (trash EDX, we don't need it  \n\
+        cmovnz %%edx, %%eax             #  anymore)                     \n\
+        movl $0, %%edx                                                  \n\
+        cmovs %%edx, %%eax                                              \n\
+        movb %%al, -1("EDI","ECX")      # and store                     \n",
+        /* main_loop */ "\
+        movdqu -16("ESI","ECX",2),%%xmm0 #XMM0: Y7 V3..Y0 U0            \n\
+        pand %%xmm5, %%xmm0             # XMM0: Y7..Y0 << 8             \n\
+        psrlw $6, %%xmm0                # XMM0: fixed point 8.2         \n\
+        psubw %%xmm6, %%xmm0            # XMM0: unbias by 16            \n\
+        pmulhw %%xmm7, %%xmm0           # XMM0: multiply by 255/219>>2  \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: G7..G0, saturated       \n\
+        movq %%xmm0, -8("EDI","ECX")                                    \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data)
+        : "eax");
+    return 1;
+}
+
+/*************************************************************************/
+
+static int gray8_y8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("movdqa 16("EDX"), %%xmm7       # constant: 219/255             \n\
+        movdqa 32("EDX"), %%xmm6        # constant: 16                  \n\
+        pxor %%xmm4, %%xmm4             # constant: 0                   \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 16,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movzbl -1("ESI","ECX"), %%eax   # retrieve gray byte            \n\
+        imull %3, %%eax                 # multiply by 219/255           \n\
+        shrl $14, %%eax                 # shift down to 8 bits          \n\
+        addl $16, %%eax                 # add 16                        \n\
+        movb %%al, -1("EDI","ECX")      # and store                     \n",
+        /* main_loop */ "\
+        movdqu -16("ESI","ECX"), %%xmm2 # XMM2: G15..G0                 \n\
+        movdqa %%xmm4, %%xmm0                                           \n\
+        punpcklbw %%xmm2, %%xmm0        # XMM0: G7..G0 << 8             \n\
+        pmulhuw %%xmm7, %%xmm0          # XMM0: multiply by 219/255>>2  \n\
+        movdqa %%xmm4, %%xmm1                                           \n\
+        punpckhbw %%xmm2, %%xmm1        # XMM1: G15..G8 << 8            \n\
+        pmulhuw %%xmm7, %%xmm1          # XMM1: multiply by 219/255>>2  \n\
+        psrlw $6, %%xmm0                # XMM0: shift down to 8 bits    \n\
+        paddw %%xmm6, %%xmm0            # XMM0: bias by 16              \n\
+        psrlw $6, %%xmm1                # XMM1: shift down to 8 bits    \n\
+        paddw %%xmm6, %%xmm1            # XMM1: bias by 16              \n\
+        packuswb %%xmm1, %%xmm0         # XMM0: Y15..Y0                 \n\
+        movdqu %%xmm0, -16("EDI","ECX")                                 \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "i" (GRAY_Y), "d" (&gray_data), "m" (gray_data)
+        : "eax");
+    return 1;
+}
+
+static int gray8_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("movdqa 16("EDX"), %%xmm7       # constant: 219/255             \n\
+        movdqa 32("EDX"), %%xmm6        # constant: 16                  \n\
+        pcmpeqd %%xmm5, %%xmm5                                          \n\
+        psllw $15, %%xmm5               # constant: 0x8000              \n\
+        pxor %%xmm4, %%xmm4             # constant: 0                   \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 8,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movzbl -1("ESI","ECX"), %%eax   # retrieve gray byte            \n\
+        imull %3, %%eax                 # multiply by 219/255           \n\
+        shrl $14, %%eax                 # shift down to 8 bits          \n\
+        addl $16, %%eax                 # add 16                        \n\
+        movb %%al, -2("EDI","ECX",2)    # and store                     \n\
+        movb $128, -1("EDI","ECX",2)    # store 128 in U/V byte         \n",
+        /* main_loop */ "\
+        movq -8("ESI","ECX"), %%xmm2    # XMM2: G5..G0                  \n\
+        movdqa %%xmm4, %%xmm0                                           \n\
+        punpcklbw %%xmm2, %%xmm0        # XMM0: G7..G0 << 8             \n\
+        pmulhuw %%xmm7, %%xmm0          # XMM0: multiply by 219/255>>2  \n\
+        psrlw $6, %%xmm0                # XMM0: shift down to 8 bits    \n\
+        paddw %%xmm6, %%xmm0            # XMM0: bias by 16              \n\
+        por %%xmm5, %%xmm0              # XMM0: OR in U/V bytes         \n\
+        movdqu %%xmm0, -16("EDI","ECX",2)                               \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "i" (GRAY_Y), "d" (&gray_data), "m" (gray_data)
+        : "eax");
+    return 1;
+}
+
+static int gray8_uyvy_sse2(uint8_t **src, uint8_t **dest, int width, int height) {
+    asm("movdqa 16("EDX"), %%xmm7       # constant: 219/255             \n\
+        movdqa 32("EDX"), %%xmm6                                        \n\
+        psllw $8, %%xmm6                # constant: 16 << 8             \n\
+        pcmpeqd %%xmm5, %%xmm5                                          \n\
+        psllw $15, %%xmm5                                               \n\
+        psrlw $8, %%xmm5                # constant: 0x0080              \n\
+        pxor %%xmm4, %%xmm4             # constant: 0                   \n\
+        pcmpeqd %%xmm3, %%xmm3                                          \n\
+        psllw $8, %%xmm3                # constant: 0xFF00              \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 8,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movzbl -1("ESI","ECX"), %%eax   # retrieve gray byte            \n\
+        imull %3, %%eax                 # multiply by 219/255           \n\
+        shrl $14, %%eax                 # shift down to 8 bits          \n\
+        addl $16, %%eax                 # add 16                        \n\
+        movb %%al, -1("EDI","ECX",2)    # and store                     \n\
+        movb $128, -2("EDI","ECX",2)    # store 128 in U/V byte         \n",
+        /* main_loop */ "\
+        movq -8("ESI","ECX"), %%xmm2    # XMM2: G5..G0                  \n\
+        movdqa %%xmm4, %%xmm0                                           \n\
+        punpcklbw %%xmm2, %%xmm0        # XMM0: G7..G0 << 8             \n\
+        pmulhuw %%xmm7, %%xmm0          # XMM0: multiply by 219/255>>2  \n\
+        psllw $2, %%xmm0                # XMM0: shift results to hi byte\n\
+        pand %%xmm3, %%xmm0             # XMM0: clear low byte          \n\
+        paddw %%xmm6, %%xmm0            # XMM0: bias by 16              \n\
+        por %%xmm5, %%xmm0              # XMM0: OR in U/V bytes         \n\
+        movdqu %%xmm0, -16("EDI","ECX",2)                               \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "i" (GRAY_Y), "d" (&gray_data), "m" (gray_data)
+        : "eax");
+    return 1;
+}
+
+/*************************************************************************/
+
+static int y8_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("movdqa ("EDX"), %%xmm7         # constant: 255/219             \n\
+        movdqa 32("EDX"), %%xmm6        # constant: 16                  \n\
+        movdqa 48("EDX"), %%xmm5        # constant: bytes 0/3/6/9 mask  \n\
+        pxor %%xmm4, %%xmm4             # constant: 0                   \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 4,
+        /* push_regs */ "push "EBX,
+        /* pop_regs  */ "pop "EBX,
+        /* small_loop */ "\
+        lea ("ECX","ECX",2), "EDX"      # 3*count for RGB offset        \n\
+        movzbl -1("ESI","ECX"), %%eax   # retrieve Y byte               \n\
+        subl $16, %%eax                 # subtract 16                   \n\
+        imull %3, %%eax                 # multiply by 255/219           \n\
+        shrl $14, %%eax                 # shift down to 8 bits          \n\
+        testb %%ah, %%ah                # saturate to 0..255            \n\
+        movl $-1, %%ebx                                                 \n\
+        cmovnz %%ebx, %%eax                                             \n\
+        movl $0, %%ebx                                                  \n\
+        cmovs %%ebx, %%eax                                              \n\
+        movb %%al, -3("EDI","EDX")      # and store                     \n\
+        movb %%al, -2("EDI","EDX")                                      \n\
+        movb %%al, -1("EDI","EDX")                                      \n",
+        /* main_loop */ "\
+        lea ("ECX","ECX",2), "EDX"                                      \n\
+        movd -4("ESI","ECX"), %%xmm0    # XMM0: Y3..Y0                  \n\
+        punpcklbw %%xmm4, %%xmm0        # XMM0: Y3..Y0 in 16 bits       \n\
+        psubw %%xmm6, %%xmm0            # XMM0: unbias by 16            \n\
+        psllw $2, %%xmm0                # XMM0: fixed point 8.2         \n\
+        pmulhw %%xmm7, %%xmm0           # XMM0: multiply by 255/219>>2  \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: G3..G0, saturated       \n\
+        pshuflw $0x50, %%xmm0, %%xmm0   # X0.l: G3 G2 G3 G2 G1 G0 G1 G0 \n\
+        pshufhw $0x55, %%xmm0, %%xmm0   # X0.h: G3 G2 G3 G2 G3 G2 G3 G2 \n\
+        pand %%xmm5, %%xmm0             # XMM0: ------3--2--1--0        \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: ------3--2--1--0        \n\
+        pslldq $1, %%xmm1               # XMM1: -----3--2--1--0-        \n\
+        movdqa %%xmm0, %%xmm2           # XMM2: ------3--2--1--0        \n\
+        pslldq $2, %%xmm2               # XMM2: ----3--2--1--0--        \n\
+        por %%xmm1, %%xmm0              # XMM0: -----33-22-11-00        \n\
+        por %%xmm2, %%xmm0              # XMM0: ----333222111000        \n\
+        movd %%xmm0, -12("EDI","EDX")                                   \n\
+        pshufd $0xC9, %%xmm0, %%xmm0                                    \n\
+        movq %%xmm0, -8("EDI","EDX")                                    \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data)
+        : "eax");
+    return 1;
+}
+
+/* 4BPP is slightly easier... */
+static int y8_rgba32_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("movdqa ("EDX"), %%xmm7         # constant: 255/219             \n\
+        movdqa 32("EDX"), %%xmm6        # constant: 16                  \n\
+        pxor %%xmm4, %%xmm4             # constant: 0                   \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 4,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movzbl -1("ESI","ECX"), %%eax   # retrieve Y byte               \n\
+        subl $16, %%eax                 # subtract 16                   \n\
+        imull %3, %%eax                 # multiply by 255/219           \n\
+        shrl $14, %%eax                 # shift down to 8 bits          \n\
+        testb %%ah, %%ah                # saturate to 0..255            \n\
+        movl $-1, %%edx                                                 \n\
+        cmovnz %%edx, %%eax                                             \n\
+        movl $0, %%edx                                                  \n\
+        cmovs %%edx, %%eax                                              \n\
+        movb %%al, -4("EDI","ECX",4)      # and store                   \n\
+        movb %%al, -3("EDI","ECX",4)                                    \n\
+        movb %%al, -2("EDI","ECX",4)                                    \n",
+        /* main_loop */ "\
+        movd -4("ESI","ECX"), %%xmm0    # XMM0: Y3..Y0                  \n\
+        punpcklbw %%xmm4, %%xmm0        # XMM0: Y3..Y0 in 16 bits       \n\
+        psubw %%xmm6, %%xmm0            # XMM0: unbias by 16            \n\
+        psllw $2, %%xmm0                # XMM0: fixed point 8.2         \n\
+        pmulhw %%xmm7, %%xmm0           # XMM0: multiply by 255/219>>2  \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: G3..G0, saturated       \n\
+        punpcklbw %%xmm4, %%xmm0        # XMM0: G3..G0 in 16 bits       \n\
+        punpcklbw %%xmm4, %%xmm0        # XMM0: ---3---2---1---0        \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: ---3---2---1---0        \n\
+        pslldq $1, %%xmm1               # XMM1: --3---2---1---0-        \n\
+        movdqa %%xmm0, %%xmm2           # XMM2: ---3---2---1---0        \n\
+        pslldq $2, %%xmm2               # XMM2: -3---2---1---0--        \n\
+        por %%xmm1, %%xmm0              # XMM0: --33--22--11--00        \n\
+        por %%xmm2, %%xmm0              # XMM0: -333-222-111-000        \n\
+        movntdq %%xmm0, -16("EDI","ECX",4)                              \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data)
+        : "eax");
+    return 1;
+}
+
+static int y8_argb32_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("movdqa ("EDX"), %%xmm7         # constant: 255/219             \n\
+        movdqa 32("EDX"), %%xmm6        # constant: 16                  \n\
+        pxor %%xmm4, %%xmm4             # constant: 0                   \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 4,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movzbl -1("ESI","ECX"), %%eax   # retrieve Y byte               \n\
+        subl $16, %%eax                 # subtract 16                   \n\
+        imull %3, %%eax                 # multiply by 255/219           \n\
+        shrl $14, %%eax                 # shift down to 8 bits          \n\
+        testb %%ah, %%ah                # saturate to 0..255            \n\
+        movl $-1, %%edx                                                 \n\
+        cmovnz %%edx, %%eax                                             \n\
+        movl $0, %%edx                                                  \n\
+        cmovs %%edx, %%eax                                              \n\
+        movb %%al, -3("EDI","ECX",4)      # and store                   \n\
+        movb %%al, -2("EDI","ECX",4)                                    \n\
+        movb %%al, -1("EDI","ECX",4)                                    \n",
+        /* main_loop */ "\
+        movd -4("ESI","ECX"), %%xmm0    # XMM0: Y3..Y0                  \n\
+        punpcklbw %%xmm4, %%xmm0        # XMM0: Y3..Y0 in 16 bits       \n\
+        psubw %%xmm6, %%xmm0            # XMM0: unbias by 16            \n\
+        psllw $2, %%xmm0                # XMM0: fixed point 8.2         \n\
+        pmulhw %%xmm7, %%xmm0           # XMM0: multiply by 255/219>>2  \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: G3..G0, saturated       \n\
+        punpcklbw %%xmm4, %%xmm0        # XMM0: G3..G0 in 16 bits       \n\
+        movdqa %%xmm4, %%xmm3           # XMM3: 0                       \n\
+        punpcklbw %%xmm0, %%xmm3        # XMM3: --3---2---1---0-        \n\
+        movdqa %%xmm3, %%xmm1           # XMM1: --3---2---1---0-        \n\
+        pslldq $1, %%xmm1               # XMM1: -3---2---1---0--        \n\
+        movdqa %%xmm3, %%xmm2           # XMM2: --3---2---1---0-        \n\
+        pslldq $2, %%xmm2               # XMM2: 3---2---1---0---        \n\
+        por %%xmm1, %%xmm3              # XMM3: -33--22--11--00-        \n\
+        por %%xmm2, %%xmm3              # XMM3: 333-222-111-000-        \n\
+        movntdq %%xmm3, -16("EDI","ECX",4)                              \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data)
+        : "eax");
+    return 1;
+}
+
+/*************************************************************************/
+
+#endif  /* HAVE_ASM_SSE2 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Initialization */
+
+int ac_imgconvert_init_yuv_rgb(int accel)
+{
+    /******** Standard C implementations ********/
+
+    //---- YUV->RGB ----//
+
+    if (!register_conversion(IMG_YUV420P, IMG_RGB24,   yuv420p_rgb24)
+     || !register_conversion(IMG_YUV411P, IMG_RGB24,   yuv411p_rgb24)
+     || !register_conversion(IMG_YUV422P, IMG_RGB24,   yuv422p_rgb24)
+     || !register_conversion(IMG_YUV444P, IMG_RGB24,   yuv444p_rgb24)
+     || !register_conversion(IMG_YUY2,    IMG_RGB24,   yuy2_rgb24)
+     || !register_conversion(IMG_UYVY,    IMG_RGB24,   uyvy_rgb24)
+     || !register_conversion(IMG_YVYU,    IMG_RGB24,   yvyu_rgb24)
+     || !register_conversion(IMG_Y8,      IMG_RGB24,   y8_rgb24)
+
+     || !register_conversion(IMG_YUV420P, IMG_BGR24,   yuv420p_bgr24)
+     || !register_conversion(IMG_YUV411P, IMG_BGR24,   yuv411p_bgr24)
+     || !register_conversion(IMG_YUV422P, IMG_BGR24,   yuv422p_bgr24)
+     || !register_conversion(IMG_YUV444P, IMG_BGR24,   yuv444p_bgr24)
+     || !register_conversion(IMG_YUY2,    IMG_BGR24,   yuy2_bgr24)
+     || !register_conversion(IMG_UYVY,    IMG_BGR24,   uyvy_bgr24)
+     || !register_conversion(IMG_YVYU,    IMG_BGR24,   yvyu_bgr24)
+     || !register_conversion(IMG_Y8,      IMG_BGR24,   y8_rgb24)
+
+     || !register_conversion(IMG_YUV420P, IMG_RGBA32,  yuv420p_rgba32)
+     || !register_conversion(IMG_YUV411P, IMG_RGBA32,  yuv411p_rgba32)
+     || !register_conversion(IMG_YUV422P, IMG_RGBA32,  yuv422p_rgba32)
+     || !register_conversion(IMG_YUV444P, IMG_RGBA32,  yuv444p_rgba32)
+     || !register_conversion(IMG_YUY2,    IMG_RGBA32,  yuy2_rgba32)
+     || !register_conversion(IMG_UYVY,    IMG_RGBA32,  uyvy_rgba32)
+     || !register_conversion(IMG_YVYU,    IMG_RGBA32,  yvyu_rgba32)
+     || !register_conversion(IMG_Y8,      IMG_RGBA32,  y8_rgba32)
+
+     || !register_conversion(IMG_YUV420P, IMG_ABGR32,  yuv420p_abgr32)
+     || !register_conversion(IMG_YUV411P, IMG_ABGR32,  yuv411p_abgr32)
+     || !register_conversion(IMG_YUV422P, IMG_ABGR32,  yuv422p_abgr32)
+     || !register_conversion(IMG_YUV444P, IMG_ABGR32,  yuv444p_abgr32)
+     || !register_conversion(IMG_YUY2,    IMG_ABGR32,  yuy2_abgr32)
+     || !register_conversion(IMG_UYVY,    IMG_ABGR32,  uyvy_abgr32)
+     || !register_conversion(IMG_YVYU,    IMG_ABGR32,  yvyu_abgr32)
+     || !register_conversion(IMG_Y8,      IMG_ABGR32,  y8_argb32)
+
+     || !register_conversion(IMG_YUV420P, IMG_ARGB32,  yuv420p_argb32)
+     || !register_conversion(IMG_YUV411P, IMG_ARGB32,  yuv411p_argb32)
+     || !register_conversion(IMG_YUV422P, IMG_ARGB32,  yuv422p_argb32)
+     || !register_conversion(IMG_YUV444P, IMG_ARGB32,  yuv444p_argb32)
+     || !register_conversion(IMG_YUY2,    IMG_ARGB32,  yuy2_argb32)
+     || !register_conversion(IMG_UYVY,    IMG_ARGB32,  uyvy_argb32)
+     || !register_conversion(IMG_YVYU,    IMG_ARGB32,  yvyu_argb32)
+     || !register_conversion(IMG_Y8,      IMG_ARGB32,  y8_argb32)
+
+     || !register_conversion(IMG_YUV420P, IMG_BGRA32,  yuv420p_bgra32)
+     || !register_conversion(IMG_YUV411P, IMG_BGRA32,  yuv411p_bgra32)
+     || !register_conversion(IMG_YUV422P, IMG_BGRA32,  yuv422p_bgra32)
+     || !register_conversion(IMG_YUV444P, IMG_BGRA32,  yuv444p_bgra32)
+     || !register_conversion(IMG_YUY2,    IMG_BGRA32,  yuy2_bgra32)
+     || !register_conversion(IMG_UYVY,    IMG_BGRA32,  uyvy_bgra32)
+     || !register_conversion(IMG_YVYU,    IMG_BGRA32,  yvyu_bgra32)
+     || !register_conversion(IMG_Y8,      IMG_BGRA32,  y8_rgba32)
+
+    //---- RGB->YUV ----//
+
+     || !register_conversion(IMG_RGB24,   IMG_YUV420P, rgb24_yuv420p)
+     || !register_conversion(IMG_RGB24,   IMG_YUV411P, rgb24_yuv411p)
+     || !register_conversion(IMG_RGB24,   IMG_YUV422P, rgb24_yuv422p)
+     || !register_conversion(IMG_RGB24,   IMG_YUV444P, rgb24_yuv444p)
+     || !register_conversion(IMG_RGB24,   IMG_YUY2,    rgb24_yuy2)
+     || !register_conversion(IMG_RGB24,   IMG_UYVY,    rgb24_uyvy)
+     || !register_conversion(IMG_RGB24,   IMG_YVYU,    rgb24_yvyu)
+     || !register_conversion(IMG_RGB24,   IMG_Y8,      rgb24_y8)
+
+     || !register_conversion(IMG_BGR24,   IMG_YUV420P, bgr24_yuv420p)
+     || !register_conversion(IMG_BGR24,   IMG_YUV411P, bgr24_yuv411p)
+     || !register_conversion(IMG_BGR24,   IMG_YUV422P, bgr24_yuv422p)
+     || !register_conversion(IMG_BGR24,   IMG_YUV444P, bgr24_yuv444p)
+     || !register_conversion(IMG_BGR24,   IMG_YUY2,    bgr24_yuy2)
+     || !register_conversion(IMG_BGR24,   IMG_UYVY,    bgr24_uyvy)
+     || !register_conversion(IMG_BGR24,   IMG_YVYU,    bgr24_yvyu)
+     || !register_conversion(IMG_BGR24,   IMG_Y8,      bgr24_y8)
+
+     || !register_conversion(IMG_RGBA32,  IMG_YUV420P, rgba32_yuv420p)
+     || !register_conversion(IMG_RGBA32,  IMG_YUV411P, rgba32_yuv411p)
+     || !register_conversion(IMG_RGBA32,  IMG_YUV422P, rgba32_yuv422p)
+     || !register_conversion(IMG_RGBA32,  IMG_YUV444P, rgba32_yuv444p)
+     || !register_conversion(IMG_RGBA32,  IMG_YUY2,    rgba32_yuy2)
+     || !register_conversion(IMG_RGBA32,  IMG_UYVY,    rgba32_uyvy)
+     || !register_conversion(IMG_RGBA32,  IMG_YVYU,    rgba32_yvyu)
+     || !register_conversion(IMG_RGBA32,  IMG_Y8,      rgba32_y8)
+
+     || !register_conversion(IMG_ABGR32,  IMG_YUV420P, abgr32_yuv420p)
+     || !register_conversion(IMG_ABGR32,  IMG_YUV411P, abgr32_yuv411p)
+     || !register_conversion(IMG_ABGR32,  IMG_YUV422P, abgr32_yuv422p)
+     || !register_conversion(IMG_ABGR32,  IMG_YUV444P, abgr32_yuv444p)
+     || !register_conversion(IMG_ABGR32,  IMG_YUY2,    abgr32_yuy2)
+     || !register_conversion(IMG_ABGR32,  IMG_UYVY,    abgr32_uyvy)
+     || !register_conversion(IMG_ABGR32,  IMG_YVYU,    abgr32_yvyu)
+     || !register_conversion(IMG_ABGR32,  IMG_Y8,      abgr32_y8)
+
+     || !register_conversion(IMG_ARGB32,  IMG_YUV420P, argb32_yuv420p)
+     || !register_conversion(IMG_ARGB32,  IMG_YUV411P, argb32_yuv411p)
+     || !register_conversion(IMG_ARGB32,  IMG_YUV422P, argb32_yuv422p)
+     || !register_conversion(IMG_ARGB32,  IMG_YUV444P, argb32_yuv444p)
+     || !register_conversion(IMG_ARGB32,  IMG_YUY2,    argb32_yuy2)
+     || !register_conversion(IMG_ARGB32,  IMG_UYVY,    argb32_uyvy)
+     || !register_conversion(IMG_ARGB32,  IMG_YVYU,    argb32_yvyu)
+     || !register_conversion(IMG_ARGB32,  IMG_Y8,      argb32_y8)
+
+     || !register_conversion(IMG_BGRA32,  IMG_YUV420P, bgra32_yuv420p)
+     || !register_conversion(IMG_BGRA32,  IMG_YUV411P, bgra32_yuv411p)
+     || !register_conversion(IMG_BGRA32,  IMG_YUV422P, bgra32_yuv422p)
+     || !register_conversion(IMG_BGRA32,  IMG_YUV444P, bgra32_yuv444p)
+     || !register_conversion(IMG_BGRA32,  IMG_YUY2,    bgra32_yuy2)
+     || !register_conversion(IMG_BGRA32,  IMG_UYVY,    bgra32_uyvy)
+     || !register_conversion(IMG_BGRA32,  IMG_YVYU,    bgra32_yvyu)
+     || !register_conversion(IMG_BGRA32,  IMG_Y8,      bgra32_y8)
+
+    //---- Grayscale ----//
+
+     || !register_conversion(IMG_YUV420P, IMG_GRAY8,   yuvp_gray8)
+     || !register_conversion(IMG_YUV411P, IMG_GRAY8,   yuvp_gray8)
+     || !register_conversion(IMG_YUV422P, IMG_GRAY8,   yuvp_gray8)
+     || !register_conversion(IMG_YUV444P, IMG_GRAY8,   yuvp_gray8)
+     || !register_conversion(IMG_YUY2,    IMG_GRAY8,   yuy2_gray8)
+     || !register_conversion(IMG_UYVY,    IMG_GRAY8,   uyvy_gray8)
+     || !register_conversion(IMG_YVYU,    IMG_GRAY8,   yuy2_gray8)
+     || !register_conversion(IMG_Y8,      IMG_GRAY8,   yuvp_gray8)
+
+     || !register_conversion(IMG_GRAY8,   IMG_YUV420P, gray8_yuv420p)
+     || !register_conversion(IMG_GRAY8,   IMG_YUV411P, gray8_yuv411p)
+     || !register_conversion(IMG_GRAY8,   IMG_YUV422P, gray8_yuv422p)
+     || !register_conversion(IMG_GRAY8,   IMG_YUV444P, gray8_yuv444p)
+     || !register_conversion(IMG_GRAY8,   IMG_YUY2,    gray8_yuy2)
+     || !register_conversion(IMG_GRAY8,   IMG_UYVY,    gray8_uyvy)
+     || !register_conversion(IMG_GRAY8,   IMG_YVYU,    gray8_yuy2)
+     || !register_conversion(IMG_GRAY8,   IMG_Y8,      gray8_y8)
+    ) {
+        return 0;
+    }
+
+    /******** MMX implementations ********/
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
+    if (accel & AC_MMX) {
+
+        //---- YUV->RGB ----//
+
+        if (!register_conversion(IMG_YUV420P, IMG_RGB24,   yuv420p_rgb24_mmx)
+         || !register_conversion(IMG_YUV422P, IMG_RGB24,   yuv422p_rgb24_mmx)
+         || !register_conversion(IMG_YUV420P, IMG_BGR24,   yuv420p_bgr24_mmx)
+         || !register_conversion(IMG_YUV422P, IMG_BGR24,   yuv422p_bgr24_mmx)
+         || !register_conversion(IMG_YUV420P, IMG_RGBA32,  yuv420p_rgba32_mmx)
+         || !register_conversion(IMG_YUV422P, IMG_RGBA32,  yuv422p_rgba32_mmx)
+         || !register_conversion(IMG_YUV420P, IMG_ABGR32,  yuv420p_abgr32_mmx)
+         || !register_conversion(IMG_YUV422P, IMG_ABGR32,  yuv422p_abgr32_mmx)
+         || !register_conversion(IMG_YUV420P, IMG_ARGB32,  yuv420p_argb32_mmx)
+         || !register_conversion(IMG_YUV422P, IMG_ARGB32,  yuv422p_argb32_mmx)
+         || !register_conversion(IMG_YUV420P, IMG_BGRA32,  yuv420p_bgra32_mmx)
+         || !register_conversion(IMG_YUV422P, IMG_BGRA32,  yuv422p_bgra32_mmx)
+        ) {
+            return 0;
+        }
+    }
+#endif
+
+    /******** SSE2 implementations ********/
+
+#if defined(HAVE_ASM_SSE2)
+    if (HAS_ACCEL(accel, AC_SSE2)) {
+
+        //---- YUV->RGB ----//
+
+        if (!register_conversion(IMG_YUV420P, IMG_RGB24,   yuv420p_rgb24_sse2)
+         || !register_conversion(IMG_YUV411P, IMG_RGB24,   yuv411p_rgb24_sse2)
+         || !register_conversion(IMG_YUV422P, IMG_RGB24,   yuv422p_rgb24_sse2)
+         || !register_conversion(IMG_YUV444P, IMG_RGB24,   yuv444p_rgb24_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_RGB24,   yuy2_rgb24_sse2)
+         || !register_conversion(IMG_UYVY,    IMG_RGB24,   uyvy_rgb24_sse2)
+         || !register_conversion(IMG_YVYU,    IMG_RGB24,   yvyu_rgb24_sse2)
+         || !register_conversion(IMG_Y8,      IMG_RGB24,   y8_rgb24_sse2)
+
+         || !register_conversion(IMG_YUV420P, IMG_BGR24,   yuv420p_bgr24_sse2)
+         || !register_conversion(IMG_YUV411P, IMG_BGR24,   yuv411p_bgr24_sse2)
+         || !register_conversion(IMG_YUV422P, IMG_BGR24,   yuv422p_bgr24_sse2)
+         || !register_conversion(IMG_YUV444P, IMG_BGR24,   yuv444p_bgr24_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_BGR24,   yuy2_bgr24_sse2)
+         || !register_conversion(IMG_UYVY,    IMG_BGR24,   uyvy_bgr24_sse2)
+         || !register_conversion(IMG_YVYU,    IMG_BGR24,   yvyu_bgr24_sse2)
+         || !register_conversion(IMG_Y8,      IMG_BGR24,   y8_rgb24_sse2)
+
+         || !register_conversion(IMG_YUV420P, IMG_RGBA32,  yuv420p_rgba32_sse2)
+         || !register_conversion(IMG_YUV411P, IMG_RGBA32,  yuv411p_rgba32_sse2)
+         || !register_conversion(IMG_YUV422P, IMG_RGBA32,  yuv422p_rgba32_sse2)
+         || !register_conversion(IMG_YUV444P, IMG_RGBA32,  yuv444p_rgba32_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_RGBA32,  yuy2_rgba32_sse2)
+         || !register_conversion(IMG_UYVY,    IMG_RGBA32,  uyvy_rgba32_sse2)
+         || !register_conversion(IMG_YVYU,    IMG_RGBA32,  yvyu_rgba32_sse2)
+         || !register_conversion(IMG_Y8,      IMG_RGBA32,  y8_rgba32_sse2)
+
+         || !register_conversion(IMG_YUV420P, IMG_ABGR32,  yuv420p_abgr32_sse2)
+         || !register_conversion(IMG_YUV411P, IMG_ABGR32,  yuv411p_abgr32_sse2)
+         || !register_conversion(IMG_YUV422P, IMG_ABGR32,  yuv422p_abgr32_sse2)
+         || !register_conversion(IMG_YUV444P, IMG_ABGR32,  yuv444p_abgr32_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_ABGR32,  yuy2_abgr32_sse2)
+         || !register_conversion(IMG_UYVY,    IMG_ABGR32,  uyvy_abgr32_sse2)
+         || !register_conversion(IMG_YVYU,    IMG_ABGR32,  yvyu_abgr32_sse2)
+         || !register_conversion(IMG_Y8,      IMG_ABGR32,  y8_argb32_sse2)
+
+         || !register_conversion(IMG_YUV420P, IMG_ARGB32,  yuv420p_argb32_sse2)
+         || !register_conversion(IMG_YUV411P, IMG_ARGB32,  yuv411p_argb32_sse2)
+         || !register_conversion(IMG_YUV422P, IMG_ARGB32,  yuv422p_argb32_sse2)
+         || !register_conversion(IMG_YUV444P, IMG_ARGB32,  yuv444p_argb32_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_ARGB32,  yuy2_argb32_sse2)
+         || !register_conversion(IMG_UYVY,    IMG_ARGB32,  uyvy_argb32_sse2)
+         || !register_conversion(IMG_YVYU,    IMG_ARGB32,  yvyu_argb32_sse2)
+         || !register_conversion(IMG_Y8,      IMG_ARGB32,  y8_argb32_sse2)
+
+         || !register_conversion(IMG_YUV420P, IMG_BGRA32,  yuv420p_bgra32_sse2)
+         || !register_conversion(IMG_YUV411P, IMG_BGRA32,  yuv411p_bgra32_sse2)
+         || !register_conversion(IMG_YUV422P, IMG_BGRA32,  yuv422p_bgra32_sse2)
+         || !register_conversion(IMG_YUV444P, IMG_BGRA32,  yuv444p_bgra32_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_BGRA32,  yuy2_bgra32_sse2)
+         || !register_conversion(IMG_UYVY,    IMG_BGRA32,  uyvy_bgra32_sse2)
+         || !register_conversion(IMG_YVYU,    IMG_BGRA32,  yvyu_bgra32_sse2)
+         || !register_conversion(IMG_Y8,      IMG_BGRA32,  y8_rgba32_sse2)
+
+        //---- RGB->YUV ----//
+
+         || !register_conversion(IMG_RGB24,   IMG_YUV420P, rgb24_yuv420p_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_YUV411P, rgb24_yuv411p_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_YUV422P, rgb24_yuv422p_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_YUV444P, rgb24_yuv444p_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_YUY2,    rgb24_yuy2_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_UYVY,    rgb24_uyvy_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_YVYU,    rgb24_yvyu_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_Y8,      rgb24_y8_sse2)
+
+         || !register_conversion(IMG_BGR24,   IMG_YUV420P, bgr24_yuv420p_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_YUV411P, bgr24_yuv411p_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_YUV422P, bgr24_yuv422p_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_YUV444P, bgr24_yuv444p_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_YUY2,    bgr24_yuy2_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_UYVY,    bgr24_uyvy_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_YVYU,    bgr24_yvyu_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_Y8,      bgr24_y8_sse2)
+
+         || !register_conversion(IMG_RGBA32,  IMG_YUV420P, rgba32_yuv420p_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_YUV411P, rgba32_yuv411p_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_YUV422P, rgba32_yuv422p_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_YUV444P, rgba32_yuv444p_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_YUY2,    rgba32_yuy2_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_UYVY,    rgba32_uyvy_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_YVYU,    rgba32_yvyu_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_Y8,      rgba32_y8_sse2)
+
+         || !register_conversion(IMG_ABGR32,  IMG_YUV420P, abgr32_yuv420p_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_YUV411P, abgr32_yuv411p_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_YUV422P, abgr32_yuv422p_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_YUV444P, abgr32_yuv444p_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_YUY2,    abgr32_yuy2_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_UYVY,    abgr32_uyvy_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_YVYU,    abgr32_yvyu_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_Y8,      abgr32_y8_sse2)
+
+         || !register_conversion(IMG_ARGB32,  IMG_YUV420P, argb32_yuv420p_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_YUV411P, argb32_yuv411p_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_YUV422P, argb32_yuv422p_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_YUV444P, argb32_yuv444p_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_YUY2,    argb32_yuy2_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_UYVY,    argb32_uyvy_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_YVYU,    argb32_yvyu_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_Y8,      argb32_y8_sse2)
+
+         || !register_conversion(IMG_BGRA32,  IMG_YUV420P, bgra32_yuv420p_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_YUV411P, bgra32_yuv411p_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_YUV422P, bgra32_yuv422p_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_YUV444P, bgra32_yuv444p_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_YUY2,    bgra32_yuy2_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_UYVY,    bgra32_uyvy_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_YVYU,    bgra32_yvyu_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_Y8,      bgra32_y8_sse2)
+
+        //---- Grayscale ----//
+
+         || !register_conversion(IMG_GRAY8,   IMG_YUY2,    gray8_yuy2_sse2)
+         || !register_conversion(IMG_GRAY8,   IMG_UYVY,    gray8_uyvy_sse2)
+         || !register_conversion(IMG_GRAY8,   IMG_YVYU,    gray8_yuy2_sse2)
+         || !register_conversion(IMG_GRAY8,   IMG_Y8,      gray8_y8_sse2)
+        ) {
+            return 0;
+        }
+    }
+
+    /* YUV->GRAY8 routines use CMOVcc */
+    if (HAS_ACCEL(accel, AC_CMOVE|AC_SSE2)) {
+        if (!register_conversion(IMG_YUV420P, IMG_GRAY8,   yuvp_gray8_sse2)
+         || !register_conversion(IMG_YUV411P, IMG_GRAY8,   yuvp_gray8_sse2)
+         || !register_conversion(IMG_YUV422P, IMG_GRAY8,   yuvp_gray8_sse2)
+         || !register_conversion(IMG_YUV444P, IMG_GRAY8,   yuvp_gray8_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_GRAY8,   yuy2_gray8_sse2)
+         || !register_conversion(IMG_UYVY,    IMG_GRAY8,   uyvy_gray8_sse2)
+         || !register_conversion(IMG_YVYU,    IMG_GRAY8,   yuy2_gray8_sse2)
+         || !register_conversion(IMG_Y8,      IMG_GRAY8,   yuvp_gray8_sse2)
+        ) {
+            return 0;
+        }
+    }
+#endif
+
+    return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ *   c-file-style: "stroustrup"
+ *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ *   indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/imgconvert.c b/debian/transcode/transcode-1.1.7/aclib/imgconvert.c
new file mode 100644
index 00000000..cc502977
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/imgconvert.c
@@ -0,0 +1,119 @@
+/*
+ * imgconvert.c - image format conversion routines
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later).  See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "imgconvert.h"
+#include "img_internal.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+/*************************************************************************/
+
+static struct {
+    ImageFormat srcfmt, destfmt;
+    ConversionFunc func;
+} *conversions;
+static int n_conversions = 0;
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Image conversion routine.  src and dest are arrays of pointers to planes
+ * (for packed formats with only one plane, just use `&data'); srcfmt and
+ * destfmt specify the source and destination image formats (IMG_*).
+ * width and height are in pixels.  Returns 1 on success, 0 on failure. */
+
+int ac_imgconvert(uint8_t **src, ImageFormat srcfmt,
+                  uint8_t **dest, ImageFormat destfmt,
+                  int width, int height)
+{
+    int i;
+
+    /* Hack to handle YV12 easily, because conversion routines don't get
+     * format tags */
+    uint8_t *newsrc[3], *newdest[3];
+    if (srcfmt == IMG_YV12) {
+        srcfmt = IMG_YUV420P;
+        newsrc[0] = src[0];
+        newsrc[1] = src[2];
+        newsrc[2] = src[1];
+        src = newsrc;
+    }
+    if (destfmt == IMG_YV12) {
+        destfmt = IMG_YUV420P;
+        newdest[0] = dest[0];
+        newdest[1] = dest[2];
+        newdest[2] = dest[1];
+        dest = newdest;
+    }
+
+    for (i = 0; i < n_conversions; i++) {
+        if (conversions[i].srcfmt==srcfmt && conversions[i].destfmt==destfmt)
+            return (*conversions[i].func)(src, dest, width, height);
+    }
+
+    return 0;
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Internal use only! */
+
+int ac_imgconvert_init(int accel)
+{
+    if (!ac_imgconvert_init_yuv_planar(accel)
+     || !ac_imgconvert_init_yuv_packed(accel)
+     || !ac_imgconvert_init_yuv_mixed(accel)
+     || !ac_imgconvert_init_yuv_rgb(accel)
+     || !ac_imgconvert_init_rgb_packed(accel)
+    ) {
+        fprintf(stderr, "ac_imgconvert_init() failed");
+        return 0;
+    }
+    return 1;
+}
+
+int register_conversion(ImageFormat srcfmt, ImageFormat destfmt,
+                        ConversionFunc function)
+{
+    int i;
+
+    for (i = 0; i < n_conversions; i++) {
+        if (conversions[i].srcfmt==srcfmt && conversions[i].destfmt==destfmt) {
+            conversions[i].func = function;
+            return 1;
+        }
+    }
+
+    if (!(conversions = realloc(conversions,
+                                (n_conversions+1) * sizeof(*conversions)))) {
+        fprintf(stderr, "register_conversion(): out of memory\n");
+        return 0;
+    }
+    conversions[n_conversions].srcfmt  = srcfmt;
+    conversions[n_conversions].destfmt = destfmt;
+    conversions[n_conversions].func    = function;
+    n_conversions++;
+    return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ *   c-file-style: "stroustrup"
+ *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ *   indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/imgconvert.h b/debian/transcode/transcode-1.1.7/aclib/imgconvert.h
new file mode 100644
index 00000000..c02d5a01
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/imgconvert.h
@@ -0,0 +1,105 @@
+/*
+ * imgconvert.h - defines for image format conversion routines
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later).  See the file COPYING
+ * for details.
+ */
+
+#ifndef ACLIB_IMGCONVERT_H
+#define ACLIB_IMGCONVERT_H
+
+/*************************************************************************/
+
+/* Image format defines */
+typedef enum {
+    IMG_UNKNOWN = 0,    /* Unknown/unset (dummy value, guaranteed to be 0) */
+    /* YUV formats */
+    IMG_YUV_BASE = 0x1000,
+    IMG_YUV420P,        /* YUV planar, 1 U/V per 2x2 Y pixels */
+    IMG_YV12,           /* YUV420P with U and V reversed */
+    IMG_YUV411P,        /* YUV planar, 1 U/V per 4x1 Y pixels */
+    IMG_YUV422P,        /* YUV planar, 1 U/V per 2x1 Y pixels */
+    IMG_YUV444P,        /* YUV planar, 1 U/V per 1x1 Y pixels */
+    IMG_YUY2,           /* YUV packed, 1 U/V per 2x1 Y pixels, Y:U:Y:V */
+    IMG_UYVY,           /* YUV packed, 1 U/V per 2x1 Y pixels, U:Y:V:Y */
+    IMG_YVYU,           /* YUV packed, 1 U/V per 2x1 Y pixels, Y:V:Y:U */
+    IMG_Y8,             /* Y-only 8-bit data */
+    IMG_YUV_LAST,
+    /* RGB formats */
+    IMG_RGB_BASE = 0x2000,
+    IMG_RGB24,          /* RGB packed, 8 bits per component, R:G:B */
+    IMG_BGR24,          /* RGB packed, 8 bits per component, B:G:R */
+    IMG_RGBA32,         /* RGB+alpha packed, 8 bits per component, R:G:B:A */
+    IMG_ABGR32,         /* RGB+alpha packed, 8 bits per component, A:B:G:R */
+    IMG_ARGB32,         /* RGB+alpha packed, 8 bits per component, A:R:G:B */
+    IMG_BGRA32,         /* RGB+alpha packed, 8 bits per component, B:G:R:A */
+    IMG_GRAY8,          /* Grayscale 8-bit data */
+    IMG_RGB_LAST,
+} ImageFormat;
+
+/* Alias */
+#define IMG_NONE        IMG_UNKNOWN
+
+/* Default YUV and RGB formats */
+#define IMG_YUV_DEFAULT         IMG_YUV420P
+#define IMG_RGB_DEFAULT         IMG_RGB24
+
+/* Is the given image format a YUV/RGB one? */
+#define IS_YUV_FORMAT(fmt)      ((fmt) > IMG_YUV_BASE && (fmt) < IMG_YUV_LAST)
+#define IS_RGB_FORMAT(fmt)      ((fmt) > IMG_RGB_BASE && (fmt) < IMG_RGB_LAST)
+
+/* U/V plane size for YUV planar formats (Y plane size is always w*h) */
+#define UV_PLANE_SIZE(fmt,w,h) \
+    ((fmt)==IMG_YUV420P ? ((w)/2)*((h)/2) : \
+     (fmt)==IMG_YV12    ? ((w)/2)*((h)/2) : \
+     (fmt)==IMG_YUV411P ? ((w)/4)* (h)    : \
+     (fmt)==IMG_YUV422P ? ((w)/2)* (h)    : \
+     (fmt)==IMG_YUV444P ?  (w)   * (h)    : 0)
+
+/* Macro to initialize an array of planes from a buffer */
+#define YUV_INIT_PLANES(planes,buffer,fmt,w,h) \
+    ((planes)[0] = (buffer),                   \
+     (planes)[1] = (planes)[0] + (w)*(h),      \
+     (planes)[2] = (planes)[1] + UV_PLANE_SIZE((fmt),(w),(h)))
+
+#if 0
+/* Structure describing an image.  FIXME: not currently used--this should
+ * eventually replace the (planes,format) pairs passed to ac_imgconvert. */
+typedef struct {
+    ImageFormat format;  /* Format of image data */
+    int width, height;   /* Size of image */
+    uint8_t *planes[4];  /* Data planes (use planes[0] for packed data) */
+    int stride[4];       /* Length of one row in each plane, incl. padding */
+} Image;
+#endif
+
+/*************************************************************************/
+
+/* Initialization routine.  Returns 1 on success, 0 on failure. */
+extern int ac_imgconvert_init(int accel);
+
+/* Conversion routine.  Returns 1 on success, 0 on failure. */
+extern int ac_imgconvert(uint8_t **src,         /* Array of source planes */
+                         ImageFormat srcfmt,    /* Source image format */
+                         uint8_t **dest,        /* Array of dest planes */
+                         ImageFormat destfmt,   /* Destination image format */
+                         int width,             /* Image width in pixels */
+                         int height             /* Image height in pixels */
+                        );
+
+/*************************************************************************/
+
+#endif  /* ACLIB_IMGCONVERT_H */
+
+/*
+ * Local variables:
+ *   c-file-style: "stroustrup"
+ *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ *   indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/memcpy.c b/debian/transcode/transcode-1.1.7/aclib/memcpy.c
new file mode 100644
index 00000000..05cdf41c
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/memcpy.c
@@ -0,0 +1,543 @@
+/*
+ * memcpy.c - optimized memcpy() routines for aclib
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later).  See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "ac_internal.h"
+#include <string.h>
+
+/* Use memmove because memcpy isn't guaranteed to be ascending */
+static void *(*memcpy_ptr)(void *, const void *, size_t) = memmove;
+
+/*************************************************************************/
+
+/* External interface */
+
+void *ac_memcpy(void *dest, const void *src, size_t size)
+{
+    return (*memcpy_ptr)(dest, src, size);
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Note the check for ARCH_X86 here: this is to prevent compilation of this
+ * code on x86_64, since all x86_64 processors support SSE2, and because
+ * this code is not set up to use the 64-bit registers for addressing on
+ * x86_64. */
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
+
+/* MMX-optimized routine, intended for PMMX/PII processors.
+ * Nonstandard instructions used:
+ *     (CPUID.MMX)   MOVQ
+ */
+
+static void *memcpy_mmx(void *dest, const void *src, size_t bytes)
+{
+    asm("\
+PENTIUM_LINE_SIZE = 32          # PMMX/PII cache line size              \n\
+PENTIUM_CACHE_SIZE = 8192       # PMMX/PII total cache size             \n\
+# Use only half because writes may touch the cache too (PII)            \n\
+PENTIUM_CACHE_BLOCK = (PENTIUM_CACHE_SIZE/2 - PENTIUM_LINE_SIZE)        \n\
+                                                                        \n\
+        push %%ebx              # Save PIC register                     \n\
+        push %%edi              # Save destination for return value     \n\
+        cld                     # MOVS* should ascend                   \n\
+                                                                        \n\
+        mov $64, %%ebx          # Constant                              \n\
+                                                                        \n\
+        cmp %%ebx, %%ecx                                                \n\
+        jb mmx.memcpy_last      # Just use movs if <64 bytes            \n\
+                                                                        \n\
+        # First align destination address to a multiple of 8 bytes      \n\
+        mov $8, %%eax           # EAX <- (8-dest) & 7                   \n\
+        sub %%edi, %%eax                                                \n\
+        and $7, %%eax           # ... which is the number of bytes to copy\n"
+#ifdef ACLIB_DISABLE_X86_TEXTRELS  // Because "lea 0f" requires a textrel
+"       xchg %%eax, %%ecx                                               \n\
+        mov %%ecx, %%edx                                                \n\
+        repz movsb                                                      \n\
+        mov %%eax, %%ecx                                                \n\
+        mov %%edx, %%eax                                                \n"
+#else
+"       lea 0f, %%edx           # Use a computed jump--faster than a loop\n\
+        sub %%eax, %%edx                                                \n\
+        jmp *%%edx              # Execute 0-7 MOVSB's                   \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n"
+#endif
+"0:     sub %%eax, %%ecx        # Update count                          \n\
+                                                                        \n\
+        # Now copy data in blocks                                       \n\
+0:      mov %%ecx, %%edx        # EDX <- ECX >> 6 (cache lines to copy) \n\
+        shr $6, %%edx                                                   \n\
+        jz mmx.memcpy_last      # <64 bytes left?  Skip to end          \n\
+        cmp $PENTIUM_CACHE_BLOCK/64, %%edx                              \n\
+        jb 1f                   # Limit size of block                   \n\
+        mov $PENTIUM_CACHE_BLOCK/64, %%edx                              \n\
+1:      mov %%edx, %%eax        # EAX <- EDX << 6 (bytes to copy)       \n\
+        shl $6, %%eax                                                   \n\
+        sub %%eax, %%ecx        # Update remaining count                \n\
+        add %%eax, %%esi        # Point to end of region to be block-copied\n\
+2:      test %%eax, -32(%%esi)  # Touch each cache line in reverse order\n\
+        test %%eax, -64(%%esi)                                          \n\
+        sub %%ebx, %%esi        # Update pointer                        \n\
+        sub %%ebx, %%eax        # And loop                              \n\
+        jnz 2b                                                          \n\
+        # Note that ESI now points to the beginning of the block        \n\
+3:      movq   (%%esi), %%mm0   # Do the actual copy, 64 bytes at a time\n\
+        movq  8(%%esi), %%mm1                                           \n\
+        movq 16(%%esi), %%mm2                                           \n\
+        movq 24(%%esi), %%mm3                                           \n\
+        movq 32(%%esi), %%mm4                                           \n\
+        movq 40(%%esi), %%mm5                                           \n\
+        movq 48(%%esi), %%mm6                                           \n\
+        movq 56(%%esi), %%mm7                                           \n\
+        movq %%mm0,   (%%edi)                                           \n\
+        movq %%mm1,  8(%%edi)                                           \n\
+        movq %%mm2, 16(%%edi)                                           \n\
+        movq %%mm3, 24(%%edi)                                           \n\
+        movq %%mm4, 32(%%edi)                                           \n\
+        movq %%mm5, 40(%%edi)                                           \n\
+        movq %%mm6, 48(%%edi)                                           \n\
+        movq %%mm7, 56(%%edi)                                           \n\
+        add %%ebx, %%esi        # Update pointers                       \n\
+        add %%ebx, %%edi                                                \n\
+        dec %%edx               # And loop                              \n\
+        jnz 3b                                                          \n\
+        jmp 0b                                                          \n\
+                                                                        \n\
+mmx.memcpy_last:                                                        \n\
+        # Copy last <64 bytes, using the computed jump trick            \n\
+        mov %%ecx, %%eax        # EAX <- ECX>>2                         \n\
+        shr $2, %%eax                                                   \n"
+#ifdef ACLIB_DISABLE_X86_TEXTRELS
+"       xchg %%eax, %%ecx                                               \n\
+        repz movsd                                                      \n\
+        mov %%eax, %%ecx                                                \n"
+#else
+"       lea 0f, %%edx                                                   \n\
+        sub %%eax, %%edx                                                \n\
+        jmp *%%edx              # Execute 0-15 MOVSD's                  \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n"
+#endif
+"0:     and $3, %%ecx           # ECX <- ECX & 3                        \n"
+#ifdef ACLIB_DISABLE_X86_TEXTRELS
+"       repz movsb                                                      \n"
+#else
+"       lea 0f, %%edx                                                   \n\
+        sub %%ecx, %%edx                                                \n\
+        jmp *%%edx              # Execute 0-3 MOVSB's                   \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n"
+#endif
+"0:                                                                     \n\
+        # All done!                                                     \n\
+        emms                    # Clean up MMX state                    \n\
+        pop %%edi               # Restore destination (return value)    \n\
+        pop %%ebx               # Restore PIC register                  \n\
+    " : /* no outputs */
+      : "D" (dest), "S" (src), "c" (bytes)
+      : "%eax", "%edx"
+    );
+    return dest;
+}
+
+#endif  /* HAVE_ASM_MMX && ARCH_X86 */
+
+/*************************************************************************/
+
+#if defined(HAVE_ASM_SSE) && defined(ARCH_X86)
+
+/* SSE-optimized routine.  Backported from AMD64 routine below.
+ * Nonstandard instructions used:
+ *     (CPUID.CMOVE) CMOVA
+ *     (CPUID.MMX)   MOVQ
+ *     (CPUID.SSE)   MOVNTQ
+ */
+
+static void *memcpy_sse(void *dest, const void *src, size_t bytes)
+{
+    asm("\
+        push %%ebx              # Save PIC register                     \n\
+        push %%edi              # Save destination for return value     \n\
+        cld                     # MOVS* should ascend                   \n\
+                                                                        \n\
+        cmp $64, %%ecx          # Skip block copy for small blocks      \n\
+        jb sse.memcpy_last                                              \n\
+                                                                        \n\
+        mov $128, %%ebx         # Constant used later                   \n\
+                                                                        \n\
+        # First align destination address to a multiple of 8 bytes      \n\
+        mov $8, %%eax           # EAX <- (8-dest) & 7                   \n\
+        sub %%edi, %%eax                                                \n\
+        and $7, %%eax           # ... which is the number of bytes to copy\n"
+#ifdef ACLIB_DISABLE_X86_TEXTRELS
+"       xchg %%eax, %%ecx                                               \n\
+        mov %%ecx, %%edx                                                \n\
+        repz movsb                                                      \n\
+        mov %%eax, %%ecx                                                \n\
+        mov %%edx, %%eax                                                \n"
+#else
+"       lea 0f, %%edx           # Use a computed jump--faster than a loop\n\
+        sub %%eax, %%edx                                                \n\
+        jmp *%%edx              # Execute 0-7 MOVSB's                   \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n"
+#endif
+"0:     sub %%eax, %%ecx        # Update count                          \n\
+                                                                        \n\
+        cmp $0x10040, %%ecx     # Is this a large block? (0x10040 is an \n\
+                                # arbitrary value where prefetching and \n\
+                                # write combining seem to start becoming\n\
+                                # faster)                               \n\
+        jae sse.memcpy_bp       # Yup, use prefetch copy                \n\
+                                                                        \n\
+sse.memcpy_small:               # Small block copy routine--no prefetch \n"
+#if 0
+"       mov %%ecx, %%edx        # EDX <- bytes to copy / 8              \n\
+        shr $3, %%edx                                                   \n\
+        mov %%edx, %%eax        # Leave remainder in ECX for later      \n\
+        shl $3, %%eax                                                   \n\
+        sub %%eax, %%ecx                                                \n\
+        .balign 16                                                      \n\
+0:      movq (%%esi), %%mm0     # Copy 8 bytes of data                  \n\
+        movq %%mm0, (%%edi)                                             \n\
+        add $8, %%esi           # Update pointers                       \n\
+        add $8, %%edi                                                   \n\
+        dec %%edx               # And loop                              \n\
+        jg 0b                                                           \n\
+        jmp sse.memcpy_last     # Copy any remaining bytes              \n\
+                                                                        \n\
+        nop                     # Align loops below                     \n"
+#else
+"       # It appears that a simple rep movs is faster than cleverness   \n\
+        # with movq...                                                  \n\
+        mov %%ecx, %%edx        # EDX <- ECX & 3                        \n\
+        and $3, %%edx                                                   \n\
+        shr $2, %%ecx           # ECX <- ECX >> 2                       \n\
+        rep movsl               # Copy away!                            \n\
+        mov %%edx, %%ecx        # Take care of last 0-3 bytes           \n\
+        rep movsb                                                       \n\
+        jmp sse.memcpy_end      # And exit                              \n\
+                                                                        \n\
+        .balign 16                                                      \n\
+        nop                                                             \n\
+        nop                                                             \n"
+#endif
+"sse.memcpy_bp:                 # Block prefetch copy routine           \n\
+0:      mov %%ecx, %%edx        # EDX: temp counter                     \n\
+        shr $6, %%edx           # Divide by cache line size (64 bytes)  \n\
+        cmp %%ebx, %%edx        # ... and cap at 128 (8192 bytes)       \n\
+        cmova %%ebx, %%edx                                              \n\
+        shl $3, %%edx           # EDX <- cache lines to copy * 8        \n\
+        mov %%edx, %%eax        # EAX <- cache lines to preload * 8     \n\
+                                #        (also used as memory offset)   \n\
+1:      test %%eax, -64(%%esi,%%eax,8)  # Preload cache lines in pairs  \n\
+        test %%eax, -128(%%esi,%%eax,8) # (going backwards)             \n\
+        # (note that test %%eax,... seems to be faster than prefetchnta \n\
+        #  on x86)                                                      \n\
+        sub $16, %%eax          # And loop                              \n\
+        jg 1b                                                           \n\
+                                                                        \n\
+        # Then copy--forward, which seems to be faster than reverse for \n\
+        # certain alignments                                            \n\
+        xor %%eax, %%eax                                                \n\
+2:      movq (%%esi,%%eax,8), %%mm0 # Copy 8 bytes and loop             \n\
+        movntq %%mm0, (%%edi,%%eax,8)                                   \n\
+        inc %%eax                                                       \n\
+        cmp %%edx, %%eax                                                \n\
+        jb 2b                                                           \n\
+                                                                        \n\
+        # Finally, update pointers and count, and loop                  \n\
+        shl $3, %%edx           # EDX <- bytes copied                   \n\
+        add %%edx, %%esi                                                \n\
+        add %%edx, %%edi                                                \n\
+        sub %%edx, %%ecx                                                \n\
+        cmp $64, %%ecx          # At least one cache line left?         \n\
+        jae 0b                  # Yup, loop                             \n\
+                                                                        \n\
+sse.memcpy_last:                                                        \n\
+        # Copy last <64 bytes, using the computed jump trick            \n\
+        mov %%ecx, %%eax        # EAX <- ECX>>2                         \n\
+        shr $2, %%eax                                                   \n"
+#ifdef ACLIB_DISABLE_X86_TEXTRELS
+"       xchg %%eax, %%ecx                                               \n\
+        repz movsd                                                      \n\
+        mov %%eax, %%ecx                                                \n"
+#else
+"       lea 0f, %%edx                                                   \n\
+        sub %%eax, %%edx                                                \n\
+        jmp *%%edx              # Execute 0-15 MOVSD's                  \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n\
+        movsd                                                           \n"
+#endif
+"0:     and $3, %%ecx           # ECX <- ECX & 3                        \n"
+#ifdef ACLIB_DISABLE_X86_TEXTRELS
+"       repz movsb                                                      \n"
+#else
+"       lea sse.memcpy_end, %%edx                                       \n\
+        sub %%ecx, %%edx                                                \n\
+        jmp *%%edx              # Execute 0-3 MOVSB's                   \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n"
+#endif
+"                                                                       \n\
+sse.memcpy_end:                                                         \n\
+        # All done!                                                     \n\
+        emms                    # Clean up after MMX instructions       \n\
+        sfence                  # Flush the write buffer                \n\
+        pop %%edi               # Restore destination (return value)    \n\
+        pop %%ebx               # Restore PIC register                  \n\
+    " : /* no outputs */
+      : "D" (dest), "S" (src), "c" (bytes)
+      : "%eax", "%edx"
+    );
+    return dest;
+}
+
+#endif  /* HAVE_ASM_SSE && ARCH_X86 */
+
+/*************************************************************************/
+
+#if defined(HAVE_ASM_SSE2) && defined(ARCH_X86_64)
+
+/* AMD64-optimized routine, using SSE2.  Derived from AMD64 optimization
+ * guide section 5.13: Appropriate Memory Copying Routines.
+ * Nonstandard instructions used:
+ *     (CPUID.CMOVE) CMOVA
+ *     (CPUID.SSE2)  MOVDQA, MOVDQU, MOVNTDQ
+ *
+ * Note that this routine will also run more or less as-is (modulo register
+ * names and label(%%rip) references) on x86 CPUs, but tests have shown the
+ * SSE1 version above to be faster.
+ */
+
+/* The block copying code--macroized because we use two versions of it
+ * depending on whether the source is 16-byte-aligned or not.  Pass either
+ * movdqa or movdqu (unquoted) for the parameter. */
+#define AMD64_BLOCK_MEMCPY(movdq) \
+"       # First prefetch (note that if we end on an odd number of cache \n\
+        # lines, we skip prefetching the last one--faster that way than \n\
+        # prefetching line by line or treating it as a special case)    \n\
+0:      mov %%ecx, %%edx        # EDX: temp counter (always <32 bits)   \n\
+        shr $6, %%edx           # Divide by cache line size (64 bytes)  \n\
+        cmp %%ebx, %%edx        # ... and cap at 128 (8192 bytes)       \n\
+        cmova %%ebx, %%edx                                              \n\
+        shl $3, %%edx           # EDX <- cache lines to copy * 8        \n\
+        mov %%edx, %%eax        # EAX <- cache lines to preload * 8     \n\
+                                #        (also used as memory offset)   \n\
+1:      prefetchnta -64(%%rsi,%%rax,8)  # Preload cache lines in pairs  \n\
+        prefetchnta -128(%%rsi,%%rax,8) # (going backwards)             \n\
+        sub $16, %%eax          # And loop                              \n\
+        jg 1b                                                           \n\
+                                                                        \n\
+        # Then copy--forward, which seems to be faster than reverse for \n\
+        # certain alignments                                            \n\
+        xor %%eax, %%eax                                                \n\
+2:      " #movdq " (%%rsi,%%rax,8), %%xmm0 # Copy 16 bytes and loop     \n\
+        movntdq %%xmm0, (%%rdi,%%rax,8)                                 \n\
+        add $2, %%eax                                                   \n\
+        cmp %%edx, %%eax                                                \n\
+        jb 2b                                                           \n\
+                                                                        \n\
+        # Finally, update pointers and count, and loop                  \n\
+        shl $3, %%edx           # EDX <- bytes copied                   \n\
+        add %%rdx, %%rsi                                                \n\
+        add %%rdx, %%rdi                                                \n\
+        sub %%rdx, %%rcx                                                \n\
+        cmp $64, %%rcx          # At least one cache line left?         \n\
+        jae 0b                  # Yup, loop                             \n"
+
+static void *memcpy_amd64(void *dest, const void *src, size_t bytes)
+{
+    asm("\
+        push %%rdi              # Save destination for return value     \n\
+        cld                     # MOVS* should ascend                   \n\
+                                                                        \n\
+        cmp $64, %%rcx          # Skip block copy for small blocks      \n\
+        jb amd64.memcpy_last                                            \n\
+                                                                        \n\
+        mov $128, %%ebx         # Constant used later                   \n\
+                                                                        \n\
+        # First align destination address to a multiple of 16 bytes     \n\
+        mov $8, %%eax           # EAX <- (8-dest) & 7                   \n\
+        sub %%edi, %%eax        # (we don't care about the top 32 bits) \n\
+        and $7, %%eax           # ... which is the number of bytes to copy\n\
+        lea 0f(%%rip), %%rdx    # Use a computed jump--faster than a loop\n\
+        sub %%rax, %%rdx                                                \n\
+        jmp *%%rdx              # Execute 0-7 MOVSB's                   \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+0:      sub %%rax, %%rcx        # Update count                          \n\
+        test $8, %%edi          # Is destination not 16-byte aligned?   \n\
+        je 1f                                                           \n\
+        movsq                   # Then move 8 bytes to align it         \n\
+        sub $8, %%rcx                                                   \n\
+                                                                        \n\
+1:      cmp $0x38000, %%rcx     # Is this a large block? (0x38000 is an \n\
+                                # arbitrary value where prefetching and \n\
+                                # write combining seem to start becoming\n\
+                                # faster)                               \n\
+        jb amd64.memcpy_small   # Nope, use small copy (no prefetch/WC) \n\
+        test $15, %%esi         # Is source also 16-byte aligned?       \n\
+                                # (use ESI to save a REX prefix byte)   \n\
+        jnz amd64.memcpy_normal_bp # Nope, use slow copy                \n\
+        jmp amd64.memcpy_fast_bp # Yup, use fast copy                   \n\
+                                                                        \n\
+amd64.memcpy_small:             # Small block copy routine--no prefetch \n\
+        mov %%ecx, %%edx        # EDX <- bytes to copy / 16             \n\
+        shr $4, %%edx           # (count known to fit in 32 bits)       \n\
+        mov %%edx, %%eax        # Leave remainder in ECX for later      \n\
+        shl $4, %%eax                                                   \n\
+        sub %%eax, %%ecx                                                \n\
+        .balign 16                                                      \n\
+0:      movdqu (%%rsi), %%xmm0  # Copy 16 bytes of data                 \n\
+        movdqa %%xmm0, (%%rdi)                                          \n\
+        add $16, %%rsi          # Update pointers                       \n\
+        add $16, %%rdi                                                  \n\
+        dec %%edx               # And loop                              \n\
+        jnz 0b                                                          \n\
+        jmp amd64.memcpy_last   # Copy any remaining bytes              \n\
+                                                                        \n\
+        .balign 16                                                      \n\
+        nop                                                             \n\
+        nop                                                             \n\
+amd64.memcpy_fast_bp:           # Fast block prefetch loop              \n"
+AMD64_BLOCK_MEMCPY(movdqa)
+"       jmp amd64.memcpy_last   # Copy any remaining bytes              \n\
+                                                                        \n\
+        .balign 16                                                      \n\
+        nop                                                             \n\
+        nop                                                             \n\
+amd64.memcpy_normal_bp:         # Normal (unaligned) block prefetch loop\n"
+AMD64_BLOCK_MEMCPY(movdqu)
+"                                                                       \n\
+amd64.memcpy_last:                                                      \n\
+        # Copy last <64 bytes, using the computed jump trick            \n\
+        mov %%ecx, %%eax        # EAX <- ECX>>3                         \n\
+        shr $3, %%eax                                                   \n\
+        lea 0f(%%rip), %%rdx                                            \n\
+        add %%eax, %%eax        # Watch out, MOVSQ is 2 bytes!          \n\
+        sub %%rax, %%rdx                                                \n\
+        jmp *%%rdx              # Execute 0-7 MOVSQ's                   \n\
+        movsq                                                           \n\
+        movsq                                                           \n\
+        movsq                                                           \n\
+        movsq                                                           \n\
+        movsq                                                           \n\
+        movsq                                                           \n\
+        movsq                                                           \n\
+0:      and $7, %%ecx           # ECX <- ECX & 7                        \n\
+        lea 0f(%%rip), %%rdx                                            \n\
+        sub %%rcx, %%rdx                                                \n\
+        jmp *%%rdx              # Execute 0-7 MOVSB's                   \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+        movsb                                                           \n\
+0:                                                                      \n\
+        # All done!                                                     \n\
+        emms                    # Clean up after MMX instructions       \n\
+        sfence                  # Flush the write buffer                \n\
+        pop %%rdi               # Restore destination (return value)    \n\
+    " : /* no outputs */
+      : "D" (dest), "S" (src), "c" (bytes)
+      : "%rax", "%rbx", "%rdx"
+    );
+    return dest;
+}
+
+#endif  /* HAVE_ASM_SSE2 && ARCH_X86_64 */
+
+/*************************************************************************/
+
+/* Initialization routine. */
+
+int ac_memcpy_init(int accel)
+{
+    memcpy_ptr = memmove;
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
+    if (HAS_ACCEL(accel, AC_MMX))
+        memcpy_ptr = memcpy_mmx;
+#endif
+
+#if defined(HAVE_ASM_SSE) && defined(ARCH_X86)
+    if (HAS_ACCEL(accel, AC_CMOVE|AC_SSE))
+        memcpy_ptr = memcpy_sse;
+#endif
+
+#if defined(HAVE_ASM_SSE2) && defined(ARCH_X86_64)
+    if (HAS_ACCEL(accel, AC_CMOVE|AC_SSE2))
+        memcpy_ptr = memcpy_amd64;
+#endif
+
+    return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ *   c-file-style: "stroustrup"
+ *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ *   indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/rescale.c b/debian/transcode/transcode-1.1.7/aclib/rescale.c
new file mode 100644
index 00000000..5a619735
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/rescale.c
@@ -0,0 +1,280 @@
+/*
+ * rescale.c -- take the weighted average of two sets of byte data
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later).  See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "ac_internal.h"
+
+static void rescale(const uint8_t *, const uint8_t *, uint8_t *, int,
+                    uint32_t, uint32_t);
+static void (*rescale_ptr)(const uint8_t *, const uint8_t *, uint8_t *, int,
+                           uint32_t, uint32_t) = rescale;
+
+/*************************************************************************/
+
+/* External interface */
+
+void ac_rescale(const uint8_t *src1, const uint8_t *src2,
+                uint8_t *dest, int bytes, uint32_t weight1, uint32_t weight2)
+{
+    if (weight1 >= 0x10000)
+        ac_memcpy(dest, src1, bytes);
+    else if (weight2 >= 0x10000)
+        ac_memcpy(dest, src2, bytes);
+    else
+        (*rescale_ptr)(src1, src2, dest, bytes, weight1, weight2);
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Vanilla C version */
+
+static void rescale(const uint8_t *src1, const uint8_t *src2,
+                    uint8_t *dest, int bytes,
+                    uint32_t weight1, uint32_t weight2)
+{
+    int i;
+    for (i = 0; i < bytes; i++)
+        dest[i] = (src1[i]*weight1 + src2[i]*weight2 + 32768) >> 16;
+}
+
+/*************************************************************************/
+
+/* MMX version */
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)  /* i.e. not x86_64 */
+
+static void rescale_mmx(const uint8_t *src1, const uint8_t *src2,
+                        uint8_t *dest, int bytes,
+                        uint32_t weight1, uint32_t weight2)
+{
+    if (bytes >= 8) {
+        /* First store weights in MM4/MM5 to relieve register pressure;
+         * save time by making 2 copies ahead of time in the general
+         * registers.  Note that we divide by 2 for MMX due to the lack
+         * of an unsigned SIMD multiply instruction (PMULHUW). */
+        int half1 = weight1 / 2;
+        int half2 = weight2 / 2;
+        half2 += weight1 & weight2 & 1;  // pick up the lost bit here
+        asm("movd %%eax, %%mm4; movd %%edx, %%mm5"
+            : : "a" (half1<<16|half1), "d" (half2<<16|half2));
+        asm("\
+            movq %%mm4, %%mm6           # MM6: 00 00 W1 W1              \n\
+            psllq $32, %%mm4            # MM4: W1 W1 00 00              \n\
+            por %%mm6, %%mm4            # MM4: W1 W1 W1 W1              \n\
+            movq %%mm5, %%mm7           # MM7: 00 00 W2 W2              \n\
+            psllq $32, %%mm5            # MM5: W2 W2 00 00              \n\
+            por %%mm7, %%mm5            # MM5: W2 W2 W2 W2              \n\
+            pxor %%mm7, %%mm7           # MM7: 00 00 00 00              \n\
+            pxor %%mm6, %%mm6           # Put 0x0020*4 in MM6 (rounding)\n\
+            pcmpeqw %%mm3, %%mm3                                        \n\
+            psubw %%mm3, %%mm6                                          \n\
+            psllw $5, %%mm6                                             \n\
+            0:                                                          \n\
+            movq -8(%%esi,%%ecx), %%mm0                                 \n\
+            movq %%mm0, %%mm1                                           \n\
+            punpcklbw %%mm7, %%mm0                                      \n\
+            psllw $7, %%mm0             # 9.7 fixed point               \n\
+            pmulhw %%mm4, %%mm0         # Multiply to get 10.6 fixed    \n\
+            punpckhbw %%mm7, %%mm1                                      \n\
+            psllw $7, %%mm1                                             \n\
+            pmulhw %%mm4, %%mm1                                         \n\
+            movq -8(%%edx,%%ecx), %%mm2                                 \n\
+            movq %%mm2, %%mm3                                           \n\
+            punpcklbw %%mm7, %%mm2                                      \n\
+            psllw $7, %%mm2                                             \n\
+            pmulhw %%mm5, %%mm2                                         \n\
+            punpckhbw %%mm7, %%mm3                                      \n\
+            psllw $7, %%mm3                                             \n\
+            pmulhw %%mm5, %%mm3                                         \n\
+            paddw %%mm2, %%mm0                                          \n\
+            paddw %%mm6, %%mm0                                          \n\
+            psrlw $6, %%mm0                                             \n\
+            paddw %%mm3, %%mm1                                          \n\
+            paddw %%mm6, %%mm1                                          \n\
+            psrlw $6, %%mm1                                             \n\
+            packuswb %%mm1, %%mm0                                       \n\
+            movq %%mm0, -8(%%edi,%%ecx)                                 \n\
+            subl $8, %%ecx                                              \n\
+            jnz 0b                                                      \n\
+            emms"
+            : /* no outputs */
+            : "S" (src1), "d" (src2), "D" (dest), "c" (bytes & ~7));
+    }
+    if (UNLIKELY(bytes & 7)) {
+        rescale(src1+(bytes & ~7), src2+(bytes & ~7), dest+(bytes & ~7),
+                bytes & 7, weight1, weight2);
+    }
+}
+
+#endif  /* HAVE_ASM_MMX && ARCH_X86 */
+
+/*************************************************************************/
+
+/* MMXEXT version (also for SSE) */
+
+#if (defined(HAVE_ASM_MMXEXT) || defined(HAVE_ASM_SSE)) && defined(ARCH_X86)
+
+static void rescale_mmxext(const uint8_t *src1, const uint8_t *src2,
+                           uint8_t *dest, int bytes,
+                           uint32_t weight1, uint32_t weight2)
+{
+    if (bytes >= 8) {
+        asm("movd %%eax, %%mm4; movd %%edx, %%mm5"
+            : : "a" (weight1), "d" (weight2));
+        asm("\
+            pshufw $0, %%mm4, %%mm4     # MM4: W1 W1 W1 W1              \n\
+            pshufw $0, %%mm5, %%mm5     # MM5: W2 W2 W2 W2              \n\
+            pxor %%mm6, %%mm6           # Put 0x0080*4 in MM6 (rounding)\n\
+            pcmpeqw %%mm7, %%mm7                                        \n\
+            psubw %%mm7, %%mm6                                          \n\
+            psllw $7, %%mm6                                             \n\
+            0:                                                          \n\
+            movq -8(%%esi,%%ecx), %%mm7                                 \n\
+            pxor %%mm0, %%mm0           # Load data into high bytes     \n\
+            punpcklbw %%mm7, %%mm0      # (gives 8.8 fixed point)       \n\
+            pmulhuw %%mm4, %%mm0        # Result: 0000..FF00            \n\
+            pxor %%mm1, %%mm1                                           \n\
+            punpckhbw %%mm7, %%mm1                                      \n\
+            pmulhuw %%mm4, %%mm1                                        \n\
+            movq -8(%%edx,%%ecx), %%mm7                                 \n\
+            pxor %%mm2, %%mm2                                           \n\
+            punpcklbw %%mm7, %%mm2                                      \n\
+            pmulhuw %%mm5, %%mm2                                        \n\
+            pxor %%mm3, %%mm3                                           \n\
+            punpckhbw %%mm7, %%mm3                                      \n\
+            pmulhuw %%mm5, %%mm3                                        \n\
+            paddw %%mm2, %%mm0                                          \n\
+            paddw %%mm6, %%mm0                                          \n\
+            psrlw $8, %%mm0             # Shift back down to 00..FF     \n\
+            paddw %%mm3, %%mm1                                          \n\
+            paddw %%mm6, %%mm1                                          \n\
+            psrlw $8, %%mm1                                             \n\
+            packuswb %%mm1, %%mm0                                       \n\
+            movq %%mm0, -8(%%edi,%%ecx)                                 \n\
+            subl $8, %%ecx                                              \n\
+            jnz 0b                                                      \n\
+            emms"
+            : /* no outputs */
+            : "S" (src1), "d" (src2), "D" (dest), "c" (bytes & ~7));
+    }
+    if (UNLIKELY(bytes & 7)) {
+        rescale(src1+(bytes & ~7), src2+(bytes & ~7), dest+(bytes & ~7),
+                bytes & 7, weight1, weight2);
+    }
+}
+
+#endif  /* (HAVE_ASM_MMXEXT || HAVE_ASM_SSE) && ARCH_X86 */
+
+/*************************************************************************/
+
+/* SSE2 version */
+
+#if defined(HAVE_ASM_SSE2)
+
+#ifdef ARCH_X86_64
+# define ECX "%%rcx"
+# define EDX "%%rdx"
+# define ESI "%%rsi"
+# define EDI "%%rdi"
+#else
+# define ECX "%%ecx"
+# define EDX "%%edx"
+# define ESI "%%esi"
+# define EDI "%%edi"
+#endif
+
+static void rescale_sse2(const uint8_t *src1, const uint8_t *src2,
+                         uint8_t *dest, int bytes,
+                         uint32_t weight1, uint32_t weight2)
+{
+    if (bytes >= 16) {
+        asm("movd %%eax, %%xmm4; movd %%edx, %%xmm5"
+            : : "a" (weight1<<16|weight1), "d" (weight2<<16|weight2));
+        asm("\
+            pshufd $0, %%xmm4, %%xmm4   # XMM4: W1 W1 W1 W1 W1 W1 W1 W1 \n\
+            pshufd $0, %%xmm5, %%xmm5   # XMM5: W2 W2 W2 W2 W2 W2 W2 W2 \n\
+            pxor %%xmm6, %%xmm6         # Put 0x0080*4 in XMM6 (rounding)\n\
+            pcmpeqw %%xmm7, %%xmm7                                      \n\
+            psubw %%xmm7, %%xmm6                                        \n\
+            psllw $7, %%xmm6                                            \n\
+            0:                                                          \n\
+            movdqu -16("ESI","ECX"), %%xmm7                             \n\
+            pxor %%xmm0, %%xmm0                                         \n\
+            punpcklbw %%xmm7, %%xmm0                                    \n\
+            pmulhuw %%xmm4, %%xmm0                                      \n\
+            pxor %%xmm1, %%xmm1                                         \n\
+            punpckhbw %%xmm7, %%xmm1                                    \n\
+            pmulhuw %%xmm4, %%xmm1                                      \n\
+            movdqu -16("EDX","ECX"), %%xmm7                             \n\
+            pxor %%xmm2, %%xmm2                                         \n\
+            punpcklbw %%xmm7, %%xmm2                                    \n\
+            pmulhuw %%xmm5, %%xmm2                                      \n\
+            pxor %%xmm3, %%xmm3                                         \n\
+            punpckhbw %%xmm7, %%xmm3                                    \n\
+            pmulhuw %%xmm5, %%xmm3                                      \n\
+            paddw %%xmm2, %%xmm0                                        \n\
+            paddw %%xmm6, %%xmm0                                        \n\
+            psrlw $8, %%xmm0                                            \n\
+            paddw %%xmm3, %%xmm1                                        \n\
+            paddw %%xmm6, %%xmm1                                        \n\
+            psrlw $8, %%xmm1                                            \n\
+            packuswb %%xmm1, %%xmm0                                     \n\
+            movdqu %%xmm0, -16("EDI","ECX")                             \n\
+            subl $16, %%ecx                                             \n\
+            jnz 0b                                                      \n\
+            emms"
+            : /* no outputs */
+            : "S" (src1), "d" (src2), "D" (dest), "c" (bytes & ~15));
+    }
+    if (UNLIKELY(bytes & 15)) {
+        rescale(src1+(bytes & ~15), src2+(bytes & ~15), dest+(bytes & ~15),
+                bytes & 15, weight1, weight2);
+    }
+}
+
+#endif  /* HAVE_ASM_SSE2 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Initialization routine. */
+
+int ac_rescale_init(int accel)
+{
+    rescale_ptr = rescale;
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
+    if (HAS_ACCEL(accel, AC_MMX))
+        rescale_ptr = rescale_mmx;
+#endif
+#if (defined(HAVE_ASM_MMXEXT) || defined(HAVE_ASM_SSE)) && defined(ARCH_X86)
+    if (HAS_ACCEL(accel, AC_MMXEXT) || HAS_ACCEL(accel, AC_SSE))
+        rescale_ptr = rescale_mmxext;
+#endif
+#if defined(HAVE_ASM_SSE2)
+    if (HAS_ACCEL(accel, AC_SSE2))
+        rescale_ptr = rescale_sse2;
+#endif
+
+    return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ *   c-file-style: "stroustrup"
+ *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ *   indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/rgb-yuv-conv.pl b/debian/transcode/transcode-1.1.7/aclib/rgb-yuv-conv.pl
new file mode 100755
index 00000000..a2b6257c
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/rgb-yuv-conv.pl
@@ -0,0 +1,48 @@
+#!/usr/bin/perl -w
+# Calculate conversion matrices for RGB<->YUV given Kb and Kr
+
+die "Usage: $0 Kb Kr [scale]\n" if @ARGV < 2;
+$scale = $ARGV[2] || 1;
+$Kb = $ARGV[0];
+$Kr = $ARGV[1];
+$Kg = 1 - $Kr - $Kb;
+$a11 = $Kr;
+$a12 = $Kg;
+$a13 = $Kb;
+$a21 = -$Kr/(1-$Kb)/2;
+$a22 = -$Kg/(1-$Kb)/2;
+$a23 = 1/2;
+$a31 = 1/2;
+$a32 = -$Kg/(1-$Kr)/2;
+$a33 = -$Kb/(1-$Kr)/2;
+print "Y [R] = ".($a11*$scale)."\n";
+print "Y [G] = ".($a12*$scale)."\n";
+print "Y [B] = ".($a13*$scale)."\n";
+print "Cb[R] = ".($a21*$scale)."\n";
+print "Cb[G] = ".($a22*$scale)."\n";
+print "Cb[B] = ".($a23*$scale)."\n";
+print "Cr[R] = ".($a31*$scale)."\n";
+print "Cr[G] = ".($a32*$scale)."\n";
+print "Cr[B] = ".($a33*$scale)."\n";
+$det = $a11*$a22*$a33 - $a11*$a23*$a32
+     + $a12*$a23*$a31 - $a12*$a21*$a33
+     + $a13*$a21*$a32 - $a13*$a22*$a31;
+$b11 = (1/$det)*($a22*$a33-$a23*$a32);
+$b12 = (1/$det)*($a13*$a32-$a12*$a33);
+$b13 = (1/$det)*($a12*$a23-$a13*$a22);
+$b21 = (1/$det)*($a23*$a31-$a21*$a33);
+$b22 = (1/$det)*($a11*$a33-$a13*$a31);
+$b23 = (1/$det)*($a13*$a21-$a11*$a23);
+$b31 = (1/$det)*($a21*$a32-$a22*$a31);
+$b32 = (1/$det)*($a12*$a31-$a11*$a32);
+$b33 = (1/$det)*($a11*$a22-$a12*$a21);
+map {$_ = 0 if abs($_) < 1e-10} ($b11,$b12,$b13,$b21,$b22,$b23,$b31,$b32,$b33);
+print "R[Y ] = ".($b11*$scale)."\n";
+print "R[Cb] = ".($b12*$scale)."\n";
+print "R[Cr] = ".($b13*$scale)."\n";
+print "G[Y ] = ".($b21*$scale)."\n";
+print "G[Cb] = ".($b22*$scale)."\n";
+print "G[Cr] = ".($b23*$scale)."\n";
+print "B[Y ] = ".($b31*$scale)."\n";
+print "B[Cb] = ".($b32*$scale)."\n";
+print "B[Cr] = ".($b33*$scale)."\n";
author	Michele Calgaro <[email protected]>	2020-09-11 14:38:47 +0900
committer	Michele Calgaro <[email protected]>	2020-09-11 14:38:47 +0900
commit	884c8093d63402a1ad0b502244b791e3c6782be3 (patch)
tree	a600d4ab0d431a2bdfe4c15b70df43c14fbd8dd0 /debian/transcode/transcode-1.1.7/aclib
parent	14e1aa2006796f147f3f4811fb908a6b01e79253 (diff)
download	extra-dependencies-884c8093d63402a1ad0b502244b791e3c6782be3.tar.gz extra-dependencies-884c8093d63402a1ad0b502244b791e3c6782be3.zip