summaryrefslogtreecommitdiffstats
path: root/debian/transcode/transcode-1.1.7/aclib
diff options
context:
space:
mode:
authorMichele Calgaro <[email protected]>2020-09-11 14:38:47 +0900
committerMichele Calgaro <[email protected]>2020-09-11 14:38:47 +0900
commit884c8093d63402a1ad0b502244b791e3c6782be3 (patch)
treea600d4ab0d431a2bdfe4c15b70df43c14fbd8dd0 /debian/transcode/transcode-1.1.7/aclib
parent14e1aa2006796f147f3f4811fb908a6b01e79253 (diff)
downloadextra-dependencies-884c8093d63402a1ad0b502244b791e3c6782be3.tar.gz
extra-dependencies-884c8093d63402a1ad0b502244b791e3c6782be3.zip
Added debian extra dependency packages.
Signed-off-by: Michele Calgaro <[email protected]>
Diffstat (limited to 'debian/transcode/transcode-1.1.7/aclib')
-rw-r--r--debian/transcode/transcode-1.1.7/aclib/Makefile.am27
-rw-r--r--debian/transcode/transcode-1.1.7/aclib/Makefile.in610
-rw-r--r--debian/transcode/transcode-1.1.7/aclib/ac.h107
-rw-r--r--debian/transcode/transcode-1.1.7/aclib/ac_internal.h42
-rw-r--r--debian/transcode/transcode-1.1.7/aclib/accore.c320
-rw-r--r--debian/transcode/transcode-1.1.7/aclib/average.c243
-rw-r--r--debian/transcode/transcode-1.1.7/aclib/img_internal.h40
-rw-r--r--debian/transcode/transcode-1.1.7/aclib/img_rgb_packed.c1106
-rw-r--r--debian/transcode/transcode-1.1.7/aclib/img_x86_common.h613
-rw-r--r--debian/transcode/transcode-1.1.7/aclib/img_yuv_mixed.c981
-rw-r--r--debian/transcode/transcode-1.1.7/aclib/img_yuv_packed.c290
-rw-r--r--debian/transcode/transcode-1.1.7/aclib/img_yuv_planar.c788
-rw-r--r--debian/transcode/transcode-1.1.7/aclib/img_yuv_rgb.c2410
-rw-r--r--debian/transcode/transcode-1.1.7/aclib/imgconvert.c119
-rw-r--r--debian/transcode/transcode-1.1.7/aclib/imgconvert.h105
-rw-r--r--debian/transcode/transcode-1.1.7/aclib/memcpy.c543
-rw-r--r--debian/transcode/transcode-1.1.7/aclib/rescale.c280
-rwxr-xr-xdebian/transcode/transcode-1.1.7/aclib/rgb-yuv-conv.pl48
18 files changed, 8672 insertions, 0 deletions
diff --git a/debian/transcode/transcode-1.1.7/aclib/Makefile.am b/debian/transcode/transcode-1.1.7/aclib/Makefile.am
new file mode 100644
index 00000000..54951ce6
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/Makefile.am
@@ -0,0 +1,27 @@
+# # Process this file with automake to produce Makefile.in.
+
+AM_CPPFLAGS = \
+ $(PTHREAD_CFLAGS) \
+ -I$(top_srcdir)
+
+noinst_LTLIBRARIES = libac.la
+
+libac_la_SOURCES = \
+ accore.c \
+ average.c \
+ imgconvert.c \
+ img_rgb_packed.c \
+ img_yuv_mixed.c \
+ img_yuv_packed.c \
+ img_yuv_planar.c \
+ img_yuv_rgb.c \
+ memcpy.c \
+ rescale.c
+
+EXTRA_DIST = \
+ ac.h \
+ ac_internal.h \
+ imgconvert.h \
+ img_internal.h \
+ img_x86_common.h \
+ rgb-yuv-conv.pl
diff --git a/debian/transcode/transcode-1.1.7/aclib/Makefile.in b/debian/transcode/transcode-1.1.7/aclib/Makefile.in
new file mode 100644
index 00000000..8f3a132a
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/Makefile.in
@@ -0,0 +1,610 @@
+# Makefile.in generated by automake 1.11.1 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation,
+# Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# # Process this file with automake to produce Makefile.in.
+
+VPATH = @srcdir@
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+target_triplet = @target@
+subdir = aclib
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/acinclude.m4 \
+ $(top_srcdir)/configure.in
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+ $(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+libac_la_LIBADD =
+am_libac_la_OBJECTS = accore.lo average.lo imgconvert.lo \
+ img_rgb_packed.lo img_yuv_mixed.lo img_yuv_packed.lo \
+ img_yuv_planar.lo img_yuv_rgb.lo memcpy.lo rescale.lo
+libac_la_OBJECTS = $(am_libac_la_OBJECTS)
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/autotools/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+ --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+ --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+ $(LDFLAGS) -o $@
+SOURCES = $(libac_la_SOURCES)
+DIST_SOURCES = $(libac_la_SOURCES)
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+A52_CFLAGS = @A52_CFLAGS@
+A52_LIBS = @A52_LIBS@
+ACLIB_LIBS = @ACLIB_LIBS@
+ACLOCAL = @ACLOCAL@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVILIB_LIBS = @AVILIB_LIBS@
+AWK = @AWK@
+BSDAV_CFLAGS = @BSDAV_CFLAGS@
+BSDAV_LIBS = @BSDAV_LIBS@
+CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CXXCPP = @CXXCPP@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLDARWIN_CFLAGS = @DLDARWIN_CFLAGS@
+DLDARWIN_LIBS = @DLDARWIN_LIBS@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+FAAC_CFLAGS = @FAAC_CFLAGS@
+FAAC_LIBS = @FAAC_LIBS@
+FGREP = @FGREP@
+FREETYPE2_CFLAGS = @FREETYPE2_CFLAGS@
+FREETYPE2_LIBS = @FREETYPE2_LIBS@
+GREP = @GREP@
+IBP_LIBS = @IBP_LIBS@
+ICONV_CFLAGS = @ICONV_CFLAGS@
+ICONV_LIBS = @ICONV_LIBS@
+IMAGEMAGICK_CFLAGS = @IMAGEMAGICK_CFLAGS@
+IMAGEMAGICK_LIBS = @IMAGEMAGICK_LIBS@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LAME_CFLAGS = @LAME_CFLAGS@
+LAME_LIBS = @LAME_LIBS@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBAVCODEC_CFLAGS = @LIBAVCODEC_CFLAGS@
+LIBAVCODEC_LIBS = @LIBAVCODEC_LIBS@
+LIBAVFORMAT_CFLAGS = @LIBAVFORMAT_CFLAGS@
+LIBAVFORMAT_LIBS = @LIBAVFORMAT_LIBS@
+LIBDVDREAD_CFLAGS = @LIBDVDREAD_CFLAGS@
+LIBDVDREAD_LIBS = @LIBDVDREAD_LIBS@
+LIBDV_CFLAGS = @LIBDV_CFLAGS@
+LIBDV_LIBS = @LIBDV_LIBS@
+LIBJPEG_CFLAGS = @LIBJPEG_CFLAGS@
+LIBJPEG_LIBS = @LIBJPEG_LIBS@
+LIBMPEG2CONVERT_CFLAGS = @LIBMPEG2CONVERT_CFLAGS@
+LIBMPEG2CONVERT_LIBS = @LIBMPEG2CONVERT_LIBS@
+LIBMPEG2_CFLAGS = @LIBMPEG2_CFLAGS@
+LIBMPEG2_LIBS = @LIBMPEG2_LIBS@
+LIBOBJS = @LIBOBJS@
+LIBPOSTPROC_CFLAGS = @LIBPOSTPROC_CFLAGS@
+LIBPOSTPROC_LIBS = @LIBPOSTPROC_LIBS@
+LIBQUICKTIME_CFLAGS = @LIBQUICKTIME_CFLAGS@
+LIBQUICKTIME_LIBS = @LIBQUICKTIME_LIBS@
+LIBS = @LIBS@
+LIBTCAUDIO_LIBS = @LIBTCAUDIO_LIBS@
+LIBTCVIDEO_LIBS = @LIBTCVIDEO_LIBS@
+LIBTC_LIBS = @LIBTC_LIBS@
+LIBTOOL = @LIBTOOL@
+LIBV4L2_CFLAGS = @LIBV4L2_CFLAGS@
+LIBV4L2_LIBS = @LIBV4L2_LIBS@
+LIBV4LCONVERT_CFLAGS = @LIBV4LCONVERT_CFLAGS@
+LIBV4LCONVERT_LIBS = @LIBV4LCONVERT_LIBS@
+LIBXML2_CFLAGS = @LIBXML2_CFLAGS@
+LIBXML2_LIBS = @LIBXML2_LIBS@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+LZO_CFLAGS = @LZO_CFLAGS@
+LZO_LIBS = @LZO_LIBS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MJPEGTOOLS_CFLAGS = @MJPEGTOOLS_CFLAGS@
+MJPEGTOOLS_LIBS = @MJPEGTOOLS_LIBS@
+MKDIR_P = @MKDIR_P@
+MOD_PATH = @MOD_PATH@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OGG_CFLAGS = @OGG_CFLAGS@
+OGG_LIBS = @OGG_LIBS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+PATH_TO_AWK = @PATH_TO_AWK@
+PKG_CONFIG = @PKG_CONFIG@
+PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
+PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
+PROF_PATH = @PROF_PATH@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+PVM3_CFLAGS = @PVM3_CFLAGS@
+PVM3_LIBS = @PVM3_LIBS@
+PVM3_PVMGS = @PVM3_PVMGS@
+RANLIB = @RANLIB@
+SDL_CFLAGS = @SDL_CFLAGS@
+SDL_LIBS = @SDL_LIBS@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+SIMD_FLAGS = @SIMD_FLAGS@
+STRIP = @STRIP@
+THEORA_CFLAGS = @THEORA_CFLAGS@
+THEORA_LIBS = @THEORA_LIBS@
+USE_DLDARWIN = @USE_DLDARWIN@
+VERSION = @VERSION@
+VORBIS_CFLAGS = @VORBIS_CFLAGS@
+VORBIS_LIBS = @VORBIS_LIBS@
+WAVLIB_LIBS = @WAVLIB_LIBS@
+X264_CFLAGS = @X264_CFLAGS@
+X264_LIBS = @X264_LIBS@
+XIO_CFLAGS = @XIO_CFLAGS@
+XIO_LIBS = @XIO_LIBS@
+XMKMF = @XMKMF@
+XVID_CFLAGS = @XVID_CFLAGS@
+XVID_LIBS = @XVID_LIBS@
+X_CFLAGS = @X_CFLAGS@
+X_EXTRA_LIBS = @X_EXTRA_LIBS@
+X_LIBS = @X_LIBS@
+X_PRE_LIBS = @X_PRE_LIBS@
+a52_config = @a52_config@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+bsdav_config = @bsdav_config@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+faac_config = @faac_config@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+iconv_config = @iconv_config@
+imagemagick_config = @imagemagick_config@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+lame_config = @lame_config@
+libdir = @libdir@
+libdvdread_config = @libdvdread_config@
+libexecdir = @libexecdir@
+libjpeg_config = @libjpeg_config@
+libjpegmmx_config = @libjpegmmx_config@
+localedir = @localedir@
+localstatedir = @localstatedir@
+lzo_config = @lzo_config@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+pvm3_config = @pvm3_config@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target = @target@
+target_alias = @target_alias@
+target_cpu = @target_cpu@
+target_os = @target_os@
+target_vendor = @target_vendor@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+x_includes = @x_includes@
+x_libraries = @x_libraries@
+xvid_config = @xvid_config@
+AM_CPPFLAGS = \
+ $(PTHREAD_CFLAGS) \
+ -I$(top_srcdir)
+
+noinst_LTLIBRARIES = libac.la
+libac_la_SOURCES = \
+ accore.c \
+ average.c \
+ imgconvert.c \
+ img_rgb_packed.c \
+ img_yuv_mixed.c \
+ img_yuv_packed.c \
+ img_yuv_planar.c \
+ img_yuv_rgb.c \
+ memcpy.c \
+ rescale.c
+
+EXTRA_DIST = \
+ ac.h \
+ ac_internal.h \
+ imgconvert.h \
+ img_internal.h \
+ img_x86_common.h \
+ rgb-yuv-conv.pl
+
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps)
+ @for dep in $?; do \
+ case '$(am__configure_deps)' in \
+ *$$dep*) \
+ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+ && { if test -f $@; then exit 0; else break; fi; }; \
+ exit 1;; \
+ esac; \
+ done; \
+ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu aclib/Makefile'; \
+ $(am__cd) $(top_srcdir) && \
+ $(AUTOMAKE) --gnu aclib/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+ @case '$?' in \
+ *config.status*) \
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+ *) \
+ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+ esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+ -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+ @list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+ dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+ test "$$dir" != "$$p" || dir=.; \
+ echo "rm -f \"$${dir}/so_locations\""; \
+ rm -f "$${dir}/so_locations"; \
+ done
+libac.la: $(libac_la_OBJECTS) $(libac_la_DEPENDENCIES)
+ $(LINK) $(libac_la_OBJECTS) $(libac_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+ -rm -f *.$(OBJEXT)
+
+distclean-compile:
+ -rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/accore.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/average.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/img_rgb_packed.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/img_yuv_mixed.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/img_yuv_packed.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/img_yuv_planar.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/img_yuv_rgb.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/imgconvert.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memcpy.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rescale.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@ $(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+ -rm -f *.lo
+
+clean-libtool:
+ -rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+ unique=`for i in $$list; do \
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+ done | \
+ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+ END { if (nonempty) { for (i in files) print i; }; }'`; \
+ mkid -fID $$unique
+tags: TAGS
+
+TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \
+ $(TAGS_FILES) $(LISP)
+ set x; \
+ here=`pwd`; \
+ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+ unique=`for i in $$list; do \
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+ done | \
+ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+ END { if (nonempty) { for (i in files) print i; }; }'`; \
+ shift; \
+ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+ test -n "$$unique" || unique=$$empty_fix; \
+ if test $$# -gt 0; then \
+ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+ "$$@" $$unique; \
+ else \
+ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+ $$unique; \
+ fi; \
+ fi
+ctags: CTAGS
+CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \
+ $(TAGS_FILES) $(LISP)
+ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+ unique=`for i in $$list; do \
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+ done | \
+ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+ END { if (nonempty) { for (i in files) print i; }; }'`; \
+ test -z "$(CTAGS_ARGS)$$unique" \
+ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+ $$unique
+
+GTAGS:
+ here=`$(am__cd) $(top_builddir) && pwd` \
+ && $(am__cd) $(top_srcdir) \
+ && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+ list='$(DISTFILES)'; \
+ dist_files=`for file in $$list; do echo $$file; done | \
+ sed -e "s|^$$srcdirstrip/||;t" \
+ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+ case $$dist_files in \
+ */*) $(MKDIR_P) `echo "$$dist_files" | \
+ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+ sort -u` ;; \
+ esac; \
+ for file in $$dist_files; do \
+ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+ if test -d $$d/$$file; then \
+ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+ if test -d "$(distdir)/$$file"; then \
+ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+ fi; \
+ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+ fi; \
+ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+ else \
+ test -f "$(distdir)/$$file" \
+ || cp -p $$d/$$file "$(distdir)/$$file" \
+ || exit 1; \
+ fi; \
+ done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+ `test -z '$(STRIP)' || \
+ echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+ @echo "This command is intended for maintainers to use"
+ @echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+ mostlyclean-am
+
+distclean: distclean-am
+ -rm -rf ./$(DEPDIR)
+ -rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+ distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+ -rm -rf ./$(DEPDIR)
+ -rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+ mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+ clean-libtool clean-noinstLTLIBRARIES ctags distclean \
+ distclean-compile distclean-generic distclean-libtool \
+ distclean-tags distdir dvi dvi-am html html-am info info-am \
+ install install-am install-data install-data-am install-dvi \
+ install-dvi-am install-exec install-exec-am install-html \
+ install-html-am install-info install-info-am install-man \
+ install-pdf install-pdf-am install-ps install-ps-am \
+ install-strip installcheck installcheck-am installdirs \
+ maintainer-clean maintainer-clean-generic mostlyclean \
+ mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+ pdf pdf-am ps ps-am tags uninstall uninstall-am
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/debian/transcode/transcode-1.1.7/aclib/ac.h b/debian/transcode/transcode-1.1.7/aclib/ac.h
new file mode 100644
index 00000000..d2a542b2
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/ac.h
@@ -0,0 +1,107 @@
+/*
+ * ac.h -- main aclib include
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later). See the file COPYING
+ * for details.
+ */
+
+#ifndef ACLIB_AC_H
+#define ACLIB_AC_H
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+/*************************************************************************/
+
+/* CPU acceleration support flags, for use with ac_init(): */
+
+#define AC_IA32ASM 0x0001 /* x86-32: standard assembly (no MMX) */
+#define AC_AMD64ASM 0x0002 /* x86-64: standard assembly (no MMX) */
+#define AC_CMOVE 0x0004 /* x86: CMOVcc instruction */
+#define AC_MMX 0x0008 /* x86: MMX instructions */
+#define AC_MMXEXT 0x0010 /* x86: MMX extended instructions (AMD) */
+#define AC_3DNOW 0x0020 /* x86: 3DNow! instructions (AMD) */
+#define AC_3DNOWEXT 0x0040 /* x86: 3DNow! instructions (AMD) */
+#define AC_SSE 0x0080 /* x86: SSE instructions */
+#define AC_SSE2 0x0100 /* x86: SSE2 instructions */
+#define AC_SSE3 0x0200 /* x86: SSE3 instructions */
+#define AC_SSSE3 0x0400 /* x86: SSSE3 instructions */
+#define AC_SSE41 0x0800 /* x86: SSE4.1 instructions */
+#define AC_SSE42 0x1000 /* x86: SSE4.2 instructions (Intel) */
+#define AC_SSE4A 0x2000 /* x86: SSE4a instructions (AMD) */
+#define AC_SSE5 0x4000 /* x86: SSE5 instructions (AMD) */
+
+#define AC_NONE 0 /* No acceleration (vanilla C functions) */
+#define AC_ALL (~0) /* All available acceleration */
+
+
+/* Endianness flag: */
+#define AC_LITTLE_ENDIAN 1
+#define AC_BIG_ENDIAN 2
+
+/*************************************************************************/
+
+/* Library initialization function--MUST be called before any other aclib
+ * functions are used! `accel' selects the accelerations to enable:
+ * AC_NONE, AC_ALL, or a combination of the other AC_* flags above. The
+ * value will always be masked to the acceleration options available on the
+ * actual CPU, as returned by ac_cpuinfo(). Returns 1 on success, 0 on
+ * failure. This function can be called multiple times to change the set
+ * of acceleration features to be used. */
+extern int ac_init(int accel);
+
+/* Returns the set of acceleration features supported by this CPU. */
+extern int ac_cpuinfo(void);
+
+/* Returns the endianness of this CPU (AC_BIG_ENDIAN or AC_LITTLE_ENDIAN). */
+extern int ac_endian(void);
+
+/* Utility routine to convert a set of flags to a descriptive string. The
+ * string is stored in a static buffer overwritten each call. */
+extern const char *ac_flagstotext(int accel);
+
+/* Utility routine to parse a comma-separate descriptive string to the
+ corrisponding flag. The reverse of ac_flagstotext.
+ Returns 1 on success, 0 on failure */
+extern int ac_parseflags(const char *text, int *accel);
+
+/*************************************************************************/
+
+/* Acceleration-enabled functions: */
+
+/* Optimized memcpy(). The copy direction is guaranteed to be ascending
+ * (so ac_memcpy(ptr, ptr+1, size) will work). */
+extern void *ac_memcpy(void *dest, const void *src, size_t size);
+
+/* Average of two sets of data */
+extern void ac_average(const uint8_t *src1, const uint8_t *src2,
+ uint8_t *dest, int bytes);
+
+/* Weighted average of two sets of data (weight1+weight2 should be 65536) */
+extern void ac_rescale(const uint8_t *src1, const uint8_t *src2,
+ uint8_t *dest, int bytes,
+ uint32_t weight1, uint32_t weight2);
+
+/* Image format manipulation is available in aclib/imgconvert.h */
+
+/*************************************************************************/
+
+#endif /* ACLIB_AC_H */
+
+/*
+ * Local variables:
+ * c-file-style: "stroustrup"
+ * c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/ac_internal.h b/debian/transcode/transcode-1.1.7/aclib/ac_internal.h
new file mode 100644
index 00000000..67a9c59f
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/ac_internal.h
@@ -0,0 +1,42 @@
+/*
+ * ac_internal.h -- internal include file for aclib functions
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later). See the file COPYING
+ * for details.
+ */
+
+#ifndef ACLIB_AC_INTERNAL_H
+#define ACLIB_AC_INTERNAL_H
+
+
+/* Compiler hint that a condition is unlikely */
+#ifdef __GNUC__
+# define UNLIKELY(x) (__builtin_expect((x) != 0, 0))
+#else
+# define UNLIKELY(x) (x)
+#endif
+
+/* Are _all_ of the given acceleration flags (`test') available? */
+#define HAS_ACCEL(accel,test) (((accel) & (test)) == (test))
+
+/* Initialization subfunctions */
+extern int ac_average_init(int accel);
+extern int ac_imgconvert_init(int accel);
+extern int ac_memcpy_init(int accel);
+extern int ac_rescale_init(int accel);
+
+
+#endif /* ACLIB_AC_INTERNAL_H */
+
+/*
+ * Local variables:
+ * c-file-style: "stroustrup"
+ * c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/accore.c b/debian/transcode/transcode-1.1.7/aclib/accore.c
new file mode 100644
index 00000000..ec7ea2dd
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/accore.c
@@ -0,0 +1,320 @@
+/*
+ * accore.c -- core aclib functions
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later). See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "ac_internal.h"
+#include "imgconvert.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+static int cpuinfo_x86(void);
+#endif
+
+/*************************************************************************/
+
+/* Library initialization function. Determines CPU features, then calls
+ * all initialization subfunctions with appropriate flags. Returns 1 on
+ * success, 0 on failure. This function can be called multiple times to
+ * change the set of acceleration features to be used. */
+
+int ac_init(int accel)
+{
+ accel &= ac_cpuinfo();
+ if (!ac_average_init(accel)
+ || !ac_imgconvert_init(accel)
+ || !ac_memcpy_init(accel)
+ || !ac_rescale_init(accel)
+ ) {
+ return 0;
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+/* Returns the set of acceleration features supported by this CPU. */
+
+int ac_cpuinfo(void)
+{
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+ return cpuinfo_x86();
+#else
+ return 0;
+#endif
+}
+
+/*************************************************************************/
+
+/* Returns the endianness of this CPU (AC_BIG_ENDIAN or AC_LITTLE_ENDIAN). */
+
+int ac_endian(void)
+{
+ volatile int test;
+
+ test = 1;
+ if (*((uint8_t *)&test))
+ return AC_LITTLE_ENDIAN;
+ else
+ return AC_BIG_ENDIAN;
+}
+
+/*************************************************************************/
+
+/* Utility routine to convert a set of flags to a descriptive string. The
+ * string is stored in a static buffer overwritten each call. `filter'
+ * selects whether to filter out flags not supported by the CPU. */
+
+const char *ac_flagstotext(int accel)
+{
+ static char retbuf[1000];
+ if (!accel)
+ return "none";
+ snprintf(retbuf, sizeof(retbuf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+ accel & AC_SSE5 ? " sse5" : "",
+ accel & AC_SSE4A ? " sse4a" : "",
+ accel & AC_SSE42 ? " sse42" : "",
+ accel & AC_SSE41 ? " sse41" : "",
+ accel & AC_SSSE3 ? " ssse3" : "",
+ accel & AC_SSE3 ? " sse3" : "",
+ accel & AC_SSE2 ? " sse2" : "",
+ accel & AC_SSE ? " sse" : "",
+ accel & AC_3DNOWEXT ? " 3dnowext" : "",
+ accel & AC_3DNOW ? " 3dnow" : "",
+ accel & AC_MMXEXT ? " mmxext" : "",
+ accel & AC_MMX ? " mmx" : "",
+ accel & AC_CMOVE ? " cmove" : "",
+ accel & (AC_IA32ASM|AC_AMD64ASM) ? " asm" : "");
+ return *retbuf ? retbuf+1 : retbuf; /* skip initial space */
+}
+
+/* Utility routine to parse a comma-separate descriptive string to the
+ corrisponding flag. The reverse of ac_flagstotext.
+ Returns 1 on success, 0 on failure */
+
+#define AC_FLAG_LEN 16
+
+int ac_parseflags(const char *text, int *accel)
+{
+ int parsed = 1, done = 0;
+ if (!text || !accel)
+ return 0;
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+ *accel = 0;
+
+ while (parsed && !done) {
+ char buf[AC_FLAG_LEN + 1] = { '\0' };
+ const char *comma = strchr(text, ',');
+ if (!comma) {
+ strncpy(buf, text, AC_FLAG_LEN);
+ done = 1;
+ } else {
+ /* parse the remaining and exit*/
+ size_t len = (comma - text);
+ if (len > AC_FLAG_LEN)
+ len = AC_FLAG_LEN;
+ strncpy(buf, text, len);
+ }
+//fprintf(stderr, "(%s) buf=[%s]\n", __func__, buf);
+ if (strcasecmp(buf, "C") == 0) // dummy for "no accel"
+ *accel |= 0;
+#ifdef ARCH_X86
+ else if (strcasecmp(buf, "asm" ) == 0)
+ *accel |= AC_IA32ASM;
+#endif
+#ifdef ARCH_X86_64
+ else if (strcasecmp(buf, "asm" ) == 0)
+ *accel |= AC_AMD64ASM;
+#endif
+ else if (strcasecmp(buf, "mmx" ) == 0)
+ *accel |= AC_MMX;
+ else if (strcasecmp(buf, "mmxext" ) == 0)
+ *accel |= AC_MMXEXT;
+ else if (strcasecmp(buf, "3dnow" ) == 0)
+ *accel |= AC_3DNOW;
+ else if (strcasecmp(buf, "3dnowext") == 0)
+ *accel |= AC_3DNOWEXT;
+ else if (strcasecmp(buf, "sse" ) == 0)
+ *accel |= AC_SSE;
+ else if (strcasecmp(buf, "sse2" ) == 0)
+ *accel |= AC_SSE2;
+ else if (strcasecmp(buf, "sse3" ) == 0)
+ *accel |= AC_SSE3;
+ else if (strcasecmp(buf, "ssse3" ) == 0)
+ *accel |= AC_SSSE3;
+ else if (strcasecmp(buf, "sse41" ) == 0)
+ *accel |= AC_SSE41;
+ else if (strcasecmp(buf, "sse42" ) == 0)
+ *accel |= AC_SSE42;
+ else if (strcasecmp(buf, "sse4a" ) == 0)
+ *accel |= AC_SSE4A;
+ else if (strcasecmp(buf, "sse5" ) == 0)
+ *accel |= AC_SSE5;
+ else
+ parsed = 0;
+ text = comma + 1;
+ }
+#endif
+ return parsed;
+}
+
+#undef AC_FLAG_LEN
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Private functions to return acceleration flags corresponding to available
+ * CPU features for various CPUs. Currently only x86 is supported. */
+
+/*************************************************************************/
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+
+#ifdef ARCH_X86_64
+# define EAX "%%rax"
+# define EBX "%%rbx"
+# define ESI "%%rsi"
+# define PUSHF "pushfq"
+# define POPF "popfq"
+#else
+# define EAX "%%eax"
+# define EBX "%%ebx"
+# define ESI "%%esi"
+# define PUSHF "pushfl"
+# define POPF "popfl"
+#endif
+
+/* Macro to execute the CPUID instruction with EAX = func. Results are
+ * placed in ret_a (EAX), ret_b (EBX), ret_c (ECX), and ret_d (EDX), which
+ * must be lvalues. Note that we save and restore EBX (RBX on x86-64)
+ * because it is the PIC register. */
+#define CPUID(func,ret_a,ret_b,ret_c,ret_d) \
+ asm("mov "EBX", "ESI"; cpuid; xchg "EBX", "ESI \
+ : "=a" (ret_a), "=S" (ret_b), "=c" (ret_c), "=d" (ret_d) \
+ : "a" (func))
+
+/* Various CPUID flags. The second word of the macro name indicates the
+ * function (1: function 1, X1: function 0x80000001) and register (D: EDX)
+ * to which the value belongs. */
+#define CPUID_1D_CMOVE (1UL<<15)
+#define CPUID_1D_MMX (1UL<<23)
+#define CPUID_1D_SSE (1UL<<25)
+#define CPUID_1D_SSE2 (1UL<<26)
+#define CPUID_1C_SSE3 (1UL<< 0)
+#define CPUID_1C_SSSE3 (1UL<< 9)
+#define CPUID_1C_SSE41 (1UL<<19)
+#define CPUID_1C_SSE42 (1UL<<20)
+#define CPUID_X1D_AMD_MMXEXT (1UL<<22) /* AMD only */
+#define CPUID_X1D_AMD_3DNOW (1UL<<31) /* AMD only */
+#define CPUID_X1D_AMD_3DNOWEXT (1UL<<30) /* AMD only */
+#define CPUID_X1D_CYRIX_MMXEXT (1UL<<24) /* Cyrix only */
+#define CPUID_X1C_AMD_SSE4A (1UL<< 6) /* AMD only */
+#define CPUID_X1C_AMD_SSE5 (1UL<<11) /* AMD only */
+
+static int cpuinfo_x86(void)
+{
+ uint32_t eax, ebx, ecx, edx;
+ uint32_t cpuid_max, cpuid_ext_max; /* Maximum CPUID function numbers */
+ union {
+ char string[13];
+ struct { uint32_t ebx, edx, ecx; } regs;
+ } cpu_vendor; /* 12-byte CPU vendor string + trailing null */
+ uint32_t cpuid_1D, cpuid_1C, cpuid_X1C, cpuid_X1D;
+ int accel;
+
+ /* First see if the CPUID instruction is even available. We try to
+ * toggle bit 21 (ID) of the flags register; if the bit changes, then
+ * CPUID is available. */
+ asm(PUSHF" \n\
+ pop "EAX" \n\
+ mov %%eax, %%edx \n\
+ xor $0x200000, %%eax \n\
+ push "EAX" \n\
+ "POPF" \n\
+ "PUSHF" \n\
+ pop "EAX" \n\
+ xor %%edx, %%eax"
+ : "=a" (eax) : : "edx");
+ if (!eax)
+ return 0;
+
+ /* Determine the maximum function number available, and save the vendor
+ * string */
+ CPUID(0, cpuid_max, ebx, ecx, edx);
+ cpu_vendor.regs.ebx = ebx;
+ cpu_vendor.regs.ecx = ecx;
+ cpu_vendor.regs.edx = edx;
+ cpu_vendor.string[12] = 0;
+ cpuid_ext_max = 0; /* FIXME: how do early CPUs respond to 0x80000000? */
+ CPUID(0x80000000, cpuid_ext_max, ebx, ecx, edx);
+
+ /* Read available features */
+ cpuid_1D = cpuid_1C = cpuid_X1C = cpuid_X1D = 0;
+ if (cpuid_max >= 1)
+ CPUID(1, eax, ebx, cpuid_1C, cpuid_1D);
+ if (cpuid_ext_max >= 0x80000001)
+ CPUID(0x80000001, eax, ebx, cpuid_X1C, cpuid_X1D);
+
+ /* Convert to acceleration flags */
+#ifdef ARCH_X86_64
+ accel = AC_AMD64ASM; /* but not IA32! (register size issues) */
+#else
+ accel = AC_IA32ASM;
+#endif
+ if (cpuid_1D & CPUID_1D_CMOVE)
+ accel |= AC_CMOVE;
+ if (cpuid_1D & CPUID_1D_MMX)
+ accel |= AC_MMX;
+ if (cpuid_1D & CPUID_1D_SSE)
+ accel |= AC_SSE;
+ if (cpuid_1D & CPUID_1D_SSE2)
+ accel |= AC_SSE2;
+ if (cpuid_1C & CPUID_1C_SSE3)
+ accel |= AC_SSE3;
+ if (cpuid_1C & CPUID_1C_SSSE3)
+ accel |= AC_SSSE3;
+ if (cpuid_1C & CPUID_1C_SSE41)
+ accel |= AC_SSE41;
+ if (cpuid_1C & CPUID_1C_SSE42)
+ accel |= AC_SSE42;
+ if (strcmp(cpu_vendor.string, "AuthenticAMD") == 0) {
+ if (cpuid_X1D & CPUID_X1D_AMD_MMXEXT)
+ accel |= AC_MMXEXT;
+ if (cpuid_X1D & CPUID_X1D_AMD_3DNOW)
+ accel |= AC_3DNOW;
+ if (cpuid_X1D & CPUID_X1D_AMD_3DNOWEXT)
+ accel |= AC_3DNOWEXT;
+ if (cpuid_X1C & CPUID_X1C_AMD_SSE4A)
+ accel |= AC_SSE4A;
+ if (cpuid_X1C & CPUID_X1C_AMD_SSE5)
+ accel |= AC_SSE5;
+ } else if (strcmp(cpu_vendor.string, "CyrixInstead") == 0) {
+ if (cpuid_X1D & CPUID_X1D_CYRIX_MMXEXT)
+ accel |= AC_MMXEXT;
+ }
+
+ /* And return */
+ return accel;
+}
+
+#endif /* ARCH_X86 || ARCH_X86_64 */
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ * c-file-style: "stroustrup"
+ * c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/average.c b/debian/transcode/transcode-1.1.7/aclib/average.c
new file mode 100644
index 00000000..517102e6
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/average.c
@@ -0,0 +1,243 @@
+/*
+ * average.c -- average two sets of byte data
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later). See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "ac_internal.h"
+
+static void average(const uint8_t *, const uint8_t *, uint8_t *, int);
+static void (*average_ptr)(const uint8_t *, const uint8_t *, uint8_t *, int)
+ = average;
+
+/*************************************************************************/
+
+/* External interface */
+
+void ac_average(const uint8_t *src1, const uint8_t *src2,
+ uint8_t *dest, int bytes)
+{
+ (*average_ptr)(src1, src2, dest, bytes);
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Vanilla C version */
+
+static void average(const uint8_t *src1, const uint8_t *src2,
+ uint8_t *dest, int bytes)
+{
+ int i;
+ for (i = 0; i < bytes; i++)
+ dest[i] = (src1[i]+src2[i]+1) / 2;
+}
+
+/*************************************************************************/
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86) /* i.e. not x86_64 */
+
+static void average_mmx(const uint8_t *src1, const uint8_t *src2,
+ uint8_t *dest, int bytes)
+{
+ if (bytes >= 8) {
+ asm("\
+ pxor %%mm7, %%mm7 \n\
+ movq %%mm7, %%mm6 \n\
+ pcmpeqw %%mm5, %%mm5 \n\
+ psubw %%mm5, %%mm6 # Put 0x0001*4 in MM6 \n\
+ 0: \n\
+ movq -8(%%esi,%%eax), %%mm0 \n\
+ movq %%mm0, %%mm1 \n\
+ punpcklbw %%mm7, %%mm0 \n\
+ punpckhbw %%mm7, %%mm1 \n\
+ movq -8(%%edx,%%eax), %%mm2 \n\
+ movq %%mm2, %%mm3 \n\
+ punpcklbw %%mm7, %%mm2 \n\
+ punpckhbw %%mm7, %%mm3 \n\
+ paddw %%mm2, %%mm0 \n\
+ paddw %%mm6, %%mm0 \n\
+ psrlw $1, %%mm0 \n\
+ paddw %%mm3, %%mm1 \n\
+ paddw %%mm6, %%mm1 \n\
+ psrlw $1, %%mm1 \n\
+ packuswb %%mm1, %%mm0 \n\
+ movq %%mm0, -8(%%edi,%%eax) \n\
+ subl $8, %%eax \n\
+ jnz 0b \n\
+ emms"
+ : /* no outputs */
+ : "S" (src1), "d" (src2), "D" (dest), "a" (bytes & ~7));
+ }
+ if (UNLIKELY(bytes & 7)) {
+ average(src1+(bytes & ~7), src2+(bytes & ~7), dest+(bytes & ~7),
+ bytes & 7);
+ }
+}
+
+#endif /* HAVE_ASM_MMX && ARCH_X86 */
+
+/*************************************************************************/
+
+#if defined(HAVE_ASM_SSE) && defined(ARCH_X86)
+
+/* SSE has PAVGB */
+
+static void average_sse(const uint8_t *src1, const uint8_t *src2,
+ uint8_t *dest, int bytes)
+{
+ if (bytes >= 8) {
+ asm("\
+ testl $~0x1F, %%eax \n\
+ jz 1f \n\
+ 0: \n\
+ movq -32(%%esi,%%eax), %%mm0 \n\
+ movq -24(%%esi,%%eax), %%mm1 \n\
+ movq -16(%%esi,%%eax), %%mm2 \n\
+ movq -8(%%esi,%%eax), %%mm3 \n\
+ movq -32(%%edx,%%eax), %%mm4 \n\
+ pavgb %%mm4, %%mm0 \n\
+ movq -24(%%edx,%%eax), %%mm5 \n\
+ pavgb %%mm5, %%mm1 \n\
+ movq -16(%%edx,%%eax), %%mm6 \n\
+ pavgb %%mm6, %%mm2 \n\
+ movq -8(%%edx,%%eax), %%mm7 \n\
+ pavgb %%mm7, %%mm3 \n\
+ movntq %%mm0, -32(%%edi,%%eax) \n\
+ movntq %%mm1, -24(%%edi,%%eax) \n\
+ movntq %%mm2, -16(%%edi,%%eax) \n\
+ movntq %%mm3, -8(%%edi,%%eax) \n\
+ subl $32, %%eax \n\
+ testl $~0x1F, %%eax \n\
+ jnz 0b \n\
+ testl %%eax, %%eax \n\
+ jz 2f \n\
+ 1: \n\
+ movq -8(%%esi,%%eax), %%mm0 \n\
+ movq -8(%%edx,%%eax), %%mm1 \n\
+ pavgb %%mm1, %%mm0 \n\
+ movntq %%mm0, -8(%%edi,%%eax) \n\
+ subl $8, %%eax \n\
+ jnz 1b \n\
+ 2: \n\
+ emms \n\
+ sfence"
+ : /* no outputs */
+ : "S" (src1), "d" (src2), "D" (dest), "a" (bytes & ~7));
+ }
+ if (UNLIKELY(bytes & 7)) {
+ average(src1+(bytes & ~7), src2+(bytes & ~7), dest+(bytes & ~7),
+ bytes & 7);
+ }
+}
+
+#endif /* HAVE_ASM_SSE && ARCH_X86 */
+
+/*************************************************************************/
+
+#if defined(HAVE_ASM_SSE2)
+
+#if defined(ARCH_X86_64)
+# define EAX "%%rax"
+# define EDX "%%rdx"
+# define ESI "%%rsi"
+# define EDI "%%rdi"
+#else
+# define EAX "%%eax"
+# define EDX "%%edx"
+# define ESI "%%esi"
+# define EDI "%%edi"
+#endif
+
+static void average_sse2(const uint8_t *src1, const uint8_t *src2,
+ uint8_t *dest, int bytes)
+{
+ if (bytes >= 8) {
+ asm("\
+ testl $~0x3F, %%eax \n\
+ jz 1f \n\
+ 0: \n\
+ movdqu -64("ESI","EAX"), %%xmm0 \n\
+ movdqu -48("ESI","EAX"), %%xmm1 \n\
+ movdqu -32("ESI","EAX"), %%xmm2 \n\
+ movdqu -16("ESI","EAX"), %%xmm3 \n\
+ movdqu -64("EDX","EAX"), %%xmm4 \n\
+ pavgb %%xmm4, %%xmm0 \n\
+ movdqu -48("EDX","EAX"), %%xmm5 \n\
+ pavgb %%xmm5, %%xmm1 \n\
+ movdqu -32("EDX","EAX"), %%xmm6 \n\
+ pavgb %%xmm6, %%xmm2 \n\
+ movdqu -16("EDX","EAX"), %%xmm7 \n\
+ pavgb %%xmm7, %%xmm3 \n\
+ # Note that movntdq requires 16-byte alignment, which we're \n\
+ # not guaranteed \n\
+ movdqu %%xmm0, -64("EDI","EAX") \n\
+ movdqu %%xmm1, -48("EDI","EAX") \n\
+ movdqu %%xmm2, -32("EDI","EAX") \n\
+ movdqu %%xmm3, -16("EDI","EAX") \n\
+ subl $64, %%eax \n\
+ testl $~0x3F, %%eax \n\
+ jnz 0b \n\
+ testl %%eax, %%eax \n\
+ jz 2f \n\
+ 1: \n\
+ movq -8("ESI","EAX"), %%mm0 \n\
+ movq -8("EDX","EAX"), %%mm1 \n\
+ pavgb %%mm1, %%mm0 \n\
+ movq %%mm0, -8("EDI","EAX") \n\
+ subl $8, %%eax \n\
+ jnz 1b \n\
+ 2: \n\
+ emms"
+ : /* no outputs */
+ : "S" (src1), "d" (src2), "D" (dest), "a" (bytes & ~7));
+ }
+ if (UNLIKELY(bytes & 7)) {
+ average(src1+(bytes & ~7), src2+(bytes & ~7), dest+(bytes & ~7),
+ bytes & 7);
+ }
+}
+
+#endif /* HAVE_ASM_SSE2 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Initialization routine. */
+
+int ac_average_init(int accel)
+{
+ average_ptr = average;
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
+ if (HAS_ACCEL(accel, AC_MMX))
+ average_ptr = average_mmx;
+#endif
+#if defined(HAVE_ASM_SSE) && defined(ARCH_X86)
+ if (HAS_ACCEL(accel, AC_SSE))
+ average_ptr = average_sse;
+#endif
+#if defined(HAVE_ASM_SSE2)
+ if (HAS_ACCEL(accel, AC_SSE2))
+ average_ptr = average_sse2;
+#endif
+
+ return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ * c-file-style: "stroustrup"
+ * c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/img_internal.h b/debian/transcode/transcode-1.1.7/aclib/img_internal.h
new file mode 100644
index 00000000..153a2fb6
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/img_internal.h
@@ -0,0 +1,40 @@
+/*
+ * img_internal.h - imgconvert internal use header
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later). See the file COPYING
+ * for details.
+ */
+
+#ifndef ACLIB_IMG_INTERNAL_H
+#define ACLIB_IMG_INTERNAL_H
+
+/* Type of a conversion function */
+typedef int (*ConversionFunc)(uint8_t **src, uint8_t **dest,
+ int width, int height);
+
+/* Function to register a conversion */
+extern int register_conversion(ImageFormat srcfmt, ImageFormat destfmt,
+ ConversionFunc function);
+
+/* Initialization routines */
+extern int ac_imgconvert_init(int accel);
+extern int ac_imgconvert_init_yuv_planar(int accel);
+extern int ac_imgconvert_init_yuv_packed(int accel);
+extern int ac_imgconvert_init_yuv_mixed(int accel);
+extern int ac_imgconvert_init_yuv_rgb(int accel);
+extern int ac_imgconvert_init_rgb_packed(int accel);
+
+#endif /* ACLIB_IMG_INTERNAL_H */
+
+/*
+ * Local variables:
+ * c-file-style: "stroustrup"
+ * c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/img_rgb_packed.c b/debian/transcode/transcode-1.1.7/aclib/img_rgb_packed.c
new file mode 100644
index 00000000..e6d5bf35
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/img_rgb_packed.c
@@ -0,0 +1,1106 @@
+/*
+ * img_rgb_packed.c - RGB packed image format conversion routines
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later). See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "imgconvert.h"
+#include "img_internal.h"
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Standard C implementations */
+
+/*************************************************************************/
+
+/* Identity transformations, all work when src==dest */
+
+static int rgb_copy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height*3);
+ return 1;
+}
+
+static int rgba_copy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height*4);
+ return 1;
+}
+
+static int gray8_copy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height);
+ return 1;
+}
+
+/*************************************************************************/
+
+/* Conversions between various 32-bit formats, all usable when src==dest */
+
+/* RGBA<->ABGR and ARGB<->BGRA: reverse byte order */
+static int rgba_swapall(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ uint32_t *srcp = (uint32_t *)src[0];
+ uint32_t *destp = (uint32_t *)dest[0];
+ int i;
+ for (i = 0; i < width*height; i++) {
+ /* This shortcut works regardless of CPU endianness */
+ destp[i] = srcp[i] >> 24
+ | (srcp[i] & 0x00FF0000) >> 8
+ | (srcp[i] & 0x0000FF00) << 8
+ | srcp[i] << 24;
+ }
+ return 1;
+}
+
+/* RGBA<->BGRA: swap bytes 0 and 2 */
+static int rgba_swap02(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ uint8_t tmp = src[0][i*4+2];
+ dest[0][i*4+2] = src[0][i*4 ];
+ dest[0][i*4 ] = tmp;
+ dest[0][i*4+1] = src[0][i*4+1];
+ dest[0][i*4+3] = src[0][i*4+3];
+ }
+ return 1;
+}
+
+/* ARGB<->ABGR: swap bytes 1 and 3 */
+static int rgba_swap13(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ uint8_t tmp = src[0][i*4+3];
+ dest[0][i*4+3] = src[0][i*4+1];
+ dest[0][i*4+1] = tmp;
+ dest[0][i*4 ] = src[0][i*4 ];
+ dest[0][i*4+2] = src[0][i*4+2];
+ }
+ return 1;
+}
+
+/* RGBA->ARGB and BGRA->ABGR: alpha moves from byte 3 to byte 0 */
+static int rgba_alpha30(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ uint8_t tmp = src[0][i*4+3];
+ dest[0][i*4+3] = src[0][i*4+2];
+ dest[0][i*4+2] = src[0][i*4+1];
+ dest[0][i*4+1] = src[0][i*4 ];
+ dest[0][i*4 ] = tmp;
+ }
+ return 1;
+}
+
+/* ARGB->RGBA and ABGR->BGRA: alpha moves from byte 0 to byte 3 */
+static int rgba_alpha03(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ uint8_t tmp = src[0][i*4 ];
+ dest[0][i*4 ] = src[0][i*4+1];
+ dest[0][i*4+1] = src[0][i*4+2];
+ dest[0][i*4+2] = src[0][i*4+3];
+ dest[0][i*4+3] = tmp;
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+static int rgb24_bgr24(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ dest[0][i*3 ] = src[0][i*3+2];
+ dest[0][i*3+1] = src[0][i*3+1];
+ dest[0][i*3+2] = src[0][i*3 ];
+ }
+ return 1;
+}
+
+static int rgb24_rgba32(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ dest[0][i*4 ] = src[0][i*3 ];
+ dest[0][i*4+1] = src[0][i*3+1];
+ dest[0][i*4+2] = src[0][i*3+2];
+ dest[0][i*4+3] = 0;
+ }
+ return 1;
+}
+
+static int rgb24_abgr32(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ dest[0][i*4 ] = 0;
+ dest[0][i*4+1] = src[0][i*3+2];
+ dest[0][i*4+2] = src[0][i*3+1];
+ dest[0][i*4+3] = src[0][i*3 ];
+ }
+ return 1;
+}
+
+static int rgb24_argb32(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ dest[0][i*4 ] = 0;
+ dest[0][i*4+1] = src[0][i*3 ];
+ dest[0][i*4+2] = src[0][i*3+1];
+ dest[0][i*4+3] = src[0][i*3+2];
+ }
+ return 1;
+}
+
+static int rgb24_bgra32(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ dest[0][i*4 ] = src[0][i*3+2];
+ dest[0][i*4+1] = src[0][i*3+1];
+ dest[0][i*4+2] = src[0][i*3 ];
+ dest[0][i*4+3] = 0;
+ }
+ return 1;
+}
+
+static int rgb24_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ /* Use the Y part of a YUV transformation, scaled to 0..255 */
+ int r = src[0][i*3 ];
+ int g = src[0][i*3+1];
+ int b = src[0][i*3+2];
+ dest[0][i] = (19595*r + 38470*g + 7471*b + 32768) >> 16;
+ }
+ return 1;
+}
+
+static int bgr24_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ /* Use the Y part of a YUV transformation, scaled to 0..255 */
+ int r = src[0][i*3+2];
+ int g = src[0][i*3+1];
+ int b = src[0][i*3 ];
+ dest[0][i] = (19595*r + 38470*g + 7471*b + 32768) >> 16;
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+static int rgba32_rgb24(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ dest[0][i*3 ] = src[0][i*4 ];
+ dest[0][i*3+1] = src[0][i*4+1];
+ dest[0][i*3+2] = src[0][i*4+2];
+ }
+ return 1;
+}
+
+static int bgra32_rgb24(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ dest[0][i*3 ] = src[0][i*4+2];
+ dest[0][i*3+1] = src[0][i*4+1];
+ dest[0][i*3+2] = src[0][i*4 ];
+ }
+ return 1;
+}
+
+static int rgba32_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ /* Use the Y part of a YUV transformation, scaled to 0..255 */
+ int r = src[0][i*4 ];
+ int g = src[0][i*4+1];
+ int b = src[0][i*4+2];
+ dest[0][i] = (19595*r + 38470*g + 7471*b + 32768) >> 16;
+ }
+ return 1;
+}
+
+static int bgra32_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ /* Use the Y part of a YUV transformation, scaled to 0..255 */
+ int r = src[0][i*4+2];
+ int g = src[0][i*4+1];
+ int b = src[0][i*4 ];
+ dest[0][i] = (19595*r + 38470*g + 7471*b + 32768) >> 16;
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+static int argb32_rgb24(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ dest[0][i*3 ] = src[0][i*4+1];
+ dest[0][i*3+1] = src[0][i*4+2];
+ dest[0][i*3+2] = src[0][i*4+3];
+ }
+ return 1;
+}
+
+static int abgr32_rgb24(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ dest[0][i*3 ] = src[0][i*4+3];
+ dest[0][i*3+1] = src[0][i*4+2];
+ dest[0][i*3+2] = src[0][i*4+1];
+ }
+ return 1;
+}
+
+static int argb32_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ /* Use the Y part of a YUV transformation, scaled to 0..255 */
+ int r = src[0][i*4+1];
+ int g = src[0][i*4+2];
+ int b = src[0][i*4+3];
+ dest[0][i] = (19595*r + 38470*g + 7471*b + 32768) >> 16;
+ }
+ return 1;
+}
+
+static int abgr32_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ /* Use the Y part of a YUV transformation, scaled to 0..255 */
+ int r = src[0][i*4+3];
+ int g = src[0][i*4+2];
+ int b = src[0][i*4+1];
+ dest[0][i] = (19595*r + 38470*g + 7471*b + 32768) >> 16;
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+static int gray8_rgb24(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ dest[0][i*3 ] = src[0][i];
+ dest[0][i*3+1] = src[0][i];
+ dest[0][i*3+2] = src[0][i];
+ }
+ return 1;
+}
+
+static int gray8_rgba32(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ dest[0][i*4 ] = src[0][i];
+ dest[0][i*4+1] = src[0][i];
+ dest[0][i*4+2] = src[0][i];
+ dest[0][i*4+3] = 0;
+ }
+ return 1;
+}
+
+static int gray8_argb32(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ dest[0][i*4 ] = 0;
+ dest[0][i*4+1] = src[0][i];
+ dest[0][i*4+2] = src[0][i];
+ dest[0][i*4+3] = src[0][i];
+ }
+ return 1;
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+
+#define DEFINE_MASK_DATA
+#include "img_x86_common.h"
+
+/*************************************************************************/
+
+/* Basic assembly routines */
+
+/* RGBA<->ABGR and ARGB<->BGRA: reverse byte order */
+static int rgba_swapall_x86(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_REV32_X86(width*height);
+ return 1;
+}
+
+/* RGBA<->BGRA: swap bytes 0 and 2 */
+static int rgba_swap02_x86(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_SWAP32_02_X86(width*height);
+ return 1;
+}
+
+/* ARGB<->ABGR: swap bytes 1 and 3 */
+static int rgba_swap13_x86(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_SWAP32_13_X86(width*height);
+ return 1;
+}
+
+/* RGBA->ARGB and BGRA->ABGR: alpha moves from byte 3 to byte 0 */
+static int rgba_alpha30_x86(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_ROL32_X86(width*height);
+ return 1;
+}
+
+/* ARGB->RGBA and ABGR->BGRA: alpha moves from byte 0 to byte 3 */
+static int rgba_alpha03_x86(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_ROR32_X86(width*height);
+ return 1;
+}
+
+/*************************************************************************/
+
+/* MMX routines */
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86) /* i.e. not x86_64 */
+
+/* RGBA<->ABGR and ARGB<->BGRA: reverse byte order */
+static int rgba_swapall_mmx(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_REV32_MMX(width*height);
+ return 1;
+}
+
+/* RGBA<->BGRA: swap bytes 0 and 2 */
+static int rgba_swap02_mmx(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_SWAP32_02_MMX(width*height);
+ return 1;
+}
+
+/* ARGB<->ABGR: swap bytes 1 and 3 */
+static int rgba_swap13_mmx(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_SWAP32_13_MMX(width*height);
+ return 1;
+}
+
+/* RGBA->ARGB and BGRA->ABGR: alpha moves from byte 3 to byte 0 */
+static int rgba_alpha30_mmx(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_ROL32_MMX(width*height);
+ return 1;
+}
+
+/* ARGB->RGBA and ABGR->BGRA: alpha moves from byte 0 to byte 3 */
+static int rgba_alpha03_mmx(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_ROR32_MMX(width*height);
+ return 1;
+}
+
+#endif /* HAVE_ASM_MMX && ARCH_X86 */
+
+/*************************************************************************/
+
+/* SSE2 routines */
+
+#if defined(HAVE_ASM_SSE2)
+
+static const struct { uint32_t n[4]; } __attribute__((aligned(16))) rgb_bgr_data = {{
+ 0xFF0000FF, 0x00FF0000, 0x0000FF00, 0x00000000
+}};
+
+#define SHIFT_RBSWAP \
+ "movdqa %%xmm6, %%xmm2 # XMM2: low bytes mask \n\
+ pand %%xmm0, %%xmm2 # XMM2: R/B bytes \n\
+ pshuflw $0xB1, %%xmm2, %%xmm2 # XMM2: swap R and B (low quad) \n\
+ pand %%xmm7, %%xmm0 # XMM0: G bytes \n\
+ pshufhw $0xB1, %%xmm2, %%xmm2 # XMM2: swap R and B (high quad)\n\
+ por %%xmm2, %%xmm0 # XMM0: data now in BGRA32 \n"
+
+#define SHIFT_AFIRST \
+ "pslldq $1, %%xmm0 # XMM0: move A first \n"
+
+#define SHIFT_ALAST \
+ "psrldq $1, %%xmm0 # XMM0: move A last \n"
+
+#define RGB24TO32(ROFS,GOFS,BOFS,AOFS,SHIFT) \
+ asm("pcmpeqd %%xmm5, %%xmm5 \n\
+ movdqa %%xmm5, %%xmm6 \n\
+ psrldq $13, %%xmm5 # XMM5: 24-bit mask \n\
+ movdqa %%xmm6, %%xmm7 \n\
+ psrlw $8, %%xmm6 # XMM6: low bytes mask \n\
+ psllw $8, %%xmm7 # XMM7: high bytes mask \n"\
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 4, \
+ /* push_regs */ "", \
+ /* pop_regs */ "", \
+ /* small_loop */ \
+ "lea ("ECX","ECX",2),"EDX" \n\
+ movb -3("ESI","EDX"), %%al \n\
+ movb %%al, ("#ROFS"-4)("EDI","ECX",4) \n\
+ movb -2("ESI","EDX"), %%al \n\
+ movb %%al, ("#GOFS"-4)("EDI","ECX",4) \n\
+ movb -1("ESI","EDX"), %%al \n\
+ movb %%al, ("#BOFS"-4)("EDI","ECX",4) \n\
+ movb $0, ("#AOFS"-4)("EDI","ECX",4)", \
+ /* main_loop */ \
+ "lea ("ECX","ECX",2),"EDX" \n\
+ # We can't just movdqu, because we might run over the edge \n\
+ movd -12("ESI","EDX"), %%xmm1 \n\
+ movq -8("ESI","EDX"), %%xmm0 \n\
+ pshufd $0xD3, %%xmm0, %%xmm0 # shift left by 4 bytes \n\
+ por %%xmm1, %%xmm0 # XMM0: original RGB24 data \n\
+ pshufd $0xF3, %%xmm5, %%xmm2 # XMM2: pixel 1 mask \n\
+ movdqa %%xmm5, %%xmm1 # XMM1: pixel 0 mask \n\
+ pshufd $0xCF, %%xmm5, %%xmm3 # XMM3: pixel 2 mask \n\
+ pand %%xmm0, %%xmm1 # XMM1: pixel 0 \n\
+ pslldq $1, %%xmm0 \n\
+ pand %%xmm0, %%xmm2 # XMM2: pixel 1 \n\
+ pshufd $0x3F, %%xmm5, %%xmm4 # XMM4: pixel 3 mask \n\
+ por %%xmm2, %%xmm1 # XMM1: pixels 0 and 1 \n\
+ pslldq $1, %%xmm0 \n\
+ pand %%xmm0, %%xmm3 # XMM3: pixel 2 \n\
+ por %%xmm3, %%xmm1 # XMM1: pixels 0, 1, and 2 \n\
+ pslldq $1, %%xmm0 \n\
+ pand %%xmm4, %%xmm0 # XMM0: pixel 3 \n\
+ por %%xmm1, %%xmm0 # XMM0: RGBA32 data \n\
+ "SHIFT" # shift bytes to target position\n\
+ movdqu %%xmm0, -16("EDI","ECX",4)", \
+ /* emms */ "emms") \
+ : /* no outputs */ \
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height), \
+ "d" (&rgb_bgr_data), "m" (rgb_bgr_data) \
+ : "eax");
+
+#define RGB32TO24(ROFS,GOFS,BOFS,AOFS,SHIFT) \
+ asm("pcmpeqd %%xmm5, %%xmm5 \n\
+ movdqa %%xmm5, %%xmm6 \n\
+ psrldq $13, %%xmm5 # 24-bit mask \n\
+ movdqa %%xmm6, %%xmm7 \n\
+ psrlw $8, %%xmm6 # low bytes mask \n\
+ psllw $8, %%xmm7 # high bytes mask \n"\
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 4, \
+ /* push_regs */ "", \
+ /* pop_regs */ "", \
+ /* small_loop */ \
+ "lea ("ECX","ECX",2),"EDX" \n\
+ movb ("#ROFS"-4)("ESI","ECX",4), %%al \n\
+ movb %%al, -3("EDI","EDX") \n\
+ movb ("#GOFS"-4)("ESI","ECX",4), %%al \n\
+ movb %%al, -2("EDI","EDX") \n\
+ movb ("#BOFS"-4)("ESI","ECX",4), %%al \n\
+ movb %%al, -1("EDI","EDX")", \
+ /* main_loop */ \
+ "lea ("ECX","ECX",2),"EDX" \n\
+ movdqu -16("ESI","ECX",4), %%xmm0 \n\
+ "SHIFT" # shift source data to RGBA \n\
+ pshufd $0xF3, %%xmm5, %%xmm1 # XMM1: pixel 1 mask \n\
+ pshufd $0xCF, %%xmm5, %%xmm2 # XMM2: pixel 2 mask \n\
+ pshufd $0x3F, %%xmm5, %%xmm3 # XMM3: pixel 3 mask \n\
+ pand %%xmm0, %%xmm3 # XMM3: pixel 3 \n\
+ psrldq $1, %%xmm3 \n\
+ pand %%xmm0, %%xmm2 # XMM2: pixel 2 \n\
+ por %%xmm3, %%xmm2 # XMM2: pixels 2 and 3 \n\
+ psrldq $1, %%xmm2 \n\
+ pand %%xmm0, %%xmm1 # XMM1: pixel 1 \n\
+ pand %%xmm5, %%xmm0 # XMM0: pixel 0 \n\
+ por %%xmm2, %%xmm1 # XMM1: pixels 1, 2, and 3 \n\
+ psrldq $1, %%xmm1 \n\
+ por %%xmm1, %%xmm0 # XMM0: RGB24 data \n\
+ # We can't just movdqu, because we might run over the edge \n\
+ movd %%xmm0, -12("EDI","EDX") # store low 4 bytes \n\
+ pshufd $0xF9, %%xmm0, %%xmm0 # shift right 4 bytes \n\
+ movq %%xmm0, -8("EDI","EDX") # store high 8 bytes \n",\
+ /* emms */ "emms") \
+ : /* no outputs */ \
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height), \
+ "d" (&rgb_bgr_data), "m" (rgb_bgr_data) \
+ : "eax");
+
+
+/* RGBA<->ABGR and ARGB<->BGRA: reverse byte order */
+static int rgba_swapall_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_REV32_SSE2(width*height);
+ return 1;
+}
+
+/* RGBA<->BGRA: swap bytes 0 and 2 */
+static int rgba_swap02_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_SWAP32_02_SSE2(width*height);
+ return 1;
+}
+
+/* ARGB<->ABGR: swap bytes 1 and 3 */
+static int rgba_swap13_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_SWAP32_13_SSE2(width*height);
+ return 1;
+}
+
+/* RGBA->ARGB and BGRA->ABGR: alpha moves from byte 3 to byte 0 */
+static int rgba_alpha30_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_ROL32_SSE2(width*height);
+ return 1;
+}
+
+/* ARGB->RGBA and ABGR->BGRA: alpha moves from byte 0 to byte 3 */
+static int rgba_alpha03_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_ROR32_SSE2(width*height);
+ return 1;
+}
+
+/* RGB<->BGR */
+static int rgb24_bgr24_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ asm("movdqa ("EDX"), %%xmm5 # byte 0 mask \n\
+ pshufd $0xD2, %%xmm5, %%xmm6 # byte 1 mask \n\
+ pshufd $0xC9, %%xmm5, %%xmm7 # byte 2 mask \n"
+ SIMD_LOOP_WRAPPER(
+ /* blocksize */ 4,
+ /* push_regs */ "",
+ /* pop_regs */ "",
+ /* small_loop */
+ "lea ("ECX","ECX",2),"EDX" \n\
+ movb -3("ESI","EDX"), %%al \n\
+ movb -2("ESI","EDX"), %%ah \n\
+ movb %%ah, -2("EDI","EDX") \n\
+ movb -1("ESI","EDX"), %%ah \n\
+ movb %%ah, -3("EDI","EDX") \n\
+ movb %%al, -1("EDI","EDX")",
+ /* main_loop */
+ "lea ("ECX","ECX",2),"EDX" \n\
+ # We can't just movdqu, because we might run over the edge \n\
+ movd -12("ESI","EDX"), %%xmm1 \n\
+ movq -8("ESI","EDX"), %%xmm0 \n\
+ pshufd $0xD3, %%xmm0, %%xmm0 # shift left by 4 bytes \n\
+ por %%xmm1, %%xmm0 # XMM0: original data \n\
+ movdqa %%xmm5, %%xmm2 \n\
+ movdqa %%xmm6, %%xmm3 \n\
+ movdqa %%xmm7, %%xmm4 \n\
+ pand %%xmm0, %%xmm2 # XMM2: byte 0 \n\
+ pslldq $2, %%xmm2 # shift to byte 2 position \n\
+ pand %%xmm0, %%xmm3 # XMM3: byte 1 \n\
+ pand %%xmm0, %%xmm4 # XMM4: byte 2 \n\
+ psrldq $2, %%xmm4 # shift to byte 0 position \n\
+ por %%xmm2, %%xmm3 \n\
+ por %%xmm4, %%xmm3 # XMM3: reversed data \n\
+ movd %%xmm3, -12("EDI","EDX") # avoid running over the edge \n\
+ pshufd $0xF9, %%xmm3, %%xmm3 # shift right by 4 bytes \n\
+ movq %%xmm3, -8("EDI","EDX")",
+ /* emms */ "emms")
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+ "d" (&rgb_bgr_data), "m" (rgb_bgr_data)
+ : "eax");
+ return 1;
+}
+
+/* RGB->RGBA */
+static int rgb24_rgba32_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ RGB24TO32(0,1,2,3, "");
+ return 1;
+}
+
+/* RGB->ABGR */
+static int rgb24_abgr32_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ RGB24TO32(3,2,1,0, SHIFT_RBSWAP SHIFT_AFIRST);
+ return 1;
+}
+
+/* RGB->ARGB */
+static int rgb24_argb32_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ RGB24TO32(1,2,3,0, SHIFT_AFIRST);
+ return 1;
+}
+
+/* RGB->BGRA */
+static int rgb24_bgra32_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ RGB24TO32(2,1,0,3, SHIFT_RBSWAP);
+ return 1;
+}
+
+/* RGBA->RGB */
+static int rgba32_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ RGB32TO24(0,1,2,3, "");
+ return 1;
+}
+
+/* ABGR->RGB */
+static int abgr32_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ RGB32TO24(3,2,1,0, SHIFT_ALAST SHIFT_RBSWAP);
+ return 1;
+}
+
+/* ARGB->RGB */
+static int argb32_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ RGB32TO24(1,2,3,0, SHIFT_ALAST);
+ return 1;
+}
+
+/* BGRA->RGB */
+static int bgra32_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ RGB32TO24(2,1,0,3, SHIFT_RBSWAP);
+ return 1;
+}
+
+/*************************************************************************/
+
+#define R_GRAY 19595
+#define G_GRAY 38470
+#define B_GRAY 7471
+#define INIT_GRAY8 \
+ "pxor %%xmm4, %%xmm4 # XMM4: all 0's \n\
+ movl %3, %%eax \n\
+ movd %%eax, %%xmm5 \n\
+ pshuflw $0x00, %%xmm5, %%xmm5 \n\
+ pshufd $0x00, %%xmm5, %%xmm5 # XMM5: R->gray constant \n\
+ movl %4, %%eax \n\
+ movd %%eax, %%xmm6 \n\
+ pshuflw $0x00, %%xmm6, %%xmm6 \n\
+ pshufd $0x00, %%xmm6, %%xmm6 # XMM6: G->gray constant \n\
+ movl %5, %%eax \n\
+ movd %%eax, %%xmm7 \n\
+ pshuflw $0x00, %%xmm7, %%xmm7 \n\
+ pshufd $0x00, %%xmm7, %%xmm7 # XMM7: B->gray constant \n\
+ pcmpeqd %%xmm3, %%xmm3 \n\
+ psllw $15, %%xmm3 \n\
+ psrlw $8, %%xmm3 # XMM3: 0x0080*8 (for rounding) \n"
+#define SINGLE_GRAY8(idx,ofsR,ofsG,ofsB) \
+ "movzbl "#ofsR"("ESI","idx"), %%eax # retrieve red byte \n\
+ imull %3, %%eax # multiply by red->gray factor \n\
+ movzbl "#ofsG"("ESI","idx"), %%edx # retrieve green byte \n\
+ imull %4, %%edx # multiply by green->gray factor\n\
+ addl %%edx, %%eax # add to total \n\
+ movzbl "#ofsB"("ESI","idx"), %%edx # retrieve blue byte \n\
+ imull %5, %%edx # multiply by blue->gray factor \n\
+ addl %%edx, %%eax # add to total \n\
+ addl $0x8000, %%eax # round \n\
+ shrl $16, %%eax # shift back down \n\
+ movb %%al, -1("EDI","ECX") # and store \n"
+#define STORE_GRAY8 \
+ "psllw $8, %%xmm0 # XMM0: add 8 bits of precision \n\
+ pmulhuw %%xmm5, %%xmm0 # XMM0: r7 r6 r5 r4 r3 r2 r1 r0 \n\
+ psllw $8, %%xmm1 # XMM1: add 8 bits of precision \n\
+ pmulhuw %%xmm6, %%xmm1 # XMM1: g7 g6 g5 g4 g3 g2 g1 g0 \n\
+ paddw %%xmm3, %%xmm0 # XMM0: add rounding constant \n\
+ psllw $8, %%xmm2 # XMM2: add 8 bits of precision \n\
+ pmulhuw %%xmm7, %%xmm2 # XMM2: b7 b6 b5 b4 b3 b2 b1 b0 \n\
+ paddw %%xmm1, %%xmm0 # XMM0: add green part \n\
+ paddw %%xmm2, %%xmm0 # XMM0: add blue part \n\
+ psrlw $8, %%xmm0 # XMM0: shift back to bytes \n\
+ packuswb %%xmm4, %%xmm0 # XMM0: gray7..gray0 packed \n\
+ movq %%xmm0, -8("EDI","ECX") \n"
+
+#define ASM_RGB24_GRAY(ofsR,ofsG,ofsB,load) \
+ asm(INIT_GRAY8 \
+ PUSH(EBX)" \n\
+ lea ("ECX","ECX",2),"EBX" \n"\
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 8, \
+ /* push_regs */ "", \
+ /* pop_regs */ "", \
+ /* small_loop */ SINGLE_GRAY8(EBX, ofsR,ofsG,ofsB) "subl $3, %%ebx;",\
+ /* main_loop */ load(4) STORE_GRAY8 "subl $24, %%ebx;", \
+ /* emms */ "emms") \
+ POP(EBX) \
+ : /* no outputs */ \
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height), \
+ "i" (R_GRAY), "i" (G_GRAY), "i" (B_GRAY) \
+ : "eax", "edx" COMMA_FAKE_PUSH_REG \
+ )
+
+#define ASM_RGB32_GRAY(ofsR,ofsG,ofsB,load) \
+ asm(INIT_GRAY8 \
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 8, \
+ /* push_regs */ "", \
+ /* pop_regs */ "", \
+ /* small_loop */ SINGLE_GRAY8(ECX",4", ofsR,ofsG,ofsB), \
+ /* main_loop */ load(4) STORE_GRAY8, \
+ /* emms */ "emms") \
+ : /* no outputs */ \
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height), \
+ "i" (R_GRAY), "i" (G_GRAY), "i" (B_GRAY) \
+ : "eax", "edx")
+
+
+static int rgb24_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_RGB24_GRAY(-3,-2,-1, SSE2_LOAD_RGB24);
+ return 1;
+}
+
+static int bgr24_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_RGB24_GRAY(-1,-2,-3, SSE2_LOAD_BGR24);
+ return 1;
+}
+
+static int rgba32_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_RGB32_GRAY(-4,-3,-2, SSE2_LOAD_RGBA32);
+ return 1;
+}
+
+static int bgra32_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_RGB32_GRAY(-2,-3,-4, SSE2_LOAD_BGRA32);
+ return 1;
+}
+
+static int argb32_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_RGB32_GRAY(-3,-2,-1, SSE2_LOAD_ARGB32);
+ return 1;
+}
+
+static int abgr32_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_RGB32_GRAY(-1,-2,-3, SSE2_LOAD_ABGR32);
+ return 1;
+}
+
+/*************************************************************************/
+
+static int gray8_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ asm("# Store all 0's in XMM4 \n\
+ pxor %%xmm4, %%xmm4 \n\
+ # Generate mask in XMM7 to select bytes 0,3,6,9 of an XMM register\n\
+ pcmpeqd %%xmm7, %%xmm7 # XMM7: all 1's \n\
+ psrlw $8, %%xmm7 # XMM7: 0x00FF * 8 \n\
+ pcmpeqd %%xmm6, %%xmm6 # XMM6: all 1's \n\
+ psllw $8, %%xmm6 # XMM6: 0xFF00 * 8 \n\
+ pslldq $8, %%xmm6 \n\
+ psrldq $8, %%xmm7 \n\
+ por %%xmm6, %%xmm7 # XMM7: 0xFF00*4, 0x00FF*4 \n\
+ pshufd $0xCC, %%xmm7, %%xmm7 # XMM7: {0xFF00*2, 0x00FF*2} * 2\n\
+ pshuflw $0xC0, %%xmm7, %%xmm7 # XMM7.l: FF0000FF00FF00FF \n\
+ psrldq $4, %%xmm7 # XMM7: 0x00000000FF00FF00 \n\
+ # 00FF00FFFF0000FF \n\
+ pshufd $0xEC, %%xmm7, %%xmm7 # XMM7: 0x00000000FF00FF00 \n\
+ # 00000000FF0000FF \n\
+ pshuflw $0x24, %%xmm7, %%xmm7 # XMM7.l: 00FF0000FF0000FF \n\
+ pshufhw $0xFC, %%xmm7, %%xmm7 # XMM7.h: 000000000000FF00 \n\
+ # Load ECX*3 into EDX ahead of time \n\
+ lea ("ECX","ECX",2), "EDX" \n"
+ SIMD_LOOP_WRAPPER(
+ /* blocksize */ 4,
+ /* push_regs */ "",
+ /* pop_regs */ "",
+ /* small_loop */ "\
+ movb -1("ESI","ECX"), %%al # retrieve gray byte \n\
+ movb %%al, -3("EDI","EDX") # and store 3 times \n\
+ movb %%al, -2("EDI","EDX") \n\
+ movb %%al, -1("EDI","EDX") \n\
+ subl $3, %%edx \n",
+ /* main_loop */ "\
+ movd -4("ESI","ECX"), %%xmm0 # XMM0: G3..G0 \n\
+ pshufd $0xCC, %%xmm0, %%xmm0 # XMM0: {0,0,0,0,G3..G0} * 2 \n\
+ pshuflw $0x50, %%xmm0, %%xmm0 # X0.l: G3 G2 G3 G2 G1 G0 G1 G0 \n\
+ pshufhw $0x55, %%xmm0, %%xmm0 # X0.h: G3 G2 G3 G2 G3 G2 G3 G2 \n\
+ pand %%xmm7, %%xmm0 # XMM0: ------3--2--1--0 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: ------3--2--1--0 \n\
+ pslldq $1, %%xmm1 # XMM1: -----3--2--1--0- \n\
+ movdqa %%xmm0, %%xmm2 # XMM2: ------3--2--1--0 \n\
+ pslldq $2, %%xmm2 # XMM2: ----3--2--1--0-- \n\
+ por %%xmm1, %%xmm0 # XMM0: -----33-22-11-00 \n\
+ por %%xmm2, %%xmm0 # XMM0: ----333222111000 \n\
+ movd %%xmm0, -12("EDI","EDX") \n\
+ pshufd $0xC9, %%xmm0, %%xmm0 \n\
+ movq %%xmm0, -8("EDI","EDX") \n\
+ subl $12, %%edx \n",
+ /* emms */ "emms")
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height)
+ : "eax", "edx");
+ return 1;
+}
+
+static int gray8_rgba32_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ asm("pxor %%xmm4, %%xmm4 # XMM4: all 0's \n"
+ SIMD_LOOP_WRAPPER(
+ /* blocksize */ 4,
+ /* push_regs */ "",
+ /* pop_regs */ "",
+ /* small_loop */ "\
+ movb -1("ESI","ECX"), %%al # retrieve gray byte \n\
+ movb %%al, -4("EDI","ECX",4) # and store 3 times \n\
+ movb %%al, -3("EDI","ECX",4) \n\
+ movb %%al, -2("EDI","ECX",4) \n\
+ movb $0, -1("EDI","ECX",4) # clear A byte \n",
+ /* main_loop */ "\
+ movd -4("ESI","ECX"), %%xmm0 # XMM0: 00 00 00 00 G3 G2 G1 G0 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: 00 00 00 00 G3 G2 G1 G0 \n\
+ punpcklbw %%xmm0, %%xmm0 # XMM0: G3 G3 G2 G2 G1 G1 G0 G0 \n\
+ punpcklbw %%xmm4, %%xmm1 # XMM1: 00 G3 00 G2 00 G1 00 G0 \n\
+ punpcklbw %%xmm1, %%xmm0 # XMM0: 0GGG3 0GGG2 0GGG1 0GGG0 \n\
+ movdqu %%xmm0, -16("EDI","ECX",4) \n",
+ /* emms */ "emms")
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height)
+ : "eax");
+ return 1;
+}
+
+static int gray8_argb32_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ asm("pxor %%xmm4, %%xmm4 # XMM4: all 0's \n"
+ SIMD_LOOP_WRAPPER(
+ /* blocksize */ 4,
+ /* push_regs */ "",
+ /* pop_regs */ "",
+ /* small_loop */ "\
+ movb -1("ESI","ECX"), %%al # retrieve gray byte \n\
+ movb %%al, -3("EDI","ECX",4) # and store 3 times \n\
+ movb %%al, -2("EDI","ECX",4) \n\
+ movb %%al, -1("EDI","ECX",4) \n\
+ movb $0, -4("EDI","ECX",4) # clear A byte \n",
+ /* main_loop */ "\
+ movd -4("ESI","ECX"), %%xmm0 # XMM0: 00 00 00 00 G3 G2 G1 G0 \n\
+ movdqa %%xmm4, %%xmm1 # XMM1: 00 00 00 00 00 00 00 00 \n\
+ punpcklbw %%xmm0, %%xmm1 # XMM1: G3 00 G2 00 G1 00 G0 00 \n\
+ punpcklbw %%xmm0, %%xmm0 # XMM0: G3 G3 G2 G2 G1 G1 G0 G0 \n\
+ punpcklbw %%xmm0, %%xmm1 # XMM0: GGG03 GGG02 GGG01 GGG00 \n\
+ movdqu %%xmm1, -16("EDI","ECX",4) \n",
+ /* emms */ "emms")
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height)
+ : "eax");
+ return 1;
+}
+
+#endif /* HAVE_ASM_SSE2 */
+
+/*************************************************************************/
+
+#endif /* ARCH_X86 || ARCH_X86_64 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Initialization */
+
+int ac_imgconvert_init_rgb_packed(int accel)
+{
+ if (!register_conversion(IMG_RGB24, IMG_RGB24, rgb_copy)
+ || !register_conversion(IMG_RGB24, IMG_BGR24, rgb24_bgr24)
+ || !register_conversion(IMG_RGB24, IMG_RGBA32, rgb24_rgba32)
+ || !register_conversion(IMG_RGB24, IMG_ABGR32, rgb24_abgr32)
+ || !register_conversion(IMG_RGB24, IMG_ARGB32, rgb24_argb32)
+ || !register_conversion(IMG_RGB24, IMG_BGRA32, rgb24_bgra32)
+ || !register_conversion(IMG_RGB24, IMG_GRAY8, rgb24_gray8)
+
+ || !register_conversion(IMG_BGR24, IMG_BGR24, rgb_copy)
+ || !register_conversion(IMG_BGR24, IMG_RGB24, rgb24_bgr24)
+ || !register_conversion(IMG_BGR24, IMG_RGBA32, rgb24_bgra32)
+ || !register_conversion(IMG_BGR24, IMG_ABGR32, rgb24_argb32)
+ || !register_conversion(IMG_BGR24, IMG_ARGB32, rgb24_abgr32)
+ || !register_conversion(IMG_BGR24, IMG_BGRA32, rgb24_rgba32)
+ || !register_conversion(IMG_BGR24, IMG_GRAY8, bgr24_gray8)
+
+ || !register_conversion(IMG_RGBA32, IMG_RGB24, rgba32_rgb24)
+ || !register_conversion(IMG_RGBA32, IMG_BGR24, bgra32_rgb24)
+ || !register_conversion(IMG_RGBA32, IMG_RGBA32, rgba_copy)
+ || !register_conversion(IMG_RGBA32, IMG_ABGR32, rgba_swapall)
+ || !register_conversion(IMG_RGBA32, IMG_ARGB32, rgba_alpha30)
+ || !register_conversion(IMG_RGBA32, IMG_BGRA32, rgba_swap02)
+ || !register_conversion(IMG_RGBA32, IMG_GRAY8, rgba32_gray8)
+
+ || !register_conversion(IMG_ABGR32, IMG_RGB24, abgr32_rgb24)
+ || !register_conversion(IMG_ABGR32, IMG_BGR24, argb32_rgb24)
+ || !register_conversion(IMG_ABGR32, IMG_RGBA32, rgba_swapall)
+ || !register_conversion(IMG_ABGR32, IMG_ABGR32, rgba_copy)
+ || !register_conversion(IMG_ABGR32, IMG_ARGB32, rgba_swap13)
+ || !register_conversion(IMG_ABGR32, IMG_BGRA32, rgba_alpha03)
+ || !register_conversion(IMG_ABGR32, IMG_GRAY8, abgr32_gray8)
+
+ || !register_conversion(IMG_ARGB32, IMG_RGB24, argb32_rgb24)
+ || !register_conversion(IMG_ARGB32, IMG_BGR24, abgr32_rgb24)
+ || !register_conversion(IMG_ARGB32, IMG_RGBA32, rgba_alpha03)
+ || !register_conversion(IMG_ARGB32, IMG_ABGR32, rgba_swap13)
+ || !register_conversion(IMG_ARGB32, IMG_ARGB32, rgba_copy)
+ || !register_conversion(IMG_ARGB32, IMG_BGRA32, rgba_swapall)
+ || !register_conversion(IMG_ARGB32, IMG_GRAY8, argb32_gray8)
+
+ || !register_conversion(IMG_BGRA32, IMG_RGB24, bgra32_rgb24)
+ || !register_conversion(IMG_BGRA32, IMG_BGR24, rgba32_rgb24)
+ || !register_conversion(IMG_BGRA32, IMG_RGBA32, rgba_swap02)
+ || !register_conversion(IMG_BGRA32, IMG_ABGR32, rgba_alpha30)
+ || !register_conversion(IMG_BGRA32, IMG_ARGB32, rgba_swapall)
+ || !register_conversion(IMG_BGRA32, IMG_BGRA32, rgba_copy)
+ || !register_conversion(IMG_BGRA32, IMG_GRAY8, bgra32_gray8)
+
+ || !register_conversion(IMG_GRAY8, IMG_RGB24, gray8_rgb24)
+ || !register_conversion(IMG_GRAY8, IMG_BGR24, gray8_rgb24)
+ || !register_conversion(IMG_GRAY8, IMG_RGBA32, gray8_rgba32)
+ || !register_conversion(IMG_GRAY8, IMG_ABGR32, gray8_argb32)
+ || !register_conversion(IMG_GRAY8, IMG_ARGB32, gray8_argb32)
+ || !register_conversion(IMG_GRAY8, IMG_BGRA32, gray8_rgba32)
+ || !register_conversion(IMG_GRAY8, IMG_GRAY8, gray8_copy)
+ ) {
+ return 0;
+ }
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+
+ if (accel & (AC_IA32ASM | AC_AMD64ASM)) {
+ if (!register_conversion(IMG_RGBA32, IMG_ABGR32, rgba_swapall_x86)
+ || !register_conversion(IMG_RGBA32, IMG_ARGB32, rgba_alpha30_x86)
+ || !register_conversion(IMG_RGBA32, IMG_BGRA32, rgba_swap02_x86)
+
+ || !register_conversion(IMG_ABGR32, IMG_RGBA32, rgba_swapall_x86)
+ || !register_conversion(IMG_ABGR32, IMG_ARGB32, rgba_swap13_x86)
+ || !register_conversion(IMG_ABGR32, IMG_BGRA32, rgba_alpha03_x86)
+
+ || !register_conversion(IMG_ARGB32, IMG_RGBA32, rgba_alpha03_x86)
+ || !register_conversion(IMG_ARGB32, IMG_ABGR32, rgba_swap13_x86)
+ || !register_conversion(IMG_ARGB32, IMG_BGRA32, rgba_swapall_x86)
+
+ || !register_conversion(IMG_BGRA32, IMG_RGBA32, rgba_swap02_x86)
+ || !register_conversion(IMG_BGRA32, IMG_ABGR32, rgba_alpha30_x86)
+ || !register_conversion(IMG_BGRA32, IMG_ARGB32, rgba_swapall_x86)
+ ) {
+ return 0;
+ }
+ }
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
+ if (accel & AC_MMX) {
+ if (!register_conversion(IMG_RGBA32, IMG_ABGR32, rgba_swapall_mmx)
+ || !register_conversion(IMG_RGBA32, IMG_ARGB32, rgba_alpha30_mmx)
+ || !register_conversion(IMG_RGBA32, IMG_BGRA32, rgba_swap02_mmx)
+
+ || !register_conversion(IMG_ABGR32, IMG_RGBA32, rgba_swapall_mmx)
+ || !register_conversion(IMG_ABGR32, IMG_ARGB32, rgba_swap13_mmx)
+ || !register_conversion(IMG_ABGR32, IMG_BGRA32, rgba_alpha03_mmx)
+
+ || !register_conversion(IMG_ARGB32, IMG_RGBA32, rgba_alpha03_mmx)
+ || !register_conversion(IMG_ARGB32, IMG_ABGR32, rgba_swap13_mmx)
+ || !register_conversion(IMG_ARGB32, IMG_BGRA32, rgba_swapall_mmx)
+
+ || !register_conversion(IMG_BGRA32, IMG_RGBA32, rgba_swap02_mmx)
+ || !register_conversion(IMG_BGRA32, IMG_ABGR32, rgba_alpha30_mmx)
+ || !register_conversion(IMG_BGRA32, IMG_ARGB32, rgba_swapall_mmx)
+ ) {
+ return 0;
+ }
+ }
+#endif
+
+#if defined(HAVE_ASM_SSE2)
+ if (accel & AC_SSE2) {
+ if (!register_conversion(IMG_RGB24, IMG_BGR24, rgb24_bgr24_sse2)
+ || !register_conversion(IMG_RGB24, IMG_RGBA32, rgb24_rgba32_sse2)
+ || !register_conversion(IMG_RGB24, IMG_ABGR32, rgb24_abgr32_sse2)
+ || !register_conversion(IMG_RGB24, IMG_ARGB32, rgb24_argb32_sse2)
+ || !register_conversion(IMG_RGB24, IMG_BGRA32, rgb24_bgra32_sse2)
+ || !register_conversion(IMG_RGB24, IMG_GRAY8, rgb24_gray8_sse2)
+
+ || !register_conversion(IMG_BGR24, IMG_RGB24, rgb24_bgr24_sse2)
+ || !register_conversion(IMG_BGR24, IMG_RGBA32, rgb24_bgra32_sse2)
+ || !register_conversion(IMG_BGR24, IMG_ABGR32, rgb24_argb32_sse2)
+ || !register_conversion(IMG_BGR24, IMG_ARGB32, rgb24_abgr32_sse2)
+ || !register_conversion(IMG_BGR24, IMG_BGRA32, rgb24_rgba32_sse2)
+ || !register_conversion(IMG_BGR24, IMG_GRAY8, bgr24_gray8_sse2)
+
+ || !register_conversion(IMG_RGBA32, IMG_RGB24, rgba32_rgb24_sse2)
+ || !register_conversion(IMG_RGBA32, IMG_BGR24, bgra32_rgb24_sse2)
+ || !register_conversion(IMG_RGBA32, IMG_ABGR32, rgba_swapall_sse2)
+ || !register_conversion(IMG_RGBA32, IMG_ARGB32, rgba_alpha30_sse2)
+ || !register_conversion(IMG_RGBA32, IMG_BGRA32, rgba_swap02_sse2)
+ || !register_conversion(IMG_RGBA32, IMG_GRAY8, rgba32_gray8_sse2)
+
+ || !register_conversion(IMG_ABGR32, IMG_RGB24, abgr32_rgb24_sse2)
+ || !register_conversion(IMG_ABGR32, IMG_BGR24, argb32_rgb24_sse2)
+ || !register_conversion(IMG_ABGR32, IMG_RGBA32, rgba_swapall_sse2)
+ || !register_conversion(IMG_ABGR32, IMG_ARGB32, rgba_swap13_sse2)
+ || !register_conversion(IMG_ABGR32, IMG_BGRA32, rgba_alpha03_sse2)
+ || !register_conversion(IMG_ABGR32, IMG_GRAY8, abgr32_gray8_sse2)
+
+ || !register_conversion(IMG_ARGB32, IMG_RGB24, argb32_rgb24_sse2)
+ || !register_conversion(IMG_ARGB32, IMG_BGR24, abgr32_rgb24_sse2)
+ || !register_conversion(IMG_ARGB32, IMG_RGBA32, rgba_alpha03_sse2)
+ || !register_conversion(IMG_ARGB32, IMG_ABGR32, rgba_swap13_sse2)
+ || !register_conversion(IMG_ARGB32, IMG_BGRA32, rgba_swapall_sse2)
+ || !register_conversion(IMG_ARGB32, IMG_GRAY8, argb32_gray8_sse2)
+
+ || !register_conversion(IMG_BGRA32, IMG_RGB24, bgra32_rgb24_sse2)
+ || !register_conversion(IMG_BGRA32, IMG_BGR24, rgba32_rgb24_sse2)
+ || !register_conversion(IMG_BGRA32, IMG_RGBA32, rgba_swap02_sse2)
+ || !register_conversion(IMG_BGRA32, IMG_ABGR32, rgba_alpha30_sse2)
+ || !register_conversion(IMG_BGRA32, IMG_ARGB32, rgba_swapall_sse2)
+ || !register_conversion(IMG_BGRA32, IMG_GRAY8, bgra32_gray8_sse2)
+
+ || !register_conversion(IMG_GRAY8, IMG_RGB24, gray8_rgb24_sse2)
+ || !register_conversion(IMG_GRAY8, IMG_BGR24, gray8_rgb24_sse2)
+ || !register_conversion(IMG_GRAY8, IMG_RGBA32, gray8_rgba32_sse2)
+ || !register_conversion(IMG_GRAY8, IMG_ABGR32, gray8_argb32_sse2)
+ || !register_conversion(IMG_GRAY8, IMG_ARGB32, gray8_argb32_sse2)
+ || !register_conversion(IMG_GRAY8, IMG_BGRA32, gray8_rgba32_sse2)
+ ) {
+ return 0;
+ }
+ }
+#endif
+
+#endif /* ARCH_X86 || ARCH_X86_64 */
+
+ return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ * c-file-style: "stroustrup"
+ * c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/img_x86_common.h b/debian/transcode/transcode-1.1.7/aclib/img_x86_common.h
new file mode 100644
index 00000000..13ed851f
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/img_x86_common.h
@@ -0,0 +1,613 @@
+/*
+ * img_x86_common.h - common x86/x86-64 assembly macros
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later). See the file COPYING
+ * for details.
+ */
+
+#ifndef ACLIB_IMG_X86_COMMON_H
+#define ACLIB_IMG_X86_COMMON_H
+
+/*************************************************************************/
+
+/* Register names for pointers */
+#ifdef ARCH_X86_64
+# define EAX "%%rax"
+# define EBX "%%rbx"
+# define ECX "%%rcx"
+# define EDX "%%rdx"
+# define ESP "%%rsp"
+# define EBP "%%rbp"
+# define ESI "%%rsi"
+# define EDI "%%rdi"
+#else
+# define EAX "%%eax"
+# define EBX "%%ebx"
+# define ECX "%%ecx"
+# define EDX "%%edx"
+# define ESP "%%esp"
+# define EBP "%%ebp"
+# define ESI "%%esi"
+# define EDI "%%edi"
+#endif
+
+/* Macros to push and pop one or two registers within an assembly block.
+ * The x86-64 ABI allows leaf functions to write to 128 bytes BELOW
+ * (yes, below) the stack pointer, so we can't just push our own stuff
+ * there. Argh. */
+#ifdef ARCH_X86_64
+# define FAKE_PUSH_REG "r12"
+# define FAKE_PUSH_REG_2 "r13"
+# define COMMA_FAKE_PUSH_REG ,FAKE_PUSH_REG
+# define PUSH(reg) "mov " reg ", %%" FAKE_PUSH_REG
+# define POP(reg) "mov %%" FAKE_PUSH_REG ", " reg
+# define PUSH2(reg1,reg2) PUSH(reg1) "; mov " reg2 ", %%" FAKE_PUSH_REG_2
+# define POP2(reg2,reg1) "mov %%" FAKE_PUSH_REG_2 ", " reg2 "; " POP(reg1)
+#else
+# define COMMA_FAKE_PUSH_REG /*nothing*/
+# define PUSH(reg) "push " reg
+# define POP(reg) "pop " reg
+# define PUSH2(reg1,reg2) "push " reg1 "; push " reg2
+# define POP2(reg2,reg1) "pop " reg2 "; pop " reg1
+#endif
+
+/* Data for isolating particular bytes. Used by the SWAP32 macros; if you
+ * use them, make sure to define DEFINE_MASK_DATA before including this
+ * file! */
+#ifdef DEFINE_MASK_DATA
+static const struct { uint32_t n[64]; } __attribute__((aligned(16))) mask_data = {{
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF,
+ 0x0000FF00, 0x0000FF00, 0x0000FF00, 0x0000FF00,
+ 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF,
+ 0x00FF0000, 0x00FF0000, 0x00FF0000, 0x00FF0000,
+ 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF,
+ 0x00FFFF00, 0x00FFFF00, 0x00FFFF00, 0x00FFFF00,
+ 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF,
+ 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
+ 0xFF0000FF, 0xFF0000FF, 0xFF0000FF, 0xFF0000FF,
+ 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00,
+ 0xFF00FFFF, 0xFF00FFFF, 0xFF00FFFF, 0xFF00FFFF,
+ 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000,
+ 0xFFFF00FF, 0xFFFF00FF, 0xFFFF00FF, 0xFFFF00FF,
+ 0xFFFFFF00, 0xFFFFFF00, 0xFFFFFF00, 0xFFFFFF00,
+ 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+}};
+#endif
+
+/*************************************************************************/
+
+/* Basic assembly macros, used for odd-count loops */
+
+/* Swap bytes in pairs of 16-bit values */
+#define X86_SWAP16_2 \
+ "movl -4("ESI","ECX",4), %%eax \n\
+ movl %%eax, %%edx \n\
+ shll $8, %%eax \n\
+ andl $0xFF00FF00, %%eax \n\
+ shrl $8, %%edx \n\
+ andl $0x00FF00FF, %%edx \n\
+ orl %%edx, %%eax \n\
+ movl %%eax, -4("EDI","ECX",4)"
+
+/* Swap words in a 32-bit value */
+#define X86_SWAP32 \
+ "movl -4("ESI","ECX",4), %%eax \n\
+ roll $16, %%eax \n\
+ movl %%eax, -4("EDI","ECX",4)"
+
+/* Swap bytes 0 and 2 of a 32-bit value */
+#define X86_SWAP32_02 \
+ "movw -4("ESI","ECX",4), %%ax \n\
+ movw -2("ESI","ECX",4), %%dx \n\
+ xchg %%dl, %%al \n\
+ movw %%ax, -4("EDI","ECX",4) \n\
+ movw %%dx, -2("EDI","ECX",4)"
+
+/* Swap bytes 1 and 3 of a 32-bit value */
+#define X86_SWAP32_13 \
+ "movw -4("ESI","ECX",4), %%ax \n\
+ movw -2("ESI","ECX",4), %%dx \n\
+ xchg %%dh, %%ah \n\
+ movw %%ax, -4("EDI","ECX",4) \n\
+ movw %%dx, -2("EDI","ECX",4)"
+
+/* Reverse the order of bytes in a 32-bit value */
+#define X86_REV32 \
+ "movl -4("ESI","ECX",4), %%eax \n\
+ xchg %%ah, %%al \n\
+ roll $16, %%eax \n\
+ xchg %%ah, %%al \n\
+ movl %%eax, -4("EDI","ECX",4)"
+
+/* The same, using the BSWAP instruction */
+#define X86_REV32_BSWAP \
+ "movl -4("ESI","ECX",4), %%eax \n\
+ bswap %%eax \n\
+ movl %%eax, -4("EDI","ECX",4)"
+
+/* Rotate a 32-bit value left 8 bits */
+#define X86_ROL32 \
+ "movl -4("ESI","ECX",4), %%eax \n\
+ roll $8, %%eax \n\
+ movl %%eax, -4("EDI","ECX",4)"
+
+/* Rotate a 32-bit value right 8 bits */
+#define X86_ROR32 \
+ "movl -4("ESI","ECX",4), %%eax \n\
+ rorl $8, %%eax \n\
+ movl %%eax, -4("EDI","ECX",4)"
+
+/*************************************************************************/
+
+/* Basic assembly routines. Sizes are all given in 32-bit units. */
+
+#define ASM_SWAP16_2_X86(size) \
+ asm("0: "X86_SWAP16_2" \n\
+ subl $1, %%ecx \n\
+ jnz 0b" \
+ : /* no outputs */ \
+ : "S" (src[0]), "D" (dest[0]), "c" (size) \
+ : "eax", "edx")
+
+#define ASM_SWAP32_X86(size) \
+ asm("0: "X86_SWAP32" \n\
+ subl $1, %%ecx \n\
+ jnz 0b" \
+ : /* no outputs */ \
+ : "S" (src[0]), "D" (dest[0]), "c" (size) \
+ : "eax", "edx")
+
+#define ASM_SWAP32_02_X86(size) \
+ asm("0: "X86_SWAP32_02" \n\
+ subl $1, %%ecx \n\
+ jnz 0b" \
+ : /* no outputs */ \
+ : "S" (src[0]), "D" (dest[0]), "c" (size) \
+ : "eax", "edx")
+
+#define ASM_SWAP32_13_X86(size) \
+ asm("0: "X86_SWAP32_13" \n\
+ subl $1, %%ecx \n\
+ jnz 0b" \
+ : /* no outputs */ \
+ : "S" (src[0]), "D" (dest[0]), "c" (size) \
+ : "eax", "edx")
+
+#define ASM_REV32_X86(size) \
+ asm("0: "X86_REV32" \n\
+ subl $1, %%ecx \n\
+ jnz 0b" \
+ : /* no outputs */ \
+ : "S" (src[0]), "D" (dest[0]), "c" (size) \
+ : "eax")
+
+#define ASM_ROL32_X86(size) \
+ asm("0: "X86_ROL32" \n\
+ subl $1, %%ecx \n\
+ jnz 0b" \
+ : /* no outputs */ \
+ : "S" (src[0]), "D" (dest[0]), "c" (size) \
+ : "eax")
+
+#define ASM_ROR32_X86(size) \
+ asm("0: "X86_ROR32" \n\
+ subl $1, %%ecx \n\
+ jnz 0b" \
+ : /* no outputs */ \
+ : "S" (src[0]), "D" (dest[0]), "c" (size) \
+ : "eax")
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Wrapper for SIMD loops. This generates the body of an asm() construct
+ * (the string only, not the input/output/clobber lists) given the data
+ * block size (number of data units processed per SIMD loop iteration),
+ * instructions to save and restore unclobberable registers (such as EBX),
+ * and the bodies of the odd-count and main loops. The data count is
+ * assumed to be preloaded in ECX. Parameters are:
+ * blocksize: number of units of data processed per SIMD loop (must be
+ * a power of 2); can be a constant or a numerical
+ * expression containing only constants
+ * push_regs: string constant containing instructions to push registers
+ * that must be saved over the small loop
+ * pop_regs: string constant containing instructions to pop registers
+ * saved by `push_regs' (restored before the main loop)
+ * small_loop: loop for handling data elements one at a time (when the
+ * count is not a multiple of `blocksize'
+ * main_loop: main SIMD loop for processing data
+ * emms: EMMS/SFENCE instructions to end main loop with, as needed
+ */
+
+#define SIMD_LOOP_WRAPPER(blocksize,push_regs,pop_regs,small_loop,main_loop,emms) \
+ /* Check whether the count is a multiple of the blocksize (this \
+ * can cause branch mispredicts but seems to be faster overall) */ \
+ "testl $(("#blocksize")-1), %%ecx; " \
+ "jz 1f; " \
+ /* It's not--run the small loop to align the count */ \
+ push_regs"; " \
+ "0: " \
+ small_loop"; " \
+ "subl $1, %%ecx; " \
+ "testl $(("#blocksize")-1), %%ecx; " \
+ "jnz 0b; " \
+ pop_regs"; " \
+ /* Make sure there's some data left */ \
+ "testl %%ecx, %%ecx; " \
+ "jz 2f; " \
+ /* Now run the main SIMD loop */ \
+ "1: " \
+ main_loop"; " \
+ "subl $("#blocksize"), %%ecx; " \
+ "jnz 1b; " \
+ /* Clear MMX state and/or SFENCE, as needed */ \
+ emms"; " \
+ /* Done */ \
+ "2: "
+
+/*************************************************************************/
+
+/* MMX- and SSE2-optimized swap/rotate routines. These routines are
+ * identical save for data size, so we use common macros to implement them,
+ * with register names and data offsets replaced by parameters to the
+ * macros. */
+
+#define ASM_SIMD_MMX(name,size) \
+ name((size), 64, \
+ "movq", "movq", "movq", "", \
+ "%%mm0", "%%mm1", "%%mm2", "%%mm3", \
+ "%%mm4", "%%mm5", "%%mm6", "%%mm7")
+#define ASM_SIMD_SSE2(name,size) \
+ name((size), 128, \
+ "movdqu", "movdqa", "movdqu", "", \
+ "%%xmm0", "%%xmm1", "%%xmm2", "%%xmm3",\
+ "%%xmm4", "%%xmm5", "%%xmm6", "%%xmm7")
+#define ASM_SIMD_SSE2_ALIGNED(name,size) \
+ name((size), 128, \
+ "movdqa", "movdqa", "movntdq", "sfence",\
+ "%%xmm0", "%%xmm1", "%%xmm2", "%%xmm3",\
+ "%%xmm4", "%%xmm5", "%%xmm6", "%%xmm7")
+
+#define ASM_SWAP16_2_MMX(size) ASM_SIMD_MMX(ASM_SWAP16_2_SIMD,(size))
+#define ASM_SWAP16_2_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP16_2_SIMD,(size))
+#define ASM_SWAP16_2_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP16_2_SIMD,(size))
+#define ASM_SWAP32_MMX(size) ASM_SIMD_MMX(ASM_SWAP32_SIMD,(size))
+#define ASM_SWAP32_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP32_SIMD,(size))
+#define ASM_SWAP32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_SIMD,(size))
+#define ASM_SWAP32_02_MMX(size) ASM_SIMD_MMX(ASM_SWAP32_02_SIMD,(size))
+#define ASM_SWAP32_02_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP32_02_SIMD,(size))
+#define ASM_SWAP32_02_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_02_SIMD,(size))
+#define ASM_SWAP32_13_MMX(size) ASM_SIMD_MMX(ASM_SWAP32_13_SIMD,(size))
+#define ASM_SWAP32_13_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP32_13_SIMD,(size))
+#define ASM_SWAP32_13_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_13_SIMD,(size))
+#define ASM_REV32_MMX(size) ASM_SIMD_MMX(ASM_REV32_SIMD,(size))
+#define ASM_REV32_SSE2(size) ASM_SIMD_SSE2(ASM_REV32_SIMD,(size))
+#define ASM_REV32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_REV32_SIMD,(size))
+#define ASM_ROL32_MMX(size) ASM_SIMD_MMX(ASM_ROL32_SIMD,(size))
+#define ASM_ROL32_SSE2(size) ASM_SIMD_SSE2(ASM_ROL32_SIMD,(size))
+#define ASM_ROL32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_ROL32_SIMD,(size))
+#define ASM_ROR32_MMX(size) ASM_SIMD_MMX(ASM_ROR32_SIMD,(size))
+#define ASM_ROR32_SSE2(size) ASM_SIMD_SSE2(ASM_ROR32_SIMD,(size))
+#define ASM_ROR32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_ROR32_SIMD,(size))
+
+/*************************************************************************/
+
+/* Actual implementations. Note that unrolling the SIMD loops doesn't seem
+ * to be a win (only 2-3% improvement at most), and in fact can lose by a
+ * bit in short loops. */
+
+#define ASM_SWAP16_2_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
+ asm(SIMD_LOOP_WRAPPER( \
+ /* blocksize */ (regsize)/32, \
+ /* push_regs */ "", \
+ /* pop_regs */ "", \
+ /* small_loop */ X86_SWAP16_2, \
+ /* main_loop */ \
+ ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
+ # MM0: 7 6 5 4 3 2 1 0 \n\
+ "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
+ psrlw $8, "MM0" # MM0: - 7 - 5 - 3 - 1 \n\
+ psllw $8, "MM1" # MM1: 6 - 4 - 2 - 0 - \n\
+ por "MM1", "MM0" # MM0: 6 7 4 5 2 3 0 1 \n\
+ "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
+ /* emms */ "emms; "sfence) \
+ : /* no outputs */ \
+ : "S" (src[0]), "D" (dest[0]), "c" (size) \
+ : "eax", "edx")
+
+#define ASM_SWAP32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
+ asm(SIMD_LOOP_WRAPPER( \
+ /* blocksize */ (regsize)/32, \
+ /* push_regs */ "", \
+ /* pop_regs */ "", \
+ /* small_loop */ X86_SWAP32, \
+ /* main_loop */ \
+ ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
+ # MM0: 7 6 5 4 3 2 1 0 \n\
+ "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
+ psrld $16, "MM0" # MM0: - - 7 6 - - 3 2 \n\
+ pslld $16, "MM1" # MM1: 5 4 - - 1 0 - - \n\
+ por "MM1", "MM0" # MM0: 5 4 7 6 1 0 3 2 \n\
+ "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
+ /* emms */ "emms; "sfence) \
+ : /* no outputs */ \
+ : "S" (src[0]), "D" (dest[0]), "c" (size) \
+ : "eax")
+
+#define ASM_SWAP32_02_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
+ asm(SIMD_LOOP_WRAPPER( \
+ /* blocksize */ (regsize)/32, \
+ /* push_regs */ "push "EDX, \
+ /* pop_regs */ "pop "EDX, \
+ /* small_loop */ X86_SWAP32_02, \
+ /* main_loop */ \
+ ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
+ # MM0: 7 6 5 4 3 2 1 0 \n\
+ "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
+ "movq" "MM0", "MM2" # MM2: 7 6 5 4 3 2 1 0 \n\
+ pand 16("EDX"), "MM1" # MM1: - - - 4 - - - 0 \n\
+ pslld $16, "MM1" # MM1: - 4 - - - 0 - - \n\
+ pand 64("EDX"), "MM2" # MM2: - 6 - - - 2 - - \n\
+ psrld $16, "MM2" # MM2: - - - 6 - - - 2 \n\
+ pand 160("EDX"), "MM0" # MM0: 7 - 5 - 3 - 1 - \n\
+ por "MM1", "MM0" # MM0: 7 4 5 - 3 0 1 - \n\
+ por "MM2", "MM0" # MM0: 7 4 5 6 3 0 1 2 \n\
+ "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
+ /* emms */ "emms; "sfence) \
+ : /* no outputs */ \
+ : "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data), \
+ "m" (mask_data) \
+ : "eax")
+
+#define ASM_SWAP32_13_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
+ asm(SIMD_LOOP_WRAPPER( \
+ /* blocksize */ (regsize)/32, \
+ /* push_regs */ "push "EDX, \
+ /* pop_regs */ "pop "EDX, \
+ /* small_loop */ X86_SWAP32_13, \
+ /* main_loop */ \
+ ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
+ # MM0: 7 6 5 4 3 2 1 0 \n\
+ "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
+ "movq" "MM0", "MM2" # MM2: 7 6 5 4 3 2 1 0 \n\
+ pand 32("EDX"), "MM1" # MM1: - - 5 - - - 1 - \n\
+ pslld $16, "MM1" # MM1: 5 - - - 1 - - - \n\
+ pand 128("EDX"), "MM2" # MM2: 7 - - - 3 - - - \n\
+ psrld $16, "MM2" # MM2: - - 7 - - - 3 - \n\
+ pand 80("EDX"), "MM0" # MM0: - 6 - 4 - 2 - 0 \n\
+ por "MM1", "MM0" # MM0: 5 6 - 4 1 2 - 0 \n\
+ por "MM2", "MM0" # MM0: 5 6 7 4 1 2 3 0 \n\
+ "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
+ /* emms */ "emms; "sfence) \
+ : /* no outputs */ \
+ : "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data), \
+ "m" (mask_data) \
+ : "eax");
+
+#define ASM_REV32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
+ asm(SIMD_LOOP_WRAPPER( \
+ /* blocksize */ (regsize)/32, \
+ /* push_regs */ "", \
+ /* pop_regs */ "", \
+ /* small_loop */ X86_REV32_BSWAP, \
+ /* main_loop */ \
+ ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
+ # MM0: 7 6 5 4 3 2 1 0 \n\
+ "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
+ "movq" "MM0", "MM2" # MM2: 7 6 5 4 3 2 1 0 \n\
+ "movq" "MM0", "MM3" # MM3: 7 6 5 4 3 2 1 0 \n\
+ psrld $24, "MM0" # MM0: - - - 7 - - - 3 \n\
+ pand 32("EDX"), "MM2" # MM2: - - 5 - - - 1 - \n\
+ psrld $8, "MM1" # MM1: - 7 6 5 - 3 2 1 \n\
+ pand 32("EDX"), "MM1" # MM1: - - 6 - - - 2 - \n\
+ pslld $8, "MM2" # MM2: - 5 - - - 1 - - \n\
+ pslld $24, "MM3" # MM3: 4 - - - 0 - - - \n\
+ por "MM1", "MM0" # MM0: - - 6 7 - - 2 3 \n\
+ por "MM2", "MM0" # MM0: - 5 6 7 - 1 2 3 \n\
+ por "MM3", "MM0" # MM0: 4 5 6 7 0 1 2 3 \n\
+ "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
+ /* emms */ "emms; "sfence) \
+ : /* no outputs */ \
+ : "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data), \
+ "m" (mask_data) \
+ : "eax")
+
+#define ASM_ROL32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
+ asm(SIMD_LOOP_WRAPPER( \
+ /* blocksize */ (regsize)/32, \
+ /* push_regs */ "", \
+ /* pop_regs */ "", \
+ /* small_loop */ X86_ROL32, \
+ /* main_loop */ \
+ ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
+ # MM0: 7 6 5 4 3 2 1 0 \n\
+ "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
+ pslld $8, "MM0" # MM0: 6 5 4 - 2 1 0 - \n\
+ psrld $24, "MM1" # MM1: - - - 7 - - - 3 \n\
+ por "MM1", "MM0" # MM0: 6 5 4 7 2 1 0 3 \n\
+ "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
+ /* emms */ "emms; "sfence) \
+ : /* no outputs */ \
+ : "S" (src[0]), "D" (dest[0]), "c" (size) \
+ : "eax")
+
+#define ASM_ROR32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
+ asm(SIMD_LOOP_WRAPPER( \
+ /* blocksize */ (regsize)/32, \
+ /* push_regs */ "", \
+ /* pop_regs */ "", \
+ /* small_loop */ X86_ROR32, \
+ /* main_loop */ \
+ ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
+ # MM0: 7 6 5 4 3 2 1 0 \n\
+ "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
+ psrld $8, "MM0" # MM0: - 7 6 5 - 3 2 1 \n\
+ pslld $24, "MM1" # MM1: 4 - - - 0 - - - \n\
+ por "MM1", "MM0" # MM0: 4 7 6 5 0 3 2 1 \n\
+ "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
+ /* emms */ "emms; "sfence) \
+ : /* no outputs */ \
+ : "S" (src[0]), "D" (dest[0]), "c" (size) \
+ : "eax")
+
+/*************************************************************************/
+
+/* SSE2 macros to load 8 24- or 32-bit RGB pixels into XMM0/1/2 (R/G/B) as
+ * 16-bit values, used for RGB->YUV and RGB->grayscale conversions.
+ * ZERO is the number of the XMM register containing all zeroes. */
+
+#define SSE2_LOAD_RGB24(ZERO) \
+ "movl -21("ESI","EBX"), %%eax \n\
+ movd %%eax, %%xmm0 # XMM0: ----- ----- ----- xBGR1 \n\
+ pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xBGR1 ----- ----- ----- \n\
+ movl -18("ESI","EBX"), %%eax \n\
+ movd %%eax, %%xmm2 \n\
+ por %%xmm2, %%xmm0 # XMM0: xBGR1 ----- ----- xBGR2 \n\
+ pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xBGR2 xBGR1 ----- ----- \n\
+ movl -15("ESI","EBX"), %%eax \n\
+ movd %%eax, %%xmm2 \n\
+ por %%xmm2, %%xmm0 # XMM0: xBGR2 xBGR1 ----- xBGR3 \n\
+ pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xBGR3 xBGR2 xBGR1 ----- \n\
+ movl -24("ESI","EBX"), %%eax \n\
+ movd %%eax, %%xmm2 \n\
+ por %%xmm2, %%xmm0 # XMM0: xBGR3 xBGR2 xBGR1 xBGR0 \n\
+ movl -9("ESI","EBX"), %%eax \n\
+ movd %%eax, %%xmm1 # XMM1: ----- ----- ----- xBGR5 \n\
+ pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xBGR5 ----- ----- ----- \n\
+ movl -6("ESI","EBX"), %%eax \n\
+ movd %%eax, %%xmm2 \n\
+ por %%xmm2, %%xmm1 # XMM1: xBGR5 ----- ----- xBGR6 \n\
+ pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xBGR6 xBGR5 ----- ----- \n\
+ movl -3("ESI","EBX"), %%eax \n\
+ movd %%eax, %%xmm2 \n\
+ por %%xmm2, %%xmm1 # XMM1: xBGR6 xBGR5 ----- xBGR7 \n\
+ pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xBGR7 xBGR6 xBGR5 ----- \n\
+ movl -12("ESI","EBX"), %%eax \n\
+ movd %%eax, %%xmm2 \n\
+ por %%xmm2, %%xmm1 # XMM1: xBGR7 xBGR6 xBGR5 xBGR4 \n"\
+ SSE2_MASSAGE_RGBA32(ZERO)
+
+#define SSE2_LOAD_BGR24(ZERO) \
+ "movl -21("ESI","EBX"), %%eax \n\
+ movd %%eax, %%xmm0 # XMM0: ----- ----- ----- xRGB1 \n\
+ pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xRGB1 ----- ----- ----- \n\
+ movl -18("ESI","EBX"), %%eax \n\
+ movd %%eax, %%xmm2 \n\
+ por %%xmm2, %%xmm0 # XMM0: xRGB1 ----- ----- xRGB2 \n\
+ pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xRGB2 xRGB1 ----- ----- \n\
+ movl -15("ESI","EBX"), %%eax \n\
+ movd %%eax, %%xmm2 \n\
+ por %%xmm2, %%xmm0 # XMM0: xRGB2 xRGB1 ----- xRGB3 \n\
+ pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xRGB3 xRGB2 xRGB1 ----- \n\
+ movl -24("ESI","EBX"), %%eax \n\
+ movd %%eax, %%xmm2 \n\
+ por %%xmm2, %%xmm0 # XMM0: xRGB3 xRGB2 xRGB1 xRGB0 \n\
+ movl -9("ESI","EBX"), %%eax \n\
+ movd %%eax, %%xmm1 # XMM1: ----- ----- ----- xRGB5 \n\
+ pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xRGB5 ----- ----- ----- \n\
+ movl -6("ESI","EBX"), %%eax \n\
+ movd %%eax, %%xmm2 \n\
+ por %%xmm2, %%xmm1 # XMM1: xRGB5 ----- ----- xRGB6 \n\
+ pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xRGB6 xRGB5 ----- ----- \n\
+ movl -3("ESI","EBX"), %%eax \n\
+ movd %%eax, %%xmm2 \n\
+ por %%xmm2, %%xmm1 # XMM1: xRGB6 xRGB5 ----- xRGB7 \n\
+ pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xRGB7 xRGB6 xRGB5 ----- \n\
+ movl -12("ESI","EBX"), %%eax \n\
+ movd %%eax, %%xmm2 \n\
+ por %%xmm2, %%xmm1 # XMM1: xRGB7 xRGB6 xRGB5 xRGB4 \n"\
+ SSE2_MASSAGE_BGRA32(ZERO)
+
+#define SSE2_LOAD_RGBA32(ZERO) "\
+ movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: ABGR3 ABGR2 ABGR1 ABGR0 \n\
+ movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: ABGR7 ABGR6 ABGR5 ABGR4 \n"\
+ SSE2_MASSAGE_RGBA32(ZERO)
+#define SSE2_MASSAGE_RGBA32(ZERO) "\
+ movdqa %%xmm0, %%xmm2 # XMM2: ABGR3 ABGR2 ABGR1 ABGR0 \n\
+ punpcklbw %%xmm1, %%xmm0 # X0.l: A4 A0 B4 B0 G4 G0 R4 R0 \n\
+ punpckhbw %%xmm1, %%xmm2 # X2.l: A6 A2 B6 B2 G6 G2 R6 R2 \n\
+ movdqa %%xmm0, %%xmm1 # X1.l: A4 A0 B4 B0 G4 G0 R4 R0 \n\
+ punpcklbw %%xmm2, %%xmm0 # X0.l: G6 G4 G2 G0 R6 R4 R2 R0 \n\
+ punpckhbw %%xmm2, %%xmm1 # X1.l: G7 G5 G3 G1 R7 R5 R3 R1 \n\
+ movdqa %%xmm0, %%xmm2 # X2.l: G6 G4 G2 G0 R6 R4 R2 R0 \n\
+ punpcklbw %%xmm1, %%xmm0 # XMM0: G7.......G0 R7.......R0 \n\
+ punpckhbw %%xmm1, %%xmm2 # XMM2: A7.......A0 B7.......B0 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: G7.......G0 R7.......R0 \n\
+ punpcklbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\
+ punpckhbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\
+ punpcklbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n"
+
+#define SSE2_LOAD_BGRA32(ZERO) "\
+ movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: ARGB3 ARGB2 ARGB1 ARGB0 \n\
+ movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: ARGB7 ARGB6 ARGB5 ARGB4 \n"\
+ SSE2_MASSAGE_BGRA32(ZERO)
+#define SSE2_MASSAGE_BGRA32(ZERO) "\
+ movdqa %%xmm0, %%xmm2 # XMM2: ARGB3 ARGB2 ARGB1 ARGB0 \n\
+ punpcklbw %%xmm1, %%xmm2 # X2.l: A4 A0 R4 R0 G4 G0 B4 B0 \n\
+ punpckhbw %%xmm1, %%xmm0 # X0.l: A6 A2 R6 R2 G6 G2 B6 B2 \n\
+ movdqa %%xmm2, %%xmm1 # X1.l: A4 A0 R4 R0 G4 G0 B4 B0 \n\
+ punpcklbw %%xmm0, %%xmm2 # X2.l: G6 G4 G2 G0 B6 B4 B2 B0 \n\
+ punpckhbw %%xmm0, %%xmm1 # X1.l: G7 G5 G3 G1 B7 B5 B3 B1 \n\
+ movdqa %%xmm2, %%xmm0 # X0.l: G6 G4 G2 G0 B6 B4 B2 B0 \n\
+ punpcklbw %%xmm1, %%xmm2 # XMM2: G7.......G0 B7.......B0 \n\
+ punpckhbw %%xmm1, %%xmm0 # XMM0: A7.......A0 R7.......R0 \n\
+ movdqa %%xmm2, %%xmm1 # XMM1: G7.......G0 B7.......B0 \n\
+ punpcklbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\
+ punpckhbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\
+ punpcklbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n"
+
+#define SSE2_LOAD_ARGB32(ZERO) "\
+ movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: BGRA3 BGRA2 BGRA1 BGRA0 \n\
+ movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: BGRA7 BGRA6 BGRA5 BGRA4 \n"\
+ SSE2_MASSAGE_ARGB32(ZERO)
+#define SSE2_MASSAGE_ARGB32(ZERO) "\
+ movdqa %%xmm0, %%xmm2 # XMM2: BGRA3 BGRA2 BGRA1 BGRA0 \n\
+ punpcklbw %%xmm1, %%xmm0 # X0.l: B4 B0 G4 G0 R4 R0 A4 A0 \n\
+ punpckhbw %%xmm1, %%xmm2 # X2.l: B6 B2 G6 G2 R6 R2 A6 A2 \n\
+ movdqa %%xmm0, %%xmm1 # X1.l: B4 B0 G4 G0 R4 R0 A4 A0 \n\
+ punpcklbw %%xmm2, %%xmm0 # X0.l: R6 R4 R2 R0 A6 A4 A2 A0 \n\
+ punpckhbw %%xmm2, %%xmm1 # X1.l: R7 R5 R3 R1 A7 A5 A3 A1 \n\
+ movdqa %%xmm0, %%xmm2 # X2.l: R6 R4 R2 R0 A6 A4 A2 A0 \n\
+ punpcklbw %%xmm1, %%xmm0 # XMM0: R7.......G0 A7.......A0 \n\
+ punpckhbw %%xmm1, %%xmm2 # XMM2: B7.......G0 G7.......G0 \n\
+ movdqa %%xmm2, %%xmm1 # XMM1: B7.......B0 G7.......G0 \n\
+ punpckhbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\
+ punpcklbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\
+ punpckhbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n"
+
+#define SSE2_LOAD_ABGR32(ZERO) "\
+ movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: RGBA3 RGBA2 RGBA1 RGBA0 \n\
+ movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: RGBA7 RGBA6 RGBA5 RGBA4 \n"\
+ SSE2_MASSAGE_ABGR32(ZERO)
+#define SSE2_MASSAGE_ABGR32(ZERO) "\
+ movdqa %%xmm0, %%xmm2 # XMM2: RGBA3 RGBA2 RGBA1 RGBA0 \n\
+ punpcklbw %%xmm1, %%xmm2 # X2.l: R4 R0 G4 G0 B4 B0 A4 A0 \n\
+ punpckhbw %%xmm1, %%xmm0 # X0.l: R6 R2 G6 G2 B6 B2 A6 A2 \n\
+ movdqa %%xmm2, %%xmm1 # X1.l: R4 R0 G4 G0 B4 B0 A4 A0 \n\
+ punpcklbw %%xmm0, %%xmm2 # X2.l: B6 B4 B2 B0 A6 A4 A2 A0 \n\
+ punpckhbw %%xmm0, %%xmm1 # X1.l: B7 B5 B3 B1 A7 A5 A3 A1 \n\
+ movdqa %%xmm2, %%xmm0 # X0.l: B6 B4 B2 B0 A6 A4 A2 A0 \n\
+ punpcklbw %%xmm1, %%xmm2 # XMM2: B7.......B0 A7.......A0 \n\
+ punpckhbw %%xmm1, %%xmm0 # XMM0: R7.......R0 G7.......G0 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: R7.......R0 G7.......G0 \n\
+ punpckhbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\
+ punpcklbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\
+ punpckhbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n"
+
+/*************************************************************************/
+
+#endif /* ACLIB_IMG_X86_COMMON_H */
+
+/*
+ * Local variables:
+ * c-file-style: "stroustrup"
+ * c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/img_yuv_mixed.c b/debian/transcode/transcode-1.1.7/aclib/img_yuv_mixed.c
new file mode 100644
index 00000000..7f4b8d70
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/img_yuv_mixed.c
@@ -0,0 +1,981 @@
+/*
+ * img_yuv_packed.c - YUV planar<->packed image format conversion routines
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later). See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "imgconvert.h"
+#include "img_internal.h"
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Standard C implementations */
+
+/*************************************************************************/
+
+/* Wrappers for UYVY and YVYU */
+/* Note: we rely on YUY2<->{UYVY,YVYU} working for src==dest */
+/* FIXME: when converting from UYVY/YVYU, src is destroyed! */
+
+static int uyvy_yvyu_wrapper(uint8_t **src, ImageFormat srcfmt,
+ uint8_t **dest, ImageFormat destfmt,
+ int width, int height)
+{
+ if (srcfmt == IMG_UYVY || srcfmt == IMG_YVYU)
+ return ac_imgconvert(src, srcfmt, src, IMG_YUY2, width, height)
+ && ac_imgconvert(src, IMG_YUY2, dest, destfmt, width, height);
+ else
+ return ac_imgconvert(src, srcfmt, dest, IMG_YUY2, width, height)
+ && ac_imgconvert(dest, IMG_YUY2, dest, destfmt, width, height);
+}
+
+static int yuv420p_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YUV420P, dest, IMG_UYVY, width, height); }
+
+static int yuv420p_yvyu(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YUV420P, dest, IMG_YVYU, width, height); }
+
+static int yuv411p_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YUV411P, dest, IMG_UYVY, width, height); }
+
+static int yuv411p_yvyu(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YUV411P, dest, IMG_YVYU, width, height); }
+
+static int yuv422p_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YUV422P, dest, IMG_UYVY, width, height); }
+
+static int yuv422p_yvyu(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YUV422P, dest, IMG_YVYU, width, height); }
+
+static int yuv444p_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YUV444P, dest, IMG_UYVY, width, height); }
+
+static int yuv444p_yvyu(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YUV444P, dest, IMG_YVYU, width, height); }
+
+static int uyvy_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV420P, width, height); }
+
+static int yvyu_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV420P, width, height); }
+
+static int uyvy_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV411P, width, height); }
+
+static int yvyu_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV411P, width, height); }
+
+static int uyvy_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV422P, width, height); }
+
+static int yvyu_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV422P, width, height); }
+
+static int uyvy_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV444P, width, height); }
+
+static int yvyu_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
+{ return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV444P, width, height); }
+
+/*************************************************************************/
+
+static int yuv420p_yuy2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int x, y;
+
+ for (y = 0; y < (height & ~1); y++) {
+ for (x = 0; x < (width & ~1); x += 2) {
+ dest[0][(y*width+x)*2 ] = src[0][y*width+x];
+ dest[0][(y*width+x)*2+1] = src[1][(y/2)*(width/2)+x/2];
+ dest[0][(y*width+x)*2+2] = src[0][y*width+x+1];
+ dest[0][(y*width+x)*2+3] = src[2][(y/2)*(width/2)+x/2];
+ }
+ }
+ return 1;
+}
+
+static int yuv411p_yuy2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int x, y;
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < (width & ~1); x += 2) {
+ dest[0][(y*width+x)*2 ] = src[0][y*width+x];
+ dest[0][(y*width+x)*2+1] = src[1][y*(width/4)+x/4];
+ dest[0][(y*width+x)*2+2] = src[0][y*width+x+1];
+ dest[0][(y*width+x)*2+3] = src[2][y*(width/4)+x/4];
+ }
+ }
+ return 1;
+}
+
+static int yuv422p_yuy2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < (width/2)*height; i++) {
+ dest[0][i*4 ] = src[0][i*2];
+ dest[0][i*4+1] = src[1][i];
+ dest[0][i*4+2] = src[0][i*2+1];
+ dest[0][i*4+3] = src[2][i];
+ }
+ return 1;
+}
+
+static int yuv444p_yuy2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < (width/2)*height; i++) {
+ dest[0][i*4 ] = src[0][i*2];
+ dest[0][i*4+1] = (src[1][i*2] + src[1][i*2+1]) / 2;
+ dest[0][i*4+2] = src[0][i*2+1];
+ dest[0][i*4+3] = (src[2][i*2] + src[2][i*2+1]) / 2;
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+static int yuy2_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int x, y;
+
+ for (y = 0; y < (height & ~1); y++) {
+ for (x = 0; x < (width & ~1); x += 2) {
+ dest[0][y*width+x ] = src[0][(y*width+x)*2 ];
+ dest[0][y*width+x+1] = src[0][(y*width+x)*2+2];
+ if (y%2 == 0) {
+ dest[1][(y/2)*(width/2)+x/2] = src[0][(y*width+x)*2+1];
+ dest[2][(y/2)*(width/2)+x/2] = src[0][(y*width+x)*2+3];
+ } else {
+ dest[1][(y/2)*(width/2)+x/2] =
+ (dest[1][(y/2)*(width/2)+x/2] + src[0][(y*width+x)*2+1] + 1) / 2;
+ dest[2][(y/2)*(width/2)+x/2] =
+ (dest[2][(y/2)*(width/2)+x/2] + src[0][(y*width+x)*2+3] + 1) / 2;
+ }
+ }
+ }
+ return 1;
+}
+
+static int yuy2_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int x, y;
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < (width & ~3); x += 4) {
+ dest[0][y*width+x] = src[0][(y*width+x)*2 ];
+ dest[0][y*width+x+1] = src[0][(y*width+x)*2+2];
+ dest[0][y*width+x+2] = src[0][(y*width+x)*2+4];
+ dest[0][y*width+x+3] = src[0][(y*width+x)*2+6];
+ dest[1][y*(width/4)+x/4] = (src[0][(y*width+x)*2+1]
+ + src[0][(y*width+x)*2+5] + 1) / 2;
+ dest[2][y*(width/4)+x/4] = (src[0][(y*width+x)*2+3]
+ + src[0][(y*width+x)*2+7] + 1) / 2;
+ }
+ }
+ return 1;
+}
+
+static int yuy2_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < (width/2)*height; i++) {
+ dest[0][i*2] = src[0][i*4 ];
+ dest[1][i] = src[0][i*4+1];
+ dest[0][i*2+1] = src[0][i*4+2];
+ dest[2][i] = src[0][i*4+3];
+ }
+ return 1;
+}
+
+static int yuy2_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < (width & ~1)*height; i += 2) {
+ dest[0][i] = src[0][i*2 ];
+ dest[1][i] = src[0][i*2+1];
+ dest[1][i+1] = src[0][i*2+1];
+ dest[0][i+1] = src[0][i*2+2];
+ dest[2][i] = src[0][i*2+3];
+ dest[2][i+1] = src[0][i*2+3];
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+static int y8_yuy2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ dest[0][i*2 ] = src[0][i];
+ dest[0][i*2+1] = 128;
+ }
+ return 1;
+}
+
+static int y8_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++) {
+ dest[0][i*2 ] = 128;
+ dest[0][i*2+1] = src[0][i];
+ }
+ return 1;
+}
+
+static int yuy2_y8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++)
+ dest[0][i] = src[0][i*2];
+ return 1;
+}
+
+static int uyvy_y8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height; i++)
+ dest[0][i] = src[0][i*2+1];
+ return 1;
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+#if defined(HAVE_ASM_SSE2)
+
+/* SSE2 routines. See comments in img_x86_common.h for why we don't bother
+ * unrolling the loops. */
+
+/* Common macros/data for x86 code */
+#include "img_x86_common.h"
+
+/* YUV420P (1 row) or YUV422P -> YUY2 (unit: 2 pixels) */
+#define YUV42XP_YUY2 \
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 8, \
+ /* push_regs */ PUSH(EBX), \
+ /* pop_regs */ POP(EBX), \
+ /* small_loop */ \
+ "movb -1("EDX","ECX"), %%bh \n\
+ movb -1("ESI","ECX",2), %%bl \n\
+ shll $16, %%ebx \n\
+ movb -1("EAX","ECX"), %%bh \n\
+ movb -2("ESI","ECX",2), %%bl \n\
+ movl %%ebx, -4("EDI","ECX",4)", \
+ /* main_loop */ \
+ "movdqu -16("ESI","ECX",2),%%xmm0 #XM0: YF YE YD ..... Y2 Y1 Y0 \n\
+ movq -8("EAX","ECX"), %%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0 \n\
+ movq -8("EDX","ECX"), %%xmm3 # XMM3: V7 V6 V5 V4 V3 V2 V1 V0 \n\
+ punpcklbw %%xmm3, %%xmm2 # XMM2: V7 U7 V6 ..... U1 V0 U0 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\
+ punpcklbw %%xmm2, %%xmm0 # XMM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+ punpckhbw %%xmm2, %%xmm1 # XMM1: V7 YF U7 ..... Y9 U4 Y8 \n\
+ movdqu %%xmm0, -32("EDI","ECX",4) \n\
+ movdqu %%xmm1, -16("EDI","ECX",4)", \
+ /* emms */ "emms")
+
+/* YUV411P -> YUY2 (unit: 4 pixels) */
+#define YUV411P_YUY2 \
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 4, \
+ /* push_regs */ PUSH(EBX), \
+ /* pop_regs */ POP(EBX), \
+ /* small_loop */ \
+ "movb -1("EDX","ECX"), %%bh \n\
+ movb -1("ESI","ECX",4), %%bl \n\
+ shll $16, %%ebx \n\
+ movb -1("EAX","ECX"), %%bh \n\
+ movb -2("ESI","ECX",4), %%bl \n\
+ movl %%ebx, -4("EDI","ECX",8) \n\
+ movb -1("EDX","ECX"), %%bh \n\
+ movb -3("ESI","ECX",4), %%bl \n\
+ shll $16, %%ebx \n\
+ movb -1("EAX","ECX"), %%bh \n\
+ movb -4("ESI","ECX",4), %%bl \n\
+ movl %%ebx, -8("EDI","ECX",8)", \
+ /* main_loop */ \
+ "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: YF YE YD ..... Y2 Y1 Y0 \n\
+ movd -4("EAX","ECX"), %%xmm2 # XMM2: U3 U2 U1 U0 \n\
+ punpcklbw %%xmm2, %%xmm2 # XMM2: U3 U3 U2 U2 U1 U1 U0 U0 \n\
+ movd -4("EDX","ECX"), %%xmm3 # XMM3: V3 V2 V1 V0 \n\
+ punpcklbw %%xmm3, %%xmm3 # XMM3: V3 V3 V2 V2 V1 V1 V0 V0 \n\
+ punpcklbw %%xmm3, %%xmm2 # XMM2: V3 U3 V3 ..... U0 V0 U0 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\
+ punpcklbw %%xmm2, %%xmm0 # XMM0: V1 Y7 U1 ..... Y1 U0 Y0 \n\
+ punpckhbw %%xmm2, %%xmm1 # XMM1: V3 YF U3 ..... Y9 U2 Y8 \n\
+ movdqu %%xmm0, -32("EDI","ECX",8) \n\
+ movdqu %%xmm1, -16("EDI","ECX",8)", \
+ /* emms */ "emms")
+
+/* YUV444P -> YUY2 (unit: 2 pixels) */
+#define YUV444P_YUY2 \
+ /* Load 0x00FF*8 into XMM7 for masking */ \
+ "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 8, \
+ /* push_regs */ PUSH2(EBX,EBP), \
+ /* pop_regs */ POP2(EBP,EBX), \
+ /* small_loop */ \
+ "movzbl -1("EDX","ECX",2), %%ebx \n\
+ movzbl -2("EDX","ECX",2), %%ebp \n\
+ addl %%ebp, %%ebx \n\
+ shrl $1, %%ebx \n\
+ movb %%bl, -1("EDI","ECX",4) \n\
+ movb -1("ESI","ECX",2), %%bl \n\
+ movb %%bl, -2("EDI","ECX",4) \n\
+ movzbl -1("EAX","ECX",2), %%ebx \n\
+ movzbl -2("EAX","ECX",2), %%ebp \n\
+ addl %%ebp, %%ebx \n\
+ shrl $1, %%ebx \n\
+ movb %%bl, -3("EDI","ECX",4) \n\
+ movb -2("ESI","ECX",2), %%bl \n\
+ movb %%bl, -4("EDI","ECX",4)", \
+ /* main_loop */ \
+ "movdqu -16("ESI","ECX",2),%%xmm0 #XM0: YF YE YD ..... Y2 Y1 Y0 \n\
+ movdqu -16("EAX","ECX",2), %%xmm2 #XM2: UF UE UD ..... U2 U1 U0 \n\
+ movdqu -16("EDX","ECX",2), %%xmm3 #XM3: VF VE VD ..... V2 V1 V0 \n\
+ movdqa %%xmm2, %%xmm4 # XMM4: UF UE UD ..... U2 U1 U0 \n\
+ pand %%xmm7, %%xmm2 # XMM2: -- UE -- ..... U2 -- U0 \n\
+ psrlw $8, %%xmm4 # XMM4: -- UF -- ..... U3 -- U1 \n\
+ pavgw %%xmm4, %%xmm2 # XMM2: -- u7 -- ..... u1 -- u0 \n\
+ movdqa %%xmm3, %%xmm5 # XMM4: UF UE UD ..... U2 U1 U0 \n\
+ pand %%xmm7, %%xmm3 # XMM3: -- VE -- ..... V2 -- V0 \n\
+ psrlw $8, %%xmm5 # XMM5: -- VF -- ..... V3 -- V1 \n\
+ pavgw %%xmm5, %%xmm3 # XMM3: -- v7 -- ..... v1 -- v0 \n\
+ psllw $8, %%xmm3 # XMM3: v7 -- v6 ..... -- v0 -- \n\
+ por %%xmm3, %%xmm2 # XMM2: v7 u7 v6 ..... u1 v0 u0 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\
+ punpcklbw %%xmm2, %%xmm0 # XMM0: v3 Y7 u3 ..... Y1 u0 Y0 \n\
+ punpckhbw %%xmm2, %%xmm1 # XMM1: v7 YF u7 ..... Y9 u4 Y8 \n\
+ movdqu %%xmm0, -32("EDI","ECX",4) \n\
+ movdqu %%xmm1, -16("EDI","ECX",4)", \
+ /* emms */ "emms")
+
+/* YUY2 -> YUV420P (U row) (unit: 2 pixels) */
+#define YUY2_YUV420P_U \
+ /* Load 0x00FF*8 into XMM7 for masking */ \
+ "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 4, \
+ /* push_regs */ PUSH2(EBX,EBP), \
+ /* pop_regs */ POP2(EBP,EBX), \
+ /* small_loop */ \
+ "movb -4("ESI","ECX",4), %%bl \n\
+ movb %%bl, -2("EDI","ECX",2) \n\
+ movb -2("ESI","ECX",4), %%bl \n\
+ movb %%bl, -1("EDI","ECX",2) \n\
+ movzbl -3("ESI","ECX",4), %%ebx \n\
+ movzbl -3("EAX","ECX",4), %%ebp \n\
+ addl %%ebp, %%ebx \n\
+ shrl $1, %%ebx \n\
+ movb %%bl, -1("EDX","ECX")", \
+ /* main_loop */ \
+ "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+ movdqu -16("EAX","ECX",4),%%xmm2 #XMM2: Vd Yh Ud ..... Yb Ua Ya \n\
+ pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
+ packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+ psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\
+ psrlw $8, %%xmm2 # XMM2: -- Vd -- ..... Va -- Ua \n\
+ pavgw %%xmm2, %%xmm1 # XMM1: -- v3 -- ..... v0 -- u0 \n\
+ packuswb %%xmm1, %%xmm1 # XMM1: v3 u3 v2 u2 v1 u1 v0 u0 \n\
+ pand %%xmm7, %%xmm1 # XMM1: -- u3 -- u2 -- u1 -- u0 \n\
+ packuswb %%xmm1, %%xmm1 # XMM1: u3 u2 u1 u0 \n\
+ movq %%xmm0, -8("EDI","ECX",2) \n\
+ movd %%xmm1, -4("EDX","ECX")", \
+ /* emms */ "emms")
+
+/* YUY2 -> YUV420P (V row) (unit: 2 pixels) */
+#define YUY2_YUV420P_V \
+ /* Load 0x00FF*8 into XMM7 for masking */ \
+ "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 4, \
+ /* push_regs */ PUSH2(EBX,EBP), \
+ /* pop_regs */ POP2(EBP,EBX), \
+ /* small_loop */ \
+ "movb -4("ESI","ECX",4), %%bl \n\
+ movb %%bl, -2("EDI","ECX",2) \n\
+ movb -2("ESI","ECX",4), %%bl \n\
+ movb %%bl, -1("EDI","ECX",2) \n\
+ movzbl -1("ESI","ECX",4), %%ebx \n\
+ movzbl -1("EAX","ECX",4), %%ebp \n\
+ addl %%ebp, %%ebx \n\
+ shrl $1, %%ebx \n\
+ movb %%bl, -1("EDX","ECX")", \
+ /* main_loop */ \
+ "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+ movdqu -16("EAX","ECX",4),%%xmm2 #XMM2: Vd Yh Ud ..... Yb Ua Ya \n\
+ pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
+ packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+ psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\
+ psrlw $8, %%xmm2 # XMM2: -- Vd -- ..... Va -- Ua \n\
+ pavgw %%xmm1, %%xmm2 # XMM2: -- v3 -- ..... v0 -- u0 \n\
+ packuswb %%xmm2, %%xmm2 # XMM2: v3 u3 v2 u2 v1 u1 v0 u0 \n\
+ psrlw $8, %%xmm2 # XMM2: -- v3 -- v2 -- v1 -- v0 \n\
+ packuswb %%xmm2, %%xmm2 # XMM2: v3 v2 v1 v0 \n\
+ movq %%xmm0, -8("EDI","ECX",2) \n\
+ movd %%xmm2, -4("EDX","ECX")", \
+ /* emms */ "emms")
+
+/* YUY2 -> YUV411P (unit: 4 pixels) */
+#define YUY2_YUV411P \
+ /* Load 0x000..000FFFFFFFF into XMM6, 0x00FF*8 into XMM7 for masking */ \
+ "pcmpeqd %%xmm6, %%xmm6; psrldq $12, %%xmm6;" \
+ "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 2, \
+ /* push_regs */ PUSH2(EBX,EBP), \
+ /* pop_regs */ POP2(EBP,EBX), \
+ /* small_loop */ \
+ "movb -8("ESI","ECX",8), %%bl \n\
+ movb %%bl, -4("EDI","ECX",4) \n\
+ movb -6("ESI","ECX",8), %%bl \n\
+ movb %%bl, -3("EDI","ECX",4) \n\
+ movb -4("ESI","ECX",8), %%bl \n\
+ movb %%bl, -2("EDI","ECX",4) \n\
+ movb -2("ESI","ECX",8), %%bl \n\
+ movb %%bl, -1("EDI","ECX",4) \n\
+ movzbl -7("ESI","ECX",8), %%ebx \n\
+ movzbl -3("ESI","ECX",8), %%ebp \n\
+ addl %%ebp, %%ebx \n\
+ shrl $1, %%ebx \n\
+ movb %%bl, -1("EAX","ECX") \n\
+ movzbl -5("ESI","ECX",8), %%ebx \n\
+ movzbl -1("ESI","ECX",8), %%ebp \n\
+ addl %%ebp, %%ebx \n\
+ shrl $1, %%ebx \n\
+ movb %%bl, -1("EDX","ECX")", \
+ /* main_loop */ \
+ "movdqu -16("ESI","ECX",8),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+ pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
+ packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+ psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\
+ packuswb %%xmm1, %%xmm1 # XMM1: V3 U3 V2 U2 V1 U1 V0 U0 \n\
+ movdqa %%xmm1, %%xmm2 # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\
+ pand %%xmm7, %%xmm1 # XMM1: -- U3 -- U2 -- U1 -- U0 \n\
+ psrlw $8, %%xmm2 # XMM2: -- V3 -- V2 -- V1 -- V0 \n\
+ packuswb %%xmm1, %%xmm1 # XMM1: U3 U2 U1 U0 \n\
+ packuswb %%xmm2, %%xmm2 # XMM2: V3 V2 V1 V0 \n\
+ pand %%xmm6, %%xmm1 # XMM1: -- -- -- -- U3 U2 U1 U0 \n\
+ psllq $32, %%xmm2 # XMM2: V3 V2 V1 V0 -- -- -- -- \n\
+ por %%xmm1, %%xmm2 # XMM2: V3 V2 V1 V0 U3 U2 U1 U0 \n\
+ movdqa %%xmm2, %%xmm1 # XMM1: V3 V2 V1 V0 U3 U2 U1 U0 \n\
+ pand %%xmm7, %%xmm1 # XMM1: -- V2 -- V0 -- U2 -- U0 \n\
+ psrlw $8, %%xmm2 # XMM2: -- V3 -- V1 -- U3 -- U1 \n\
+ pavgw %%xmm2, %%xmm1 # XMM1: -- v1 -- v0 -- u1 -- u0 \n\
+ packuswb %%xmm1, %%xmm1 # XMM1: v1 v0 u1 u0 \n\
+ movq %%xmm0, -8("EDI","ECX",4) \n\
+ movd %%xmm1, %%ebx \n\
+ movw %%bx, -2("EAX","ECX") \n\
+ shrl $16, %%ebx; \n\
+ movw %%bx, -2("EDX","ECX")", \
+ /* emms */ "emms")
+
+/* YUY2 -> YUV422P (unit: 2 pixels) */
+#define YUY2_YUV422P \
+ /* Load 0x00FF*8 into XMM7 for masking */ \
+ "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 4, \
+ /* push_regs */ PUSH(EBX), \
+ /* pop_regs */ POP(EBX), \
+ /* small_loop */ \
+ "movb -4("ESI","ECX",4), %%bl \n\
+ movb %%bl, -2("EDI","ECX",2) \n\
+ movb -2("ESI","ECX",4), %%bl \n\
+ movb %%bl, -1("EDI","ECX",2) \n\
+ movb -3("ESI","ECX",4), %%bl \n\
+ movb %%bl, -1("EAX","ECX") \n\
+ movb -1("ESI","ECX",4), %%bl \n\
+ movb %%bl, -1("EDX","ECX")", \
+ /* main_loop */ \
+ "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+ pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
+ packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+ psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\
+ packuswb %%xmm1, %%xmm1 # XMM1: V3 U3 V2 U2 V1 U1 V0 U0 \n\
+ movdqa %%xmm1, %%xmm2 # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\
+ pand %%xmm7, %%xmm1 # XMM1: -- U3 -- U2 -- U1 -- U0 \n\
+ psrlw $8, %%xmm2 # XMM2: -- V3 -- V2 -- V1 -- V0 \n\
+ packuswb %%xmm1, %%xmm1 # XMM1: U3 U2 U1 U0 \n\
+ packuswb %%xmm2, %%xmm2 # XMM2: V3 V2 V1 V0 \n\
+ movq %%xmm0, -8("EDI","ECX",2) \n\
+ movd %%xmm1, -4("EAX","ECX") \n\
+ movd %%xmm2, -4("EDX","ECX")", \
+ /* emms */ "emms")
+
+/* YUY2 -> YUV444P (unit: 2 pixels) */
+#define YUY2_YUV444P \
+ /* Load 0x00FF*8 into XMM7 for masking */ \
+ "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 4, \
+ /* push_regs */ PUSH(EBX), \
+ /* pop_regs */ POP(EBX), \
+ /* small_loop */ \
+ "movb -4("ESI","ECX",4), %%bl \n\
+ movb %%bl, -2("EDI","ECX",2) \n\
+ movb -2("ESI","ECX",4), %%bl \n\
+ movb %%bl, -1("EDI","ECX",2) \n\
+ movb -3("ESI","ECX",4), %%bl \n\
+ movb %%bl, -2("EAX","ECX",2) \n\
+ movb %%bl, -1("EAX","ECX",2) \n\
+ movb -1("ESI","ECX",4), %%bl \n\
+ movb %%bl, -2("EDX","ECX",2) \n\
+ movb %%bl, -1("EDX","ECX",2)", \
+ /* main_loop */ \
+ "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+ pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
+ packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+ psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\
+ packuswb %%xmm1, %%xmm1 # XMM1: V3 U3 V2 U2 V1 U1 V0 U0 \n\
+ movdqa %%xmm1, %%xmm2 # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\
+ pand %%xmm7, %%xmm1 # XMM1: -- U3 -- U2 -- U1 -- U0 \n\
+ psrlw $8, %%xmm2 # XMM2: -- V3 -- V2 -- V1 -- V0 \n\
+ movdqa %%xmm1, %%xmm3 # XMM3: -- U3 -- U2 -- U1 -- U0 \n\
+ psllw $8, %%xmm3 # XMM3: U3 -- U2 -- U1 -- U0 -- \n\
+ por %%xmm3, %%xmm1 # XMM1: U3 U3 U2 U2 U1 U1 U0 U0 \n\
+ movdqa %%xmm2, %%xmm3 # XMM3: -- V3 -- V2 -- V1 -- V0 \n\
+ psllw $8, %%xmm3 # XMM3: V3 -- V2 -- V1 -- V0 -- \n\
+ por %%xmm3, %%xmm2 # XMM1: V3 V3 V2 V2 V1 V1 V0 V0 \n\
+ movq %%xmm0, -8("EDI","ECX",2) \n\
+ movq %%xmm1, -8("EAX","ECX",2) \n\
+ movq %%xmm2, -8("EDX","ECX",2)", \
+ /* emms */ "emms")
+
+
+/* Y8 -> YUY2/YVYU (unit: 1 pixel) */
+#define Y8_YUY2 \
+ /* Load 0x80*16 into XMM7 for interlacing U/V */ \
+ "pcmpeqd %%xmm7, %%xmm7; psllw $7, %%xmm7; packsswb %%xmm7, %%xmm7;"\
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 16, \
+ /* push_regs */ PUSH(EBX), \
+ /* pop_regs */ POP(EBX), \
+ /* small_loop */ \
+ "movb -1("ESI","ECX"), %%al \n\
+ movb %%al, -2("EDI","ECX",2) \n\
+ movb $0x80, -1("EDI","ECX",2)", \
+ /* main_loop */ \
+ "movdqu -16("ESI","ECX"),%%xmm0 # XMM0: YF YE YD ..... Y2 Y1 Y0 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\
+ punpcklbw %%xmm7, %%xmm0 # XMM0: 80 Y7 80 ..... Y1 80 Y0 \n\
+ movdqu %%xmm0, -32("EDI","ECX",2) \n\
+ punpckhbw %%xmm7, %%xmm1 # XMM1: 80 YF 80 ..... Y9 80 Y8 \n\
+ movdqu %%xmm1, -16("EDI","ECX",2)", \
+ /* emms */ "emms")
+
+/* Y8 -> UYVY (unit: 1 pixel) */
+#define Y8_UYVY \
+ /* Load 0x80*16 into XMM7 for interlacing U/V */ \
+ "pcmpeqd %%xmm7, %%xmm7; psllw $7, %%xmm7; packsswb %%xmm7, %%xmm7;"\
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 16, \
+ /* push_regs */ "", \
+ /* pop_regs */ "", \
+ /* small_loop */ \
+ "movb -1("ESI","ECX"), %%al \n\
+ movb %%al, -1("EDI","ECX",2) \n\
+ movb $0x80, -2("EDI","ECX",2)", \
+ /* main_loop */ \
+ "movdqu -16("ESI","ECX"),%%xmm0 # XMM0: YF YE YD ..... Y2 Y1 Y0 \n\
+ movdqa %%xmm7, %%xmm1 # XMM1: 80 80 80 ..... 80 80 80 \n\
+ punpcklbw %%xmm0, %%xmm1 # XMM1: Y7 80 Y6 ..... 80 Y0 80 \n\
+ movdqu %%xmm1, -32("EDI","ECX",2) \n\
+ movdqa %%xmm7, %%xmm2 # XMM2: 80 80 80 ..... 80 80 80 \n\
+ punpckhbw %%xmm0, %%xmm2 # XMM0: YF 80 YE ..... 80 Y8 80 \n\
+ movdqu %%xmm2, -16("EDI","ECX",2)", \
+ /* emms */ "emms")
+
+/* YUY2/YVYU -> Y8 (unit: 1 pixel) */
+#define YUY2_Y8 \
+ /* Load 0x00FF*8 into XMM7 for masking */ \
+ "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 8, \
+ /* push_regs */ "", \
+ /* pop_regs */ "", \
+ /* small_loop */ \
+ "movb -2("ESI","ECX",2), %%al \n\
+ movb %%al, -1("EDI","ECX")", \
+ /* main_loop */ \
+ "movdqu -16("ESI","ECX",2),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
+ pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
+ packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+ movq %%xmm0, -8("EDI","ECX")", \
+ /* emms */ "emms")
+
+/* UYVY -> Y8 (unit: 1 pixel) */
+#define UYVY_Y8 \
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 8, \
+ /* push_regs */ "", \
+ /* pop_regs */ "", \
+ /* small_loop */ \
+ "movb -1("ESI","ECX",2), %%al \n\
+ movb %%al, -1("EDI","ECX")", \
+ /* main_loop */ \
+ "movdqu -16("ESI","ECX",2),%%xmm0 #XM0: Y7 V3 Y6 ..... V0 Y0 U0 \n\
+ psrlw $8, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
+ packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+ movq %%xmm0, -8("EDI","ECX")", \
+ /* emms */ "emms")
+
+/*************************************************************************/
+
+static int yuv420p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int y;
+ for (y = 0; y < (height & ~1); y++) {
+ int dummy;
+ asm volatile(YUV42XP_YUY2
+ : "=c" (dummy) // Ensure GCC reloads ECX each time through
+ : "S" (src[0]+y*width), "a" (src[1]+(y/2)*(width/2)),
+ "d" (src[2]+(y/2)*(width/2)), "D" (dest[0]+y*width*2),
+ "0" (width/2)
+#ifdef ARCH_X86_64
+ : FAKE_PUSH_REG
+#endif
+ );
+ }
+ return 1;
+}
+
+static int yuv411p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ if (!(width & 3)) {
+ asm(YUV411P_YUY2
+ : /* no outputs */
+ : "S" (src[0]), "a" (src[1]), "d" (src[2]), "D" (dest[0]),
+ "c" ((width/4)*height)
+#ifdef ARCH_X86_64
+ : FAKE_PUSH_REG
+#endif
+ );
+ } else {
+ int y;
+ for (y = 0; y < height; y++) {
+ int dummy;
+ asm volatile(YUV411P_YUY2
+ : "=c" (dummy)
+ : "S" (src[0]+y*width), "a" (src[1]+y*(width/4)),
+ "d" (src[2]+y*(width/4)), "D" (dest[0]+y*width*2),
+ "0" (width/4)
+#ifdef ARCH_X86_64
+ : FAKE_PUSH_REG
+#endif
+ );
+ }
+ }
+ return 1;
+}
+
+static int yuv422p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ if (!(width & 1)) {
+ asm(YUV42XP_YUY2
+ : /* no outputs */
+ : "S" (src[0]), "a" (src[1]), "d" (src[2]), "D" (dest[0]),
+ "c" ((width/2)*height)
+#ifdef ARCH_X86_64
+ : FAKE_PUSH_REG
+#endif
+ );
+ } else {
+ int y;
+ for (y = 0; y < height; y++) {
+ int dummy;
+ asm volatile(YUV42XP_YUY2
+ : "=c" (dummy)
+ : "S" (src[0]+y*width), "a" (src[1]+y*(width/2)),
+ "d" (src[2]+y*(width/2)), "D" (dest[0]+y*width*2),
+ "0" (width/2)
+#ifdef ARCH_X86_64
+ : FAKE_PUSH_REG
+#endif
+ );
+ }
+ }
+ return 1;
+}
+
+static int yuv444p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ if (!(width & 1)) {
+ asm(YUV444P_YUY2
+ : /* no outputs */
+ : "S" (src[0]), "a" (src[1]), "d" (src[2]), "D" (dest[0]),
+ "c" ((width/2)*height)
+#ifdef ARCH_X86_64
+ : FAKE_PUSH_REG, FAKE_PUSH_REG_2
+#endif
+ );
+ } else {
+ int y;
+ for (y = 0; y < height; y++) {
+ int dummy;
+ asm volatile(YUV444P_YUY2
+ : "=c" (dummy)
+ : "S" (src[0]+y*width), "a" (src[1]+y*(width/2)),
+ "d" (src[2]+y*(width/2)), "D" (dest[0]+y*width*2),
+ "0" (width/2)
+#ifdef ARCH_X86_64
+ : FAKE_PUSH_REG, FAKE_PUSH_REG_2
+#endif
+ );
+ }
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+static int yuy2_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int y;
+
+ for (y = 0; y < (height & ~1); y += 2) {
+ int dummy;
+ asm volatile(YUY2_YUV420P_U
+ : "=c" (dummy)
+ : "S" (src[0]+y*width*2), "a" (src[0]+(y+1)*width*2),
+ "D" (dest[0]+y*width), "d" (dest[1]+(y/2)*(width/2)),
+ "0" (width/2)
+#ifdef ARCH_X86_64
+ : FAKE_PUSH_REG, FAKE_PUSH_REG_2
+#endif
+ );
+ asm volatile(YUY2_YUV420P_V
+ : "=c" (dummy)
+ : "S" (src[0]+(y+1)*width*2), "a" (src[0]+y*width*2),
+ "D" (dest[0]+(y+1)*width), "d" (dest[2]+(y/2)*(width/2)),
+ "0" (width/2)
+#ifdef ARCH_X86_64
+ : FAKE_PUSH_REG, FAKE_PUSH_REG_2
+#endif
+ );
+ }
+ return 1;
+}
+
+static int yuy2_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ if (!(width & 3)) {
+ asm(YUY2_YUV411P
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "a" (dest[1]), "d" (dest[2]),
+ "c" ((width/4)*height)
+#ifdef ARCH_X86_64
+ : FAKE_PUSH_REG, FAKE_PUSH_REG_2
+#endif
+ );
+ } else {
+ int y;
+ for (y = 0; y < height; y++) {
+ int dummy;
+ asm volatile(YUY2_YUV411P
+ : "=c" (dummy)
+ : "S" (src[0]+y*width*2), "D" (dest[0]+y*width),
+ "a" (dest[1]+y*(width/4)), "d" (dest[2]+y*(width/4)),
+ "0" (width/4)
+#ifdef ARCH_X86_64
+ : FAKE_PUSH_REG, FAKE_PUSH_REG_2
+#endif
+ );
+ }
+ }
+ return 1;
+}
+
+static int yuy2_yuv422p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ if (!(width & 1)) {
+ asm(YUY2_YUV422P
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "a" (dest[1]), "d" (dest[2]),
+ "c" ((width/2)*height)
+#ifdef ARCH_X86_64
+ : FAKE_PUSH_REG
+#endif
+ );
+ } else {
+ int y;
+ for (y = 0; y < height; y++) {
+ int dummy;
+ asm volatile(YUY2_YUV422P
+ : "=c" (dummy)
+ : "S" (src[0]+y*width*2), "D" (dest[0]+y*width),
+ "a" (dest[1]+y*(width/2)), "d" (dest[2]+y*(width/2)),
+ "0" (width/2)
+#ifdef ARCH_X86_64
+ : FAKE_PUSH_REG
+#endif
+ );
+ }
+ }
+ return 1;
+}
+
+static int yuy2_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ if (!(width & 1)) {
+ asm(YUY2_YUV444P
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "a" (dest[1]), "d" (dest[2]),
+ "c" ((width/2)*height)
+#ifdef ARCH_X86_64
+ : FAKE_PUSH_REG
+#endif
+ );
+ } else {
+ int y;
+ for (y = 0; y < height; y++) {
+ int dummy;
+ asm volatile(YUY2_YUV444P
+ : "=c" (dummy)
+ : "S" (src[0]+y*width*2), "D" (dest[0]+y*width),
+ "a" (dest[1]+y*width), "d" (dest[2]+y*width),
+ "0" (width/2)
+#ifdef ARCH_X86_64
+ : FAKE_PUSH_REG
+#endif
+ );
+ }
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+static int y8_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ asm(Y8_YUY2
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height)
+ : "eax" COMMA_FAKE_PUSH_REG
+ );
+ return 1;
+}
+
+static int y8_uyvy_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ asm(Y8_UYVY
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height)
+ : "eax");
+ return 1;
+}
+
+static int yuy2_y8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ asm(YUY2_Y8
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height)
+ : "eax");
+ return 1;
+}
+
+static int uyvy_y8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ asm(UYVY_Y8
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height)
+ : "eax");
+ return 1;
+}
+
+/*************************************************************************/
+
+#endif /* HAVE_ASM_SSE2 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Initialization */
+
+int ac_imgconvert_init_yuv_mixed(int accel)
+{
+ if (!register_conversion(IMG_YUV420P, IMG_YUY2, yuv420p_yuy2)
+ || !register_conversion(IMG_YUV411P, IMG_YUY2, yuv411p_yuy2)
+ || !register_conversion(IMG_YUV422P, IMG_YUY2, yuv422p_yuy2)
+ || !register_conversion(IMG_YUV444P, IMG_YUY2, yuv444p_yuy2)
+ || !register_conversion(IMG_Y8, IMG_YUY2, y8_yuy2)
+ || !register_conversion(IMG_YUV420P, IMG_UYVY, yuv420p_uyvy)
+ || !register_conversion(IMG_YUV411P, IMG_UYVY, yuv411p_uyvy)
+ || !register_conversion(IMG_YUV422P, IMG_UYVY, yuv422p_uyvy)
+ || !register_conversion(IMG_YUV444P, IMG_UYVY, yuv444p_uyvy)
+ || !register_conversion(IMG_Y8, IMG_UYVY, y8_uyvy)
+ || !register_conversion(IMG_YUV420P, IMG_YVYU, yuv420p_yvyu)
+ || !register_conversion(IMG_YUV411P, IMG_YVYU, yuv411p_yvyu)
+ || !register_conversion(IMG_YUV422P, IMG_YVYU, yuv422p_yvyu)
+ || !register_conversion(IMG_YUV444P, IMG_YVYU, yuv444p_yvyu)
+ || !register_conversion(IMG_Y8, IMG_YVYU, y8_yuy2)
+
+ || !register_conversion(IMG_YUY2, IMG_YUV420P, yuy2_yuv420p)
+ || !register_conversion(IMG_YUY2, IMG_YUV411P, yuy2_yuv411p)
+ || !register_conversion(IMG_YUY2, IMG_YUV422P, yuy2_yuv422p)
+ || !register_conversion(IMG_YUY2, IMG_YUV444P, yuy2_yuv444p)
+ || !register_conversion(IMG_YUY2, IMG_Y8, yuy2_y8)
+ || !register_conversion(IMG_UYVY, IMG_YUV420P, uyvy_yuv420p)
+ || !register_conversion(IMG_UYVY, IMG_YUV411P, uyvy_yuv411p)
+ || !register_conversion(IMG_UYVY, IMG_YUV422P, uyvy_yuv422p)
+ || !register_conversion(IMG_UYVY, IMG_YUV444P, uyvy_yuv444p)
+ || !register_conversion(IMG_UYVY, IMG_Y8, uyvy_y8)
+ || !register_conversion(IMG_YVYU, IMG_YUV420P, yvyu_yuv420p)
+ || !register_conversion(IMG_YVYU, IMG_YUV411P, yvyu_yuv411p)
+ || !register_conversion(IMG_YVYU, IMG_YUV422P, yvyu_yuv422p)
+ || !register_conversion(IMG_YVYU, IMG_YUV444P, yvyu_yuv444p)
+ || !register_conversion(IMG_YVYU, IMG_Y8, yuy2_y8)
+ ) {
+ return 0;
+ }
+
+#if defined(HAVE_ASM_SSE2)
+ if (accel & AC_SSE2) {
+ if (!register_conversion(IMG_YUV420P, IMG_YUY2, yuv420p_yuy2_sse2)
+ || !register_conversion(IMG_YUV411P, IMG_YUY2, yuv411p_yuy2_sse2)
+ || !register_conversion(IMG_YUV422P, IMG_YUY2, yuv422p_yuy2_sse2)
+ || !register_conversion(IMG_YUV444P, IMG_YUY2, yuv444p_yuy2_sse2)
+ || !register_conversion(IMG_Y8, IMG_YUY2, y8_yuy2_sse2)
+ || !register_conversion(IMG_Y8, IMG_UYVY, y8_uyvy_sse2)
+ || !register_conversion(IMG_Y8, IMG_YVYU, y8_yuy2_sse2)
+
+ || !register_conversion(IMG_YUY2, IMG_YUV420P, yuy2_yuv420p_sse2)
+ || !register_conversion(IMG_YUY2, IMG_YUV411P, yuy2_yuv411p_sse2)
+ || !register_conversion(IMG_YUY2, IMG_YUV422P, yuy2_yuv422p_sse2)
+ || !register_conversion(IMG_YUY2, IMG_YUV444P, yuy2_yuv444p_sse2)
+ || !register_conversion(IMG_YUY2, IMG_Y8, yuy2_y8_sse2)
+ || !register_conversion(IMG_UYVY, IMG_Y8, uyvy_y8_sse2)
+ || !register_conversion(IMG_YVYU, IMG_Y8, yuy2_y8_sse2)
+ ) {
+ return 0;
+ }
+ }
+#endif /* HAVE_ASM_SSE2 */
+
+ return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ * c-file-style: "stroustrup"
+ * c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/img_yuv_packed.c b/debian/transcode/transcode-1.1.7/aclib/img_yuv_packed.c
new file mode 100644
index 00000000..05357405
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/img_yuv_packed.c
@@ -0,0 +1,290 @@
+/*
+ * img_yuv_packed.c - YUV packed image format conversion routines
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later). See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "imgconvert.h"
+#include "img_internal.h"
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Standard C implementations */
+
+/*************************************************************************/
+
+/* Identity transformation, works when src==dest */
+static int yuv16_copy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height*2);
+ return 1;
+}
+
+/* Used for YUY2->UYVY and UYVY->YUY2, works when src==dest */
+static int yuv16_swap16(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ uint16_t *srcp = (uint16_t *)src[0];
+ uint16_t *destp = (uint16_t *)dest[0];
+ int i;
+ for (i = 0; i < width*height; i++)
+ destp[i] = srcp[i]>>8 | srcp[i]<<8;
+ return 1;
+}
+
+/* Used for YUY2->YVYU and YVYU->YUY2, works when src==dest */
+static int yuv16_swapuv(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height/2; i++) {
+ uint8_t tmp = src[0][i*4+1];
+ dest[0][i*4 ] = src[0][i*4 ];
+ dest[0][i*4+1] = src[0][i*4+3];
+ dest[0][i*4+2] = src[0][i*4+2];
+ dest[0][i*4+3] = tmp;
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+static int uyvy_yvyu(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height/2; i++) {
+ dest[0][i*4 ] = src[0][i*4+1];
+ dest[0][i*4+1] = src[0][i*4+2];
+ dest[0][i*4+2] = src[0][i*4+3];
+ dest[0][i*4+3] = src[0][i*4 ];
+ }
+ return 1;
+}
+
+static int yvyu_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ for (i = 0; i < width*height/2; i++) {
+ dest[0][i*4 ] = src[0][i*4+3];
+ dest[0][i*4+1] = src[0][i*4 ];
+ dest[0][i*4+2] = src[0][i*4+1];
+ dest[0][i*4+3] = src[0][i*4+2];
+ }
+ return 1;
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+
+/* Common macros/data for x86 code */
+#define DEFINE_MASK_DATA
+#include "img_x86_common.h"
+
+/*************************************************************************/
+
+/* Basic assembly routines */
+
+static int yuv16_swap16_x86(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_SWAP16_2_X86(width*height/2);
+ if (width*height % 1)
+ ((uint16_t *)(dest[0]))[width*height-1] =
+ src[0][width*height*2-2]<<8 | src[0][width*height*2-1];
+ return 1;
+}
+
+static int yuv16_swapuv_x86(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_SWAP32_13_X86(width*height/2);
+ return 1;
+}
+
+static int uyvy_yvyu_x86(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_ROR32_X86(width*height/2);
+ if (width*height % 1)
+ ((uint16_t *)(dest[0]))[width*height-1] =
+ src[0][width*height*2-2]<<8 | src[0][width*height*2-1];
+ return 1;
+}
+
+static int yvyu_uyvy_x86(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_ROL32_X86(width*height/2);
+ if (width*height % 1)
+ ((uint16_t *)(dest[0]))[width*height-1] =
+ src[0][width*height*2-2]<<8 | src[0][width*height*2-1];
+ return 1;
+}
+
+/*************************************************************************/
+
+/* MMX routines */
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86) /* i.e. not x86_64 */
+
+static int yuv16_swap16_mmx(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_SWAP16_2_MMX(width*height/2);
+ if (width*height % 1)
+ ((uint16_t *)(dest[0]))[width*height-1] =
+ src[0][width*height*2-2]<<8 | src[0][width*height*2-1];
+ return 1;
+}
+
+static int yuv16_swapuv_mmx(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_SWAP32_13_MMX(width*height/2);
+ return 1;
+}
+
+static int uyvy_yvyu_mmx(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_ROR32_MMX(width*height/2);
+ if (width*height % 1)
+ ((uint16_t *)(dest[0]))[width*height-1] =
+ src[0][width*height*2-2]<<8 | src[0][width*height*2-1];
+ return 1;
+}
+
+static int yvyu_uyvy_mmx(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_ROL32_MMX(width*height/2);
+ if (width*height % 1)
+ ((uint16_t *)(dest[0]))[width*height-1] =
+ src[0][width*height*2-2]<<8 | src[0][width*height*2-1];
+ return 1;
+}
+
+#endif /* HAVE_ASM_MMX && ARCH_X86 */
+
+/*************************************************************************/
+
+/* SSE2 routines */
+
+#if defined(HAVE_ASM_SSE2)
+
+static int yuv16_swap16_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_SWAP16_2_SSE2(width*height/2);
+ if (width*height % 1)
+ ((uint16_t *)(dest[0]))[width*height-1] =
+ src[0][width*height*2-2]<<8 | src[0][width*height*2-1];
+ return 1;
+}
+
+static int yuv16_swapuv_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_SWAP32_13_SSE2(width*height/2);
+ return 1;
+}
+
+static int uyvy_yvyu_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_ROR32_SSE2(width*height/2);
+ if (width*height % 1)
+ ((uint16_t *)(dest[0]))[width*height-1] =
+ src[0][width*height*2-2]<<8 | src[0][width*height*2-1];
+ return 1;
+}
+
+static int yvyu_uyvy_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ASM_ROL32_SSE2(width*height/2);
+ if (width*height % 1)
+ ((uint16_t *)(dest[0]))[width*height-1] =
+ src[0][width*height*2-2]<<8 | src[0][width*height*2-1];
+ return 1;
+}
+
+#endif /* HAVE_ASM_SSE2 */
+
+/*************************************************************************/
+
+#endif /* ARCH_X86 || ARCH_X86_64 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Initialization */
+
+int ac_imgconvert_init_yuv_packed(int accel)
+{
+ if (!register_conversion(IMG_YUY2, IMG_YUY2, yuv16_copy)
+ || !register_conversion(IMG_YUY2, IMG_UYVY, yuv16_swap16)
+ || !register_conversion(IMG_YUY2, IMG_YVYU, yuv16_swapuv)
+
+ || !register_conversion(IMG_UYVY, IMG_YUY2, yuv16_swap16)
+ || !register_conversion(IMG_UYVY, IMG_UYVY, yuv16_copy)
+ || !register_conversion(IMG_UYVY, IMG_YVYU, uyvy_yvyu)
+
+ || !register_conversion(IMG_YVYU, IMG_YUY2, yuv16_swapuv)
+ || !register_conversion(IMG_YVYU, IMG_UYVY, yvyu_uyvy)
+ || !register_conversion(IMG_YVYU, IMG_YVYU, yuv16_copy)
+ ) {
+ return 0;
+ }
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+ if (accel & (AC_IA32ASM | AC_AMD64ASM)) {
+ if (!register_conversion(IMG_YUY2, IMG_UYVY, yuv16_swap16_x86)
+ || !register_conversion(IMG_YUY2, IMG_YVYU, yuv16_swapuv_x86)
+ || !register_conversion(IMG_UYVY, IMG_YUY2, yuv16_swap16_x86)
+ || !register_conversion(IMG_UYVY, IMG_YVYU, uyvy_yvyu_x86)
+ || !register_conversion(IMG_YVYU, IMG_YUY2, yuv16_swapuv_x86)
+ || !register_conversion(IMG_YVYU, IMG_UYVY, yvyu_uyvy_x86)
+ ) {
+ return 0;
+ }
+ }
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
+ if (accel & AC_MMX) {
+ if (!register_conversion(IMG_YUY2, IMG_UYVY, yuv16_swap16_mmx)
+ || !register_conversion(IMG_YUY2, IMG_YVYU, yuv16_swapuv_mmx)
+ || !register_conversion(IMG_UYVY, IMG_YUY2, yuv16_swap16_mmx)
+ || !register_conversion(IMG_UYVY, IMG_YVYU, uyvy_yvyu_mmx)
+ || !register_conversion(IMG_YVYU, IMG_YUY2, yuv16_swapuv_mmx)
+ || !register_conversion(IMG_YVYU, IMG_UYVY, yvyu_uyvy_mmx)
+ ) {
+ return 0;
+ }
+ }
+#endif
+
+#if defined(HAVE_ASM_SSE2)
+ if (accel & AC_SSE2) {
+ if (!register_conversion(IMG_YUY2, IMG_UYVY, yuv16_swap16_sse2)
+ || !register_conversion(IMG_YUY2, IMG_YVYU, yuv16_swapuv_sse2)
+ || !register_conversion(IMG_UYVY, IMG_YUY2, yuv16_swap16_sse2)
+ || !register_conversion(IMG_UYVY, IMG_YVYU, uyvy_yvyu_sse2)
+ || !register_conversion(IMG_YVYU, IMG_YUY2, yuv16_swapuv_sse2)
+ || !register_conversion(IMG_YVYU, IMG_UYVY, yvyu_uyvy_sse2)
+ ) {
+ return 0;
+ }
+ }
+#endif
+
+#endif /* ARCH_X86 || ARCH_X86_64 */
+
+ return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ * c-file-style: "stroustrup"
+ * c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/img_yuv_planar.c b/debian/transcode/transcode-1.1.7/aclib/img_yuv_planar.c
new file mode 100644
index 00000000..e510fa4a
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/img_yuv_planar.c
@@ -0,0 +1,788 @@
+/*
+ * img_yuv_planar.c - YUV planar image format conversion routines
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later). See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "imgconvert.h"
+#include "img_internal.h"
+
+#include <string.h>
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Standard C implementations */
+
+/*************************************************************************/
+
+/* Identity transformations */
+
+static int yuv420p_copy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height);
+ ac_memcpy(dest[1], src[1], (width/2)*(height/2));
+ ac_memcpy(dest[2], src[2], (width/2)*(height/2));
+ return 1;
+}
+
+static int yuv411p_copy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height);
+ ac_memcpy(dest[1], src[1], (width/4)*height);
+ ac_memcpy(dest[2], src[2], (width/4)*height);
+ return 1;
+}
+
+static int yuv422p_copy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height);
+ ac_memcpy(dest[1], src[1], (width/2)*height);
+ ac_memcpy(dest[2], src[2], (width/2)*height);
+ return 1;
+}
+
+static int yuv444p_copy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height);
+ ac_memcpy(dest[1], src[1], width*height);
+ ac_memcpy(dest[2], src[2], width*height);
+ return 1;
+}
+
+static int y8_copy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height);
+ return 1;
+}
+
+/*************************************************************************/
+
+static int yuv420p_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int x, y;
+ ac_memcpy(dest[0], src[0], width*height);
+ for (y = 0; y < (height & ~1); y += 2) {
+ for (x = 0; x < (width/2 & ~1); x += 2) {
+ dest[1][y*(width/4)+x/2] = (src[1][(y/2)*(width/2)+x]
+ + src[1][(y/2)*(width/2)+x+1] + 1) / 2;
+ dest[2][y*(width/4)+x/2] = (src[2][(y/2)*(width/2)+x]
+ + src[2][(y/2)*(width/2)+x+1] + 1) / 2;
+ }
+ ac_memcpy(dest[1]+(y+1)*(width/4), dest[1]+y*(width/4), width/4);
+ ac_memcpy(dest[2]+(y+1)*(width/4), dest[2]+y*(width/4), width/4);
+ }
+ return 1;
+}
+
+static int yuv420p_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int y;
+ ac_memcpy(dest[0], src[0], width*height);
+ for (y = 0; y < (height & ~1); y += 2) {
+ ac_memcpy(dest[1]+(y )*(width/2), src[1]+(y/2)*(width/2), width/2);
+ ac_memcpy(dest[1]+(y+1)*(width/2), src[1]+(y/2)*(width/2), width/2);
+ ac_memcpy(dest[2]+(y )*(width/2), src[2]+(y/2)*(width/2), width/2);
+ ac_memcpy(dest[2]+(y+1)*(width/2), src[2]+(y/2)*(width/2), width/2);
+ }
+ return 1;
+}
+
+static int yuv420p_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int x, y;
+ ac_memcpy(dest[0], src[0], width*height);
+ for (y = 0; y < height; y += 2) {
+ for (x = 0; x < width; x += 2) {
+ dest[1][y*width+x ] =
+ dest[1][y*width+x+1] = src[1][(y/2)*(width/2)+(x/2)];
+ dest[2][y*width+x ] =
+ dest[2][y*width+x+1] = src[2][(y/2)*(width/2)+(x/2)];
+ }
+ ac_memcpy(dest[1]+(y+1)*width, dest[1]+y*width, width);
+ ac_memcpy(dest[2]+(y+1)*width, dest[2]+y*width, width);
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+static int yuv411p_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int x, y;
+ ac_memcpy(dest[0], src[0], width*height);
+ for (y = 0; y < (height & ~1); y += 2) {
+ for (x = 0; x < ((width/2) & ~1); x += 2) {
+ dest[1][(y/2)*(width/2)+x] = (src[1][y*(width/4)+x/2]
+ + src[1][(y+1)*(width/4)+x/2] + 1) / 2;
+ dest[2][(y/2)*(width/2)+x] = (src[2][y*(width/4)+x/2]
+ + src[2][(y+1)*(width/4)+x/2] + 1) / 2;
+ dest[1][(y/2)*(width/2)+x+1] = dest[1][(y/2)*(width/2)+x];
+ dest[2][(y/2)*(width/2)+x+1] = dest[2][(y/2)*(width/2)+x];
+ }
+ }
+ return 1;
+}
+
+static int yuv411p_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int x, y;
+ ac_memcpy(dest[0], src[0], width*height);
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < ((width/2) & ~1); x += 2) {
+ dest[1][y*(width/2)+x ] = src[1][y*(width/4)+x/2];
+ dest[1][y*(width/2)+x+1] = src[1][y*(width/4)+x/2];
+ dest[2][y*(width/2)+x ] = src[2][y*(width/4)+x/2];
+ dest[2][y*(width/2)+x+1] = src[2][y*(width/4)+x/2];
+ }
+ }
+ return 1;
+}
+
+static int yuv411p_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int x, y;
+ ac_memcpy(dest[0], src[0], width*height);
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < (width & ~3); x += 4) {
+ dest[1][y*width+x ] = src[1][y*(width/4)+x/4];
+ dest[1][y*width+x+1] = src[1][y*(width/4)+x/4];
+ dest[1][y*width+x+2] = src[1][y*(width/4)+x/4];
+ dest[1][y*width+x+3] = src[1][y*(width/4)+x/4];
+ dest[2][y*width+x ] = src[2][y*(width/4)+x/4];
+ dest[2][y*width+x+1] = src[2][y*(width/4)+x/4];
+ dest[2][y*width+x+2] = src[2][y*(width/4)+x/4];
+ dest[2][y*width+x+3] = src[2][y*(width/4)+x/4];
+ }
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+static int yuv422p_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int x, y;
+ ac_memcpy(dest[0], src[0], width*height);
+ for (y = 0; y < (height & ~1); y += 2) {
+ for (x = 0; x < width/2; x++) {
+ dest[1][(y/2)*(width/2)+x] = (src[1][y*(width/2)+x]
+ + src[1][(y+1)*(width/2)+x] + 1) / 2;
+ dest[2][(y/2)*(width/2)+x] = (src[2][y*(width/2)+x]
+ + src[2][(y+1)*(width/2)+x] + 1) / 2;
+ }
+ }
+ return 1;
+}
+
+static int yuv422p_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int x, y;
+ ac_memcpy(dest[0], src[0], width*height);
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < ((width/2) & ~1); x += 2) {
+ dest[1][y*(width/4)+x/2] = (src[1][y*(width/2)+x]
+ + src[1][y*(width/2)+x+1] + 1) / 2;
+ dest[2][y*(width/4)+x/2] = (src[2][y*(width/2)+x]
+ + src[2][y*(width/2)+x+1] + 1) / 2;
+ }
+ }
+ return 1;
+}
+
+static int yuv422p_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int x, y;
+ ac_memcpy(dest[0], src[0], width*height);
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < (width & ~1); x += 2) {
+ dest[1][y*width+x ] = src[1][y*(width/2)+x/2];
+ dest[1][y*width+x+1] = src[1][y*(width/2)+x/2];
+ dest[2][y*width+x ] = src[2][y*(width/2)+x/2];
+ dest[2][y*width+x+1] = src[2][y*(width/2)+x/2];
+ }
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+static int yuv444p_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int x, y;
+ ac_memcpy(dest[0], src[0], width*height);
+ for (y = 0; y < (height & ~1); y += 2) {
+ for (x = 0; x < (width & ~1); x += 2) {
+ dest[1][(y/2)*(width/2)+x/2] = (src[1][y*width+x]
+ + src[1][y*width+x+1]
+ + src[1][(y+1)*width+x]
+ + src[1][(y+1)*width+x+1] + 2) / 4;
+ dest[2][(y/2)*(width/2)+x/2] = (src[2][y*width+x]
+ + src[2][y*width+x+1]
+ + src[2][(y+1)*width+x]
+ + src[2][(y+1)*width+x+1] + 2) / 4;
+ }
+ }
+ return 1;
+}
+
+static int yuv444p_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int x, y;
+ ac_memcpy(dest[0], src[0], width*height);
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < (width & ~3); x += 4) {
+ dest[1][y*(width/4)+x/4] = (src[1][y*width+x]
+ + src[1][y*width+x+1]
+ + src[1][y*width+x+2]
+ + src[1][y*width+x+3] + 2) / 4;
+ dest[2][y*(width/4)+x/4] = (src[2][y*width+x]
+ + src[2][y*width+x+1]
+ + src[2][y*width+x+2]
+ + src[2][y*width+x+3] + 2) / 4;
+ }
+ }
+ return 1;
+}
+
+static int yuv444p_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int x, y;
+ ac_memcpy(dest[0], src[0], width*height);
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < (width & ~1); x += 2) {
+ dest[1][y*(width/2)+x/2] = (src[1][y*width+x]
+ + src[1][y*width+x+1] + 1) / 2;
+ dest[2][y*(width/2)+x/2] = (src[2][y*width+x]
+ + src[2][y*width+x+1] + 1) / 2;
+ }
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+/* We treat Y8 as a planar format */
+
+static int yuvp_y8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height);
+ return 1;
+}
+
+static int y8_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height);
+ memset(dest[1], 128, (width/2)*(height/2));
+ memset(dest[2], 128, (width/2)*(height/2));
+ return 1;
+}
+
+static int y8_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height);
+ memset(dest[1], 128, (width/4)*height);
+ memset(dest[2], 128, (width/4)*height);
+ return 1;
+}
+
+static int y8_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height);
+ memset(dest[1], 128, (width/2)*height);
+ memset(dest[2], 128, (width/2)*height);
+ return 1;
+}
+
+static int y8_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height);
+ memset(dest[1], 128, width*height);
+ memset(dest[2], 128, width*height);
+ return 1;
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+#if defined(HAVE_ASM_SSE2)
+
+/* SSE2 routines. See comments in img_x86_common.h for why we don't bother
+ * unrolling the loops. */
+
+/* Common macros/data for x86 code */
+#include "img_x86_common.h"
+
+/* Average 2 bytes horizontally (e.g. 422P->411P) (unit: 2 source bytes) */
+#define AVG_2H(src,dest,count) do { \
+ int dummy; \
+ asm volatile( \
+ "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" /* XMM7: 0x00FF*8 */ \
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 8, \
+ /* push_regs */ "", \
+ /* pop_regs */ "", \
+ /* small_loop */ \
+ "movzbl -2("ESI","ECX",2), %%eax \n\
+ movzbl -1("ESI","ECX",2), %%edx \n\
+ addl %%edx, %%eax \n\
+ shrl $1, %%eax \n\
+ movb %%al, -1("EDI","ECX")", \
+ /* main_loop */ \
+ "movdqu -16("ESI","ECX",2),%%xmm0 #XMM0:FEDCBA9876543210 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: FEDCBA9876543210 \n\
+ pand %%xmm7, %%xmm0 # XMM0: E C A 8 6 4 2 0 \n\
+ psrlw $8, %%xmm1 # XMM1: F D B 9 7 5 3 1 \n\
+ pavgw %%xmm1, %%xmm0 # XMM0: w v u t s r q p (avgs) \n\
+ packuswb %%xmm0, %%xmm0 # XMM0: wvutsrqpwvutsrqp \n\
+ movq %%xmm0, -8("EDI","ECX")", \
+ /* emms */ "emms") \
+ : "=c" (dummy) \
+ : "S" (src), "D" (dest), "0" (count) \
+ : "eax", "edx"); \
+} while (0)
+
+/* Average 4 bytes horizontally (e.g. 444P->411P) (unit: 4 source bytes) */
+#define AVG_4H(src,dest,count) do { \
+ int dummy; \
+ asm volatile( \
+ "pcmpeqd %%xmm7, %%xmm7; psrld $24, %%xmm7;" /* XMM7: 0x000000FF*4 */ \
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 4, \
+ /* push_regs */ "", \
+ /* pop_regs */ "", \
+ /* small_loop */ \
+ "movzbl -4("ESI","ECX",4), %%eax \n\
+ movzbl -3("ESI","ECX",4), %%edx \n\
+ addl %%edx, %%eax \n\
+ movzbl -2("ESI","ECX",4), %%edx \n\
+ addl %%edx, %%eax \n\
+ movzbl -1("ESI","ECX",4), %%edx \n\
+ addl %%edx, %%eax \n\
+ shrl $2, %%eax \n\
+ movb %%al, -1("EDI","ECX")", \
+ /* main_loop */ \
+ "movdqu -16("ESI","ECX",4),%%xmm0 #XMM0:FEDCBA9876543210 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: FEDCBA9876543210 \n\
+ movdqa %%xmm0, %%xmm2 # XMM2: FEDCBA9876543210 \n\
+ movdqa %%xmm0, %%xmm3 # XMM3: FEDCBA9876543210 \n\
+ pand %%xmm7, %%xmm0 # XMM0: C 8 4 0 \n\
+ psrld $8, %%xmm1 # XMM1: FED BA9 765 321 \n\
+ pand %%xmm7, %%xmm1 # XMM1: D 9 5 1 \n\
+ psrld $16, %%xmm2 # XMM2: FE BA 76 32 \n\
+ pand %%xmm7, %%xmm2 # XMM2: E A 6 2 \n\
+ psrld $24, %%xmm3 # XMM3: F B 7 3 \n\
+ pavgw %%xmm1, %%xmm0 # XMM0: C+D 8+9 4+5 0+1 (avgs) \n\
+ pavgw %%xmm3, %%xmm2 # XMM2: E+F A+B 6+7 2+3 (avgs) \n\
+ pavgw %%xmm2, %%xmm0 # XMM0: s r q p (avgs) \n\
+ packuswb %%xmm0, %%xmm0 # XMM0: s r q p s r q p \n\
+ packuswb %%xmm0, %%xmm0 # XMM0: srqpsrqpsrqpsrqp \n\
+ movd %%xmm0, -4("EDI","ECX")", \
+ /* emms */ "emms") \
+ : "=c" (dummy) \
+ : "S" (src), "D" (dest), "0" (count) \
+ : "eax", "edx"); \
+} while (0)
+
+/* Repeat 2 bytes horizontally (e.g. 422P->444P) (unit: 1 source byte) */
+#define REP_2H(src,dest,count) do { \
+ int dummy; \
+ asm volatile(SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 8, \
+ /* push_regs */ "", \
+ /* pop_regs */ "", \
+ /* small_loop */ \
+ "movb -1("ESI","ECX"), %%al \n\
+ movb %%al, %%ah \n\
+ movw %%ax, -2("EDI","ECX",2)", \
+ /* main_loop */ \
+ "movq -8("ESI","ECX"), %%xmm0 # XMM0: 76543210 \n\
+ punpcklbw %%xmm0, %%xmm0 # XMM0: 7766554433221100 \n\
+ movdqu %%xmm0, -16("EDI","ECX",2)", \
+ /* emms */ "emms") \
+ : "=c" (dummy) \
+ : "S" (src), "D" (dest), "0" (count) \
+ : "eax"); \
+} while (0)
+
+/* Repeat 4 bytes horizontally (e.g. 411P->444P) (unit: 1 source byte) */
+#define REP_4H(src,dest,count) do { \
+ int dummy; \
+ asm volatile(SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 4, \
+ /* push_regs */ "", \
+ /* pop_regs */ "", \
+ /* small_loop */ \
+ "movzbl -1("ESI","ECX"), %%eax \n\
+ movb %%al, %%ah \n\
+ movl %%eax, %%edx \n\
+ shll $16, %%eax \n\
+ orl %%edx, %%eax \n\
+ movl %%eax, -4("EDI","ECX",4)", \
+ /* main_loop */ \
+ "movd -4("ESI","ECX"), %%xmm0 # XMM0: 3210 \n\
+ punpcklbw %%xmm0, %%xmm0 # XMM0: 33221100 \n\
+ punpcklwd %%xmm0, %%xmm0 # XMM0: 3333222211110000 \n\
+ movdqu %%xmm0, -16("EDI","ECX",4)", \
+ /* emms */ "emms") \
+ : "=c" (dummy) \
+ : "S" (src), "D" (dest), "0" (count) \
+ : "eax", "edx"); \
+} while (0)
+
+/* Average 2 bytes vertically and double horizontally (411P->420P)
+ * (unit: 1 source byte) */
+#define AVG_411_420(src1,src2,dest,count) do { \
+ int dummy; \
+ asm volatile(SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 8, \
+ /* push_regs */ "push "EBX, \
+ /* pop_regs */ "pop "EBX, \
+ /* small_loop */ \
+ "movzbl -1("ESI","ECX"), %%eax \n\
+ movzbl -1("EDX","ECX"), %%ebx \n\
+ addl %%ebx, %%eax \n\
+ shrl $1, %%eax \n\
+ movb %%al, %%ah \n\
+ movw %%ax, -2("EDI","ECX",2)", \
+ /* main_loop */ \
+ "movq -8("ESI","ECX"), %%xmm0 \n\
+ movq -8("EDX","ECX"), %%xmm1 \n\
+ pavgb %%xmm1, %%xmm0 \n\
+ punpcklbw %%xmm0, %%xmm0 \n\
+ movdqu %%xmm0, -16("EDI","ECX",2)", \
+ /* emms */ "emms") \
+ : "=c" (dummy) \
+ : "S" (src1), "d" (src2), "D" (dest), "0" (count) \
+ : "eax"); \
+} while (0)
+
+/* Average 2 bytes vertically (422P->420P) (unit: 1 source byte) */
+#define AVG_422_420(src1,src2,dest,count) do { \
+ int dummy; \
+ asm volatile(SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 16, \
+ /* push_regs */ "push "EBX, \
+ /* pop_regs */ "pop "EBX, \
+ /* small_loop */ \
+ "movzbl -1("ESI","ECX"), %%eax \n\
+ movzbl -1("EDX","ECX"), %%ebx \n\
+ addl %%ebx, %%eax \n\
+ shrl $1, %%eax \n\
+ movb %%al, -1("EDI","ECX")", \
+ /* main_loop */ \
+ "movdqu -16("ESI","ECX"), %%xmm0 \n\
+ movdqu -16("EDX","ECX"), %%xmm1 \n\
+ pavgb %%xmm1, %%xmm0 \n\
+ movdqu %%xmm0, -16("EDI","ECX")", \
+ /* emms */ "emms") \
+ : "=c" (dummy) \
+ : "S" (src1), "d" (src2), "D" (dest), "0" (count) \
+ : "eax"); \
+} while (0)
+
+/* Average 4 bytes, 2 horizontally and 2 vertically (444P->420P)
+ * (unit: 2 source bytes) */
+#define AVG_444_420(src1,src2,dest,count) do { \
+ int dummy; \
+ asm volatile( \
+ "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" /* XMM7: 0x00FF*8 */ \
+ SIMD_LOOP_WRAPPER( \
+ /* blocksize */ 8, \
+ /* push_regs */ "push "EBX, \
+ /* pop_regs */ "pop "EBX, \
+ /* small_loop */ \
+ "movzbl -2("ESI","ECX",2), %%eax \n\
+ movzbl -1("ESI","ECX",2), %%ebx \n\
+ addl %%ebx, %%eax \n\
+ movzbl -2("EDX","ECX",2), %%ebx \n\
+ addl %%ebx, %%eax \n\
+ movzbl -1("EDX","ECX",2), %%ebx \n\
+ addl %%ebx, %%eax \n\
+ shrl $2, %%eax \n\
+ movb %%al, -1("EDI","ECX")", \
+ /* main_loop */ \
+ "movdqu -16("ESI","ECX",2), %%xmm0 \n\
+ movdqu -16("EDX","ECX",2), %%xmm2 \n\
+ movdqa %%xmm0, %%xmm1 \n\
+ pand %%xmm7, %%xmm0 \n\
+ psrlw $8, %%xmm1 \n\
+ pavgw %%xmm1, %%xmm0 \n\
+ movdqa %%xmm2, %%xmm3 \n\
+ pand %%xmm7, %%xmm2 \n\
+ psrlw $8, %%xmm3 \n\
+ pavgw %%xmm3, %%xmm2 \n\
+ pavgw %%xmm2, %%xmm0 \n\
+ packuswb %%xmm0, %%xmm0 \n\
+ movq %%xmm0, -8("EDI","ECX")", \
+ /* emms */ "emms") \
+ : "=c" (dummy) \
+ : "S" (src1), "d" (src2), "D" (dest), "c" (count)); \
+} while (0)
+
+/*************************************************************************/
+
+static int yuv420p_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int y;
+ ac_memcpy(dest[0], src[0], width*height);
+ for (y = 0; y < (height & ~1); y += 2) {
+ AVG_2H(src[1]+(y/2)*(width/2), dest[1]+y*(width/4), width/4);
+ ac_memcpy(dest[1]+(y+1)*(width/4), dest[1]+y*(width/4), width/4);
+ AVG_2H(src[2]+(y/2)*(width/2), dest[2]+y*(width/4), width/4);
+ ac_memcpy(dest[2]+(y+1)*(width/4), dest[2]+y*(width/4), width/4);
+ }
+ return 1;
+}
+
+static int yuv420p_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int y;
+ ac_memcpy(dest[0], src[0], width*height);
+ for (y = 0; y < height; y += 2) {
+ REP_2H(src[1]+(y/2)*(width/2), dest[1]+y*width, width/2);
+ ac_memcpy(dest[1]+(y+1)*width, dest[1]+y*width, width);
+ REP_2H(src[2]+(y/2)*(width/2), dest[2]+y*width, width/2);
+ ac_memcpy(dest[2]+(y+1)*width, dest[2]+y*width, width);
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+static int yuv411p_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int y;
+ ac_memcpy(dest[0], src[0], width*height);
+ for (y = 0; y < (height & ~1); y += 2) {
+ AVG_411_420(src[1]+y*(width/4), src[1]+(y+1)*(width/4),
+ dest[1]+(y/2)*(width/2), width/4);
+ AVG_411_420(src[2]+y*(width/4), src[2]+(y+1)*(width/4),
+ dest[2]+(y/2)*(width/2), width/4);
+ }
+ return 1;
+}
+
+static int yuv411p_yuv422p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height);
+ if (!(width & 3)) {
+ /* Fast version, no bytes at end of row to skip */
+ REP_2H(src[1], dest[1], (width/4)*height);
+ REP_2H(src[2], dest[2], (width/4)*height);
+ } else {
+ /* Slow version, loop through each row */
+ int y;
+ for (y = 0; y < height; y++) {
+ REP_2H(src[1]+y*(width/4), dest[1]+y*(width/2), width/4);
+ REP_2H(src[2]+y*(width/4), dest[2]+y*(width/2), width/4);
+ }
+ }
+ return 1;
+}
+
+static int yuv411p_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height);
+ if (!(width & 3)) {
+ /* Fast version, no bytes at end of row to skip */
+ REP_4H(src[1], dest[1], (width/4)*height);
+ REP_4H(src[2], dest[2], (width/4)*height);
+ } else {
+ /* Slow version, loop through each row */
+ int y;
+ for (y = 0; y < height; y++) {
+ REP_4H(src[1]+y*(width/4), dest[1]+y*width, width/4);
+ REP_4H(src[2]+y*(width/4), dest[2]+y*width, width/4);
+ }
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+static int yuv422p_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int y;
+ ac_memcpy(dest[0], src[0], width*height);
+ for (y = 0; y < (height & ~1); y += 2) {
+ AVG_422_420(src[1]+y*(width/2), src[1]+(y+1)*(width/2),
+ dest[1]+(y/2)*(width/2), width/2);
+ AVG_422_420(src[2]+y*(width/2), src[2]+(y+1)*(width/2),
+ dest[2]+(y/2)*(width/2), width/2);
+ }
+ return 1;
+}
+
+static int yuv422p_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height);
+ if (!(width & 3)) {
+ /* Fast version, no bytes at end of row to skip */
+ AVG_2H(src[1], dest[1], (width/4)*height);
+ AVG_2H(src[2], dest[2], (width/4)*height);
+ } else {
+ /* Slow version, loop through each row */
+ int y;
+ for (y = 0; y < height; y++) {
+ AVG_2H(src[1]+y*(width/2), dest[1]+y*(width/4), width/4);
+ AVG_2H(src[2]+y*(width/2), dest[2]+y*(width/4), width/4);
+ }
+ }
+ return 1;
+}
+
+static int yuv422p_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height);
+ if (!(width & 1)) {
+ /* Fast version, no bytes at end of row to skip */
+ REP_2H(src[1], dest[1], (width/2)*height);
+ REP_2H(src[2], dest[2], (width/2)*height);
+ } else {
+ /* Slow version, loop through each row */
+ int y;
+ for (y = 0; y < height; y++) {
+ REP_2H(src[1]+y*(width/2), dest[1]+y*width, width/2);
+ REP_2H(src[2]+y*(width/2), dest[2]+y*width, width/2);
+ }
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+static int yuv444p_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int y;
+ ac_memcpy(dest[0], src[0], width*height);
+ for (y = 0; y < (height & ~1); y += 2) {
+ AVG_444_420(src[1]+y*width, src[1]+(y+1)*width,
+ dest[1]+(y/2)*(width/2), width/2);
+ AVG_444_420(src[2]+y*width, src[2]+(y+1)*width,
+ dest[2]+(y/2)*(width/2), width/2);
+ }
+ return 1;
+}
+
+static int yuv444p_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height);
+ if (!(width & 3)) {
+ /* Fast version, no bytes at end of row to skip */
+ AVG_4H(src[1], dest[1], (width/4)*height);
+ AVG_4H(src[2], dest[2], (width/4)*height);
+ } else {
+ /* Slow version, loop through each row */
+ int y;
+ for (y = 0; y < height; y++) {
+ AVG_4H(src[1]+y*width, dest[1]+y*(width/4), width/4);
+ AVG_4H(src[2]+y*width, dest[2]+y*(width/4), width/4);
+ }
+ }
+ return 1;
+}
+
+static int yuv444p_yuv422p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ ac_memcpy(dest[0], src[0], width*height);
+ if (!(width & 1)) {
+ /* Fast version, no bytes at end of row to skip */
+ AVG_2H(src[1], dest[1], (width/2)*height);
+ AVG_2H(src[2], dest[2], (width/2)*height);
+ } else {
+ /* Slow version, loop through each row */
+ int y;
+ for (y = 0; y < height; y++) {
+ AVG_2H(src[1]+y*width, dest[1]+y*(width/2), width/2);
+ AVG_2H(src[2]+y*width, dest[2]+y*(width/2), width/2);
+ }
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+#endif /* HAVE_ASM_SSE2 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Initialization */
+
+int ac_imgconvert_init_yuv_planar(int accel)
+{
+ if (!register_conversion(IMG_YUV420P, IMG_YUV420P, yuv420p_copy)
+ || !register_conversion(IMG_YUV420P, IMG_YUV411P, yuv420p_yuv411p)
+ || !register_conversion(IMG_YUV420P, IMG_YUV422P, yuv420p_yuv422p)
+ || !register_conversion(IMG_YUV420P, IMG_YUV444P, yuv420p_yuv444p)
+ || !register_conversion(IMG_YUV420P, IMG_Y8, yuvp_y8)
+
+ || !register_conversion(IMG_YUV411P, IMG_YUV420P, yuv411p_yuv420p)
+ || !register_conversion(IMG_YUV411P, IMG_YUV411P, yuv411p_copy)
+ || !register_conversion(IMG_YUV411P, IMG_YUV422P, yuv411p_yuv422p)
+ || !register_conversion(IMG_YUV411P, IMG_YUV444P, yuv411p_yuv444p)
+ || !register_conversion(IMG_YUV411P, IMG_Y8, yuvp_y8)
+
+ || !register_conversion(IMG_YUV422P, IMG_YUV420P, yuv422p_yuv420p)
+ || !register_conversion(IMG_YUV422P, IMG_YUV411P, yuv422p_yuv411p)
+ || !register_conversion(IMG_YUV422P, IMG_YUV422P, yuv422p_copy)
+ || !register_conversion(IMG_YUV422P, IMG_YUV444P, yuv422p_yuv444p)
+ || !register_conversion(IMG_YUV422P, IMG_Y8, yuvp_y8)
+
+ || !register_conversion(IMG_YUV444P, IMG_YUV420P, yuv444p_yuv420p)
+ || !register_conversion(IMG_YUV444P, IMG_YUV411P, yuv444p_yuv411p)
+ || !register_conversion(IMG_YUV444P, IMG_YUV422P, yuv444p_yuv422p)
+ || !register_conversion(IMG_YUV444P, IMG_YUV444P, yuv444p_copy)
+ || !register_conversion(IMG_YUV444P, IMG_Y8, yuvp_y8)
+
+ || !register_conversion(IMG_Y8, IMG_YUV420P, y8_yuv420p)
+ || !register_conversion(IMG_Y8, IMG_YUV411P, y8_yuv411p)
+ || !register_conversion(IMG_Y8, IMG_YUV422P, y8_yuv422p)
+ || !register_conversion(IMG_Y8, IMG_YUV444P, y8_yuv444p)
+ || !register_conversion(IMG_Y8, IMG_Y8, y8_copy)
+ ) {
+ return 0;
+ }
+
+#if defined(HAVE_ASM_SSE2)
+ if (accel & AC_SSE2) {
+ if (!register_conversion(IMG_YUV420P, IMG_YUV411P, yuv420p_yuv411p_sse2)
+ || !register_conversion(IMG_YUV420P, IMG_YUV444P, yuv420p_yuv444p_sse2)
+
+ || !register_conversion(IMG_YUV411P, IMG_YUV420P, yuv411p_yuv420p_sse2)
+ || !register_conversion(IMG_YUV411P, IMG_YUV422P, yuv411p_yuv422p_sse2)
+ || !register_conversion(IMG_YUV411P, IMG_YUV444P, yuv411p_yuv444p_sse2)
+
+ || !register_conversion(IMG_YUV422P, IMG_YUV420P, yuv422p_yuv420p_sse2)
+ || !register_conversion(IMG_YUV422P, IMG_YUV411P, yuv422p_yuv411p_sse2)
+ || !register_conversion(IMG_YUV422P, IMG_YUV444P, yuv422p_yuv444p_sse2)
+
+ || !register_conversion(IMG_YUV444P, IMG_YUV420P, yuv444p_yuv420p_sse2)
+ || !register_conversion(IMG_YUV444P, IMG_YUV411P, yuv444p_yuv411p_sse2)
+ || !register_conversion(IMG_YUV444P, IMG_YUV422P, yuv444p_yuv422p_sse2)
+ ) {
+ return 0;
+ }
+ }
+#endif /* ARCH_X86 || ARCH_X86_64 */
+
+ return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ * c-file-style: "stroustrup"
+ * c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/img_yuv_rgb.c b/debian/transcode/transcode-1.1.7/aclib/img_yuv_rgb.c
new file mode 100644
index 00000000..9dc04fcb
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/img_yuv_rgb.c
@@ -0,0 +1,2410 @@
+/*
+ * img_yuv_rgb.c - YUV<->RGB image format conversion routines
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later). See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "ac_internal.h"
+#include "imgconvert.h"
+#include "img_internal.h"
+
+#include <string.h>
+
+#define USE_LOOKUP_TABLES /* for YUV420P->RGB24 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Standard C implementations */
+
+const int cY = 76309;
+const int crV = 104597;
+const int cgU = -25675;
+const int cgV = -53279;
+const int cbU = 132201;
+
+/*************************************************************************/
+
+#ifdef USE_LOOKUP_TABLES
+# define TABLE_SCALE 16 /* scale factor for Y */
+static int Ylutbase[768*TABLE_SCALE];
+static int *Ylut = Ylutbase+256*TABLE_SCALE;
+static int rVlut[256];
+static int gUlut[256];
+static int gVlut[256];
+static int bUlut[256];
+static void yuv_create_tables(void) {
+ static int yuv_tables_created = 0;
+ if (!yuv_tables_created) {
+ int i;
+ for (i = -256*TABLE_SCALE; i < 512*TABLE_SCALE; i++) {
+ int v = ((cY*(i-16*TABLE_SCALE)/TABLE_SCALE) + 32768) >> 16;
+ Ylut[i] = v<0 ? 0 : v>255 ? 255 : v;
+ }
+ for (i = 0; i < 256; i++) {
+ rVlut[i] = ((crV * (i-128)) * TABLE_SCALE + cY/2) / cY;
+ gUlut[i] = ((cgU * (i-128)) * TABLE_SCALE + cY/2) / cY;
+ gVlut[i] = ((cgV * (i-128)) * TABLE_SCALE + cY/2) / cY;
+ bUlut[i] = ((cbU * (i-128)) * TABLE_SCALE + cY/2) / cY;
+ }
+ yuv_tables_created = 1;
+ }
+}
+# define YUV2RGB(uvofs,rgbsz,rofs,gofs,bofs) do { \
+ int Y = src[0][y*width+x] * TABLE_SCALE; \
+ int U = src[1][(uvofs)]; \
+ int V = src[2][(uvofs)]; \
+ dest[0][(y*width+x)*rgbsz+rofs] = Ylut[Y+rVlut[V]]; \
+ dest[0][(y*width+x)*rgbsz+gofs] = Ylut[Y+gUlut[U]+gVlut[V]];\
+ dest[0][(y*width+x)*rgbsz+bofs] = Ylut[Y+bUlut[U]]; \
+} while (0)
+# define YUV2RGB_PACKED(yofs,uofs,vofs,rgbsz,rofs,gofs,bofs) do { \
+ int Y = src[0][(y*width+x)*2+yofs] * TABLE_SCALE; \
+ int U = src[0][(y*width+(x&~1))*2+uofs]; \
+ int V = src[0][(y*width+(x&~1))*2+vofs]; \
+ dest[0][(y*width+x)*rgbsz+rofs] = Ylut[Y+rVlut[V]]; \
+ dest[0][(y*width+x)*rgbsz+gofs] = Ylut[Y+gUlut[U]+gVlut[V]];\
+ dest[0][(y*width+x)*rgbsz+bofs] = Ylut[Y+bUlut[U]]; \
+} while (0)
+#else /* !USE_LOOKUP_TABLES */
+# define yuv_create_tables() /*nothing*/
+# define YUV2RGB(uvofs,rgbsz,rofs,gofs,bofs) do { \
+ int Y = cY * (src[0][y*width+x] - 16); \
+ int U = src[1][(uvofs)] - 128; \
+ int V = src[2][(uvofs)] - 128; \
+ int r = (Y + crV*V + 32768) >> 16; \
+ int g = (Y + cgU*U + cgV*V + 32768) >> 16; \
+ int b = (Y + cbU*U + 32768) >> 16; \
+ dest[0][(y*width+x)*rgbsz+rofs] = r<0 ? 0 : r>255 ? 255 : r;\
+ dest[0][(y*width+x)*rgbsz+gofs] = g<0 ? 0 : g>255 ? 255 : g;\
+ dest[0][(y*width+x)*rgbsz+bofs] = b<0 ? 0 : b>255 ? 255 : b;\
+} while (0)
+# define YUV2RGB_PACKED(yofs,uofs,vofs,rgbsz,rofs,gofs,bofs) do { \
+ int Y = cY * (src[0][(y*width+x)*2+yofs] - 16); \
+ int U = src[0][(y*width+(x&~1))*2+uofs] - 128; \
+ int V = src[0][(y*width+(x&~1))*2+vofs] - 128; \
+ int r = (Y + crV*V + 32768) >> 16; \
+ int g = (Y + cgU*U + cgV*V + 32768) >> 16; \
+ int b = (Y + cbU*U + 32768) >> 16; \
+ dest[0][(y*width+x)*rgbsz+rofs] = r<0 ? 0 : r>255 ? 255 : r;\
+ dest[0][(y*width+x)*rgbsz+gofs] = g<0 ? 0 : g>255 ? 255 : g;\
+ dest[0][(y*width+x)*rgbsz+bofs] = b<0 ? 0 : b>255 ? 255 : b;\
+} while (0)
+#endif
+
+#define YUV2RGB_420P(s,r,g,b) YUV2RGB((y/2)*(width/2)+(x/2),s,r,g,b)
+#define YUV2RGB_411P(s,r,g,b) YUV2RGB((y )*(width/4)+(x/4),s,r,g,b)
+#define YUV2RGB_422P(s,r,g,b) YUV2RGB((y )*(width/2)+(x/2),s,r,g,b)
+#define YUV2RGB_444P(s,r,g,b) YUV2RGB((y )*(width )+(x ),s,r,g,b)
+#define YUV2RGB_YUY2(s,r,g,b) YUV2RGB_PACKED(0,1,3, s,r,g,b)
+#define YUV2RGB_UYVY(s,r,g,b) YUV2RGB_PACKED(1,0,2, s,r,g,b)
+#define YUV2RGB_YVYU(s,r,g,b) YUV2RGB_PACKED(0,3,1, s,r,g,b)
+
+#define DEFINE_YUV2RGB(name,op) \
+static int name(uint8_t **src, uint8_t **dest, int width, int height) \
+{ \
+ int x, y; \
+ \
+ yuv_create_tables(); \
+ for (y = 0; y < height; y++) { \
+ for (x = 0; x < width; x++) { \
+ op; \
+ } \
+ } \
+ return 1; \
+}
+
+#define DEFINE_YUV2RGB_SET(rgb,rgbsz,rofs,gofs,bofs) \
+ DEFINE_YUV2RGB(yuv420p_##rgb, YUV2RGB_420P(rgbsz,rofs,gofs,bofs)) \
+ DEFINE_YUV2RGB(yuv411p_##rgb, YUV2RGB_411P(rgbsz,rofs,gofs,bofs)) \
+ DEFINE_YUV2RGB(yuv422p_##rgb, YUV2RGB_422P(rgbsz,rofs,gofs,bofs)) \
+ DEFINE_YUV2RGB(yuv444p_##rgb, YUV2RGB_444P(rgbsz,rofs,gofs,bofs)) \
+ DEFINE_YUV2RGB(yuy2_##rgb, YUV2RGB_YUY2(rgbsz,rofs,gofs,bofs)) \
+ DEFINE_YUV2RGB(uyvy_##rgb, YUV2RGB_UYVY(rgbsz,rofs,gofs,bofs)) \
+ DEFINE_YUV2RGB(yvyu_##rgb, YUV2RGB_YVYU(rgbsz,rofs,gofs,bofs))
+
+DEFINE_YUV2RGB_SET(rgb24, 3,0,1,2)
+DEFINE_YUV2RGB_SET(bgr24, 3,2,1,0)
+DEFINE_YUV2RGB_SET(rgba32, 4,0,1,2)
+DEFINE_YUV2RGB_SET(abgr32, 4,3,2,1)
+DEFINE_YUV2RGB_SET(argb32, 4,1,2,3)
+DEFINE_YUV2RGB_SET(bgra32, 4,2,1,0)
+
+/* Y8->RGB is defined as part of grayscale stuff below */
+
+/*************************************************************************/
+
+#define RGB2Y() \
+ (dest[0][y*width+x] = ((16829*r + 33039*g + 6416*b + 32768) >> 16) + 16)
+#define RGB2U(uvofs) \
+ (dest[1][(uvofs)] = ((-9714*r - 19070*g + 28784*b + 32768) >> 16) + 128)
+#define RGB2V(uvofs) \
+ (dest[2][(uvofs)] = ((28784*r - 24103*g - 4681*b + 32768) >> 16) + 128)
+#define RGB2Y_PACKED(ofs) \
+ (dest[0][(y*width+x)*2+(ofs)] = ((16829*r + 33039*g + 6416*b + 32768) >> 16) + 16)
+#define RGB2U_PACKED(ofs) \
+ (dest[0][(y*width+x)*2+(ofs)] = ((-9714*r - 19070*g + 28784*b + 32768) >> 16) + 128)
+#define RGB2V_PACKED(ofs) \
+ (dest[0][(y*width+x)*2+(ofs)] = ((28784*r - 24103*g - 4681*b + 32768) >> 16) + 128)
+
+#define RGB2YUV(utest,vtest,uvofs) \
+ RGB2Y(); if (utest) RGB2U(uvofs); if (vtest) RGB2V(uvofs)
+#define RGB2YUV_PACKED(utest,vtest,yofs,uvofs) \
+ RGB2Y_PACKED(yofs); \
+ if (utest) RGB2U_PACKED(uvofs); \
+ if (vtest) RGB2V_PACKED(uvofs)
+/* YUV420P: take Cb/Cr from opposite corners */
+#define RGB2YUV_420P RGB2YUV(!((x|y) & 1), (x&y) & 1, (y/2)*(width/2)+(x/2))
+/* YUV411P: take Cb/Cr from points 2 pixels apart */
+#define RGB2YUV_411P RGB2YUV(!(x & 3), !((x^2) & 3), y*(width/4)+(x/4))
+/* YUV422P: take Cb/Cr from adjacent pixels */
+#define RGB2YUV_422P RGB2YUV(!(x & 1), x & 1, y*(width/2)+(x/2))
+/* YUV444P: every pixel is sampled */
+#define RGB2YUV_444P RGB2YUV(1, 1, y*width+x)
+/* YUY2/UYVY/YVYU: take Cb/Cr from the corresponding pixel */
+#define RGB2YUV_YUY2 RGB2YUV_PACKED(!(x & 1), x & 1, 0,1)
+#define RGB2YUV_UYVY RGB2YUV_PACKED(!(x & 1), x & 1, 1,0)
+#define RGB2YUV_YVYU RGB2YUV_PACKED(x & 1, !(x & 1), 0,1)
+
+#define DEFINE_RGB2YUV(name,rgbsz,rofs,gofs,bofs,op) \
+static int name(uint8_t **src, uint8_t **dest, int width, int height) \
+{ \
+ int x, y; \
+ \
+ for (y = 0; y < height; y++) { \
+ for (x = 0; x < width; x++) { \
+ int r = src[0][(y*width+x)*rgbsz+rofs]; \
+ int g = src[0][(y*width+x)*rgbsz+gofs]; \
+ int b = src[0][(y*width+x)*rgbsz+bofs]; \
+ op; \
+ } \
+ } \
+ return 1; \
+}
+
+#define DEFINE_RGB2Y8(name,rgbsz,rofs,gofs,bofs) \
+static int name(uint8_t **src, uint8_t **dest, int width, int height) \
+{ \
+ int x, y; \
+ \
+ for (y = 0; y < height; y++) { \
+ for (x = 0; x < width; x++) { \
+ int r = src[0][(y*width+x)*rgbsz+rofs]; \
+ int g = src[0][(y*width+x)*rgbsz+gofs]; \
+ int b = src[0][(y*width+x)*rgbsz+bofs]; \
+ RGB2Y(); \
+ } \
+ } \
+ return 1; \
+}
+
+#define DEFINE_RGB2YUV_SET(rgb,rgbsz,rofs,gofs,bofs) \
+ DEFINE_RGB2YUV(rgb##_yuv420p, rgbsz,rofs,gofs,bofs, RGB2YUV_420P) \
+ DEFINE_RGB2YUV(rgb##_yuv411p, rgbsz,rofs,gofs,bofs, RGB2YUV_411P) \
+ DEFINE_RGB2YUV(rgb##_yuv422p, rgbsz,rofs,gofs,bofs, RGB2YUV_422P) \
+ DEFINE_RGB2YUV(rgb##_yuv444p, rgbsz,rofs,gofs,bofs, RGB2YUV_444P) \
+ DEFINE_RGB2YUV(rgb##_yuy2, rgbsz,rofs,gofs,bofs, RGB2YUV_YUY2) \
+ DEFINE_RGB2YUV(rgb##_uyvy, rgbsz,rofs,gofs,bofs, RGB2YUV_UYVY) \
+ DEFINE_RGB2YUV(rgb##_yvyu, rgbsz,rofs,gofs,bofs, RGB2YUV_YVYU) \
+ DEFINE_RGB2Y8 (rgb##_y8, rgbsz,rofs,gofs,bofs)
+
+DEFINE_RGB2YUV_SET(rgb24, 3,0,1,2)
+DEFINE_RGB2YUV_SET(bgr24, 3,2,1,0)
+DEFINE_RGB2YUV_SET(rgba32, 4,0,1,2)
+DEFINE_RGB2YUV_SET(abgr32, 4,3,2,1)
+DEFINE_RGB2YUV_SET(argb32, 4,1,2,3)
+DEFINE_RGB2YUV_SET(bgra32, 4,2,1,0)
+
+/*************************************************************************/
+
+/* All YUV planar formats convert to grayscale the same way */
+
+#ifdef USE_LOOKUP_TABLES
+static uint8_t graylut[2][256];
+static int graylut_created = 0;
+static void gray8_create_tables(void)
+{
+ if (!graylut_created) {
+ int i;
+ for (i = 0; i < 256; i++) {
+ if (i <= 16)
+ graylut[0][i] = 0;
+ else if (i >= 235)
+ graylut[0][i] = 255;
+ else
+ graylut[0][i] = (i-16) * 255 / 219;
+ graylut[1][i] = 16 + i*219/255;
+ }
+ graylut_created = 1;
+ }
+}
+# define Y2GRAY(val) (graylut[0][(val)])
+# define GRAY2Y(val) (graylut[1][(val)])
+#else
+# define gray8_create_tables() /*nothing*/
+# define Y2GRAY(val) ((val)<16 ? 0 : (val)>=235 ? 255 : ((val)-16)*256/219)
+# define GRAY2Y(val) (16 + (val)*219/255)
+#endif
+
+static int yuvp_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ gray8_create_tables();
+ for (i = 0; i < width*height; i++)
+ dest[0][i] = Y2GRAY(src[0][i]);
+ return 1;
+}
+
+static int yuy2_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ gray8_create_tables();
+ for (i = 0; i < width*height; i++)
+ dest[0][i] = Y2GRAY(src[0][i*2]);
+ return 1;
+}
+
+static int uyvy_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ gray8_create_tables();
+ for (i = 0; i < width*height; i++)
+ dest[0][i] = Y2GRAY(src[0][i*2+1]);
+ return 1;
+}
+
+/*************************************************************************/
+
+static int gray8_y8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ gray8_create_tables();
+ for (i = 0; i < width*height; i++)
+ dest[0][i] = GRAY2Y(src[0][i]);
+ return 1;
+}
+
+static int gray8_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ if (!ac_imgconvert(src, IMG_GRAY8, dest, IMG_Y8, width, height))
+ return 0;
+ memset(dest[1], 128, (width/2)*(height/2));
+ memset(dest[2], 128, (width/2)*(height/2));
+ return 1;
+}
+
+static int gray8_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ if (!ac_imgconvert(src, IMG_GRAY8, dest, IMG_Y8, width, height))
+ return 0;
+ memset(dest[1], 128, (width/4)*height);
+ memset(dest[2], 128, (width/4)*height);
+ return 1;
+}
+
+static int gray8_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ if (!ac_imgconvert(src, IMG_GRAY8, dest, IMG_Y8, width, height))
+ return 0;
+ memset(dest[1], 128, (width/2)*height);
+ memset(dest[2], 128, (width/2)*height);
+ return 1;
+}
+
+static int gray8_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ if (!ac_imgconvert(src, IMG_GRAY8, dest, IMG_Y8, width, height))
+ return 0;
+ memset(dest[1], 128, width*height);
+ memset(dest[2], 128, width*height);
+ return 1;
+}
+
+static int gray8_yuy2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ gray8_create_tables();
+ for (i = 0; i < width*height; i++) {
+ dest[0][i*2 ] = GRAY2Y(src[0][i]);
+ dest[0][i*2+1] = 128;
+ }
+ return 1;
+}
+
+static int gray8_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ gray8_create_tables();
+ for (i = 0; i < width*height; i++) {
+ dest[0][i*2 ] = 128;
+ dest[0][i*2+1] = GRAY2Y(src[0][i]);
+ }
+ return 1;
+}
+
+/*************************************************************************/
+
+/* We only need 3 functions for Y8->RGB (no difference between RGB and BGR) */
+
+static int y8_rgb24(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ gray8_create_tables();
+ for (i = 0; i < width*height; i++)
+ dest[0][i*3] = dest[0][i*3+1] = dest[0][i*3+2] = Y2GRAY(src[0][i]);
+ return 1;
+}
+
+static int y8_rgba32(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ gray8_create_tables();
+ for (i = 0; i < width*height; i++)
+ dest[0][i*4] = dest[0][i*4+1] = dest[0][i*4+2] = Y2GRAY(src[0][i]);
+ return 1;
+}
+
+static int y8_argb32(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ int i;
+ gray8_create_tables();
+ for (i = 0; i < width*height; i++)
+ dest[0][i*4+1] = dest[0][i*4+2] = dest[0][i*4+3] = Y2GRAY(src[0][i]);
+ return 1;
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Accelerated versions of colorspace routines. */
+
+/* Common constant values used in routines: */
+
+#if defined(HAVE_ASM_MMX)
+
+#include "img_x86_common.h"
+
+static const struct { uint16_t n[72]; } __attribute__((aligned(16))) yuv_data = {{
+ 0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF, /* for odd/even */
+ 0x0010,0x0010,0x0010,0x0010,0x0010,0x0010,0x0010,0x0010, /* for Y -16 */
+ 0x0080,0x0080,0x0080,0x0080,0x0080,0x0080,0x0080,0x0080, /* for U/V -128 */
+ 0x2543,0x2543,0x2543,0x2543,0x2543,0x2543,0x2543,0x2543, /* Y constant */
+ 0x3313,0x3313,0x3313,0x3313,0x3313,0x3313,0x3313,0x3313, /* rV constant */
+ 0xF377,0xF377,0xF377,0xF377,0xF377,0xF377,0xF377,0xF377, /* gU constant */
+ 0xE5FC,0xE5FC,0xE5FC,0xE5FC,0xE5FC,0xE5FC,0xE5FC,0xE5FC, /* gV constant */
+ 0x408D,0x408D,0x408D,0x408D,0x408D,0x408D,0x408D,0x408D, /* bU constant */
+ 0x0008,0x0008,0x0008,0x0008,0x0008,0x0008,0x0008,0x0008, /* for rounding */
+}};
+/* Note that G->Y exceeds 0x7FFF, so be careful to treat it as unsigned
+ * (the rest of the values are signed) */
+static const struct { uint16_t n[96]; } __attribute__((aligned(16))) rgb_data = {{
+ 0x41BD,0x41BD,0x41BD,0x41BD,0x41BD,0x41BD,0x41BD,0x41BD, /* R->Y */
+ 0x810F,0x810F,0x810F,0x810F,0x810F,0x810F,0x810F,0x810F, /* G->Y */
+ 0x1910,0x1910,0x1910,0x1910,0x1910,0x1910,0x1910,0x1910, /* B->Y */
+ 0xDA0E,0xDA0E,0xDA0E,0xDA0E,0xDA0E,0xDA0E,0xDA0E,0xDA0E, /* R->U */
+ 0xB582,0xB582,0xB582,0xB582,0xB582,0xB582,0xB582,0xB582, /* G->U */
+ 0x7070,0x7070,0x7070,0x7070,0x7070,0x7070,0x7070,0x7070, /* B->U */
+ 0x7070,0x7070,0x7070,0x7070,0x7070,0x7070,0x7070,0x7070, /* R->V */
+ 0xA1D9,0xA1D9,0xA1D9,0xA1D9,0xA1D9,0xA1D9,0xA1D9,0xA1D9, /* G->V */
+ 0xEDB7,0xEDB7,0xEDB7,0xEDB7,0xEDB7,0xEDB7,0xEDB7,0xEDB7, /* B->V */
+ 0x0420,0x0420,0x0420,0x0420,0x0420,0x0420,0x0420,0x0420, /* Y +16.5 */
+ 0x2020,0x2020,0x2020,0x2020,0x2020,0x2020,0x2020,0x2020, /* U/V +128.5 */
+ 0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF, /* for odd/even */
+}};
+#define Y_GRAY 0x4A85
+#define GRAY_Y 0x36F7
+static const struct { uint16_t n[32]; } __attribute__((aligned(16))) gray_data = {{
+ Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY, /* 255/219 */
+ GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y, /* 219/255 */
+ 0x0010,0x0010,0x0010,0x0010,0x0010,0x0010,0x0010,0x0010, /* Y +/-16 */
+ 0x00FF,0xFF00,0x0000,0x00FF,0xFF00,0x0000,0x0000,0x0000, /* for Y->RGB */
+}};
+
+/* Convert 4 RGB32 pixels in EAX/EBX/ECX/EDX to RGB24 in EAX/EBX/ECX */
+#define IA32_RGB32_TO_RGB24 \
+ "movl %%ebx, %%esi # ESI: 00 B1 G1 R1 \n\
+ shll $24, %%esi # ESI: R1 00 00 00 \n\
+ shrl $8, %%ebx # EBX: 00 00 B1 G1 \n\
+ orl %%esi, %%eax # EAX: R1 B0 G0 R0 \n\
+ movl %%ecx, %%esi # ESI: 00 B2 G2 R2 \n\
+ shll $16, %%esi # ESI: G2 R2 00 00 \n\
+ shrl $16, %%ecx # ECX: 00 00 00 B2 \n\
+ shll $8, %%edx # EDX: B3 G3 R3 00 \n\
+ orl %%esi, %%ebx # EBX: G2 R2 B1 G1 \n\
+ orl %%edx, %%ecx # ECX: B3 G3 R3 B2 \n"
+
+/* Convert 4 RGB24 pixels in EAX/EBX/ECX to RGB32 in EAX/EBX/ECX/EDX */
+#define IA32_RGB24_TO_RGB32 \
+ "movl %%ecx, %%edx # EDX: B3 G3 R3 B2 \n\
+ shrl $8, %%edx # EDX: 00 B3 G3 R3 \n\
+ andl $0xFF, %%ecx # ECX: 00 00 00 B2 \n\
+ movl %%ebx, %%edi # EDI: G2 R2 B1 G1 \n\
+ andl $0xFFFF0000, %%edi # EDI: G2 R2 00 00 \n\
+ orl %%edi, %%ecx # ECX: G2 R2 00 B2 \n\
+ rorl $16, %%ecx # ECX: 00 B2 G2 R2 \n\
+ movl %%eax, %%edi # EDI: R1 B0 G0 R0 \n\
+ andl $0xFF000000, %%edi # EDI: R1 00 00 00 \n\
+ andl $0x0000FFFF, %%ebx # EBX: 00 00 B1 G1 \n\
+ orl %%edi, %%ebx # EBX: R1 00 B1 G1 \n\
+ roll $8, %%ebx # EBX: 00 B1 G1 R1 \n\
+ andl $0x00FFFFFF, %%eax # EAX: 00 B0 G0 R0 \n"
+
+#endif /* HAVE_ASM_MMX */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* MMX routines */
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86) /* i.e. not x86_64 */
+
+static inline void mmx_yuv42Xp_to_rgb(uint8_t *srcY, uint8_t *srcU,
+ uint8_t *srcV);
+#define mmx_yuv420p_to_rgb mmx_yuv42Xp_to_rgb
+#define mmx_yuv422p_to_rgb mmx_yuv42Xp_to_rgb
+static inline void mmx_store_rgb24(uint8_t *dest);
+static inline void mmx_store_bgr24(uint8_t *dest);
+static inline void mmx_store_rgba32(uint8_t *dest);
+static inline void mmx_store_abgr32(uint8_t *dest);
+static inline void mmx_store_argb32(uint8_t *dest);
+static inline void mmx_store_bgra32(uint8_t *dest);
+
+#define DEFINE_YUV2RGB_MMX(yuv,rgb,uvofs,rgbsz,rofs,gofs,bofs) \
+static int yuv##_##rgb##_mmx(uint8_t **src, uint8_t **dest, \
+ int width, int height) \
+{ \
+ int x, y; \
+ \
+ yuv_create_tables(); \
+ for (y = 0; y < height; y++) { \
+ for (x = 0; x < (width & ~7); x += 8) { \
+ mmx_##yuv##_to_rgb(src[0]+y*width+x, \
+ src[1]+(uvofs), src[2]+(uvofs)); \
+ mmx_store_##rgb(dest[0]+(y*width+x)*rgbsz); \
+ } \
+ while (x < width) { \
+ YUV2RGB(uvofs,rgbsz,rofs,gofs,bofs); \
+ x++; \
+ } \
+ } \
+ asm("emms"); \
+ return 1; \
+}
+
+#define DEFINE_YUV2RGB_MMX_SET(rgb,rgbsz,rofs,gofs,bofs) \
+ DEFINE_YUV2RGB_MMX(yuv420p,rgb,(y/2)*(width/2)+(x/2),rgbsz,rofs,gofs,bofs)\
+ DEFINE_YUV2RGB_MMX(yuv422p,rgb,(y )*(width/2)+(x/2),rgbsz,rofs,gofs,bofs)
+
+DEFINE_YUV2RGB_MMX_SET(rgb24, 3,0,1,2)
+DEFINE_YUV2RGB_MMX_SET(bgr24, 3,2,1,0)
+DEFINE_YUV2RGB_MMX_SET(rgba32, 4,0,1,2)
+DEFINE_YUV2RGB_MMX_SET(abgr32, 4,3,2,1)
+DEFINE_YUV2RGB_MMX_SET(argb32, 4,1,2,3)
+DEFINE_YUV2RGB_MMX_SET(bgra32, 4,2,1,0)
+
+/************************************/
+
+static inline void mmx_yuv42Xp_to_rgb(uint8_t *srcY, uint8_t *srcU,
+ uint8_t *srcV)
+{
+ asm("\
+ # Load data, bias and expand to 16 bits \n\
+ pxor %%mm4, %%mm4 # MM4: 00 00 00 00 00 00 00 00 \n\
+ movq ("EAX"), %%mm6 # MM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+ movd ("ECX"), %%mm2 # MM2: U3 U2 U1 U0 \n\
+ movd ("EDX"), %%mm3 # MM3: V3 V2 V1 V0 \n\
+ movq %%mm6, %%mm7 # MM7: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+ pand ("ESI"), %%mm6 # MM6: -Y6- -Y4- -Y2- -Y0- \n\
+ psrlw $8, %%mm7 # MM7: -Y7- -Y5- -Y3- -Y1- \n\
+ punpcklbw %%mm4, %%mm2 # MM2: -U3- -U2- -U1- -U0- \n\
+ punpcklbw %%mm4, %%mm3 # MM3: -V3- -V2- -V1- -V0- \n\
+ psubw 16("ESI"), %%mm6 # MM6: subtract 16 \n\
+ psubw 16("ESI"), %%mm7 # MM7: subtract 16 \n\
+ psubw 32("ESI"), %%mm2 # MM2: subtract 128 \n\
+ psubw 32("ESI"), %%mm3 # MM3: subtract 128 \n\
+ psllw $7, %%mm6 # MM6: convert to fixed point 8.7 \n\
+ psllw $7, %%mm7 # MM7: convert to fixed point 8.7 \n\
+ psllw $7, %%mm2 # MM2: convert to fixed point 8.7 \n\
+ psllw $7, %%mm3 # MM3: convert to fixed point 8.7 \n\
+ # Multiply by constants \n\
+ pmulhw 48("ESI"), %%mm6 # MM6: -cY6- -cY4- -cY2- -cY0- \n\
+ pmulhw 48("ESI"), %%mm7 # MM6: -cY7- -cY5- -cY3- -cY1- \n\
+ movq 80("ESI"), %%mm4 # MM4: gU constant \n\
+ movq 96("ESI"), %%mm5 # MM5: gV constant \n\
+ pmulhw %%mm2, %%mm4 # MM4: -gU3- -gU2- -gU1- -gU0- \n\
+ pmulhw %%mm3, %%mm5 # MM5: -gV3- -gV2- -gV1- -gV0- \n\
+ paddw %%mm5, %%mm4 # MM4: -g3- -g2- -g1- -g0- \n\
+ pmulhw 64("ESI"), %%mm3 # MM3: -r3- -r2- -r1- -r0- \n\
+ pmulhw 112("ESI"),%%mm2 # MM2: -b3- -b2- -b1- -b0- \n\
+ movq %%mm3, %%mm0 # MM0: -r3- -r2- -r1- -r0- \n\
+ movq %%mm4, %%mm1 # MM1: -g3- -g2- -g1- -g0- \n\
+ movq %%mm2, %%mm5 # MM5: -b3- -b2- -b1- -b0- \n\
+ # Add intermediate results and round/shift to get R/G/B values \n\
+ paddw 128("ESI"), %%mm6 # Add rounding value (0.5 @ 8.4 fixed) \n\
+ paddw 128("ESI"), %%mm7 \n\
+ paddw %%mm6, %%mm0 # MM0: -R6- -R4- -R2- -R0- \n\
+ psraw $4, %%mm0 # Shift back to 8.0 fixed \n\
+ paddw %%mm6, %%mm1 # MM1: -G6- -G4- -G2- -G0- \n\
+ psraw $4, %%mm1 \n\
+ paddw %%mm6, %%mm2 # MM2: -B6- -B4- -B2- -B0- \n\
+ psraw $4, %%mm2 \n\
+ paddw %%mm7, %%mm3 # MM3: -R7- -R5- -R3- -R1- \n\
+ psraw $4, %%mm3 \n\
+ paddw %%mm7, %%mm4 # MM4: -G7- -G5- -G3- -G1- \n\
+ psraw $4, %%mm4 \n\
+ paddw %%mm7, %%mm5 # MM5: -B7- -B5- -B3- -B1- \n\
+ psraw $4, %%mm5 \n\
+ # Saturate to 0-255 and pack into bytes \n\
+ packuswb %%mm0, %%mm0 # MM0: R6 R4 R2 R0 R6 R4 R2 R0 \n\
+ packuswb %%mm1, %%mm1 # MM1: G6 G4 G2 G0 G6 G4 G2 G0 \n\
+ packuswb %%mm2, %%mm2 # MM2: B6 B4 B2 B0 B6 B4 B2 B0 \n\
+ packuswb %%mm3, %%mm3 # MM3: R7 R5 R3 R1 R7 R5 R3 R1 \n\
+ packuswb %%mm4, %%mm4 # MM4: G7 G5 G3 G1 G7 G5 G3 G1 \n\
+ packuswb %%mm5, %%mm5 # MM5: B7 B5 B3 B1 B7 B5 B3 B1 \n\
+ punpcklbw %%mm3, %%mm0 # MM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\
+ punpcklbw %%mm4, %%mm1 # MM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\
+ punpcklbw %%mm5, %%mm2 # MM2: B7 B6 B5 B4 B3 B2 B1 B0 \n"
+ : /* no outputs */
+ : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data)
+ );
+}
+
+/************************************/
+
+/* Convert YUV->RGB output to RGBA pixels in MM0..MM3 */
+#define MMX_RGB_TO_RGBA "\
+ pxor %%mm7, %%mm7 # MM7: 00 00 00 00 00 00 00 00 \n\
+ movq %%mm0, %%mm3 # MM3: R7 R6 R5 R4 R3 R2 R1 R0 \n\
+ movq %%mm1, %%mm4 # MM4: G7 G6 G5 G4 G3 G2 G1 G0 \n\
+ movq %%mm2, %%mm5 # MM5: B7 B6 B5 B4 B3 B2 B1 B0 \n\
+ punpcklbw %%mm1, %%mm0 # MM0: G3 R3 G2 R2 G1 R1 G0 R0 \n\
+ punpcklbw %%mm7, %%mm2 # MM2: 00 B3 00 B2 00 B1 00 B0 \n\
+ movq %%mm0, %%mm1 # MM1: G3 R3 G2 R2 G1 R1 G0 R0 \n\
+ punpcklwd %%mm2, %%mm0 # MM0: 00 B1 G1 R1 00 B0 G0 R0 \n\
+ punpckhwd %%mm2, %%mm1 # MM1: 00 B3 G3 R3 00 B2 G2 R2 \n\
+ punpckhbw %%mm4, %%mm3 # MM3: G7 R7 G6 R6 G5 R5 G4 R4 \n\
+ punpckhbw %%mm7, %%mm5 # MM5: 00 B7 00 B6 00 B5 00 B4 \n\
+ movq %%mm3, %%mm2 # MM2: G7 R7 G6 R6 G5 R5 G4 R4 \n\
+ punpckhwd %%mm5, %%mm3 # MM3: 00 B7 G7 R7 00 B6 G6 R6 \n\
+ punpcklwd %%mm5, %%mm2 # MM2: 00 B5 G5 R5 00 B4 G4 R4 \n"
+
+/* Convert YUV->RGB output to BGRA pixels in MM0..MM3 */
+#define MMX_RGB_TO_BGRA "\
+ pxor %%mm7, %%mm7 # MM7: 00 00 00 00 00 00 00 00 \n\
+ movq %%mm0, %%mm5 # MM5: R7 R6 R5 R4 R3 R2 R1 R0 \n\
+ movq %%mm1, %%mm4 # MM4: G7 G6 G5 G4 G3 G2 G1 G0 \n\
+ movq %%mm2, %%mm3 # MM3: B7 B6 B5 B4 B3 B2 B1 B0 \n\
+ punpcklbw %%mm1, %%mm2 # MM2: G3 B3 G2 B2 G1 B1 G0 B0 \n\
+ punpcklbw %%mm7, %%mm0 # MM0: 00 R3 00 R2 00 R1 00 R0 \n\
+ movq %%mm2, %%mm1 # MM1: G3 B3 G2 B2 G1 B1 G0 B0 \n\
+ punpcklwd %%mm0, %%mm2 # MM2: 00 R1 G1 B1 00 R0 G0 B0 \n\
+ punpckhwd %%mm0, %%mm1 # MM1: 00 R3 G3 B3 00 R2 G2 B2 \n\
+ movq %%mm2, %%mm0 # MM0: 00 R1 G1 B1 00 R0 G0 B0 \n\
+ punpckhbw %%mm4, %%mm3 # MM3: G7 B7 G6 B6 G5 B5 G4 B4 \n\
+ punpckhbw %%mm7, %%mm5 # MM5: 00 R7 00 R6 00 R5 00 R4 \n\
+ movq %%mm3, %%mm2 # MM2: G7 B7 G6 B6 G5 B5 G4 B4 \n\
+ punpckhwd %%mm5, %%mm3 # MM3: 00 R7 G7 B7 00 R6 G6 B6 \n\
+ punpcklwd %%mm5, %%mm2 # MM2: 00 R5 G5 B5 00 R4 G4 B4 \n"
+
+
+static inline void mmx_store_rgb24(uint8_t *dest)
+{
+ /* It looks like it's fastest to go to RGB32 first, then shift the
+ * result to merge the 24-bit pixels together. */
+ asm(MMX_RGB_TO_RGBA "\
+ movq %%mm0, %%mm4 # MM4: 00 B1 G1 R1 00 B0 G0 R0 \n\
+ movq %%mm1, %%mm5 # MM5: 00 B3 G3 R3 00 B2 G2 R2 \n\
+ movq %%mm2, %%mm6 # MM6: 00 B5 G5 R5 00 B4 G4 R4 \n\
+ movq %%mm3, %%mm7 # MM7: 00 B7 G7 R7 00 B6 G6 R6 \n\
+ psrlq $32, %%mm4 # MM4: 00 00 00 00 00 B1 G1 R1 \n\
+ psrlq $32, %%mm5 # MM5: 00 00 00 00 00 B3 G3 R3 \n\
+ psrlq $32, %%mm6 # MM6: 00 00 00 00 00 B5 G5 R5 \n\
+ psrlq $32, %%mm7 # MM7: 00 00 00 00 00 B7 G7 R7 \n\
+ push "EBX" \n\
+ movd %%mm0, %%eax # EAX: 00 B0 G0 R0 \n\
+ movd %%mm4, %%ebx # EBX: 00 B1 G1 R1 \n\
+ movd %%mm1, %%ecx # ECX: 00 B2 G2 R2 \n\
+ movd %%mm5, %%edx # EDX: 00 B3 G3 R3 \n\
+ "IA32_RGB32_TO_RGB24" \n\
+ movl %%eax, ("EDI") \n\
+ movl %%ebx, 4("EDI") \n\
+ movl %%ecx, 8("EDI") \n\
+ movd %%mm2, %%eax # EAX: 00 B4 G4 R4 \n\
+ movd %%mm6, %%ebx # EBX: 00 B5 G5 R5 \n\
+ movd %%mm3, %%ecx # ECX: 00 B6 G6 R6 \n\
+ movd %%mm7, %%edx # EDX: 00 B7 G7 R7 \n\
+ "IA32_RGB32_TO_RGB24" \n\
+ movl %%eax, 12("EDI") \n\
+ movl %%ebx, 16("EDI") \n\
+ movl %%ecx, 20("EDI") \n\
+ pop "EBX" \n"
+ : /* no outputs */
+ : "D" (dest)
+ : "eax", "ecx", "edx", "esi"
+ );
+}
+
+static inline void mmx_store_bgr24(uint8_t *dest)
+{
+ asm(MMX_RGB_TO_BGRA "\
+ movq %%mm0, %%mm4 # MM4: 00 B1 G1 R1 00 B0 G0 R0 \n\
+ movq %%mm1, %%mm5 # MM5: 00 B3 G3 R3 00 B2 G2 R2 \n\
+ movq %%mm2, %%mm6 # MM6: 00 B5 G5 R5 00 B4 G4 R4 \n\
+ movq %%mm3, %%mm7 # MM7: 00 B7 G7 R7 00 B6 G6 R6 \n\
+ psrlq $32, %%mm4 # MM4: 00 00 00 00 00 B1 G1 R1 \n\
+ psrlq $32, %%mm5 # MM5: 00 00 00 00 00 B3 G3 R3 \n\
+ psrlq $32, %%mm6 # MM6: 00 00 00 00 00 B5 G5 R5 \n\
+ psrlq $32, %%mm7 # MM7: 00 00 00 00 00 B7 G7 R7 \n\
+ push "EBX" \n\
+ movd %%mm0, %%eax # EAX: 00 B0 G0 R0 \n\
+ movd %%mm4, %%ebx # EBX: 00 B1 G1 R1 \n\
+ movd %%mm1, %%ecx # ECX: 00 B2 G2 R2 \n\
+ movd %%mm5, %%edx # EDX: 00 B3 G3 R3 \n\
+ "IA32_RGB32_TO_RGB24" \n\
+ movl %%eax, ("EDI") \n\
+ movl %%ebx, 4("EDI") \n\
+ movl %%ecx, 8("EDI") \n\
+ movd %%mm2, %%eax # EAX: 00 B4 G4 R4 \n\
+ movd %%mm6, %%ebx # EBX: 00 B5 G5 R5 \n\
+ movd %%mm3, %%ecx # ECX: 00 B6 G6 R6 \n\
+ movd %%mm7, %%edx # EDX: 00 B7 G7 R7 \n\
+ "IA32_RGB32_TO_RGB24" \n\
+ movl %%eax, 12("EDI") \n\
+ movl %%ebx, 16("EDI") \n\
+ movl %%ecx, 20("EDI") \n\
+ pop "EBX" \n"
+ : /* no outputs */
+ : "D" (dest)
+ : "eax", "ecx", "edx", "esi"
+ );
+}
+
+static inline void mmx_store_rgba32(uint8_t *dest)
+{
+ asm(MMX_RGB_TO_RGBA "\
+ movq %%mm0, ("EDI") \n\
+ movq %%mm1, 8("EDI") \n\
+ movq %%mm2, 16("EDI") \n\
+ movq %%mm3, 24("EDI") \n"
+ : /* no outputs */
+ : "D" (dest)
+ );
+}
+
+static inline void mmx_store_abgr32(uint8_t *dest)
+{
+ asm(MMX_RGB_TO_BGRA "\
+ psllq $8, %%mm0 \n\
+ psllq $8, %%mm1 \n\
+ psllq $8, %%mm2 \n\
+ psllq $8, %%mm3 \n\
+ movq %%mm0, ("EDI") \n\
+ movq %%mm1, 8("EDI") \n\
+ movq %%mm2, 16("EDI") \n\
+ movq %%mm3, 24("EDI") \n"
+ : /* no outputs */
+ : "D" (dest)
+ );
+}
+
+static inline void mmx_store_argb32(uint8_t *dest)
+{
+ asm(MMX_RGB_TO_RGBA "\
+ psllq $8, %%mm0 \n\
+ psllq $8, %%mm1 \n\
+ psllq $8, %%mm2 \n\
+ psllq $8, %%mm3 \n\
+ movq %%mm0, ("EDI") \n\
+ movq %%mm1, 8("EDI") \n\
+ movq %%mm2, 16("EDI") \n\
+ movq %%mm3, 24("EDI") \n"
+ : /* no outputs */
+ : "D" (dest)
+ );
+}
+
+static inline void mmx_store_bgra32(uint8_t *dest)
+{
+ asm(MMX_RGB_TO_BGRA "\
+ movq %%mm0, ("EDI") \n\
+ movq %%mm1, 8("EDI") \n\
+ movq %%mm2, 16("EDI") \n\
+ movq %%mm3, 24("EDI") \n"
+ : /* no outputs */
+ : "D" (dest)
+ );
+}
+
+#endif /* HAVE_ASM_MMX && ARCH_X86 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* SSE2 routines */
+
+#if defined(HAVE_ASM_SSE2)
+
+/*************************************************************************/
+
+static inline void sse2_load_yuv420p(uint8_t *srcY, uint8_t *srcU,
+ uint8_t *srcV, int x, int y, int width);
+static inline void sse2_load_yuv411p(uint8_t *srcY, uint8_t *srcU,
+ uint8_t *srcV, int x, int y, int width);
+static inline void sse2_load_yuv422p(uint8_t *srcY, uint8_t *srcU,
+ uint8_t *srcV, int x, int y, int width);
+static inline void sse2_load_yuv444p(uint8_t *srcY, uint8_t *srcU,
+ uint8_t *srcV, int x, int y, int width);
+static inline void sse2_load_yuy2(uint8_t *srcY, uint8_t *srcU,
+ uint8_t *srcV, int x, int y, int width);
+static inline void sse2_load_uyvy(uint8_t *srcY, uint8_t *srcU,
+ uint8_t *srcV, int x, int y, int width);
+static inline void sse2_load_yvyu(uint8_t *srcY, uint8_t *srcU,
+ uint8_t *srcV, int x, int y, int width);
+static inline void sse2_yuv_to_rgb(void);
+static inline void sse2_yuv444_to_rgb(void);
+static inline void sse2_store_rgb24(uint8_t *dest);
+static inline void sse2_store_bgr24(uint8_t *dest);
+static inline void sse2_store_rgba32(uint8_t *dest);
+static inline void sse2_store_abgr32(uint8_t *dest);
+static inline void sse2_store_argb32(uint8_t *dest);
+static inline void sse2_store_bgra32(uint8_t *dest);
+
+#define DEFINE_YUV2RGB_SSE2(yuv,y2r,rgb,rgbsz,slowop) \
+static int yuv##_##rgb##_sse2(uint8_t **src, uint8_t **dest, \
+ int width, int height) \
+{ \
+ int x, y; \
+ \
+ yuv_create_tables(); \
+ for (y = 0; y < height; y++) { \
+ for (x = 0; x < (width & ~15); x += 16) { \
+ sse2_load_##yuv(src[0], src[1], src[2], x, y, width); \
+ sse2_##y2r(); \
+ sse2_store_##rgb(dest[0] + (y*width+x)*rgbsz); \
+ } \
+ while (x < width) { \
+ slowop; \
+ x++; \
+ } \
+ } \
+ asm("emms"); \
+ return 1; \
+}
+
+#define DEFINE_YUV2RGB_SSE2_SET(rgb,sz,r,g,b) \
+ DEFINE_YUV2RGB_SSE2(yuv420p, yuv_to_rgb, rgb,sz, YUV2RGB_420P(sz,r,g,b))\
+ DEFINE_YUV2RGB_SSE2(yuv411p, yuv_to_rgb, rgb,sz, YUV2RGB_411P(sz,r,g,b))\
+ DEFINE_YUV2RGB_SSE2(yuv422p, yuv_to_rgb, rgb,sz, YUV2RGB_422P(sz,r,g,b))\
+ DEFINE_YUV2RGB_SSE2(yuv444p, yuv444_to_rgb,rgb,sz, YUV2RGB_444P(sz,r,g,b))\
+ DEFINE_YUV2RGB_SSE2(yuy2, yuv_to_rgb, rgb,sz, YUV2RGB_YUY2(sz,r,g,b))\
+ DEFINE_YUV2RGB_SSE2(uyvy, yuv_to_rgb, rgb,sz, YUV2RGB_UYVY(sz,r,g,b))\
+ DEFINE_YUV2RGB_SSE2(yvyu, yuv_to_rgb, rgb,sz, YUV2RGB_YVYU(sz,r,g,b))
+
+DEFINE_YUV2RGB_SSE2_SET(rgb24, 3,0,1,2)
+DEFINE_YUV2RGB_SSE2_SET(bgr24, 3,2,1,0)
+DEFINE_YUV2RGB_SSE2_SET(rgba32, 4,0,1,2)
+DEFINE_YUV2RGB_SSE2_SET(abgr32, 4,3,2,1)
+DEFINE_YUV2RGB_SSE2_SET(argb32, 4,1,2,3)
+DEFINE_YUV2RGB_SSE2_SET(bgra32, 4,2,1,0)
+
+/************************************/
+
+static inline void sse2_load_yuv420p(uint8_t *srcY, uint8_t *srcU,
+ uint8_t *srcV, int x, int y, int width)
+{
+ srcY += y*width+x;
+ srcU += (y/2)*(width/2)+(x/2);
+ srcV += (y/2)*(width/2)+(x/2);
+ asm("\
+ # Load data, bias and expand to 16 bits \n\
+ pxor %%xmm4, %%xmm4 # XMM4: 00 00 00 00 00 00 00 00 \n\
+ movdqu ("EAX"), %%xmm6 # XMM6: YF...................Y0 \n\
+ movq ("ECX"), %%xmm2 # XMM2: U7.......U0 \n\
+ movq ("EDX"), %%xmm3 # XMM3: V7.......V0 \n\
+ movdqa %%xmm6, %%xmm7 # XMM7: YF...................Y0 \n\
+ pand ("ESI"), %%xmm6 # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0 \n\
+ psrlw $8, %%xmm7 # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1 \n\
+ punpcklbw %%xmm4,%%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0 \n\
+ punpcklbw %%xmm4,%%xmm3 # XMM3: V7 V6 V5 V4 V3 V2 V1 V0 \n"
+ : /* no outputs */
+ : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data)
+ );
+}
+
+static inline void sse2_load_yuv411p(uint8_t *srcY, uint8_t *srcU,
+ uint8_t *srcV, int x, int y, int width)
+{
+ srcY += y*width+x;
+ srcU += y*(width/4)+(x/4);
+ srcV += y*(width/4)+(x/4);
+ asm("\
+ # Load data, bias and expand to 16 bits \n\
+ pxor %%xmm4, %%xmm4 # XMM4: 00 00 00 00 00 00 00 00 \n\
+ movdqu ("EAX"), %%xmm6 # XMM6: YF...................Y0 \n\
+ movd ("ECX"), %%xmm2 # XMM2: U3.U0 \n\
+ punpcklbw %%xmm2,%%xmm2 # XMM2: U3 U3.U0 U0 \n\
+ movd ("EDX"), %%xmm3 # XMM3: V3.V0 \n\
+ punpcklbw %%xmm3,%%xmm3 # XMM2: V3 V3.V0 V0 \n\
+ movdqa %%xmm6, %%xmm7 # XMM7: YF...................Y0 \n\
+ pand ("ESI"), %%xmm6 # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0 \n\
+ psrlw $8, %%xmm7 # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1 \n\
+ punpcklbw %%xmm4,%%xmm2 # XMM2: U3 U3 U2 U2 U1 U1 U0 U0 \n\
+ punpcklbw %%xmm4,%%xmm3 # XMM3: V3 V3 V2 V2 V1 V1 V0 V0 \n"
+ : /* no outputs */
+ : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data)
+ );
+}
+
+static inline void sse2_load_yuv422p(uint8_t *srcY, uint8_t *srcU,
+ uint8_t *srcV, int x, int y, int width)
+{
+ srcY += y*width+x;
+ srcU += y*(width/2)+(x/2);
+ srcV += y*(width/2)+(x/2);
+ asm("\
+ # Load data, bias and expand to 16 bits \n\
+ pxor %%xmm4, %%xmm4 # XMM4: 00 00 00 00 00 00 00 00 \n\
+ movdqu ("EAX"), %%xmm6 # XMM6: YF...................Y0 \n\
+ movq ("ECX"), %%xmm2 # XMM2: U7.......U0 \n\
+ movq ("EDX"), %%xmm3 # XMM3: V7.......V0 \n\
+ movdqa %%xmm6, %%xmm7 # XMM7: YF...................Y0 \n\
+ pand ("ESI"), %%xmm6 # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0 \n\
+ psrlw $8, %%xmm7 # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1 \n\
+ punpcklbw %%xmm4,%%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0 \n\
+ punpcklbw %%xmm4,%%xmm3 # XMM3: V7 V6 V5 V4 V3 V2 V1 V0 \n"
+ : /* no outputs */
+ : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data)
+ );
+}
+
+static inline void sse2_load_yuv444p(uint8_t *srcY, uint8_t *srcU,
+ uint8_t *srcV, int x, int y, int width)
+{
+ srcY += y*width+x;
+ srcU += y*width+x;
+ srcV += y*width+x;
+ asm("\
+ # Load data, bias and expand to 16 bits \n\
+ pxor %%xmm4, %%xmm4 # XMM4: 00 00 00 00 00 00 00 00 \n\
+ movdqu ("EAX"), %%xmm6 # XMM6: YF...................Y0 \n\
+ movdqu ("ECX"), %%xmm2 # XMM2: UF...................U0 \n\
+ movdqu ("EDX"), %%xmm0 # XMM0: VF...................V0 \n\
+ movdqa %%xmm6, %%xmm7 # XMM7: YF...................Y0 \n\
+ punpcklbw %%xmm4,%%xmm6 # XMM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+ punpckhbw %%xmm4,%%xmm7 # XMM7: YF YE YD YC YB YA Y9 Y8 \n\
+ movdqa %%xmm2, %%xmm5 # XMM5: UF...................U0 \n\
+ punpcklbw %%xmm4,%%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0 \n\
+ punpckhbw %%xmm4,%%xmm5 # XMM5: UF UE UD UC UB UA U9 U8 \n\
+ movdqa %%xmm0, %%xmm3 # XMM3: VF...................V0 \n\
+ punpcklbw %%xmm4,%%xmm0 # XMM0: V7 V6 V5 V4 V3 V2 V1 V0 \n\
+ punpckhbw %%xmm4,%%xmm3 # XMM3: VF VE VD VC VB VA V9 V8 \n"
+ : /* no outputs */
+ : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data)
+ );
+}
+
+static inline void sse2_load_yuy2(uint8_t *srcY, uint8_t *srcU,
+ uint8_t *srcV, int x, int y, int width)
+{
+ srcY += (y*width+x)*2;
+ asm("\
+ # Load data, bias and expand to 16 bits \n\
+ pxor %%xmm4, %%xmm4 # XMM4: 00 00 00 00 00 00 00 00 \n\
+ movdqu ("EAX"), %%xmm6 # XMM6: V3 Y7.............U0 Y0 \n\
+ movdqu 16("EAX"),%%xmm7 # XMM7: V7 YF.............U4 Y8 \n\
+ movdqa %%xmm6, %%xmm2 # XMM2: V3 Y7.............U0 Y0 \n\
+ psrlw $8, %%xmm2 # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\
+ pand ("ESI"), %%xmm6 # XMM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+ movdqa %%xmm7, %%xmm3 # XMM3: V7 YF.............U4 Y8 \n\
+ psrlw $8, %%xmm3 # XMM3: V7 U7 V6 U6 V5 U5 V4 U4 \n\
+ pand ("ESI"), %%xmm7 # XMM6: YF YE YD YC YB YA Y9 Y8 \n\
+ packuswb %%xmm3, %%xmm2 # XMM2: V7 U7.............V0 U0 \n\
+ movdqa %%xmm2, %%xmm3 # XMM3: V7 U7.............V0 U0 \n\
+ pand ("ESI"), %%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0 \n\
+ psrlw $8, %%xmm3 # XMM3: V7 V6 V5 V4 V3 V2 V1 V0 \n\
+ packuswb %%xmm7, %%xmm6 # XMM6: YF...................Y0 \n\
+ movdqa %%xmm6, %%xmm7 # XMM7: YF...................Y0 \n\
+ pand ("ESI"), %%xmm6 # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0 \n\
+ psrlw $8, %%xmm7 # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1 \n"
+ : /* no outputs */
+ : "a" (srcY), "S" (&yuv_data), "m" (yuv_data)
+ );
+}
+
+static inline void sse2_load_uyvy(uint8_t *srcY, uint8_t *srcU,
+ uint8_t *srcV, int x, int y, int width)
+{
+ srcY += (y*width+x)*2;
+ asm("\
+ # Load data, bias and expand to 16 bits \n\
+ pxor %%xmm4, %%xmm4 # XMM4: 00 00 00 00 00 00 00 00 \n\
+ movdqu ("EAX"), %%xmm6 # XMM6: Y7 V3.............Y0 00 \n\
+ movdqu 16("EAX"),%%xmm7 # XMM7: YF V7.............Y8 U4 \n\
+ movdqa %%xmm6, %%xmm2 # XMM2: Y7 V3.............Y0 U0 \n\
+ pand ("ESI"), %%xmm2 # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\
+ psrlw $8, %%xmm6 # XMM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+ movdqa %%xmm7, %%xmm3 # XMM3: YF V7.............Y8 U4 \n\
+ pand ("ESI"), %%xmm3 # XMM3: V7 U7 V6 U6 V5 U5 V4 U4 \n\
+ psrlw $8, %%xmm7 # XMM6: YF YE YD YC YB YA Y9 Y8 \n\
+ packuswb %%xmm3, %%xmm2 # XMM2: V7 U7.............V0 U0 \n\
+ movdqa %%xmm2, %%xmm3 # XMM3: V7 U7.............V0 U0 \n\
+ pand ("ESI"), %%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0 \n\
+ psrlw $8, %%xmm3 # XMM3: V7 V6 V5 V4 V3 V2 V1 V0 \n\
+ packuswb %%xmm7, %%xmm6 # XMM6: YF...................Y0 \n\
+ movdqa %%xmm6, %%xmm7 # XMM7: YF...................Y0 \n\
+ pand ("ESI"), %%xmm6 # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0 \n\
+ psrlw $8, %%xmm7 # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1 \n"
+ : /* no outputs */
+ : "a" (srcY), "S" (&yuv_data), "m" (yuv_data)
+ );
+}
+
+static inline void sse2_load_yvyu(uint8_t *srcY, uint8_t *srcU,
+ uint8_t *srcV, int x, int y, int width)
+{
+ srcY += (y*width+x)*2;
+ asm("\
+ # Load data, bias and expand to 16 bits \n\
+ pxor %%xmm4, %%xmm4 # XMM4: 00 00 00 00 00 00 00 00 \n\
+ movdqu ("EAX"), %%xmm6 # XMM6: U3 Y7.............V0 Y0 \n\
+ movdqu 16("EAX"),%%xmm7 # XMM7: U7 YF.............V4 Y8 \n\
+ movdqa %%xmm6, %%xmm2 # XMM2: U3 Y7.............V0 Y0 \n\
+ psrlw $8, %%xmm2 # XMM2: U3 V3 U2 V2 U1 V1 U0 V0 \n\
+ pand ("ESI"), %%xmm6 # XMM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
+ movdqa %%xmm7, %%xmm3 # XMM3: U7 YF.............V4 Y8 \n\
+ psrlw $8, %%xmm3 # XMM3: U7 V7 U6 V6 U5 V5 U4 V4 \n\
+ pand ("ESI"), %%xmm7 # XMM6: YF YE YD YC YB YA Y9 Y8 \n\
+ packuswb %%xmm3, %%xmm2 # XMM2: U7 V7.............U0 V0 \n\
+ movdqa %%xmm2, %%xmm3 # XMM3: U7 V7.............U0 V0 \n\
+ psrlw $8, %%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0 \n\
+ pand ("ESI"), %%xmm3 # XMM3: V7 V6 V5 V4 V3 V2 V1 V0 \n\
+ packuswb %%xmm7, %%xmm6 # XMM6: YF...................Y0 \n\
+ movdqa %%xmm6, %%xmm7 # XMM7: YF...................Y0 \n\
+ pand ("ESI"), %%xmm6 # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0 \n\
+ psrlw $8, %%xmm7 # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1 \n"
+ : /* no outputs */
+ : "a" (srcY), "S" (&yuv_data), "m" (yuv_data)
+ );
+}
+
+/************************************/
+
+/* Standard YUV->RGB (Yodd=XMM7 Yeven=XMM6 U=XMM2 V=XMM3) */
+static inline void sse2_yuv_to_rgb(void)
+{
+ asm("\
+ psubw 16("ESI"), %%xmm6 # XMM6: subtract 16 \n\
+ psllw $7, %%xmm6 # XMM6: convert to fixed point 8.7 \n\
+ psubw 16("ESI"), %%xmm7 # XMM7: subtract 16 \n\
+ psllw $7, %%xmm7 # XMM7: convert to fixed point 8.7 \n\
+ psubw 32("ESI"), %%xmm2 # XMM2: subtract 128 \n\
+ psllw $7, %%xmm2 # XMM2: convert to fixed point 8.7 \n\
+ psubw 32("ESI"), %%xmm3 # XMM3: subtract 128 \n\
+ psllw $7, %%xmm3 # XMM3: convert to fixed point 8.7 \n\
+ # Multiply by constants \n\
+ pmulhw 48("ESI"),%%xmm6 # XMM6: cYE.................cY0 \n\
+ pmulhw 48("ESI"),%%xmm7 # XMM7: cYF.................cY1 \n\
+ movdqa 80("ESI"),%%xmm4 # XMM4: gU constant \n\
+ pmulhw %%xmm2, %%xmm4 # XMM4: gU7.................gU0 \n\
+ movdqa 96("ESI"),%%xmm5 # XMM5: gV constant \n\
+ pmulhw %%xmm3, %%xmm5 # XMM5: gV7.................gV0 \n\
+ paddw %%xmm5, %%xmm4 # XMM4: g7 g6 g5 g4 g3 g2 g1 g0 \n\
+ pmulhw 64("ESI"),%%xmm3 # XMM3: r7 r6 r5 r4 r3 r2 r1 r0 \n\
+ pmulhw 112("ESI"),%%xmm2 #XMM2: b7 b6 b5 b4 b3 b2 b1 b0 \n\
+ movdqa %%xmm3, %%xmm0 # XMM0: r7 r6 r5 r4 r3 r2 r1 r0 \n\
+ movdqa %%xmm4, %%xmm1 # XMM1: g7 g6 g5 g4 g3 g2 g1 g0 \n\
+ movdqa %%xmm2, %%xmm5 # XMM5: b7 b6 b5 b4 b3 b2 b1 b0 \n\
+ # Add intermediate results and round/shift to get R/G/B values \n\
+ paddw 128("ESI"),%%xmm6 # Add rounding value (0.5 @ 8.4 fixed) \n\
+ paddw 128("ESI"),%%xmm7 \n\
+ paddw %%xmm6, %%xmm0 # XMM0: RE RC RA R8 R6 R4 R2 R0 \n\
+ psraw $4, %%xmm0 # Shift back to 8.0 fixed \n\
+ paddw %%xmm6, %%xmm1 # XMM1: GE GC GA G8 G6 G4 G2 G0 \n\
+ psraw $4, %%xmm1 \n\
+ paddw %%xmm6, %%xmm2 # XMM2: BE BC BA B8 B6 B4 B2 B0 \n\
+ psraw $4, %%xmm2 \n\
+ paddw %%xmm7, %%xmm3 # XMM3: RF RD RB R9 R7 R5 R3 R1 \n\
+ psraw $4, %%xmm3 \n\
+ paddw %%xmm7, %%xmm4 # XMM4: GF GD GB G9 G7 G5 G3 G1 \n\
+ psraw $4, %%xmm4 \n\
+ paddw %%xmm7, %%xmm5 # XMM5: BF BD BB B9 B7 B5 B3 B1 \n\
+ psraw $4, %%xmm5 \n\
+ # Saturate to 0-255 and pack into bytes \n\
+ packuswb %%xmm0, %%xmm0 # XMM0: RE.......R0 RE.......R0 \n\
+ packuswb %%xmm1, %%xmm1 # XMM1: GE.......G0 GE.......G0 \n\
+ packuswb %%xmm2, %%xmm2 # XMM2: BE.......B0 BE.......B0 \n\
+ packuswb %%xmm3, %%xmm3 # XMM3: RF.......R1 RF.......R1 \n\
+ packuswb %%xmm4, %%xmm4 # XMM4: GF.......G1 GF.......G1 \n\
+ packuswb %%xmm5, %%xmm5 # XMM5: BF.......B1 BF.......B1 \n\
+ punpcklbw %%xmm3,%%xmm0 # XMM0: RF...................R0 \n\
+ punpcklbw %%xmm4,%%xmm1 # XMM1: GF...................G0 \n\
+ punpcklbw %%xmm5,%%xmm2 # XMM2: BF...................B0 \n"
+ : /* no outputs */
+ : "S" (&yuv_data), "m" (yuv_data)
+ );
+}
+
+/* YUV444 YUV->RGB (Y=XMM7:XMM6 U=XMM5:XMM2 V=XMM3:XMM0) */
+static inline void sse2_yuv444_to_rgb(void)
+{
+ asm("\
+ psubw 16("ESI"), %%xmm6 # XMM6: subtract 16 \n\
+ psllw $7, %%xmm6 # XMM6: convert to fixed point 8.7 \n\
+ psubw 16("ESI"), %%xmm7 # XMM7: subtract 16 \n\
+ psllw $7, %%xmm7 # XMM7: convert to fixed point 8.7 \n\
+ psubw 32("ESI"), %%xmm2 # XMM2: subtract 128 \n\
+ psllw $7, %%xmm2 # XMM2: convert to fixed point 8.7 \n\
+ psubw 32("ESI"), %%xmm5 # XMM5: subtract 128 \n\
+ psllw $7, %%xmm5 # XMM5: convert to fixed point 8.7 \n\
+ psubw 32("ESI"), %%xmm0 # XMM0: subtract 128 \n\
+ psllw $7, %%xmm0 # XMM0: convert to fixed point 8.7 \n\
+ psubw 32("ESI"), %%xmm3 # XMM3: subtract 128 \n\
+ psllw $7, %%xmm3 # XMM3: convert to fixed point 8.7 \n\
+ # Multiply by constants \n\
+ pmulhw 48("ESI"),%%xmm6 # XMM6: cY7.................cY0 \n\
+ movdqa 80("ESI"),%%xmm1 # XMM1: gU constant \n\
+ pmulhw %%xmm2, %%xmm1 # XMM1: gU7.................gU0 \n\
+ movdqa 96("ESI"),%%xmm4 # XMM4: gV constant \n\
+ pmulhw %%xmm0, %%xmm4 # XMM4: gV7.................gV0 \n\
+ paddw %%xmm4, %%xmm1 # XMM1: g7 g6 g5 g4 g3 g2 g1 g0 \n\
+ pmulhw 64("ESI"),%%xmm0 # XMM0: r7 r6 r5 r4 r3 r2 r1 r0 \n\
+ pmulhw 112("ESI"),%%xmm2 #XMM2: b7 b6 b5 b4 b3 b2 b1 b0 \n\
+ # Add intermediate results and round/shift to get R/G/B values \n\
+ paddw 128("ESI"),%%xmm6 # Add rounding value (0.5 @ 8.4 fixed) \n\
+ paddw %%xmm6, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\
+ psraw $4, %%xmm0 # Shift back to 8.0 fixed \n\
+ paddw %%xmm6, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\
+ psraw $4, %%xmm1 \n\
+ paddw %%xmm6, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n\
+ psraw $4, %%xmm2 \n\
+ # Do it all over again for pixels 8-15 \n\
+ pmulhw 48("ESI"),%%xmm7 # XMM7: cYF.................cY8 \n\
+ movdqa 80("ESI"),%%xmm6 # XMM6: gU constant \n\
+ pmulhw %%xmm5, %%xmm6 # XMM6: gUF.................gU8 \n\
+ movdqa 96("ESI"),%%xmm4 # XMM4: gV constant \n\
+ pmulhw %%xmm3, %%xmm4 # XMM4: gVF.................gV8 \n\
+ paddw %%xmm6, %%xmm4 # XMM4: gF gE gD gC gB gA g9 g8 \n\
+ pmulhw 64("ESI"),%%xmm3 # XMM3: rF rE rD rC rB rA r9 r8 \n\
+ pmulhw 112("ESI"),%%xmm5 #XMM5: bF bE bD bC bB bA b9 b8 \n\
+ paddw 128("ESI"),%%xmm7 # Add rounding value (0.5 @ 8.4 fixed) \n\
+ paddw %%xmm7, %%xmm3 # XMM3: RF RE RD RC RB RA R9 R8 \n\
+ psraw $4, %%xmm3 \n\
+ paddw %%xmm7, %%xmm4 # XMM4: GF GE GD GC GB GA G9 G8 \n\
+ psraw $4, %%xmm4 \n\
+ paddw %%xmm7, %%xmm5 # XMM5: BF BE BD BC BB BA B9 B8 \n\
+ psraw $4, %%xmm5 \n\
+ # Saturate to 0-255 and pack into bytes \n\
+ packuswb %%xmm3, %%xmm0 # XMM0: RF...................R0 \n\
+ packuswb %%xmm4, %%xmm1 # XMM1: GF...................G0 \n\
+ packuswb %%xmm5, %%xmm2 # XMM2: BF...................B0 \n"
+ : /* no outputs */
+ : "S" (&yuv_data), "m" (yuv_data)
+ );
+}
+
+/************************************/
+
+/* Convert YUV->RGB output to RGBA pixels in XMM0..XMM3 */
+#define SSE2_RGB_TO_RGBA "\
+ pxor %%xmm7, %%xmm7 # XMM7: 00 00 00 00 00 00 00 00 \n\
+ movdqa %%xmm0, %%xmm3 # XMM3: RF...................R0 \n\
+ movdqa %%xmm1, %%xmm4 # XMM4: GF...................G0 \n\
+ movdqa %%xmm2, %%xmm5 # XMM5: BF...................B0 \n\
+ punpcklbw %%xmm1,%%xmm0 # XMM0: G7 R7.............G0 R0 \n\
+ punpcklbw %%xmm7,%%xmm2 # XMM2: 00 B7.............00 B0 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: G7 R7.............G0 R0 \n\
+ punpcklwd %%xmm2,%%xmm0 # XMM0: 0BGR3 0BGR2 0BGR1 0BGR0 \n\
+ punpckhwd %%xmm2,%%xmm1 # XMM1: 0BGR7 0BGR6 0BGR5 0BGR4 \n\
+ punpckhbw %%xmm4,%%xmm3 # XMM3: GF RF.............G8 R8 \n\
+ punpckhbw %%xmm7,%%xmm5 # XMM5: 00 BF.............00 B8 \n\
+ movdqa %%xmm3, %%xmm2 # XMM2: GF RF.............G8 R8 \n\
+ punpckhwd %%xmm5,%%xmm3 # XMM3: 0BGRF 0BGRE 0BGRD 0BGRC \n\
+ punpcklwd %%xmm5,%%xmm2 # XMM2: 0BGRB 0BGRA 0BGR9 0BGR8 \n"
+
+/* Convert YUV->RGB output to BGRA pixels in XMM0..XMM3 */
+#define SSE2_RGB_TO_BGRA "\
+ pxor %%xmm7, %%xmm7 # XMM7: 00 00 00 00 00 00 00 00 \n\
+ movdqa %%xmm0, %%xmm5 # XMM5: RF...................R0 \n\
+ movdqa %%xmm1, %%xmm4 # XMM4: GF...................G0 \n\
+ movdqa %%xmm2, %%xmm3 # XMM3: BF...................B0 \n\
+ punpcklbw %%xmm1,%%xmm2 # XMM0: G7 B7.............G0 B0 \n\
+ punpcklbw %%xmm7,%%xmm0 # XMM2: 00 R7.............00 R0 \n\
+ movdqa %%xmm2, %%xmm1 # XMM1: G7 B7.............G0 B0 \n\
+ punpcklwd %%xmm0,%%xmm2 # XMM2: 0RGB3 0RGB2 0RGB1 0RGB0 \n\
+ punpckhwd %%xmm0,%%xmm1 # XMM1: 0RGB7 0RGB6 0RGB5 0RGB4 \n\
+ movdqa %%xmm2, %%xmm0 # XMM0: 0RGB3 0RGB2 0RGB1 0RGB0 \n\
+ punpckhbw %%xmm4,%%xmm3 # XMM3: GF BF.............G8 B8 \n\
+ punpckhbw %%xmm7,%%xmm5 # XMM5: 00 RF.............00 R8 \n\
+ movdqa %%xmm3, %%xmm2 # XMM2: GF BF.............G8 B8 \n\
+ punpckhwd %%xmm5,%%xmm3 # XMM3: 0RGBF 0RGBE 0RGBD 0RGBC \n\
+ punpcklwd %%xmm5,%%xmm2 # XMM2: 0RGBB 0RGBA 0RGB9 0RGB8 \n"
+
+/* Convert and 4 RGBA32 (BGRA32) pixels in XMMn to RGB24 (BGR24) and store
+ * at EDI+(12*n) */
+#define SSE2_RGB32_TO_RGB24(n) "\
+ movd %%xmm"#n", %%eax # EAX: 00 B0 G0 R0 \n\
+ psrldq $4, %%xmm"#n" # XMMn: 00000 0BGR3 0BGR2 0BGR1 \n\
+ movd %%xmm"#n", %%ebx # EBX: 00 B1 G1 R1 \n\
+ psrldq $4, %%xmm"#n" # XMMn: 00000 00000 0BGR3 0BGR2 \n\
+ movd %%xmm"#n", %%ecx # ECX: 00 B2 G2 R2 \n\
+ psrldq $4, %%xmm"#n" # XMMn: 00000 00000 00000 0BGR3 \n\
+ movd %%xmm"#n", %%edx # EDX: 00 B3 G3 R3 \n\
+ "IA32_RGB32_TO_RGB24" \n\
+ movl %%eax, 12*"#n"+0("EDI") \n\
+ movl %%ebx, 12*"#n"+4("EDI") \n\
+ movl %%ecx, 12*"#n"+8("EDI") \n"
+
+
+static inline void sse2_store_rgb24(uint8_t *dest)
+{
+ /* It looks like it's fastest to go to RGB32 first, then shift the
+ * result to merge the 24-bit pixels together. */
+ asm(SSE2_RGB_TO_RGBA" \n\
+ "PUSH(EBX)" \n\
+ "SSE2_RGB32_TO_RGB24(0)" \n\
+ "SSE2_RGB32_TO_RGB24(1)" \n\
+ "SSE2_RGB32_TO_RGB24(2)" \n\
+ "SSE2_RGB32_TO_RGB24(3)" \n\
+ "POP(EBX)" \n"
+ : /* no outputs */
+ : "D" (dest)
+ : "eax", "ecx", "edx", "esi" COMMA_FAKE_PUSH_REG
+ );
+}
+
+static inline void sse2_store_bgr24(uint8_t *dest)
+{
+ asm(SSE2_RGB_TO_BGRA "\
+ "PUSH(EBX)" \n\
+ "SSE2_RGB32_TO_RGB24(0)" \n\
+ "SSE2_RGB32_TO_RGB24(1)" \n\
+ "SSE2_RGB32_TO_RGB24(2)" \n\
+ "SSE2_RGB32_TO_RGB24(3)" \n\
+ "POP(EBX)" \n"
+ : /* no outputs */
+ : "D" (dest)
+ : "eax", "ecx", "edx", "esi" COMMA_FAKE_PUSH_REG
+ );
+}
+
+/* It would be nice to be able to use movntdq here for a 50% speedup,
+ * but we're not guaranteed alignment... (think 766x512 for example) */
+static inline void sse2_store_rgba32(uint8_t *dest)
+{
+ asm(SSE2_RGB_TO_RGBA "\
+ movdqu %%xmm0, ("EDI") \n\
+ movdqu %%xmm1, 16("EDI") \n\
+ movdqu %%xmm2, 32("EDI") \n\
+ movdqu %%xmm3, 48("EDI") \n"
+ : /* no outputs */
+ : "D" (dest)
+ );
+}
+
+static inline void sse2_store_abgr32(uint8_t *dest)
+{
+ asm(SSE2_RGB_TO_BGRA "\
+ pslldq $1, %%xmm0 \n\
+ pslldq $1, %%xmm1 \n\
+ pslldq $1, %%xmm2 \n\
+ pslldq $1, %%xmm3 \n\
+ movdqu %%xmm0, ("EDI") \n\
+ movdqu %%xmm1, 16("EDI") \n\
+ movdqu %%xmm2, 32("EDI") \n\
+ movdqu %%xmm3, 48("EDI") \n"
+ : /* no outputs */
+ : "D" (dest)
+ );
+}
+
+static inline void sse2_store_argb32(uint8_t *dest)
+{
+ asm(SSE2_RGB_TO_RGBA "\
+ pslldq $1, %%xmm0 \n\
+ pslldq $1, %%xmm1 \n\
+ pslldq $1, %%xmm2 \n\
+ pslldq $1, %%xmm3 \n\
+ movdqu %%xmm0, ("EDI") \n\
+ movdqu %%xmm1, 16("EDI") \n\
+ movdqu %%xmm2, 32("EDI") \n\
+ movdqu %%xmm3, 48("EDI") \n"
+ : /* no outputs */
+ : "D" (dest)
+ );
+}
+
+static inline void sse2_store_bgra32(uint8_t *dest)
+{
+ asm(SSE2_RGB_TO_BGRA "\
+ movdqu %%xmm0, ("EDI") \n\
+ movdqu %%xmm1, 16("EDI") \n\
+ movdqu %%xmm2, 32("EDI") \n\
+ movdqu %%xmm3, 48("EDI") \n"
+ : /* no outputs */
+ : "D" (dest)
+ );
+}
+
+/*************************************************************************/
+
+static inline void sse2_load_rgb24(uint8_t *src);
+static inline void sse2_load_bgr24(uint8_t *src);
+static inline void sse2_load_rgba32(uint8_t *src);
+static inline void sse2_load_abgr32(uint8_t *src);
+static inline void sse2_load_argb32(uint8_t *src);
+static inline void sse2_load_bgra32(uint8_t *src);
+static inline void sse2_rgb_to_yuv420p(
+ uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_yuv411p(
+ uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_yuv422p(
+ uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_yuv444p(
+ uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_yuy2(
+ uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_uyvy(
+ uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_yvyu(
+ uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_y8(
+ uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+
+#define DEFINE_RGB2YUV_SSE2(rgb,yuv,rgbsz,rofs,gofs,bofs,slowop) \
+static int rgb##_##yuv##_sse2(uint8_t **src, uint8_t **dest, \
+ int width, int height) \
+{ \
+ int x, y; \
+ \
+ for (y = 0; y < height; y++) { \
+ for (x = 0; x < (width & ~7); x += 8) { \
+ sse2_load_##rgb(src[0]+(y*width+x)*rgbsz); \
+ sse2_rgb_to_##yuv(dest[0], dest[1], dest[2], x, y, width); \
+ } \
+ while (x < width) { \
+ int r = src[0][(y*width+x)*rgbsz+rofs]; \
+ int g = src[0][(y*width+x)*rgbsz+gofs]; \
+ int b = src[0][(y*width+x)*rgbsz+bofs]; \
+ slowop; \
+ x++; \
+ } \
+ } \
+ asm("emms"); \
+ return 1; \
+}
+
+#define DEFINE_RGB2YUV_SSE2_SET(rgb,sz,r,g,b) \
+ DEFINE_RGB2YUV_SSE2(rgb,yuv420p, sz,r,g,b, RGB2YUV_420P) \
+ DEFINE_RGB2YUV_SSE2(rgb,yuv411p, sz,r,g,b, RGB2YUV_411P) \
+ DEFINE_RGB2YUV_SSE2(rgb,yuv422p, sz,r,g,b, RGB2YUV_422P) \
+ DEFINE_RGB2YUV_SSE2(rgb,yuv444p, sz,r,g,b, RGB2YUV_444P) \
+ DEFINE_RGB2YUV_SSE2(rgb,yuy2, sz,r,g,b, RGB2YUV_YUY2) \
+ DEFINE_RGB2YUV_SSE2(rgb,uyvy, sz,r,g,b, RGB2YUV_UYVY) \
+ DEFINE_RGB2YUV_SSE2(rgb,yvyu, sz,r,g,b, RGB2YUV_YVYU) \
+ DEFINE_RGB2YUV_SSE2(rgb,y8, sz,r,g,b, RGB2Y())
+
+DEFINE_RGB2YUV_SSE2_SET(rgb24, 3,0,1,2)
+DEFINE_RGB2YUV_SSE2_SET(bgr24, 3,2,1,0)
+DEFINE_RGB2YUV_SSE2_SET(rgba32, 4,0,1,2)
+DEFINE_RGB2YUV_SSE2_SET(abgr32, 4,3,2,1)
+DEFINE_RGB2YUV_SSE2_SET(argb32, 4,1,2,3)
+DEFINE_RGB2YUV_SSE2_SET(bgra32, 4,2,1,0)
+
+/************************************/
+
+/* Split 8 RGBA pixels in XMMr/XMMb into R/G/B in XMM0/XMM1/XMM2.
+ * r and b are 0 and 2 for RGB, 2 and 0 for BGR */
+#define SSE2_SPLIT_RGB32(r,b) "\
+ movdqa 176("EDI"), %%xmm7 # XMM7: 00FF*8 \n\
+ movdqa %%xmm"#r", %%xmm1 # XMM1: XBGR3 XBGR2 XBGR1 XBGR0 \n\
+ movdqa %%xmm"#b", %%xmm3 # XMM3: XBGR7 XBGR6 XBGR5 XBGR4 \n\
+ pand %%xmm7, %%xmm"#r" # XMMr: B3 R3 B2 R2 B1 R1 B0 R0 \n\
+ psrld $8, %%xmm1 # XMM1: -XBG3 -XBG2 -XBG1 -XBG0 \n\
+ pand %%xmm7, %%xmm"#b" # XMMb: B7 R7 B6 R6 B5 R5 B4 R4 \n\
+ psrld $8, %%xmm3 # XMM3: -XBG7 -XBG6 -XBG5 -XBG4 \n\
+ pand %%xmm7, %%xmm1 # XMM1: XX G3 XX G2 XX G1 XX G0 \n\
+ packuswb %%xmm"#b", %%xmm"#r" # XMMr: B7 R7 ........... B0 R0 \n\
+ pand %%xmm7, %%xmm3 # XMM3: XX G7 XX G6 XX G5 XX G4 \n\
+ movdqa %%xmm"#r", %%xmm"#b" # XMMb: B7 R7 ........... B0 R0 \n\
+ packuswb %%xmm3, %%xmm1 # XMM1: XX G7 ........... XX G0 \n\
+ pand %%xmm7, %%xmm"#r" # XMMr: R7 R6 R5 R4 R3 R2 R1 R0 \n\
+ psrlw $8, %%xmm"#b" # XMMb: B7 B6 B5 B4 B3 B2 B1 B0 \n\
+ pand %%xmm7, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n"
+
+static inline void sse2_load_rgb24(uint8_t *src)
+{
+ asm("\
+ "PUSH(EBX)" \n\
+ # Make stack space for loading XMM registers \n"
+#ifdef ARCH_X86_64
+" sub $24+128, "ESP" \n"
+#else
+" sub $24, "ESP" \n"
+#endif
+" # Copy source pixels to appropriate positions in stack (this \n\
+ # seems to be the fastest way to get them where we want them) \n\
+ movl $8, %%ebx \n\
+ movl $24, %%edx \n\
+ 0: \n\
+ movb -3("ESI","EDX"), %%al \n\
+ movb %%al, 0-1("ESP","EBX") \n\
+ movb -2("ESI","EDX"), %%al \n\
+ movb %%al, 8-1("ESP","EBX") \n\
+ movb -1("ESI","EDX"), %%al \n\
+ movb %%al, 16-1("ESP","EBX") \n\
+ subl $3, %%edx \n\
+ subl $1, %%ebx \n\
+ jnz 0b \n\
+ # Load XMM0-XMM2 with R/G/B values and expand to 16-bit \n\
+ pxor %%xmm7, %%xmm7 \n\
+ movq ("ESP"), %%xmm0 \n\
+ punpcklbw %%xmm7, %%xmm0 \n\
+ movq 8("ESP"), %%xmm1 \n\
+ punpcklbw %%xmm7, %%xmm1 \n\
+ movq 16("ESP"), %%xmm2 \n\
+ punpcklbw %%xmm7, %%xmm2 \n"
+#ifdef ARCH_X86_64
+" add $24+128, "ESP" \n"
+#else
+" add $24, "ESP" \n"
+#endif
+" "POP(EBX)" \n"
+ : /* no outputs */
+ : "S" (src)
+ : "eax", "ecx", "edx", "edi" COMMA_FAKE_PUSH_REG
+ );
+}
+
+static inline void sse2_load_bgr24(uint8_t *src)
+{
+ /* Load as RGB and swap registers */
+ sse2_load_rgb24(src);
+ asm("\
+ movdqa %%xmm0, %%xmm3 \n\
+ movdqa %%xmm2, %%xmm0 \n\
+ movdqa %%xmm3, %%xmm2 \n"
+ : /* no outputs */
+ : /* no inputs */
+ );
+}
+
+static inline void sse2_load_rgba32(uint8_t *src)
+{
+ asm("\
+ movdqu ("ESI"), %%xmm0 # XMM0: XBGR3 XBGR2 XBGR1 XBGR0 \n\
+ movdqu 16("ESI"), %%xmm2 # XMM2: XBGR7 XBGR6 XBGR5 XBGR4 \n\
+ "SSE2_SPLIT_RGB32(0,2)" \n"
+ : /* no outputs */
+ : "S" (src), "D" (&rgb_data), "m" (rgb_data)
+ );
+}
+
+static inline void sse2_load_abgr32(uint8_t *src)
+{
+ asm("\
+ movdqu ("ESI"), %%xmm2 # XMM2: RGBX3 RGBX2 RGBX1 RGBX0 \n\
+ movdqu 16("ESI"), %%xmm0 # XMM0: RGBX7 RGBX6 RGBX5 RGBX4 \n\
+ psrld $8, %%xmm2 # XMM2: -RGB3 -RGB2 -RGB1 -RGB0 \n\
+ psrld $8, %%xmm0 # XMM0: -RGB7 -RGB6 -RGB5 -RGB4 \n\
+ "SSE2_SPLIT_RGB32(2,0)" \n"
+ : /* no outputs */
+ : "S" (src), "D" (&rgb_data), "m" (rgb_data)
+ );
+}
+
+static inline void sse2_load_argb32(uint8_t *src)
+{
+ asm("\
+ movdqu ("ESI"), %%xmm0 # XMM0: BGRX3 BGRX2 BGRX1 BGRX0 \n\
+ movdqu 16("ESI"), %%xmm2 # XMM2: BGRX7 BGRX6 BGRX5 BGRX4 \n\
+ psrld $8, %%xmm0 # XMM0: -BGR3 -BGR2 -BGR1 -BGR0 \n\
+ psrld $8, %%xmm2 # XMM2: -BGR7 -BGR6 -BGR5 -BGR4 \n\
+ "SSE2_SPLIT_RGB32(0,2)" \n"
+ : /* no outputs */
+ : "S" (src), "D" (&rgb_data), "m" (rgb_data)
+ );
+}
+
+static inline void sse2_load_bgra32(uint8_t *src)
+{
+ asm("\
+ movdqu ("ESI"), %%xmm2 # XMM2: XRGB3 XRGB2 XRGB1 XRGB0 \n\
+ movdqu 16("ESI"), %%xmm0 # XMM0: XRGB7 XRGB6 XRGB5 XRGB4 \n\
+ "SSE2_SPLIT_RGB32(2,0)" \n"
+ : /* no outputs */
+ : "S" (src), "D" (&rgb_data), "m" (rgb_data)
+ );
+}
+
+/************************************/
+
+#define SSE2_RGB2Y "\
+ # Make RGB data into 8.6 fixed-point, then create 8.6 \n\
+ # fixed-point Y data in XMM3 \n\
+ psllw $6, %%xmm0 \n\
+ movdqa %%xmm0, %%xmm3 \n\
+ pmulhuw ("EDI"), %%xmm3 \n\
+ psllw $6, %%xmm1 \n\
+ movdqa %%xmm1, %%xmm6 \n\
+ pmulhuw 16("EDI"), %%xmm6 \n\
+ psllw $6, %%xmm2 \n\
+ movdqa %%xmm2, %%xmm7 \n\
+ pmulhuw 32("EDI"), %%xmm7 \n\
+ paddw %%xmm6, %%xmm3 # No possibility of overflow \n\
+ paddw %%xmm7, %%xmm3 \n\
+ paddw 144("EDI"), %%xmm3 \n"
+#define SSE2_RGB2U "\
+ # Create 8.6 fixed-point U data in XMM4 \n\
+ movdqa %%xmm0, %%xmm4 \n\
+ pmulhw 48("EDI"), %%xmm4 \n\
+ movdqa %%xmm1, %%xmm6 \n\
+ pmulhw 64("EDI"), %%xmm6 \n\
+ movdqa %%xmm2, %%xmm7 \n\
+ pmulhw 80("EDI"), %%xmm7 \n\
+ paddw %%xmm6, %%xmm4 \n\
+ paddw %%xmm7, %%xmm4 \n\
+ paddw 160("EDI"), %%xmm4 \n"
+#define SSE2_RGB2U0 "\
+ # Create 8.6 fixed-point U data in XMM0 \n\
+ pmulhw 48("EDI"), %%xmm0 \n\
+ pmulhw 64("EDI"), %%xmm1 \n\
+ pmulhw 80("EDI"), %%xmm2 \n\
+ paddw %%xmm1, %%xmm0 \n\
+ paddw %%xmm2, %%xmm0 \n\
+ paddw 160("EDI"), %%xmm0 \n"
+#define SSE2_RGB2V "\
+ # Create 8.6 fixed-point V data in XMM0 \n\
+ pmulhw 96("EDI"), %%xmm0 \n\
+ pmulhw 112("EDI"), %%xmm1 \n\
+ pmulhw 128("EDI"), %%xmm2 \n\
+ paddw %%xmm1, %%xmm0 \n\
+ paddw %%xmm2, %%xmm0 \n\
+ paddw 160("EDI"), %%xmm0 \n"
+#define SSE2_PACKYU "\
+ # Shift back down to 8-bit values \n\
+ psraw $6, %%xmm3 \n\
+ psraw $6, %%xmm0 \n\
+ # Pack into bytes \n\
+ pxor %%xmm7, %%xmm7 \n\
+ packuswb %%xmm7, %%xmm3 \n\
+ packuswb %%xmm7, %%xmm0 \n"
+#define SSE2_PACKYUV "\
+ # Shift back down to 8-bit values \n\
+ psraw $6, %%xmm3 \n\
+ psraw $6, %%xmm4 \n\
+ psraw $6, %%xmm0 \n\
+ # Pack into bytes \n\
+ pxor %%xmm7, %%xmm7 \n\
+ packuswb %%xmm7, %%xmm3 \n\
+ packuswb %%xmm7, %%xmm4 \n\
+ packuswb %%xmm7, %%xmm0 \n"
+#define SSE2_STRIPU(N) "\
+ # Remove every odd U value \n\
+ pand 176("EDI"), %%xmm"#N" \n\
+ packuswb %%xmm7, %%xmm"#N" \n"
+#define SSE2_STRIPV "\
+ # Remove every even V value \n\
+ psrlw $8, %%xmm0 \n\
+ packuswb %%xmm7, %%xmm0 \n"
+
+static inline void sse2_rgb_to_yuv420p(
+ uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+ if (y%2 == 0) {
+ asm("\
+ "SSE2_RGB2Y" \n\
+ "SSE2_RGB2U0" \n\
+ "SSE2_PACKYU" \n\
+ "SSE2_STRIPU(0)" \n\
+ # Store into destination pointers \n\
+ movq %%xmm3, ("EAX") \n\
+ movd %%xmm0, ("ECX") \n"
+ : /* no outputs */
+ : "a" (destY+y*width+x), "c" (destU+(y/2)*(width/2)+(x/2)),
+ "D" (&rgb_data), "m" (rgb_data)
+ );
+ } else {
+ asm("\
+ "SSE2_RGB2Y" \n\
+ "SSE2_RGB2V" \n\
+ "SSE2_PACKYU" \n\
+ "SSE2_STRIPV" \n\
+ # Store into destination pointers \n\
+ movq %%xmm3, ("EAX") \n\
+ movd %%xmm0, ("EDX") \n"
+ : /* no outputs */
+ : "a" (destY+y*width+x), "d" (destV+(y/2)*(width/2)+(x/2)),
+ "D" (&rgb_data), "m" (rgb_data)
+ );
+ }
+}
+
+static inline void sse2_rgb_to_yuv411p(
+ uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+ asm("\
+ "SSE2_RGB2Y" \n\
+ "SSE2_RGB2U" \n\
+ "SSE2_RGB2V" \n\
+ "SSE2_PACKYUV" \n\
+ "SSE2_STRIPU(4)" \n\
+ "SSE2_STRIPU(4)" \n\
+ "SSE2_STRIPU(0)" \n\
+ "SSE2_STRIPV" \n\
+ # Store into destination pointers \n\
+ movq %%xmm3, ("EAX") \n\
+ "PUSH(EAX)" # needed because GCC might rely on it later \n\
+ movd %%xmm4, %%eax \n\
+ movw %%ax, ("ECX") \n\
+ movd %%xmm0, %%eax \n\
+ movw %%ax, ("EDX") \n\
+ "POP(EAX)" \n"
+ : /* no outputs */
+ : "a" (destY+y*width+x), "c" (destU+y*(width/4)+(x/4)),
+ "d" (destV+y*(width/4)+(x/4)), "D" (&rgb_data), "m" (rgb_data)
+#ifdef ARCH_X86_64
+ : FAKE_PUSH_REG
+#endif
+ );
+}
+
+static inline void sse2_rgb_to_yuv422p(
+ uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+ asm("\
+ "SSE2_RGB2Y" \n\
+ "SSE2_RGB2U" \n\
+ "SSE2_RGB2V" \n\
+ "SSE2_PACKYUV" \n\
+ "SSE2_STRIPU(4)" \n\
+ "SSE2_STRIPV" \n\
+ # Store into destination pointers \n\
+ movq %%xmm3, ("EAX") \n\
+ movd %%xmm4, ("ECX") \n\
+ movd %%xmm0, ("EDX") \n"
+ : /* no outputs */
+ : "a" (destY+y*width+x), "c" (destU+y*(width/2)+(x/2)),
+ "d" (destV+y*(width/2)+(x/2)), "D" (&rgb_data), "m" (rgb_data)
+ );
+}
+
+static inline void sse2_rgb_to_yuv444p(
+ uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+ asm("\
+ "SSE2_RGB2Y" \n\
+ "SSE2_RGB2U" \n\
+ "SSE2_RGB2V" \n\
+ "SSE2_PACKYUV" \n\
+ # Store into destination pointers \n\
+ movq %%xmm3, ("EAX") \n\
+ movq %%xmm4, ("ECX") \n\
+ movq %%xmm0, ("EDX") \n"
+ : /* no outputs */
+ : "a" (destY+y*width+x), "c" (destU+y*width+x), "d" (destV+y*width+x),
+ "D" (&rgb_data), "m" (rgb_data)
+ );
+}
+
+static inline void sse2_rgb_to_yuy2(
+ uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+ asm("\
+ "SSE2_RGB2Y" \n\
+ "SSE2_RGB2U" \n\
+ "SSE2_RGB2V" \n\
+ "SSE2_PACKYUV" \n\
+ "SSE2_STRIPU(4)" \n\
+ "SSE2_STRIPV" \n\
+ # Interleave Y/U/V \n\
+ punpcklbw %%xmm0, %%xmm4 \n\
+ punpcklbw %%xmm4, %%xmm3 \n\
+ # Store into destination pointer \n\
+ movdqu %%xmm3, ("EAX") \n"
+ : /* no outputs */
+ : "a" (destY+(y*width+x)*2), "D" (&rgb_data), "m" (rgb_data)
+ );
+}
+
+static inline void sse2_rgb_to_uyvy(
+ uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+ asm("\
+ "SSE2_RGB2Y" \n\
+ "SSE2_RGB2U" \n\
+ "SSE2_RGB2V" \n\
+ "SSE2_PACKYUV" \n\
+ "SSE2_STRIPU(4)" \n\
+ "SSE2_STRIPV" \n\
+ # Interleave Y/U/V \n\
+ punpcklbw %%xmm0, %%xmm4 \n\
+ punpcklbw %%xmm3, %%xmm4 \n\
+ # Store into destination pointer \n\
+ movdqu %%xmm4, ("EAX") \n"
+ : /* no outputs */
+ : "a" (destY+(y*width+x)*2), "D" (&rgb_data), "m" (rgb_data)
+ );
+}
+
+static inline void sse2_rgb_to_yvyu(
+ uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+ asm("\
+ "SSE2_RGB2Y" \n\
+ "SSE2_RGB2U" \n\
+ "SSE2_RGB2V" \n\
+ "SSE2_PACKYUV" \n\
+ # Remove every odd V value \n\
+ pand 176("EDI"), %%xmm0 \n\
+ packuswb %%xmm7, %%xmm0 \n\
+ # Remove every even U value \n\
+ psrlw $8, %%xmm4 \n\
+ packuswb %%xmm7, %%xmm4 \n\
+ # Interleave Y/U/V \n\
+ punpcklbw %%xmm4, %%xmm0 \n\
+ punpcklbw %%xmm0, %%xmm3 \n\
+ # Store into destination pointer \n\
+ movdqu %%xmm3, ("EAX") \n"
+ : /* no outputs */
+ : "a" (destY+(y*width+x)*2), "D" (&rgb_data), "m" (rgb_data)
+ );
+}
+
+static inline void sse2_rgb_to_y8(
+ uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+ asm("\
+ psllw $6, %%xmm0 \n\
+ pmulhuw ("EDI"), %%xmm0 \n\
+ psllw $6, %%xmm1 \n\
+ pmulhuw 16("EDI"), %%xmm1 \n\
+ psllw $6, %%xmm2 \n\
+ pmulhuw 32("EDI"), %%xmm2 \n\
+ paddw %%xmm1, %%xmm0 # No possibility of overflow \n\
+ paddw %%xmm2, %%xmm0 \n\
+ paddw 144("EDI"), %%xmm0 \n\
+ psraw $6, %%xmm0 \n\
+ packuswb %%xmm0, %%xmm0 \n\
+ movq %%xmm0, ("EAX") \n"
+ : /* no outputs */
+ : "a" (destY+y*width+x), "D" (&rgb_data), "m" (rgb_data)
+ );
+}
+
+/*************************************************************************/
+
+static int yuvp_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ asm("movdqa ("EDX"), %%xmm7 # constant: 255/219 \n\
+ movdqa 32("EDX"), %%xmm6 # constant: 16 \n\
+ pxor %%xmm4, %%xmm4 # constant: 0 \n"
+ SIMD_LOOP_WRAPPER(
+ /* blocksize */ 16,
+ /* push_regs */ "",
+ /* pop_regs */ "",
+ /* small_loop */ "\
+ movzbl -1("ESI","ECX"), %%eax # retrieve Y byte \n\
+ subl $16, %%eax # subtract 16 \n\
+ imull %3, %%eax # multiply by 255/219 \n\
+ shrl $14, %%eax # shift down to 8 bits \n\
+ testb %%ah, %%ah # saturate to 0..255 \n\
+ movl $-1, %%edx # (trash EDX, we don't need it \n\
+ cmovnz %%edx, %%eax # anymore) \n\
+ movl $0, %%edx \n\
+ cmovs %%edx, %%eax \n\
+ movb %%al, -1("EDI","ECX") # and store \n",
+ /* main_loop */ "\
+ movdqu -16("ESI","ECX"), %%xmm0 # XMM0: Y15..Y0 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: Y15..Y0 \n\
+ punpcklbw %%xmm4, %%xmm0 # XMM0: Y7..Y0 \n\
+ psubw %%xmm6, %%xmm0 # XMM0: unbias by 16 \n\
+ psllw $2, %%xmm0 # XMM0: fixed point 8.2 \n\
+ pmulhw %%xmm7, %%xmm0 # XMM0: multiply by 255/219>>2 \n\
+ punpckhbw %%xmm4, %%xmm1 # XMM1: Y15..Y8 << 8 \n\
+ psubw %%xmm6, %%xmm1 # XMM1: unbias by 16 \n\
+ psllw $2, %%xmm1 # XMM1: fixed point 8.2 \n\
+ pmulhw %%xmm7, %%xmm1 # XMM1: multiply by 255/219>>2 \n\
+ packuswb %%xmm1, %%xmm0 # XMM0: G15..G0, saturated \n\
+ movdqu %%xmm0, -16("EDI","ECX") \n",
+ /* emms */ "emms")
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+ "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data)
+ : "eax");
+ return 1;
+}
+
+static int yuy2_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ asm("movdqa ("EDX"), %%xmm7 # constant: 255/219 \n\
+ movdqa 32("EDX"), %%xmm6 # constant: 16 \n\
+ pcmpeqd %%xmm5, %%xmm5 \n\
+ psrlw $8, %%xmm5 # constant: 0x00FF \n\
+ pxor %%xmm4, %%xmm4 # constant: 0 \n"
+ SIMD_LOOP_WRAPPER(
+ /* blocksize */ 8,
+ /* push_regs */ "",
+ /* pop_regs */ "",
+ /* small_loop */ "\
+ movzbl -2("ESI","ECX",2), %%eax # retrieve Y byte \n\
+ subl $16, %%eax # subtract 16 \n\
+ imull %3, %%eax # multiply by 255/219 \n\
+ shrl $14, %%eax # shift down to 8 bits \n\
+ testb %%ah, %%ah # saturate to 0..255 \n\
+ movl $-1, %%edx # (trash EDX, we don't need it \n\
+ cmovnz %%edx, %%eax # anymore) \n\
+ movl $0, %%edx \n\
+ cmovs %%edx, %%eax \n\
+ movb %%al, -1("EDI","ECX") # and store \n",
+ /* main_loop */ "\
+ movdqu -16("ESI","ECX",2),%%xmm0 #XMM0: V3 Y7..U0 Y0 \n\
+ pand %%xmm5, %%xmm0 # XMM0: Y7..Y0 \n\
+ psubw %%xmm6, %%xmm0 # XMM0: unbias by 16 \n\
+ psllw $2, %%xmm0 # XMM0: fixed point 8.2 \n\
+ pmulhw %%xmm7, %%xmm0 # XMM0: multiply by 255/219>>2 \n\
+ packuswb %%xmm0, %%xmm0 # XMM0: G7..G0, saturated \n\
+ movq %%xmm0, -8("EDI","ECX") \n",
+ /* emms */ "emms")
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+ "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data)
+ : "eax");
+ return 1;
+}
+
+static int uyvy_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ asm("movdqa ("EDX"), %%xmm7 # constant: 255/219 \n\
+ movdqa 32("EDX"), %%xmm6 \n\
+ psllw $2, %%xmm6 # constant: 16<<2 \n\
+ pcmpeqd %%xmm5, %%xmm5 \n\
+ psllw $8, %%xmm5 # constant: 0xFF00 \n\
+ pxor %%xmm4, %%xmm4 # constant: 0 \n"
+ SIMD_LOOP_WRAPPER(
+ /* blocksize */ 8,
+ /* push_regs */ "",
+ /* pop_regs */ "",
+ /* small_loop */ "\
+ movzbl -1("ESI","ECX",2), %%eax # retrieve Y byte \n\
+ subl $16, %%eax # subtract 16 \n\
+ imull %3, %%eax # multiply by 255/219 \n\
+ shrl $14, %%eax # shift down to 8 bits \n\
+ testb %%ah, %%ah # saturate to 0..255 \n\
+ movl $-1, %%edx # (trash EDX, we don't need it \n\
+ cmovnz %%edx, %%eax # anymore) \n\
+ movl $0, %%edx \n\
+ cmovs %%edx, %%eax \n\
+ movb %%al, -1("EDI","ECX") # and store \n",
+ /* main_loop */ "\
+ movdqu -16("ESI","ECX",2),%%xmm0 #XMM0: Y7 V3..Y0 U0 \n\
+ pand %%xmm5, %%xmm0 # XMM0: Y7..Y0 << 8 \n\
+ psrlw $6, %%xmm0 # XMM0: fixed point 8.2 \n\
+ psubw %%xmm6, %%xmm0 # XMM0: unbias by 16 \n\
+ pmulhw %%xmm7, %%xmm0 # XMM0: multiply by 255/219>>2 \n\
+ packuswb %%xmm0, %%xmm0 # XMM0: G7..G0, saturated \n\
+ movq %%xmm0, -8("EDI","ECX") \n",
+ /* emms */ "emms")
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+ "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data)
+ : "eax");
+ return 1;
+}
+
+/*************************************************************************/
+
+static int gray8_y8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ asm("movdqa 16("EDX"), %%xmm7 # constant: 219/255 \n\
+ movdqa 32("EDX"), %%xmm6 # constant: 16 \n\
+ pxor %%xmm4, %%xmm4 # constant: 0 \n"
+ SIMD_LOOP_WRAPPER(
+ /* blocksize */ 16,
+ /* push_regs */ "",
+ /* pop_regs */ "",
+ /* small_loop */ "\
+ movzbl -1("ESI","ECX"), %%eax # retrieve gray byte \n\
+ imull %3, %%eax # multiply by 219/255 \n\
+ shrl $14, %%eax # shift down to 8 bits \n\
+ addl $16, %%eax # add 16 \n\
+ movb %%al, -1("EDI","ECX") # and store \n",
+ /* main_loop */ "\
+ movdqu -16("ESI","ECX"), %%xmm2 # XMM2: G15..G0 \n\
+ movdqa %%xmm4, %%xmm0 \n\
+ punpcklbw %%xmm2, %%xmm0 # XMM0: G7..G0 << 8 \n\
+ pmulhuw %%xmm7, %%xmm0 # XMM0: multiply by 219/255>>2 \n\
+ movdqa %%xmm4, %%xmm1 \n\
+ punpckhbw %%xmm2, %%xmm1 # XMM1: G15..G8 << 8 \n\
+ pmulhuw %%xmm7, %%xmm1 # XMM1: multiply by 219/255>>2 \n\
+ psrlw $6, %%xmm0 # XMM0: shift down to 8 bits \n\
+ paddw %%xmm6, %%xmm0 # XMM0: bias by 16 \n\
+ psrlw $6, %%xmm1 # XMM1: shift down to 8 bits \n\
+ paddw %%xmm6, %%xmm1 # XMM1: bias by 16 \n\
+ packuswb %%xmm1, %%xmm0 # XMM0: Y15..Y0 \n\
+ movdqu %%xmm0, -16("EDI","ECX") \n",
+ /* emms */ "emms")
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+ "i" (GRAY_Y), "d" (&gray_data), "m" (gray_data)
+ : "eax");
+ return 1;
+}
+
+static int gray8_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ asm("movdqa 16("EDX"), %%xmm7 # constant: 219/255 \n\
+ movdqa 32("EDX"), %%xmm6 # constant: 16 \n\
+ pcmpeqd %%xmm5, %%xmm5 \n\
+ psllw $15, %%xmm5 # constant: 0x8000 \n\
+ pxor %%xmm4, %%xmm4 # constant: 0 \n"
+ SIMD_LOOP_WRAPPER(
+ /* blocksize */ 8,
+ /* push_regs */ "",
+ /* pop_regs */ "",
+ /* small_loop */ "\
+ movzbl -1("ESI","ECX"), %%eax # retrieve gray byte \n\
+ imull %3, %%eax # multiply by 219/255 \n\
+ shrl $14, %%eax # shift down to 8 bits \n\
+ addl $16, %%eax # add 16 \n\
+ movb %%al, -2("EDI","ECX",2) # and store \n\
+ movb $128, -1("EDI","ECX",2) # store 128 in U/V byte \n",
+ /* main_loop */ "\
+ movq -8("ESI","ECX"), %%xmm2 # XMM2: G5..G0 \n\
+ movdqa %%xmm4, %%xmm0 \n\
+ punpcklbw %%xmm2, %%xmm0 # XMM0: G7..G0 << 8 \n\
+ pmulhuw %%xmm7, %%xmm0 # XMM0: multiply by 219/255>>2 \n\
+ psrlw $6, %%xmm0 # XMM0: shift down to 8 bits \n\
+ paddw %%xmm6, %%xmm0 # XMM0: bias by 16 \n\
+ por %%xmm5, %%xmm0 # XMM0: OR in U/V bytes \n\
+ movdqu %%xmm0, -16("EDI","ECX",2) \n",
+ /* emms */ "emms")
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+ "i" (GRAY_Y), "d" (&gray_data), "m" (gray_data)
+ : "eax");
+ return 1;
+}
+
+static int gray8_uyvy_sse2(uint8_t **src, uint8_t **dest, int width, int height) {
+ asm("movdqa 16("EDX"), %%xmm7 # constant: 219/255 \n\
+ movdqa 32("EDX"), %%xmm6 \n\
+ psllw $8, %%xmm6 # constant: 16 << 8 \n\
+ pcmpeqd %%xmm5, %%xmm5 \n\
+ psllw $15, %%xmm5 \n\
+ psrlw $8, %%xmm5 # constant: 0x0080 \n\
+ pxor %%xmm4, %%xmm4 # constant: 0 \n\
+ pcmpeqd %%xmm3, %%xmm3 \n\
+ psllw $8, %%xmm3 # constant: 0xFF00 \n"
+ SIMD_LOOP_WRAPPER(
+ /* blocksize */ 8,
+ /* push_regs */ "",
+ /* pop_regs */ "",
+ /* small_loop */ "\
+ movzbl -1("ESI","ECX"), %%eax # retrieve gray byte \n\
+ imull %3, %%eax # multiply by 219/255 \n\
+ shrl $14, %%eax # shift down to 8 bits \n\
+ addl $16, %%eax # add 16 \n\
+ movb %%al, -1("EDI","ECX",2) # and store \n\
+ movb $128, -2("EDI","ECX",2) # store 128 in U/V byte \n",
+ /* main_loop */ "\
+ movq -8("ESI","ECX"), %%xmm2 # XMM2: G5..G0 \n\
+ movdqa %%xmm4, %%xmm0 \n\
+ punpcklbw %%xmm2, %%xmm0 # XMM0: G7..G0 << 8 \n\
+ pmulhuw %%xmm7, %%xmm0 # XMM0: multiply by 219/255>>2 \n\
+ psllw $2, %%xmm0 # XMM0: shift results to hi byte\n\
+ pand %%xmm3, %%xmm0 # XMM0: clear low byte \n\
+ paddw %%xmm6, %%xmm0 # XMM0: bias by 16 \n\
+ por %%xmm5, %%xmm0 # XMM0: OR in U/V bytes \n\
+ movdqu %%xmm0, -16("EDI","ECX",2) \n",
+ /* emms */ "emms")
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+ "i" (GRAY_Y), "d" (&gray_data), "m" (gray_data)
+ : "eax");
+ return 1;
+}
+
+/*************************************************************************/
+
+static int y8_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ asm("movdqa ("EDX"), %%xmm7 # constant: 255/219 \n\
+ movdqa 32("EDX"), %%xmm6 # constant: 16 \n\
+ movdqa 48("EDX"), %%xmm5 # constant: bytes 0/3/6/9 mask \n\
+ pxor %%xmm4, %%xmm4 # constant: 0 \n"
+ SIMD_LOOP_WRAPPER(
+ /* blocksize */ 4,
+ /* push_regs */ "push "EBX,
+ /* pop_regs */ "pop "EBX,
+ /* small_loop */ "\
+ lea ("ECX","ECX",2), "EDX" # 3*count for RGB offset \n\
+ movzbl -1("ESI","ECX"), %%eax # retrieve Y byte \n\
+ subl $16, %%eax # subtract 16 \n\
+ imull %3, %%eax # multiply by 255/219 \n\
+ shrl $14, %%eax # shift down to 8 bits \n\
+ testb %%ah, %%ah # saturate to 0..255 \n\
+ movl $-1, %%ebx \n\
+ cmovnz %%ebx, %%eax \n\
+ movl $0, %%ebx \n\
+ cmovs %%ebx, %%eax \n\
+ movb %%al, -3("EDI","EDX") # and store \n\
+ movb %%al, -2("EDI","EDX") \n\
+ movb %%al, -1("EDI","EDX") \n",
+ /* main_loop */ "\
+ lea ("ECX","ECX",2), "EDX" \n\
+ movd -4("ESI","ECX"), %%xmm0 # XMM0: Y3..Y0 \n\
+ punpcklbw %%xmm4, %%xmm0 # XMM0: Y3..Y0 in 16 bits \n\
+ psubw %%xmm6, %%xmm0 # XMM0: unbias by 16 \n\
+ psllw $2, %%xmm0 # XMM0: fixed point 8.2 \n\
+ pmulhw %%xmm7, %%xmm0 # XMM0: multiply by 255/219>>2 \n\
+ packuswb %%xmm0, %%xmm0 # XMM0: G3..G0, saturated \n\
+ pshuflw $0x50, %%xmm0, %%xmm0 # X0.l: G3 G2 G3 G2 G1 G0 G1 G0 \n\
+ pshufhw $0x55, %%xmm0, %%xmm0 # X0.h: G3 G2 G3 G2 G3 G2 G3 G2 \n\
+ pand %%xmm5, %%xmm0 # XMM0: ------3--2--1--0 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: ------3--2--1--0 \n\
+ pslldq $1, %%xmm1 # XMM1: -----3--2--1--0- \n\
+ movdqa %%xmm0, %%xmm2 # XMM2: ------3--2--1--0 \n\
+ pslldq $2, %%xmm2 # XMM2: ----3--2--1--0-- \n\
+ por %%xmm1, %%xmm0 # XMM0: -----33-22-11-00 \n\
+ por %%xmm2, %%xmm0 # XMM0: ----333222111000 \n\
+ movd %%xmm0, -12("EDI","EDX") \n\
+ pshufd $0xC9, %%xmm0, %%xmm0 \n\
+ movq %%xmm0, -8("EDI","EDX") \n",
+ /* emms */ "emms")
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+ "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data)
+ : "eax");
+ return 1;
+}
+
+/* 4BPP is slightly easier... */
+static int y8_rgba32_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ asm("movdqa ("EDX"), %%xmm7 # constant: 255/219 \n\
+ movdqa 32("EDX"), %%xmm6 # constant: 16 \n\
+ pxor %%xmm4, %%xmm4 # constant: 0 \n"
+ SIMD_LOOP_WRAPPER(
+ /* blocksize */ 4,
+ /* push_regs */ "",
+ /* pop_regs */ "",
+ /* small_loop */ "\
+ movzbl -1("ESI","ECX"), %%eax # retrieve Y byte \n\
+ subl $16, %%eax # subtract 16 \n\
+ imull %3, %%eax # multiply by 255/219 \n\
+ shrl $14, %%eax # shift down to 8 bits \n\
+ testb %%ah, %%ah # saturate to 0..255 \n\
+ movl $-1, %%edx \n\
+ cmovnz %%edx, %%eax \n\
+ movl $0, %%edx \n\
+ cmovs %%edx, %%eax \n\
+ movb %%al, -4("EDI","ECX",4) # and store \n\
+ movb %%al, -3("EDI","ECX",4) \n\
+ movb %%al, -2("EDI","ECX",4) \n",
+ /* main_loop */ "\
+ movd -4("ESI","ECX"), %%xmm0 # XMM0: Y3..Y0 \n\
+ punpcklbw %%xmm4, %%xmm0 # XMM0: Y3..Y0 in 16 bits \n\
+ psubw %%xmm6, %%xmm0 # XMM0: unbias by 16 \n\
+ psllw $2, %%xmm0 # XMM0: fixed point 8.2 \n\
+ pmulhw %%xmm7, %%xmm0 # XMM0: multiply by 255/219>>2 \n\
+ packuswb %%xmm0, %%xmm0 # XMM0: G3..G0, saturated \n\
+ punpcklbw %%xmm4, %%xmm0 # XMM0: G3..G0 in 16 bits \n\
+ punpcklbw %%xmm4, %%xmm0 # XMM0: ---3---2---1---0 \n\
+ movdqa %%xmm0, %%xmm1 # XMM1: ---3---2---1---0 \n\
+ pslldq $1, %%xmm1 # XMM1: --3---2---1---0- \n\
+ movdqa %%xmm0, %%xmm2 # XMM2: ---3---2---1---0 \n\
+ pslldq $2, %%xmm2 # XMM2: -3---2---1---0-- \n\
+ por %%xmm1, %%xmm0 # XMM0: --33--22--11--00 \n\
+ por %%xmm2, %%xmm0 # XMM0: -333-222-111-000 \n\
+ movntdq %%xmm0, -16("EDI","ECX",4) \n",
+ /* emms */ "emms")
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+ "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data)
+ : "eax");
+ return 1;
+}
+
+static int y8_argb32_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+ asm("movdqa ("EDX"), %%xmm7 # constant: 255/219 \n\
+ movdqa 32("EDX"), %%xmm6 # constant: 16 \n\
+ pxor %%xmm4, %%xmm4 # constant: 0 \n"
+ SIMD_LOOP_WRAPPER(
+ /* blocksize */ 4,
+ /* push_regs */ "",
+ /* pop_regs */ "",
+ /* small_loop */ "\
+ movzbl -1("ESI","ECX"), %%eax # retrieve Y byte \n\
+ subl $16, %%eax # subtract 16 \n\
+ imull %3, %%eax # multiply by 255/219 \n\
+ shrl $14, %%eax # shift down to 8 bits \n\
+ testb %%ah, %%ah # saturate to 0..255 \n\
+ movl $-1, %%edx \n\
+ cmovnz %%edx, %%eax \n\
+ movl $0, %%edx \n\
+ cmovs %%edx, %%eax \n\
+ movb %%al, -3("EDI","ECX",4) # and store \n\
+ movb %%al, -2("EDI","ECX",4) \n\
+ movb %%al, -1("EDI","ECX",4) \n",
+ /* main_loop */ "\
+ movd -4("ESI","ECX"), %%xmm0 # XMM0: Y3..Y0 \n\
+ punpcklbw %%xmm4, %%xmm0 # XMM0: Y3..Y0 in 16 bits \n\
+ psubw %%xmm6, %%xmm0 # XMM0: unbias by 16 \n\
+ psllw $2, %%xmm0 # XMM0: fixed point 8.2 \n\
+ pmulhw %%xmm7, %%xmm0 # XMM0: multiply by 255/219>>2 \n\
+ packuswb %%xmm0, %%xmm0 # XMM0: G3..G0, saturated \n\
+ punpcklbw %%xmm4, %%xmm0 # XMM0: G3..G0 in 16 bits \n\
+ movdqa %%xmm4, %%xmm3 # XMM3: 0 \n\
+ punpcklbw %%xmm0, %%xmm3 # XMM3: --3---2---1---0- \n\
+ movdqa %%xmm3, %%xmm1 # XMM1: --3---2---1---0- \n\
+ pslldq $1, %%xmm1 # XMM1: -3---2---1---0-- \n\
+ movdqa %%xmm3, %%xmm2 # XMM2: --3---2---1---0- \n\
+ pslldq $2, %%xmm2 # XMM2: 3---2---1---0--- \n\
+ por %%xmm1, %%xmm3 # XMM3: -33--22--11--00- \n\
+ por %%xmm2, %%xmm3 # XMM3: 333-222-111-000- \n\
+ movntdq %%xmm3, -16("EDI","ECX",4) \n",
+ /* emms */ "emms")
+ : /* no outputs */
+ : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+ "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data)
+ : "eax");
+ return 1;
+}
+
+/*************************************************************************/
+
+#endif /* HAVE_ASM_SSE2 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Initialization */
+
+int ac_imgconvert_init_yuv_rgb(int accel)
+{
+ /******** Standard C implementations ********/
+
+ //---- YUV->RGB ----//
+
+ if (!register_conversion(IMG_YUV420P, IMG_RGB24, yuv420p_rgb24)
+ || !register_conversion(IMG_YUV411P, IMG_RGB24, yuv411p_rgb24)
+ || !register_conversion(IMG_YUV422P, IMG_RGB24, yuv422p_rgb24)
+ || !register_conversion(IMG_YUV444P, IMG_RGB24, yuv444p_rgb24)
+ || !register_conversion(IMG_YUY2, IMG_RGB24, yuy2_rgb24)
+ || !register_conversion(IMG_UYVY, IMG_RGB24, uyvy_rgb24)
+ || !register_conversion(IMG_YVYU, IMG_RGB24, yvyu_rgb24)
+ || !register_conversion(IMG_Y8, IMG_RGB24, y8_rgb24)
+
+ || !register_conversion(IMG_YUV420P, IMG_BGR24, yuv420p_bgr24)
+ || !register_conversion(IMG_YUV411P, IMG_BGR24, yuv411p_bgr24)
+ || !register_conversion(IMG_YUV422P, IMG_BGR24, yuv422p_bgr24)
+ || !register_conversion(IMG_YUV444P, IMG_BGR24, yuv444p_bgr24)
+ || !register_conversion(IMG_YUY2, IMG_BGR24, yuy2_bgr24)
+ || !register_conversion(IMG_UYVY, IMG_BGR24, uyvy_bgr24)
+ || !register_conversion(IMG_YVYU, IMG_BGR24, yvyu_bgr24)
+ || !register_conversion(IMG_Y8, IMG_BGR24, y8_rgb24)
+
+ || !register_conversion(IMG_YUV420P, IMG_RGBA32, yuv420p_rgba32)
+ || !register_conversion(IMG_YUV411P, IMG_RGBA32, yuv411p_rgba32)
+ || !register_conversion(IMG_YUV422P, IMG_RGBA32, yuv422p_rgba32)
+ || !register_conversion(IMG_YUV444P, IMG_RGBA32, yuv444p_rgba32)
+ || !register_conversion(IMG_YUY2, IMG_RGBA32, yuy2_rgba32)
+ || !register_conversion(IMG_UYVY, IMG_RGBA32, uyvy_rgba32)
+ || !register_conversion(IMG_YVYU, IMG_RGBA32, yvyu_rgba32)
+ || !register_conversion(IMG_Y8, IMG_RGBA32, y8_rgba32)
+
+ || !register_conversion(IMG_YUV420P, IMG_ABGR32, yuv420p_abgr32)
+ || !register_conversion(IMG_YUV411P, IMG_ABGR32, yuv411p_abgr32)
+ || !register_conversion(IMG_YUV422P, IMG_ABGR32, yuv422p_abgr32)
+ || !register_conversion(IMG_YUV444P, IMG_ABGR32, yuv444p_abgr32)
+ || !register_conversion(IMG_YUY2, IMG_ABGR32, yuy2_abgr32)
+ || !register_conversion(IMG_UYVY, IMG_ABGR32, uyvy_abgr32)
+ || !register_conversion(IMG_YVYU, IMG_ABGR32, yvyu_abgr32)
+ || !register_conversion(IMG_Y8, IMG_ABGR32, y8_argb32)
+
+ || !register_conversion(IMG_YUV420P, IMG_ARGB32, yuv420p_argb32)
+ || !register_conversion(IMG_YUV411P, IMG_ARGB32, yuv411p_argb32)
+ || !register_conversion(IMG_YUV422P, IMG_ARGB32, yuv422p_argb32)
+ || !register_conversion(IMG_YUV444P, IMG_ARGB32, yuv444p_argb32)
+ || !register_conversion(IMG_YUY2, IMG_ARGB32, yuy2_argb32)
+ || !register_conversion(IMG_UYVY, IMG_ARGB32, uyvy_argb32)
+ || !register_conversion(IMG_YVYU, IMG_ARGB32, yvyu_argb32)
+ || !register_conversion(IMG_Y8, IMG_ARGB32, y8_argb32)
+
+ || !register_conversion(IMG_YUV420P, IMG_BGRA32, yuv420p_bgra32)
+ || !register_conversion(IMG_YUV411P, IMG_BGRA32, yuv411p_bgra32)
+ || !register_conversion(IMG_YUV422P, IMG_BGRA32, yuv422p_bgra32)
+ || !register_conversion(IMG_YUV444P, IMG_BGRA32, yuv444p_bgra32)
+ || !register_conversion(IMG_YUY2, IMG_BGRA32, yuy2_bgra32)
+ || !register_conversion(IMG_UYVY, IMG_BGRA32, uyvy_bgra32)
+ || !register_conversion(IMG_YVYU, IMG_BGRA32, yvyu_bgra32)
+ || !register_conversion(IMG_Y8, IMG_BGRA32, y8_rgba32)
+
+ //---- RGB->YUV ----//
+
+ || !register_conversion(IMG_RGB24, IMG_YUV420P, rgb24_yuv420p)
+ || !register_conversion(IMG_RGB24, IMG_YUV411P, rgb24_yuv411p)
+ || !register_conversion(IMG_RGB24, IMG_YUV422P, rgb24_yuv422p)
+ || !register_conversion(IMG_RGB24, IMG_YUV444P, rgb24_yuv444p)
+ || !register_conversion(IMG_RGB24, IMG_YUY2, rgb24_yuy2)
+ || !register_conversion(IMG_RGB24, IMG_UYVY, rgb24_uyvy)
+ || !register_conversion(IMG_RGB24, IMG_YVYU, rgb24_yvyu)
+ || !register_conversion(IMG_RGB24, IMG_Y8, rgb24_y8)
+
+ || !register_conversion(IMG_BGR24, IMG_YUV420P, bgr24_yuv420p)
+ || !register_conversion(IMG_BGR24, IMG_YUV411P, bgr24_yuv411p)
+ || !register_conversion(IMG_BGR24, IMG_YUV422P, bgr24_yuv422p)
+ || !register_conversion(IMG_BGR24, IMG_YUV444P, bgr24_yuv444p)
+ || !register_conversion(IMG_BGR24, IMG_YUY2, bgr24_yuy2)
+ || !register_conversion(IMG_BGR24, IMG_UYVY, bgr24_uyvy)
+ || !register_conversion(IMG_BGR24, IMG_YVYU, bgr24_yvyu)
+ || !register_conversion(IMG_BGR24, IMG_Y8, bgr24_y8)
+
+ || !register_conversion(IMG_RGBA32, IMG_YUV420P, rgba32_yuv420p)
+ || !register_conversion(IMG_RGBA32, IMG_YUV411P, rgba32_yuv411p)
+ || !register_conversion(IMG_RGBA32, IMG_YUV422P, rgba32_yuv422p)
+ || !register_conversion(IMG_RGBA32, IMG_YUV444P, rgba32_yuv444p)
+ || !register_conversion(IMG_RGBA32, IMG_YUY2, rgba32_yuy2)
+ || !register_conversion(IMG_RGBA32, IMG_UYVY, rgba32_uyvy)
+ || !register_conversion(IMG_RGBA32, IMG_YVYU, rgba32_yvyu)
+ || !register_conversion(IMG_RGBA32, IMG_Y8, rgba32_y8)
+
+ || !register_conversion(IMG_ABGR32, IMG_YUV420P, abgr32_yuv420p)
+ || !register_conversion(IMG_ABGR32, IMG_YUV411P, abgr32_yuv411p)
+ || !register_conversion(IMG_ABGR32, IMG_YUV422P, abgr32_yuv422p)
+ || !register_conversion(IMG_ABGR32, IMG_YUV444P, abgr32_yuv444p)
+ || !register_conversion(IMG_ABGR32, IMG_YUY2, abgr32_yuy2)
+ || !register_conversion(IMG_ABGR32, IMG_UYVY, abgr32_uyvy)
+ || !register_conversion(IMG_ABGR32, IMG_YVYU, abgr32_yvyu)
+ || !register_conversion(IMG_ABGR32, IMG_Y8, abgr32_y8)
+
+ || !register_conversion(IMG_ARGB32, IMG_YUV420P, argb32_yuv420p)
+ || !register_conversion(IMG_ARGB32, IMG_YUV411P, argb32_yuv411p)
+ || !register_conversion(IMG_ARGB32, IMG_YUV422P, argb32_yuv422p)
+ || !register_conversion(IMG_ARGB32, IMG_YUV444P, argb32_yuv444p)
+ || !register_conversion(IMG_ARGB32, IMG_YUY2, argb32_yuy2)
+ || !register_conversion(IMG_ARGB32, IMG_UYVY, argb32_uyvy)
+ || !register_conversion(IMG_ARGB32, IMG_YVYU, argb32_yvyu)
+ || !register_conversion(IMG_ARGB32, IMG_Y8, argb32_y8)
+
+ || !register_conversion(IMG_BGRA32, IMG_YUV420P, bgra32_yuv420p)
+ || !register_conversion(IMG_BGRA32, IMG_YUV411P, bgra32_yuv411p)
+ || !register_conversion(IMG_BGRA32, IMG_YUV422P, bgra32_yuv422p)
+ || !register_conversion(IMG_BGRA32, IMG_YUV444P, bgra32_yuv444p)
+ || !register_conversion(IMG_BGRA32, IMG_YUY2, bgra32_yuy2)
+ || !register_conversion(IMG_BGRA32, IMG_UYVY, bgra32_uyvy)
+ || !register_conversion(IMG_BGRA32, IMG_YVYU, bgra32_yvyu)
+ || !register_conversion(IMG_BGRA32, IMG_Y8, bgra32_y8)
+
+ //---- Grayscale ----//
+
+ || !register_conversion(IMG_YUV420P, IMG_GRAY8, yuvp_gray8)
+ || !register_conversion(IMG_YUV411P, IMG_GRAY8, yuvp_gray8)
+ || !register_conversion(IMG_YUV422P, IMG_GRAY8, yuvp_gray8)
+ || !register_conversion(IMG_YUV444P, IMG_GRAY8, yuvp_gray8)
+ || !register_conversion(IMG_YUY2, IMG_GRAY8, yuy2_gray8)
+ || !register_conversion(IMG_UYVY, IMG_GRAY8, uyvy_gray8)
+ || !register_conversion(IMG_YVYU, IMG_GRAY8, yuy2_gray8)
+ || !register_conversion(IMG_Y8, IMG_GRAY8, yuvp_gray8)
+
+ || !register_conversion(IMG_GRAY8, IMG_YUV420P, gray8_yuv420p)
+ || !register_conversion(IMG_GRAY8, IMG_YUV411P, gray8_yuv411p)
+ || !register_conversion(IMG_GRAY8, IMG_YUV422P, gray8_yuv422p)
+ || !register_conversion(IMG_GRAY8, IMG_YUV444P, gray8_yuv444p)
+ || !register_conversion(IMG_GRAY8, IMG_YUY2, gray8_yuy2)
+ || !register_conversion(IMG_GRAY8, IMG_UYVY, gray8_uyvy)
+ || !register_conversion(IMG_GRAY8, IMG_YVYU, gray8_yuy2)
+ || !register_conversion(IMG_GRAY8, IMG_Y8, gray8_y8)
+ ) {
+ return 0;
+ }
+
+ /******** MMX implementations ********/
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
+ if (accel & AC_MMX) {
+
+ //---- YUV->RGB ----//
+
+ if (!register_conversion(IMG_YUV420P, IMG_RGB24, yuv420p_rgb24_mmx)
+ || !register_conversion(IMG_YUV422P, IMG_RGB24, yuv422p_rgb24_mmx)
+ || !register_conversion(IMG_YUV420P, IMG_BGR24, yuv420p_bgr24_mmx)
+ || !register_conversion(IMG_YUV422P, IMG_BGR24, yuv422p_bgr24_mmx)
+ || !register_conversion(IMG_YUV420P, IMG_RGBA32, yuv420p_rgba32_mmx)
+ || !register_conversion(IMG_YUV422P, IMG_RGBA32, yuv422p_rgba32_mmx)
+ || !register_conversion(IMG_YUV420P, IMG_ABGR32, yuv420p_abgr32_mmx)
+ || !register_conversion(IMG_YUV422P, IMG_ABGR32, yuv422p_abgr32_mmx)
+ || !register_conversion(IMG_YUV420P, IMG_ARGB32, yuv420p_argb32_mmx)
+ || !register_conversion(IMG_YUV422P, IMG_ARGB32, yuv422p_argb32_mmx)
+ || !register_conversion(IMG_YUV420P, IMG_BGRA32, yuv420p_bgra32_mmx)
+ || !register_conversion(IMG_YUV422P, IMG_BGRA32, yuv422p_bgra32_mmx)
+ ) {
+ return 0;
+ }
+ }
+#endif
+
+ /******** SSE2 implementations ********/
+
+#if defined(HAVE_ASM_SSE2)
+ if (HAS_ACCEL(accel, AC_SSE2)) {
+
+ //---- YUV->RGB ----//
+
+ if (!register_conversion(IMG_YUV420P, IMG_RGB24, yuv420p_rgb24_sse2)
+ || !register_conversion(IMG_YUV411P, IMG_RGB24, yuv411p_rgb24_sse2)
+ || !register_conversion(IMG_YUV422P, IMG_RGB24, yuv422p_rgb24_sse2)
+ || !register_conversion(IMG_YUV444P, IMG_RGB24, yuv444p_rgb24_sse2)
+ || !register_conversion(IMG_YUY2, IMG_RGB24, yuy2_rgb24_sse2)
+ || !register_conversion(IMG_UYVY, IMG_RGB24, uyvy_rgb24_sse2)
+ || !register_conversion(IMG_YVYU, IMG_RGB24, yvyu_rgb24_sse2)
+ || !register_conversion(IMG_Y8, IMG_RGB24, y8_rgb24_sse2)
+
+ || !register_conversion(IMG_YUV420P, IMG_BGR24, yuv420p_bgr24_sse2)
+ || !register_conversion(IMG_YUV411P, IMG_BGR24, yuv411p_bgr24_sse2)
+ || !register_conversion(IMG_YUV422P, IMG_BGR24, yuv422p_bgr24_sse2)
+ || !register_conversion(IMG_YUV444P, IMG_BGR24, yuv444p_bgr24_sse2)
+ || !register_conversion(IMG_YUY2, IMG_BGR24, yuy2_bgr24_sse2)
+ || !register_conversion(IMG_UYVY, IMG_BGR24, uyvy_bgr24_sse2)
+ || !register_conversion(IMG_YVYU, IMG_BGR24, yvyu_bgr24_sse2)
+ || !register_conversion(IMG_Y8, IMG_BGR24, y8_rgb24_sse2)
+
+ || !register_conversion(IMG_YUV420P, IMG_RGBA32, yuv420p_rgba32_sse2)
+ || !register_conversion(IMG_YUV411P, IMG_RGBA32, yuv411p_rgba32_sse2)
+ || !register_conversion(IMG_YUV422P, IMG_RGBA32, yuv422p_rgba32_sse2)
+ || !register_conversion(IMG_YUV444P, IMG_RGBA32, yuv444p_rgba32_sse2)
+ || !register_conversion(IMG_YUY2, IMG_RGBA32, yuy2_rgba32_sse2)
+ || !register_conversion(IMG_UYVY, IMG_RGBA32, uyvy_rgba32_sse2)
+ || !register_conversion(IMG_YVYU, IMG_RGBA32, yvyu_rgba32_sse2)
+ || !register_conversion(IMG_Y8, IMG_RGBA32, y8_rgba32_sse2)
+
+ || !register_conversion(IMG_YUV420P, IMG_ABGR32, yuv420p_abgr32_sse2)
+ || !register_conversion(IMG_YUV411P, IMG_ABGR32, yuv411p_abgr32_sse2)
+ || !register_conversion(IMG_YUV422P, IMG_ABGR32, yuv422p_abgr32_sse2)
+ || !register_conversion(IMG_YUV444P, IMG_ABGR32, yuv444p_abgr32_sse2)
+ || !register_conversion(IMG_YUY2, IMG_ABGR32, yuy2_abgr32_sse2)
+ || !register_conversion(IMG_UYVY, IMG_ABGR32, uyvy_abgr32_sse2)
+ || !register_conversion(IMG_YVYU, IMG_ABGR32, yvyu_abgr32_sse2)
+ || !register_conversion(IMG_Y8, IMG_ABGR32, y8_argb32_sse2)
+
+ || !register_conversion(IMG_YUV420P, IMG_ARGB32, yuv420p_argb32_sse2)
+ || !register_conversion(IMG_YUV411P, IMG_ARGB32, yuv411p_argb32_sse2)
+ || !register_conversion(IMG_YUV422P, IMG_ARGB32, yuv422p_argb32_sse2)
+ || !register_conversion(IMG_YUV444P, IMG_ARGB32, yuv444p_argb32_sse2)
+ || !register_conversion(IMG_YUY2, IMG_ARGB32, yuy2_argb32_sse2)
+ || !register_conversion(IMG_UYVY, IMG_ARGB32, uyvy_argb32_sse2)
+ || !register_conversion(IMG_YVYU, IMG_ARGB32, yvyu_argb32_sse2)
+ || !register_conversion(IMG_Y8, IMG_ARGB32, y8_argb32_sse2)
+
+ || !register_conversion(IMG_YUV420P, IMG_BGRA32, yuv420p_bgra32_sse2)
+ || !register_conversion(IMG_YUV411P, IMG_BGRA32, yuv411p_bgra32_sse2)
+ || !register_conversion(IMG_YUV422P, IMG_BGRA32, yuv422p_bgra32_sse2)
+ || !register_conversion(IMG_YUV444P, IMG_BGRA32, yuv444p_bgra32_sse2)
+ || !register_conversion(IMG_YUY2, IMG_BGRA32, yuy2_bgra32_sse2)
+ || !register_conversion(IMG_UYVY, IMG_BGRA32, uyvy_bgra32_sse2)
+ || !register_conversion(IMG_YVYU, IMG_BGRA32, yvyu_bgra32_sse2)
+ || !register_conversion(IMG_Y8, IMG_BGRA32, y8_rgba32_sse2)
+
+ //---- RGB->YUV ----//
+
+ || !register_conversion(IMG_RGB24, IMG_YUV420P, rgb24_yuv420p_sse2)
+ || !register_conversion(IMG_RGB24, IMG_YUV411P, rgb24_yuv411p_sse2)
+ || !register_conversion(IMG_RGB24, IMG_YUV422P, rgb24_yuv422p_sse2)
+ || !register_conversion(IMG_RGB24, IMG_YUV444P, rgb24_yuv444p_sse2)
+ || !register_conversion(IMG_RGB24, IMG_YUY2, rgb24_yuy2_sse2)
+ || !register_conversion(IMG_RGB24, IMG_UYVY, rgb24_uyvy_sse2)
+ || !register_conversion(IMG_RGB24, IMG_YVYU, rgb24_yvyu_sse2)
+ || !register_conversion(IMG_RGB24, IMG_Y8, rgb24_y8_sse2)
+
+ || !register_conversion(IMG_BGR24, IMG_YUV420P, bgr24_yuv420p_sse2)
+ || !register_conversion(IMG_BGR24, IMG_YUV411P, bgr24_yuv411p_sse2)
+ || !register_conversion(IMG_BGR24, IMG_YUV422P, bgr24_yuv422p_sse2)
+ || !register_conversion(IMG_BGR24, IMG_YUV444P, bgr24_yuv444p_sse2)
+ || !register_conversion(IMG_BGR24, IMG_YUY2, bgr24_yuy2_sse2)
+ || !register_conversion(IMG_BGR24, IMG_UYVY, bgr24_uyvy_sse2)
+ || !register_conversion(IMG_BGR24, IMG_YVYU, bgr24_yvyu_sse2)
+ || !register_conversion(IMG_BGR24, IMG_Y8, bgr24_y8_sse2)
+
+ || !register_conversion(IMG_RGBA32, IMG_YUV420P, rgba32_yuv420p_sse2)
+ || !register_conversion(IMG_RGBA32, IMG_YUV411P, rgba32_yuv411p_sse2)
+ || !register_conversion(IMG_RGBA32, IMG_YUV422P, rgba32_yuv422p_sse2)
+ || !register_conversion(IMG_RGBA32, IMG_YUV444P, rgba32_yuv444p_sse2)
+ || !register_conversion(IMG_RGBA32, IMG_YUY2, rgba32_yuy2_sse2)
+ || !register_conversion(IMG_RGBA32, IMG_UYVY, rgba32_uyvy_sse2)
+ || !register_conversion(IMG_RGBA32, IMG_YVYU, rgba32_yvyu_sse2)
+ || !register_conversion(IMG_RGBA32, IMG_Y8, rgba32_y8_sse2)
+
+ || !register_conversion(IMG_ABGR32, IMG_YUV420P, abgr32_yuv420p_sse2)
+ || !register_conversion(IMG_ABGR32, IMG_YUV411P, abgr32_yuv411p_sse2)
+ || !register_conversion(IMG_ABGR32, IMG_YUV422P, abgr32_yuv422p_sse2)
+ || !register_conversion(IMG_ABGR32, IMG_YUV444P, abgr32_yuv444p_sse2)
+ || !register_conversion(IMG_ABGR32, IMG_YUY2, abgr32_yuy2_sse2)
+ || !register_conversion(IMG_ABGR32, IMG_UYVY, abgr32_uyvy_sse2)
+ || !register_conversion(IMG_ABGR32, IMG_YVYU, abgr32_yvyu_sse2)
+ || !register_conversion(IMG_ABGR32, IMG_Y8, abgr32_y8_sse2)
+
+ || !register_conversion(IMG_ARGB32, IMG_YUV420P, argb32_yuv420p_sse2)
+ || !register_conversion(IMG_ARGB32, IMG_YUV411P, argb32_yuv411p_sse2)
+ || !register_conversion(IMG_ARGB32, IMG_YUV422P, argb32_yuv422p_sse2)
+ || !register_conversion(IMG_ARGB32, IMG_YUV444P, argb32_yuv444p_sse2)
+ || !register_conversion(IMG_ARGB32, IMG_YUY2, argb32_yuy2_sse2)
+ || !register_conversion(IMG_ARGB32, IMG_UYVY, argb32_uyvy_sse2)
+ || !register_conversion(IMG_ARGB32, IMG_YVYU, argb32_yvyu_sse2)
+ || !register_conversion(IMG_ARGB32, IMG_Y8, argb32_y8_sse2)
+
+ || !register_conversion(IMG_BGRA32, IMG_YUV420P, bgra32_yuv420p_sse2)
+ || !register_conversion(IMG_BGRA32, IMG_YUV411P, bgra32_yuv411p_sse2)
+ || !register_conversion(IMG_BGRA32, IMG_YUV422P, bgra32_yuv422p_sse2)
+ || !register_conversion(IMG_BGRA32, IMG_YUV444P, bgra32_yuv444p_sse2)
+ || !register_conversion(IMG_BGRA32, IMG_YUY2, bgra32_yuy2_sse2)
+ || !register_conversion(IMG_BGRA32, IMG_UYVY, bgra32_uyvy_sse2)
+ || !register_conversion(IMG_BGRA32, IMG_YVYU, bgra32_yvyu_sse2)
+ || !register_conversion(IMG_BGRA32, IMG_Y8, bgra32_y8_sse2)
+
+ //---- Grayscale ----//
+
+ || !register_conversion(IMG_GRAY8, IMG_YUY2, gray8_yuy2_sse2)
+ || !register_conversion(IMG_GRAY8, IMG_UYVY, gray8_uyvy_sse2)
+ || !register_conversion(IMG_GRAY8, IMG_YVYU, gray8_yuy2_sse2)
+ || !register_conversion(IMG_GRAY8, IMG_Y8, gray8_y8_sse2)
+ ) {
+ return 0;
+ }
+ }
+
+ /* YUV->GRAY8 routines use CMOVcc */
+ if (HAS_ACCEL(accel, AC_CMOVE|AC_SSE2)) {
+ if (!register_conversion(IMG_YUV420P, IMG_GRAY8, yuvp_gray8_sse2)
+ || !register_conversion(IMG_YUV411P, IMG_GRAY8, yuvp_gray8_sse2)
+ || !register_conversion(IMG_YUV422P, IMG_GRAY8, yuvp_gray8_sse2)
+ || !register_conversion(IMG_YUV444P, IMG_GRAY8, yuvp_gray8_sse2)
+ || !register_conversion(IMG_YUY2, IMG_GRAY8, yuy2_gray8_sse2)
+ || !register_conversion(IMG_UYVY, IMG_GRAY8, uyvy_gray8_sse2)
+ || !register_conversion(IMG_YVYU, IMG_GRAY8, yuy2_gray8_sse2)
+ || !register_conversion(IMG_Y8, IMG_GRAY8, yuvp_gray8_sse2)
+ ) {
+ return 0;
+ }
+ }
+#endif
+
+ return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ * c-file-style: "stroustrup"
+ * c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/imgconvert.c b/debian/transcode/transcode-1.1.7/aclib/imgconvert.c
new file mode 100644
index 00000000..cc502977
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/imgconvert.c
@@ -0,0 +1,119 @@
+/*
+ * imgconvert.c - image format conversion routines
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later). See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "imgconvert.h"
+#include "img_internal.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+/*************************************************************************/
+
+static struct {
+ ImageFormat srcfmt, destfmt;
+ ConversionFunc func;
+} *conversions;
+static int n_conversions = 0;
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Image conversion routine. src and dest are arrays of pointers to planes
+ * (for packed formats with only one plane, just use `&data'); srcfmt and
+ * destfmt specify the source and destination image formats (IMG_*).
+ * width and height are in pixels. Returns 1 on success, 0 on failure. */
+
+int ac_imgconvert(uint8_t **src, ImageFormat srcfmt,
+ uint8_t **dest, ImageFormat destfmt,
+ int width, int height)
+{
+ int i;
+
+ /* Hack to handle YV12 easily, because conversion routines don't get
+ * format tags */
+ uint8_t *newsrc[3], *newdest[3];
+ if (srcfmt == IMG_YV12) {
+ srcfmt = IMG_YUV420P;
+ newsrc[0] = src[0];
+ newsrc[1] = src[2];
+ newsrc[2] = src[1];
+ src = newsrc;
+ }
+ if (destfmt == IMG_YV12) {
+ destfmt = IMG_YUV420P;
+ newdest[0] = dest[0];
+ newdest[1] = dest[2];
+ newdest[2] = dest[1];
+ dest = newdest;
+ }
+
+ for (i = 0; i < n_conversions; i++) {
+ if (conversions[i].srcfmt==srcfmt && conversions[i].destfmt==destfmt)
+ return (*conversions[i].func)(src, dest, width, height);
+ }
+
+ return 0;
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Internal use only! */
+
+int ac_imgconvert_init(int accel)
+{
+ if (!ac_imgconvert_init_yuv_planar(accel)
+ || !ac_imgconvert_init_yuv_packed(accel)
+ || !ac_imgconvert_init_yuv_mixed(accel)
+ || !ac_imgconvert_init_yuv_rgb(accel)
+ || !ac_imgconvert_init_rgb_packed(accel)
+ ) {
+ fprintf(stderr, "ac_imgconvert_init() failed");
+ return 0;
+ }
+ return 1;
+}
+
+int register_conversion(ImageFormat srcfmt, ImageFormat destfmt,
+ ConversionFunc function)
+{
+ int i;
+
+ for (i = 0; i < n_conversions; i++) {
+ if (conversions[i].srcfmt==srcfmt && conversions[i].destfmt==destfmt) {
+ conversions[i].func = function;
+ return 1;
+ }
+ }
+
+ if (!(conversions = realloc(conversions,
+ (n_conversions+1) * sizeof(*conversions)))) {
+ fprintf(stderr, "register_conversion(): out of memory\n");
+ return 0;
+ }
+ conversions[n_conversions].srcfmt = srcfmt;
+ conversions[n_conversions].destfmt = destfmt;
+ conversions[n_conversions].func = function;
+ n_conversions++;
+ return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ * c-file-style: "stroustrup"
+ * c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/imgconvert.h b/debian/transcode/transcode-1.1.7/aclib/imgconvert.h
new file mode 100644
index 00000000..c02d5a01
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/imgconvert.h
@@ -0,0 +1,105 @@
+/*
+ * imgconvert.h - defines for image format conversion routines
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later). See the file COPYING
+ * for details.
+ */
+
+#ifndef ACLIB_IMGCONVERT_H
+#define ACLIB_IMGCONVERT_H
+
+/*************************************************************************/
+
+/* Image format defines */
+typedef enum {
+ IMG_UNKNOWN = 0, /* Unknown/unset (dummy value, guaranteed to be 0) */
+ /* YUV formats */
+ IMG_YUV_BASE = 0x1000,
+ IMG_YUV420P, /* YUV planar, 1 U/V per 2x2 Y pixels */
+ IMG_YV12, /* YUV420P with U and V reversed */
+ IMG_YUV411P, /* YUV planar, 1 U/V per 4x1 Y pixels */
+ IMG_YUV422P, /* YUV planar, 1 U/V per 2x1 Y pixels */
+ IMG_YUV444P, /* YUV planar, 1 U/V per 1x1 Y pixels */
+ IMG_YUY2, /* YUV packed, 1 U/V per 2x1 Y pixels, Y:U:Y:V */
+ IMG_UYVY, /* YUV packed, 1 U/V per 2x1 Y pixels, U:Y:V:Y */
+ IMG_YVYU, /* YUV packed, 1 U/V per 2x1 Y pixels, Y:V:Y:U */
+ IMG_Y8, /* Y-only 8-bit data */
+ IMG_YUV_LAST,
+ /* RGB formats */
+ IMG_RGB_BASE = 0x2000,
+ IMG_RGB24, /* RGB packed, 8 bits per component, R:G:B */
+ IMG_BGR24, /* RGB packed, 8 bits per component, B:G:R */
+ IMG_RGBA32, /* RGB+alpha packed, 8 bits per component, R:G:B:A */
+ IMG_ABGR32, /* RGB+alpha packed, 8 bits per component, A:B:G:R */
+ IMG_ARGB32, /* RGB+alpha packed, 8 bits per component, A:R:G:B */
+ IMG_BGRA32, /* RGB+alpha packed, 8 bits per component, B:G:R:A */
+ IMG_GRAY8, /* Grayscale 8-bit data */
+ IMG_RGB_LAST,
+} ImageFormat;
+
+/* Alias */
+#define IMG_NONE IMG_UNKNOWN
+
+/* Default YUV and RGB formats */
+#define IMG_YUV_DEFAULT IMG_YUV420P
+#define IMG_RGB_DEFAULT IMG_RGB24
+
+/* Is the given image format a YUV/RGB one? */
+#define IS_YUV_FORMAT(fmt) ((fmt) > IMG_YUV_BASE && (fmt) < IMG_YUV_LAST)
+#define IS_RGB_FORMAT(fmt) ((fmt) > IMG_RGB_BASE && (fmt) < IMG_RGB_LAST)
+
+/* U/V plane size for YUV planar formats (Y plane size is always w*h) */
+#define UV_PLANE_SIZE(fmt,w,h) \
+ ((fmt)==IMG_YUV420P ? ((w)/2)*((h)/2) : \
+ (fmt)==IMG_YV12 ? ((w)/2)*((h)/2) : \
+ (fmt)==IMG_YUV411P ? ((w)/4)* (h) : \
+ (fmt)==IMG_YUV422P ? ((w)/2)* (h) : \
+ (fmt)==IMG_YUV444P ? (w) * (h) : 0)
+
+/* Macro to initialize an array of planes from a buffer */
+#define YUV_INIT_PLANES(planes,buffer,fmt,w,h) \
+ ((planes)[0] = (buffer), \
+ (planes)[1] = (planes)[0] + (w)*(h), \
+ (planes)[2] = (planes)[1] + UV_PLANE_SIZE((fmt),(w),(h)))
+
+#if 0
+/* Structure describing an image. FIXME: not currently used--this should
+ * eventually replace the (planes,format) pairs passed to ac_imgconvert. */
+typedef struct {
+ ImageFormat format; /* Format of image data */
+ int width, height; /* Size of image */
+ uint8_t *planes[4]; /* Data planes (use planes[0] for packed data) */
+ int stride[4]; /* Length of one row in each plane, incl. padding */
+} Image;
+#endif
+
+/*************************************************************************/
+
+/* Initialization routine. Returns 1 on success, 0 on failure. */
+extern int ac_imgconvert_init(int accel);
+
+/* Conversion routine. Returns 1 on success, 0 on failure. */
+extern int ac_imgconvert(uint8_t **src, /* Array of source planes */
+ ImageFormat srcfmt, /* Source image format */
+ uint8_t **dest, /* Array of dest planes */
+ ImageFormat destfmt, /* Destination image format */
+ int width, /* Image width in pixels */
+ int height /* Image height in pixels */
+ );
+
+/*************************************************************************/
+
+#endif /* ACLIB_IMGCONVERT_H */
+
+/*
+ * Local variables:
+ * c-file-style: "stroustrup"
+ * c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/memcpy.c b/debian/transcode/transcode-1.1.7/aclib/memcpy.c
new file mode 100644
index 00000000..05cdf41c
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/memcpy.c
@@ -0,0 +1,543 @@
+/*
+ * memcpy.c - optimized memcpy() routines for aclib
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later). See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "ac_internal.h"
+#include <string.h>
+
+/* Use memmove because memcpy isn't guaranteed to be ascending */
+static void *(*memcpy_ptr)(void *, const void *, size_t) = memmove;
+
+/*************************************************************************/
+
+/* External interface */
+
+void *ac_memcpy(void *dest, const void *src, size_t size)
+{
+ return (*memcpy_ptr)(dest, src, size);
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Note the check for ARCH_X86 here: this is to prevent compilation of this
+ * code on x86_64, since all x86_64 processors support SSE2, and because
+ * this code is not set up to use the 64-bit registers for addressing on
+ * x86_64. */
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
+
+/* MMX-optimized routine, intended for PMMX/PII processors.
+ * Nonstandard instructions used:
+ * (CPUID.MMX) MOVQ
+ */
+
+static void *memcpy_mmx(void *dest, const void *src, size_t bytes)
+{
+ asm("\
+PENTIUM_LINE_SIZE = 32 # PMMX/PII cache line size \n\
+PENTIUM_CACHE_SIZE = 8192 # PMMX/PII total cache size \n\
+# Use only half because writes may touch the cache too (PII) \n\
+PENTIUM_CACHE_BLOCK = (PENTIUM_CACHE_SIZE/2 - PENTIUM_LINE_SIZE) \n\
+ \n\
+ push %%ebx # Save PIC register \n\
+ push %%edi # Save destination for return value \n\
+ cld # MOVS* should ascend \n\
+ \n\
+ mov $64, %%ebx # Constant \n\
+ \n\
+ cmp %%ebx, %%ecx \n\
+ jb mmx.memcpy_last # Just use movs if <64 bytes \n\
+ \n\
+ # First align destination address to a multiple of 8 bytes \n\
+ mov $8, %%eax # EAX <- (8-dest) & 7 \n\
+ sub %%edi, %%eax \n\
+ and $7, %%eax # ... which is the number of bytes to copy\n"
+#ifdef ACLIB_DISABLE_X86_TEXTRELS // Because "lea 0f" requires a textrel
+" xchg %%eax, %%ecx \n\
+ mov %%ecx, %%edx \n\
+ repz movsb \n\
+ mov %%eax, %%ecx \n\
+ mov %%edx, %%eax \n"
+#else
+" lea 0f, %%edx # Use a computed jump--faster than a loop\n\
+ sub %%eax, %%edx \n\
+ jmp *%%edx # Execute 0-7 MOVSB's \n\
+ movsb \n\
+ movsb \n\
+ movsb \n\
+ movsb \n\
+ movsb \n\
+ movsb \n\
+ movsb \n"
+#endif
+"0: sub %%eax, %%ecx # Update count \n\
+ \n\
+ # Now copy data in blocks \n\
+0: mov %%ecx, %%edx # EDX <- ECX >> 6 (cache lines to copy) \n\
+ shr $6, %%edx \n\
+ jz mmx.memcpy_last # <64 bytes left? Skip to end \n\
+ cmp $PENTIUM_CACHE_BLOCK/64, %%edx \n\
+ jb 1f # Limit size of block \n\
+ mov $PENTIUM_CACHE_BLOCK/64, %%edx \n\
+1: mov %%edx, %%eax # EAX <- EDX << 6 (bytes to copy) \n\
+ shl $6, %%eax \n\
+ sub %%eax, %%ecx # Update remaining count \n\
+ add %%eax, %%esi # Point to end of region to be block-copied\n\
+2: test %%eax, -32(%%esi) # Touch each cache line in reverse order\n\
+ test %%eax, -64(%%esi) \n\
+ sub %%ebx, %%esi # Update pointer \n\
+ sub %%ebx, %%eax # And loop \n\
+ jnz 2b \n\
+ # Note that ESI now points to the beginning of the block \n\
+3: movq (%%esi), %%mm0 # Do the actual copy, 64 bytes at a time\n\
+ movq 8(%%esi), %%mm1 \n\
+ movq 16(%%esi), %%mm2 \n\
+ movq 24(%%esi), %%mm3 \n\
+ movq 32(%%esi), %%mm4 \n\
+ movq 40(%%esi), %%mm5 \n\
+ movq 48(%%esi), %%mm6 \n\
+ movq 56(%%esi), %%mm7 \n\
+ movq %%mm0, (%%edi) \n\
+ movq %%mm1, 8(%%edi) \n\
+ movq %%mm2, 16(%%edi) \n\
+ movq %%mm3, 24(%%edi) \n\
+ movq %%mm4, 32(%%edi) \n\
+ movq %%mm5, 40(%%edi) \n\
+ movq %%mm6, 48(%%edi) \n\
+ movq %%mm7, 56(%%edi) \n\
+ add %%ebx, %%esi # Update pointers \n\
+ add %%ebx, %%edi \n\
+ dec %%edx # And loop \n\
+ jnz 3b \n\
+ jmp 0b \n\
+ \n\
+mmx.memcpy_last: \n\
+ # Copy last <64 bytes, using the computed jump trick \n\
+ mov %%ecx, %%eax # EAX <- ECX>>2 \n\
+ shr $2, %%eax \n"
+#ifdef ACLIB_DISABLE_X86_TEXTRELS
+" xchg %%eax, %%ecx \n\
+ repz movsd \n\
+ mov %%eax, %%ecx \n"
+#else
+" lea 0f, %%edx \n\
+ sub %%eax, %%edx \n\
+ jmp *%%edx # Execute 0-15 MOVSD's \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n"
+#endif
+"0: and $3, %%ecx # ECX <- ECX & 3 \n"
+#ifdef ACLIB_DISABLE_X86_TEXTRELS
+" repz movsb \n"
+#else
+" lea 0f, %%edx \n\
+ sub %%ecx, %%edx \n\
+ jmp *%%edx # Execute 0-3 MOVSB's \n\
+ movsb \n\
+ movsb \n\
+ movsb \n"
+#endif
+"0: \n\
+ # All done! \n\
+ emms # Clean up MMX state \n\
+ pop %%edi # Restore destination (return value) \n\
+ pop %%ebx # Restore PIC register \n\
+ " : /* no outputs */
+ : "D" (dest), "S" (src), "c" (bytes)
+ : "%eax", "%edx"
+ );
+ return dest;
+}
+
+#endif /* HAVE_ASM_MMX && ARCH_X86 */
+
+/*************************************************************************/
+
+#if defined(HAVE_ASM_SSE) && defined(ARCH_X86)
+
+/* SSE-optimized routine. Backported from AMD64 routine below.
+ * Nonstandard instructions used:
+ * (CPUID.CMOVE) CMOVA
+ * (CPUID.MMX) MOVQ
+ * (CPUID.SSE) MOVNTQ
+ */
+
+static void *memcpy_sse(void *dest, const void *src, size_t bytes)
+{
+ asm("\
+ push %%ebx # Save PIC register \n\
+ push %%edi # Save destination for return value \n\
+ cld # MOVS* should ascend \n\
+ \n\
+ cmp $64, %%ecx # Skip block copy for small blocks \n\
+ jb sse.memcpy_last \n\
+ \n\
+ mov $128, %%ebx # Constant used later \n\
+ \n\
+ # First align destination address to a multiple of 8 bytes \n\
+ mov $8, %%eax # EAX <- (8-dest) & 7 \n\
+ sub %%edi, %%eax \n\
+ and $7, %%eax # ... which is the number of bytes to copy\n"
+#ifdef ACLIB_DISABLE_X86_TEXTRELS
+" xchg %%eax, %%ecx \n\
+ mov %%ecx, %%edx \n\
+ repz movsb \n\
+ mov %%eax, %%ecx \n\
+ mov %%edx, %%eax \n"
+#else
+" lea 0f, %%edx # Use a computed jump--faster than a loop\n\
+ sub %%eax, %%edx \n\
+ jmp *%%edx # Execute 0-7 MOVSB's \n\
+ movsb \n\
+ movsb \n\
+ movsb \n\
+ movsb \n\
+ movsb \n\
+ movsb \n\
+ movsb \n"
+#endif
+"0: sub %%eax, %%ecx # Update count \n\
+ \n\
+ cmp $0x10040, %%ecx # Is this a large block? (0x10040 is an \n\
+ # arbitrary value where prefetching and \n\
+ # write combining seem to start becoming\n\
+ # faster) \n\
+ jae sse.memcpy_bp # Yup, use prefetch copy \n\
+ \n\
+sse.memcpy_small: # Small block copy routine--no prefetch \n"
+#if 0
+" mov %%ecx, %%edx # EDX <- bytes to copy / 8 \n\
+ shr $3, %%edx \n\
+ mov %%edx, %%eax # Leave remainder in ECX for later \n\
+ shl $3, %%eax \n\
+ sub %%eax, %%ecx \n\
+ .balign 16 \n\
+0: movq (%%esi), %%mm0 # Copy 8 bytes of data \n\
+ movq %%mm0, (%%edi) \n\
+ add $8, %%esi # Update pointers \n\
+ add $8, %%edi \n\
+ dec %%edx # And loop \n\
+ jg 0b \n\
+ jmp sse.memcpy_last # Copy any remaining bytes \n\
+ \n\
+ nop # Align loops below \n"
+#else
+" # It appears that a simple rep movs is faster than cleverness \n\
+ # with movq... \n\
+ mov %%ecx, %%edx # EDX <- ECX & 3 \n\
+ and $3, %%edx \n\
+ shr $2, %%ecx # ECX <- ECX >> 2 \n\
+ rep movsl # Copy away! \n\
+ mov %%edx, %%ecx # Take care of last 0-3 bytes \n\
+ rep movsb \n\
+ jmp sse.memcpy_end # And exit \n\
+ \n\
+ .balign 16 \n\
+ nop \n\
+ nop \n"
+#endif
+"sse.memcpy_bp: # Block prefetch copy routine \n\
+0: mov %%ecx, %%edx # EDX: temp counter \n\
+ shr $6, %%edx # Divide by cache line size (64 bytes) \n\
+ cmp %%ebx, %%edx # ... and cap at 128 (8192 bytes) \n\
+ cmova %%ebx, %%edx \n\
+ shl $3, %%edx # EDX <- cache lines to copy * 8 \n\
+ mov %%edx, %%eax # EAX <- cache lines to preload * 8 \n\
+ # (also used as memory offset) \n\
+1: test %%eax, -64(%%esi,%%eax,8) # Preload cache lines in pairs \n\
+ test %%eax, -128(%%esi,%%eax,8) # (going backwards) \n\
+ # (note that test %%eax,... seems to be faster than prefetchnta \n\
+ # on x86) \n\
+ sub $16, %%eax # And loop \n\
+ jg 1b \n\
+ \n\
+ # Then copy--forward, which seems to be faster than reverse for \n\
+ # certain alignments \n\
+ xor %%eax, %%eax \n\
+2: movq (%%esi,%%eax,8), %%mm0 # Copy 8 bytes and loop \n\
+ movntq %%mm0, (%%edi,%%eax,8) \n\
+ inc %%eax \n\
+ cmp %%edx, %%eax \n\
+ jb 2b \n\
+ \n\
+ # Finally, update pointers and count, and loop \n\
+ shl $3, %%edx # EDX <- bytes copied \n\
+ add %%edx, %%esi \n\
+ add %%edx, %%edi \n\
+ sub %%edx, %%ecx \n\
+ cmp $64, %%ecx # At least one cache line left? \n\
+ jae 0b # Yup, loop \n\
+ \n\
+sse.memcpy_last: \n\
+ # Copy last <64 bytes, using the computed jump trick \n\
+ mov %%ecx, %%eax # EAX <- ECX>>2 \n\
+ shr $2, %%eax \n"
+#ifdef ACLIB_DISABLE_X86_TEXTRELS
+" xchg %%eax, %%ecx \n\
+ repz movsd \n\
+ mov %%eax, %%ecx \n"
+#else
+" lea 0f, %%edx \n\
+ sub %%eax, %%edx \n\
+ jmp *%%edx # Execute 0-15 MOVSD's \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n\
+ movsd \n"
+#endif
+"0: and $3, %%ecx # ECX <- ECX & 3 \n"
+#ifdef ACLIB_DISABLE_X86_TEXTRELS
+" repz movsb \n"
+#else
+" lea sse.memcpy_end, %%edx \n\
+ sub %%ecx, %%edx \n\
+ jmp *%%edx # Execute 0-3 MOVSB's \n\
+ movsb \n\
+ movsb \n\
+ movsb \n"
+#endif
+" \n\
+sse.memcpy_end: \n\
+ # All done! \n\
+ emms # Clean up after MMX instructions \n\
+ sfence # Flush the write buffer \n\
+ pop %%edi # Restore destination (return value) \n\
+ pop %%ebx # Restore PIC register \n\
+ " : /* no outputs */
+ : "D" (dest), "S" (src), "c" (bytes)
+ : "%eax", "%edx"
+ );
+ return dest;
+}
+
+#endif /* HAVE_ASM_SSE && ARCH_X86 */
+
+/*************************************************************************/
+
+#if defined(HAVE_ASM_SSE2) && defined(ARCH_X86_64)
+
+/* AMD64-optimized routine, using SSE2. Derived from AMD64 optimization
+ * guide section 5.13: Appropriate Memory Copying Routines.
+ * Nonstandard instructions used:
+ * (CPUID.CMOVE) CMOVA
+ * (CPUID.SSE2) MOVDQA, MOVDQU, MOVNTDQ
+ *
+ * Note that this routine will also run more or less as-is (modulo register
+ * names and label(%%rip) references) on x86 CPUs, but tests have shown the
+ * SSE1 version above to be faster.
+ */
+
+/* The block copying code--macroized because we use two versions of it
+ * depending on whether the source is 16-byte-aligned or not. Pass either
+ * movdqa or movdqu (unquoted) for the parameter. */
+#define AMD64_BLOCK_MEMCPY(movdq) \
+" # First prefetch (note that if we end on an odd number of cache \n\
+ # lines, we skip prefetching the last one--faster that way than \n\
+ # prefetching line by line or treating it as a special case) \n\
+0: mov %%ecx, %%edx # EDX: temp counter (always <32 bits) \n\
+ shr $6, %%edx # Divide by cache line size (64 bytes) \n\
+ cmp %%ebx, %%edx # ... and cap at 128 (8192 bytes) \n\
+ cmova %%ebx, %%edx \n\
+ shl $3, %%edx # EDX <- cache lines to copy * 8 \n\
+ mov %%edx, %%eax # EAX <- cache lines to preload * 8 \n\
+ # (also used as memory offset) \n\
+1: prefetchnta -64(%%rsi,%%rax,8) # Preload cache lines in pairs \n\
+ prefetchnta -128(%%rsi,%%rax,8) # (going backwards) \n\
+ sub $16, %%eax # And loop \n\
+ jg 1b \n\
+ \n\
+ # Then copy--forward, which seems to be faster than reverse for \n\
+ # certain alignments \n\
+ xor %%eax, %%eax \n\
+2: " #movdq " (%%rsi,%%rax,8), %%xmm0 # Copy 16 bytes and loop \n\
+ movntdq %%xmm0, (%%rdi,%%rax,8) \n\
+ add $2, %%eax \n\
+ cmp %%edx, %%eax \n\
+ jb 2b \n\
+ \n\
+ # Finally, update pointers and count, and loop \n\
+ shl $3, %%edx # EDX <- bytes copied \n\
+ add %%rdx, %%rsi \n\
+ add %%rdx, %%rdi \n\
+ sub %%rdx, %%rcx \n\
+ cmp $64, %%rcx # At least one cache line left? \n\
+ jae 0b # Yup, loop \n"
+
+static void *memcpy_amd64(void *dest, const void *src, size_t bytes)
+{
+ asm("\
+ push %%rdi # Save destination for return value \n\
+ cld # MOVS* should ascend \n\
+ \n\
+ cmp $64, %%rcx # Skip block copy for small blocks \n\
+ jb amd64.memcpy_last \n\
+ \n\
+ mov $128, %%ebx # Constant used later \n\
+ \n\
+ # First align destination address to a multiple of 16 bytes \n\
+ mov $8, %%eax # EAX <- (8-dest) & 7 \n\
+ sub %%edi, %%eax # (we don't care about the top 32 bits) \n\
+ and $7, %%eax # ... which is the number of bytes to copy\n\
+ lea 0f(%%rip), %%rdx # Use a computed jump--faster than a loop\n\
+ sub %%rax, %%rdx \n\
+ jmp *%%rdx # Execute 0-7 MOVSB's \n\
+ movsb \n\
+ movsb \n\
+ movsb \n\
+ movsb \n\
+ movsb \n\
+ movsb \n\
+ movsb \n\
+0: sub %%rax, %%rcx # Update count \n\
+ test $8, %%edi # Is destination not 16-byte aligned? \n\
+ je 1f \n\
+ movsq # Then move 8 bytes to align it \n\
+ sub $8, %%rcx \n\
+ \n\
+1: cmp $0x38000, %%rcx # Is this a large block? (0x38000 is an \n\
+ # arbitrary value where prefetching and \n\
+ # write combining seem to start becoming\n\
+ # faster) \n\
+ jb amd64.memcpy_small # Nope, use small copy (no prefetch/WC) \n\
+ test $15, %%esi # Is source also 16-byte aligned? \n\
+ # (use ESI to save a REX prefix byte) \n\
+ jnz amd64.memcpy_normal_bp # Nope, use slow copy \n\
+ jmp amd64.memcpy_fast_bp # Yup, use fast copy \n\
+ \n\
+amd64.memcpy_small: # Small block copy routine--no prefetch \n\
+ mov %%ecx, %%edx # EDX <- bytes to copy / 16 \n\
+ shr $4, %%edx # (count known to fit in 32 bits) \n\
+ mov %%edx, %%eax # Leave remainder in ECX for later \n\
+ shl $4, %%eax \n\
+ sub %%eax, %%ecx \n\
+ .balign 16 \n\
+0: movdqu (%%rsi), %%xmm0 # Copy 16 bytes of data \n\
+ movdqa %%xmm0, (%%rdi) \n\
+ add $16, %%rsi # Update pointers \n\
+ add $16, %%rdi \n\
+ dec %%edx # And loop \n\
+ jnz 0b \n\
+ jmp amd64.memcpy_last # Copy any remaining bytes \n\
+ \n\
+ .balign 16 \n\
+ nop \n\
+ nop \n\
+amd64.memcpy_fast_bp: # Fast block prefetch loop \n"
+AMD64_BLOCK_MEMCPY(movdqa)
+" jmp amd64.memcpy_last # Copy any remaining bytes \n\
+ \n\
+ .balign 16 \n\
+ nop \n\
+ nop \n\
+amd64.memcpy_normal_bp: # Normal (unaligned) block prefetch loop\n"
+AMD64_BLOCK_MEMCPY(movdqu)
+" \n\
+amd64.memcpy_last: \n\
+ # Copy last <64 bytes, using the computed jump trick \n\
+ mov %%ecx, %%eax # EAX <- ECX>>3 \n\
+ shr $3, %%eax \n\
+ lea 0f(%%rip), %%rdx \n\
+ add %%eax, %%eax # Watch out, MOVSQ is 2 bytes! \n\
+ sub %%rax, %%rdx \n\
+ jmp *%%rdx # Execute 0-7 MOVSQ's \n\
+ movsq \n\
+ movsq \n\
+ movsq \n\
+ movsq \n\
+ movsq \n\
+ movsq \n\
+ movsq \n\
+0: and $7, %%ecx # ECX <- ECX & 7 \n\
+ lea 0f(%%rip), %%rdx \n\
+ sub %%rcx, %%rdx \n\
+ jmp *%%rdx # Execute 0-7 MOVSB's \n\
+ movsb \n\
+ movsb \n\
+ movsb \n\
+ movsb \n\
+ movsb \n\
+ movsb \n\
+ movsb \n\
+0: \n\
+ # All done! \n\
+ emms # Clean up after MMX instructions \n\
+ sfence # Flush the write buffer \n\
+ pop %%rdi # Restore destination (return value) \n\
+ " : /* no outputs */
+ : "D" (dest), "S" (src), "c" (bytes)
+ : "%rax", "%rbx", "%rdx"
+ );
+ return dest;
+}
+
+#endif /* HAVE_ASM_SSE2 && ARCH_X86_64 */
+
+/*************************************************************************/
+
+/* Initialization routine. */
+
+int ac_memcpy_init(int accel)
+{
+ memcpy_ptr = memmove;
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
+ if (HAS_ACCEL(accel, AC_MMX))
+ memcpy_ptr = memcpy_mmx;
+#endif
+
+#if defined(HAVE_ASM_SSE) && defined(ARCH_X86)
+ if (HAS_ACCEL(accel, AC_CMOVE|AC_SSE))
+ memcpy_ptr = memcpy_sse;
+#endif
+
+#if defined(HAVE_ASM_SSE2) && defined(ARCH_X86_64)
+ if (HAS_ACCEL(accel, AC_CMOVE|AC_SSE2))
+ memcpy_ptr = memcpy_amd64;
+#endif
+
+ return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ * c-file-style: "stroustrup"
+ * c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/rescale.c b/debian/transcode/transcode-1.1.7/aclib/rescale.c
new file mode 100644
index 00000000..5a619735
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/rescale.c
@@ -0,0 +1,280 @@
+/*
+ * rescale.c -- take the weighted average of two sets of byte data
+ * Written by Andrew Church <[email protected]>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later). See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "ac_internal.h"
+
+static void rescale(const uint8_t *, const uint8_t *, uint8_t *, int,
+ uint32_t, uint32_t);
+static void (*rescale_ptr)(const uint8_t *, const uint8_t *, uint8_t *, int,
+ uint32_t, uint32_t) = rescale;
+
+/*************************************************************************/
+
+/* External interface */
+
+void ac_rescale(const uint8_t *src1, const uint8_t *src2,
+ uint8_t *dest, int bytes, uint32_t weight1, uint32_t weight2)
+{
+ if (weight1 >= 0x10000)
+ ac_memcpy(dest, src1, bytes);
+ else if (weight2 >= 0x10000)
+ ac_memcpy(dest, src2, bytes);
+ else
+ (*rescale_ptr)(src1, src2, dest, bytes, weight1, weight2);
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Vanilla C version */
+
+static void rescale(const uint8_t *src1, const uint8_t *src2,
+ uint8_t *dest, int bytes,
+ uint32_t weight1, uint32_t weight2)
+{
+ int i;
+ for (i = 0; i < bytes; i++)
+ dest[i] = (src1[i]*weight1 + src2[i]*weight2 + 32768) >> 16;
+}
+
+/*************************************************************************/
+
+/* MMX version */
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86) /* i.e. not x86_64 */
+
+static void rescale_mmx(const uint8_t *src1, const uint8_t *src2,
+ uint8_t *dest, int bytes,
+ uint32_t weight1, uint32_t weight2)
+{
+ if (bytes >= 8) {
+ /* First store weights in MM4/MM5 to relieve register pressure;
+ * save time by making 2 copies ahead of time in the general
+ * registers. Note that we divide by 2 for MMX due to the lack
+ * of an unsigned SIMD multiply instruction (PMULHUW). */
+ int half1 = weight1 / 2;
+ int half2 = weight2 / 2;
+ half2 += weight1 & weight2 & 1; // pick up the lost bit here
+ asm("movd %%eax, %%mm4; movd %%edx, %%mm5"
+ : : "a" (half1<<16|half1), "d" (half2<<16|half2));
+ asm("\
+ movq %%mm4, %%mm6 # MM6: 00 00 W1 W1 \n\
+ psllq $32, %%mm4 # MM4: W1 W1 00 00 \n\
+ por %%mm6, %%mm4 # MM4: W1 W1 W1 W1 \n\
+ movq %%mm5, %%mm7 # MM7: 00 00 W2 W2 \n\
+ psllq $32, %%mm5 # MM5: W2 W2 00 00 \n\
+ por %%mm7, %%mm5 # MM5: W2 W2 W2 W2 \n\
+ pxor %%mm7, %%mm7 # MM7: 00 00 00 00 \n\
+ pxor %%mm6, %%mm6 # Put 0x0020*4 in MM6 (rounding)\n\
+ pcmpeqw %%mm3, %%mm3 \n\
+ psubw %%mm3, %%mm6 \n\
+ psllw $5, %%mm6 \n\
+ 0: \n\
+ movq -8(%%esi,%%ecx), %%mm0 \n\
+ movq %%mm0, %%mm1 \n\
+ punpcklbw %%mm7, %%mm0 \n\
+ psllw $7, %%mm0 # 9.7 fixed point \n\
+ pmulhw %%mm4, %%mm0 # Multiply to get 10.6 fixed \n\
+ punpckhbw %%mm7, %%mm1 \n\
+ psllw $7, %%mm1 \n\
+ pmulhw %%mm4, %%mm1 \n\
+ movq -8(%%edx,%%ecx), %%mm2 \n\
+ movq %%mm2, %%mm3 \n\
+ punpcklbw %%mm7, %%mm2 \n\
+ psllw $7, %%mm2 \n\
+ pmulhw %%mm5, %%mm2 \n\
+ punpckhbw %%mm7, %%mm3 \n\
+ psllw $7, %%mm3 \n\
+ pmulhw %%mm5, %%mm3 \n\
+ paddw %%mm2, %%mm0 \n\
+ paddw %%mm6, %%mm0 \n\
+ psrlw $6, %%mm0 \n\
+ paddw %%mm3, %%mm1 \n\
+ paddw %%mm6, %%mm1 \n\
+ psrlw $6, %%mm1 \n\
+ packuswb %%mm1, %%mm0 \n\
+ movq %%mm0, -8(%%edi,%%ecx) \n\
+ subl $8, %%ecx \n\
+ jnz 0b \n\
+ emms"
+ : /* no outputs */
+ : "S" (src1), "d" (src2), "D" (dest), "c" (bytes & ~7));
+ }
+ if (UNLIKELY(bytes & 7)) {
+ rescale(src1+(bytes & ~7), src2+(bytes & ~7), dest+(bytes & ~7),
+ bytes & 7, weight1, weight2);
+ }
+}
+
+#endif /* HAVE_ASM_MMX && ARCH_X86 */
+
+/*************************************************************************/
+
+/* MMXEXT version (also for SSE) */
+
+#if (defined(HAVE_ASM_MMXEXT) || defined(HAVE_ASM_SSE)) && defined(ARCH_X86)
+
+static void rescale_mmxext(const uint8_t *src1, const uint8_t *src2,
+ uint8_t *dest, int bytes,
+ uint32_t weight1, uint32_t weight2)
+{
+ if (bytes >= 8) {
+ asm("movd %%eax, %%mm4; movd %%edx, %%mm5"
+ : : "a" (weight1), "d" (weight2));
+ asm("\
+ pshufw $0, %%mm4, %%mm4 # MM4: W1 W1 W1 W1 \n\
+ pshufw $0, %%mm5, %%mm5 # MM5: W2 W2 W2 W2 \n\
+ pxor %%mm6, %%mm6 # Put 0x0080*4 in MM6 (rounding)\n\
+ pcmpeqw %%mm7, %%mm7 \n\
+ psubw %%mm7, %%mm6 \n\
+ psllw $7, %%mm6 \n\
+ 0: \n\
+ movq -8(%%esi,%%ecx), %%mm7 \n\
+ pxor %%mm0, %%mm0 # Load data into high bytes \n\
+ punpcklbw %%mm7, %%mm0 # (gives 8.8 fixed point) \n\
+ pmulhuw %%mm4, %%mm0 # Result: 0000..FF00 \n\
+ pxor %%mm1, %%mm1 \n\
+ punpckhbw %%mm7, %%mm1 \n\
+ pmulhuw %%mm4, %%mm1 \n\
+ movq -8(%%edx,%%ecx), %%mm7 \n\
+ pxor %%mm2, %%mm2 \n\
+ punpcklbw %%mm7, %%mm2 \n\
+ pmulhuw %%mm5, %%mm2 \n\
+ pxor %%mm3, %%mm3 \n\
+ punpckhbw %%mm7, %%mm3 \n\
+ pmulhuw %%mm5, %%mm3 \n\
+ paddw %%mm2, %%mm0 \n\
+ paddw %%mm6, %%mm0 \n\
+ psrlw $8, %%mm0 # Shift back down to 00..FF \n\
+ paddw %%mm3, %%mm1 \n\
+ paddw %%mm6, %%mm1 \n\
+ psrlw $8, %%mm1 \n\
+ packuswb %%mm1, %%mm0 \n\
+ movq %%mm0, -8(%%edi,%%ecx) \n\
+ subl $8, %%ecx \n\
+ jnz 0b \n\
+ emms"
+ : /* no outputs */
+ : "S" (src1), "d" (src2), "D" (dest), "c" (bytes & ~7));
+ }
+ if (UNLIKELY(bytes & 7)) {
+ rescale(src1+(bytes & ~7), src2+(bytes & ~7), dest+(bytes & ~7),
+ bytes & 7, weight1, weight2);
+ }
+}
+
+#endif /* (HAVE_ASM_MMXEXT || HAVE_ASM_SSE) && ARCH_X86 */
+
+/*************************************************************************/
+
+/* SSE2 version */
+
+#if defined(HAVE_ASM_SSE2)
+
+#ifdef ARCH_X86_64
+# define ECX "%%rcx"
+# define EDX "%%rdx"
+# define ESI "%%rsi"
+# define EDI "%%rdi"
+#else
+# define ECX "%%ecx"
+# define EDX "%%edx"
+# define ESI "%%esi"
+# define EDI "%%edi"
+#endif
+
+static void rescale_sse2(const uint8_t *src1, const uint8_t *src2,
+ uint8_t *dest, int bytes,
+ uint32_t weight1, uint32_t weight2)
+{
+ if (bytes >= 16) {
+ asm("movd %%eax, %%xmm4; movd %%edx, %%xmm5"
+ : : "a" (weight1<<16|weight1), "d" (weight2<<16|weight2));
+ asm("\
+ pshufd $0, %%xmm4, %%xmm4 # XMM4: W1 W1 W1 W1 W1 W1 W1 W1 \n\
+ pshufd $0, %%xmm5, %%xmm5 # XMM5: W2 W2 W2 W2 W2 W2 W2 W2 \n\
+ pxor %%xmm6, %%xmm6 # Put 0x0080*4 in XMM6 (rounding)\n\
+ pcmpeqw %%xmm7, %%xmm7 \n\
+ psubw %%xmm7, %%xmm6 \n\
+ psllw $7, %%xmm6 \n\
+ 0: \n\
+ movdqu -16("ESI","ECX"), %%xmm7 \n\
+ pxor %%xmm0, %%xmm0 \n\
+ punpcklbw %%xmm7, %%xmm0 \n\
+ pmulhuw %%xmm4, %%xmm0 \n\
+ pxor %%xmm1, %%xmm1 \n\
+ punpckhbw %%xmm7, %%xmm1 \n\
+ pmulhuw %%xmm4, %%xmm1 \n\
+ movdqu -16("EDX","ECX"), %%xmm7 \n\
+ pxor %%xmm2, %%xmm2 \n\
+ punpcklbw %%xmm7, %%xmm2 \n\
+ pmulhuw %%xmm5, %%xmm2 \n\
+ pxor %%xmm3, %%xmm3 \n\
+ punpckhbw %%xmm7, %%xmm3 \n\
+ pmulhuw %%xmm5, %%xmm3 \n\
+ paddw %%xmm2, %%xmm0 \n\
+ paddw %%xmm6, %%xmm0 \n\
+ psrlw $8, %%xmm0 \n\
+ paddw %%xmm3, %%xmm1 \n\
+ paddw %%xmm6, %%xmm1 \n\
+ psrlw $8, %%xmm1 \n\
+ packuswb %%xmm1, %%xmm0 \n\
+ movdqu %%xmm0, -16("EDI","ECX") \n\
+ subl $16, %%ecx \n\
+ jnz 0b \n\
+ emms"
+ : /* no outputs */
+ : "S" (src1), "d" (src2), "D" (dest), "c" (bytes & ~15));
+ }
+ if (UNLIKELY(bytes & 15)) {
+ rescale(src1+(bytes & ~15), src2+(bytes & ~15), dest+(bytes & ~15),
+ bytes & 15, weight1, weight2);
+ }
+}
+
+#endif /* HAVE_ASM_SSE2 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Initialization routine. */
+
+int ac_rescale_init(int accel)
+{
+ rescale_ptr = rescale;
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
+ if (HAS_ACCEL(accel, AC_MMX))
+ rescale_ptr = rescale_mmx;
+#endif
+#if (defined(HAVE_ASM_MMXEXT) || defined(HAVE_ASM_SSE)) && defined(ARCH_X86)
+ if (HAS_ACCEL(accel, AC_MMXEXT) || HAS_ACCEL(accel, AC_SSE))
+ rescale_ptr = rescale_mmxext;
+#endif
+#if defined(HAVE_ASM_SSE2)
+ if (HAS_ACCEL(accel, AC_SSE2))
+ rescale_ptr = rescale_sse2;
+#endif
+
+ return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ * c-file-style: "stroustrup"
+ * c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */
diff --git a/debian/transcode/transcode-1.1.7/aclib/rgb-yuv-conv.pl b/debian/transcode/transcode-1.1.7/aclib/rgb-yuv-conv.pl
new file mode 100755
index 00000000..a2b6257c
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/rgb-yuv-conv.pl
@@ -0,0 +1,48 @@
+#!/usr/bin/perl -w
+# Calculate conversion matrices for RGB<->YUV given Kb and Kr
+
+die "Usage: $0 Kb Kr [scale]\n" if @ARGV < 2;
+$scale = $ARGV[2] || 1;
+$Kb = $ARGV[0];
+$Kr = $ARGV[1];
+$Kg = 1 - $Kr - $Kb;
+$a11 = $Kr;
+$a12 = $Kg;
+$a13 = $Kb;
+$a21 = -$Kr/(1-$Kb)/2;
+$a22 = -$Kg/(1-$Kb)/2;
+$a23 = 1/2;
+$a31 = 1/2;
+$a32 = -$Kg/(1-$Kr)/2;
+$a33 = -$Kb/(1-$Kr)/2;
+print "Y [R] = ".($a11*$scale)."\n";
+print "Y [G] = ".($a12*$scale)."\n";
+print "Y [B] = ".($a13*$scale)."\n";
+print "Cb[R] = ".($a21*$scale)."\n";
+print "Cb[G] = ".($a22*$scale)."\n";
+print "Cb[B] = ".($a23*$scale)."\n";
+print "Cr[R] = ".($a31*$scale)."\n";
+print "Cr[G] = ".($a32*$scale)."\n";
+print "Cr[B] = ".($a33*$scale)."\n";
+$det = $a11*$a22*$a33 - $a11*$a23*$a32
+ + $a12*$a23*$a31 - $a12*$a21*$a33
+ + $a13*$a21*$a32 - $a13*$a22*$a31;
+$b11 = (1/$det)*($a22*$a33-$a23*$a32);
+$b12 = (1/$det)*($a13*$a32-$a12*$a33);
+$b13 = (1/$det)*($a12*$a23-$a13*$a22);
+$b21 = (1/$det)*($a23*$a31-$a21*$a33);
+$b22 = (1/$det)*($a11*$a33-$a13*$a31);
+$b23 = (1/$det)*($a13*$a21-$a11*$a23);
+$b31 = (1/$det)*($a21*$a32-$a22*$a31);
+$b32 = (1/$det)*($a12*$a31-$a11*$a32);
+$b33 = (1/$det)*($a11*$a22-$a12*$a21);
+map {$_ = 0 if abs($_) < 1e-10} ($b11,$b12,$b13,$b21,$b22,$b23,$b31,$b32,$b33);
+print "R[Y ] = ".($b11*$scale)."\n";
+print "R[Cb] = ".($b12*$scale)."\n";
+print "R[Cr] = ".($b13*$scale)."\n";
+print "G[Y ] = ".($b21*$scale)."\n";
+print "G[Cb] = ".($b22*$scale)."\n";
+print "G[Cr] = ".($b23*$scale)."\n";
+print "B[Y ] = ".($b31*$scale)."\n";
+print "B[Cb] = ".($b32*$scale)."\n";
+print "B[Cr] = ".($b33*$scale)."\n";