diff options
author | Michele Calgaro <[email protected]> | 2020-09-11 14:38:47 +0900 |
---|---|---|
committer | Michele Calgaro <[email protected]> | 2020-09-11 14:38:47 +0900 |
commit | 884c8093d63402a1ad0b502244b791e3c6782be3 (patch) | |
tree | a600d4ab0d431a2bdfe4c15b70df43c14fbd8dd0 /debian/transcode/transcode-1.1.7/aclib | |
parent | 14e1aa2006796f147f3f4811fb908a6b01e79253 (diff) | |
download | extra-dependencies-884c8093d63402a1ad0b502244b791e3c6782be3.tar.gz extra-dependencies-884c8093d63402a1ad0b502244b791e3c6782be3.zip |
Added debian extra dependency packages.
Signed-off-by: Michele Calgaro <[email protected]>
Diffstat (limited to 'debian/transcode/transcode-1.1.7/aclib')
18 files changed, 8672 insertions, 0 deletions
diff --git a/debian/transcode/transcode-1.1.7/aclib/Makefile.am b/debian/transcode/transcode-1.1.7/aclib/Makefile.am new file mode 100644 index 00000000..54951ce6 --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/Makefile.am @@ -0,0 +1,27 @@ +# # Process this file with automake to produce Makefile.in. + +AM_CPPFLAGS = \ + $(PTHREAD_CFLAGS) \ + -I$(top_srcdir) + +noinst_LTLIBRARIES = libac.la + +libac_la_SOURCES = \ + accore.c \ + average.c \ + imgconvert.c \ + img_rgb_packed.c \ + img_yuv_mixed.c \ + img_yuv_packed.c \ + img_yuv_planar.c \ + img_yuv_rgb.c \ + memcpy.c \ + rescale.c + +EXTRA_DIST = \ + ac.h \ + ac_internal.h \ + imgconvert.h \ + img_internal.h \ + img_x86_common.h \ + rgb-yuv-conv.pl diff --git a/debian/transcode/transcode-1.1.7/aclib/Makefile.in b/debian/transcode/transcode-1.1.7/aclib/Makefile.in new file mode 100644 index 00000000..8f3a132a --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/Makefile.in @@ -0,0 +1,610 @@ +# Makefile.in generated by automake 1.11.1 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, +# 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, +# Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +# # Process this file with automake to produce Makefile.in. + +VPATH = @srcdir@ +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +target_triplet = @target@ +subdir = aclib +DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/acinclude.m4 \ + $(top_srcdir)/configure.in +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +LTLIBRARIES = $(noinst_LTLIBRARIES) +libac_la_LIBADD = +am_libac_la_OBJECTS = accore.lo average.lo imgconvert.lo \ + img_rgb_packed.lo img_yuv_mixed.lo img_yuv_packed.lo \ + img_yuv_planar.lo img_yuv_rgb.lo memcpy.lo rescale.lo +libac_la_OBJECTS = $(am_libac_la_OBJECTS) +DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/autotools/depcomp +am__depfiles_maybe = depfiles +am__mv = mv -f +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +CCLD = $(CC) +LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +SOURCES = $(libac_la_SOURCES) +DIST_SOURCES = $(libac_la_SOURCES) +ETAGS = etags +CTAGS = ctags +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +A52_CFLAGS = @A52_CFLAGS@ +A52_LIBS = @A52_LIBS@ +ACLIB_LIBS = @ACLIB_LIBS@ +ACLOCAL = @ACLOCAL@ +ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@ +AMTAR = @AMTAR@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AVILIB_LIBS = @AVILIB_LIBS@ +AWK = @AWK@ +BSDAV_CFLAGS = @BSDAV_CFLAGS@ +BSDAV_LIBS = @BSDAV_LIBS@ +CC = @CC@ +CCAS = @CCAS@ +CCASDEPMODE = @CCASDEPMODE@ +CCASFLAGS = @CCASFLAGS@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXXCPP = @CXXCPP@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +DLDARWIN_CFLAGS = @DLDARWIN_CFLAGS@ +DLDARWIN_LIBS = @DLDARWIN_LIBS@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +FAAC_CFLAGS = @FAAC_CFLAGS@ +FAAC_LIBS = @FAAC_LIBS@ +FGREP = @FGREP@ +FREETYPE2_CFLAGS = @FREETYPE2_CFLAGS@ +FREETYPE2_LIBS = @FREETYPE2_LIBS@ +GREP = @GREP@ +IBP_LIBS = @IBP_LIBS@ +ICONV_CFLAGS = @ICONV_CFLAGS@ +ICONV_LIBS = @ICONV_LIBS@ +IMAGEMAGICK_CFLAGS = @IMAGEMAGICK_CFLAGS@ +IMAGEMAGICK_LIBS = @IMAGEMAGICK_LIBS@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LAME_CFLAGS = @LAME_CFLAGS@ +LAME_LIBS = @LAME_LIBS@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LIBAVCODEC_CFLAGS = @LIBAVCODEC_CFLAGS@ +LIBAVCODEC_LIBS = @LIBAVCODEC_LIBS@ +LIBAVFORMAT_CFLAGS = @LIBAVFORMAT_CFLAGS@ +LIBAVFORMAT_LIBS = @LIBAVFORMAT_LIBS@ +LIBDVDREAD_CFLAGS = @LIBDVDREAD_CFLAGS@ +LIBDVDREAD_LIBS = @LIBDVDREAD_LIBS@ +LIBDV_CFLAGS = @LIBDV_CFLAGS@ +LIBDV_LIBS = @LIBDV_LIBS@ +LIBJPEG_CFLAGS = @LIBJPEG_CFLAGS@ +LIBJPEG_LIBS = @LIBJPEG_LIBS@ +LIBMPEG2CONVERT_CFLAGS = @LIBMPEG2CONVERT_CFLAGS@ +LIBMPEG2CONVERT_LIBS = @LIBMPEG2CONVERT_LIBS@ +LIBMPEG2_CFLAGS = @LIBMPEG2_CFLAGS@ +LIBMPEG2_LIBS = @LIBMPEG2_LIBS@ +LIBOBJS = @LIBOBJS@ +LIBPOSTPROC_CFLAGS = @LIBPOSTPROC_CFLAGS@ +LIBPOSTPROC_LIBS = @LIBPOSTPROC_LIBS@ +LIBQUICKTIME_CFLAGS = @LIBQUICKTIME_CFLAGS@ +LIBQUICKTIME_LIBS = @LIBQUICKTIME_LIBS@ +LIBS = @LIBS@ +LIBTCAUDIO_LIBS = @LIBTCAUDIO_LIBS@ +LIBTCVIDEO_LIBS = @LIBTCVIDEO_LIBS@ +LIBTC_LIBS = @LIBTC_LIBS@ +LIBTOOL = @LIBTOOL@ +LIBV4L2_CFLAGS = @LIBV4L2_CFLAGS@ +LIBV4L2_LIBS = @LIBV4L2_LIBS@ +LIBV4LCONVERT_CFLAGS = @LIBV4LCONVERT_CFLAGS@ +LIBV4LCONVERT_LIBS = @LIBV4LCONVERT_LIBS@ +LIBXML2_CFLAGS = @LIBXML2_CFLAGS@ +LIBXML2_LIBS = @LIBXML2_LIBS@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +LZO_CFLAGS = @LZO_CFLAGS@ +LZO_LIBS = @LZO_LIBS@ +MAINT = @MAINT@ +MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ +MJPEGTOOLS_CFLAGS = @MJPEGTOOLS_CFLAGS@ +MJPEGTOOLS_LIBS = @MJPEGTOOLS_LIBS@ +MKDIR_P = @MKDIR_P@ +MOD_PATH = @MOD_PATH@ +NM = @NM@ +NMEDIT = @NMEDIT@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +OGG_CFLAGS = @OGG_CFLAGS@ +OGG_LIBS = @OGG_LIBS@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PATH_TO_AWK = @PATH_TO_AWK@ +PKG_CONFIG = @PKG_CONFIG@ +PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ +PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ +PROF_PATH = @PROF_PATH@ +PTHREAD_CFLAGS = @PTHREAD_CFLAGS@ +PTHREAD_LIBS = @PTHREAD_LIBS@ +PVM3_CFLAGS = @PVM3_CFLAGS@ +PVM3_LIBS = @PVM3_LIBS@ +PVM3_PVMGS = @PVM3_PVMGS@ +RANLIB = @RANLIB@ +SDL_CFLAGS = @SDL_CFLAGS@ +SDL_LIBS = @SDL_LIBS@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +SIMD_FLAGS = @SIMD_FLAGS@ +STRIP = @STRIP@ +THEORA_CFLAGS = @THEORA_CFLAGS@ +THEORA_LIBS = @THEORA_LIBS@ +USE_DLDARWIN = @USE_DLDARWIN@ +VERSION = @VERSION@ +VORBIS_CFLAGS = @VORBIS_CFLAGS@ +VORBIS_LIBS = @VORBIS_LIBS@ +WAVLIB_LIBS = @WAVLIB_LIBS@ +X264_CFLAGS = @X264_CFLAGS@ +X264_LIBS = @X264_LIBS@ +XIO_CFLAGS = @XIO_CFLAGS@ +XIO_LIBS = @XIO_LIBS@ +XMKMF = @XMKMF@ +XVID_CFLAGS = @XVID_CFLAGS@ +XVID_LIBS = @XVID_LIBS@ +X_CFLAGS = @X_CFLAGS@ +X_EXTRA_LIBS = @X_EXTRA_LIBS@ +X_LIBS = @X_LIBS@ +X_PRE_LIBS = @X_PRE_LIBS@ +a52_config = @a52_config@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +bsdav_config = @bsdav_config@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +faac_config = @faac_config@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +iconv_config = @iconv_config@ +imagemagick_config = @imagemagick_config@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +lame_config = @lame_config@ +libdir = @libdir@ +libdvdread_config = @libdvdread_config@ +libexecdir = @libexecdir@ +libjpeg_config = @libjpeg_config@ +libjpegmmx_config = @libjpegmmx_config@ +localedir = @localedir@ +localstatedir = @localstatedir@ +lzo_config = @lzo_config@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +pvm3_config = @pvm3_config@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target = @target@ +target_alias = @target_alias@ +target_cpu = @target_cpu@ +target_os = @target_os@ +target_vendor = @target_vendor@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +x_includes = @x_includes@ +x_libraries = @x_libraries@ +xvid_config = @xvid_config@ +AM_CPPFLAGS = \ + $(PTHREAD_CFLAGS) \ + -I$(top_srcdir) + +noinst_LTLIBRARIES = libac.la +libac_la_SOURCES = \ + accore.c \ + average.c \ + imgconvert.c \ + img_rgb_packed.c \ + img_yuv_mixed.c \ + img_yuv_packed.c \ + img_yuv_planar.c \ + img_yuv_rgb.c \ + memcpy.c \ + rescale.c + +EXTRA_DIST = \ + ac.h \ + ac_internal.h \ + imgconvert.h \ + img_internal.h \ + img_x86_common.h \ + rgb-yuv-conv.pl + +all: all-am + +.SUFFIXES: +.SUFFIXES: .c .lo .o .obj +$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu aclib/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --gnu aclib/Makefile +.PRECIOUS: Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +clean-noinstLTLIBRARIES: + -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) + @list='$(noinst_LTLIBRARIES)'; for p in $$list; do \ + dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \ + test "$$dir" != "$$p" || dir=.; \ + echo "rm -f \"$${dir}/so_locations\""; \ + rm -f "$${dir}/so_locations"; \ + done +libac.la: $(libac_la_OBJECTS) $(libac_la_DEPENDENCIES) + $(LINK) $(libac_la_OBJECTS) $(libac_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/accore.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/average.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/img_rgb_packed.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/img_yuv_mixed.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/img_yuv_packed.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/img_yuv_planar.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/img_yuv_rgb.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/imgconvert.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memcpy.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rescale.Plo@am__quote@ + +.c.o: +@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(COMPILE) -c $< + +.c.obj: +@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'` + +.c.lo: +@am__fastdepCC_TRUE@ $(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ + mkid -fID $$unique +tags: TAGS + +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + set x; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: CTAGS +CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + `test -z '$(STRIP)' || \ + echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ + clean-libtool clean-noinstLTLIBRARIES ctags distclean \ + distclean-compile distclean-generic distclean-libtool \ + distclean-tags distdir dvi dvi-am html html-am info info-am \ + install install-am install-data install-data-am install-dvi \ + install-dvi-am install-exec install-exec-am install-html \ + install-html-am install-info install-info-am install-man \ + install-pdf install-pdf-am install-ps install-ps-am \ + install-strip installcheck installcheck-am installdirs \ + maintainer-clean maintainer-clean-generic mostlyclean \ + mostlyclean-compile mostlyclean-generic mostlyclean-libtool \ + pdf pdf-am ps ps-am tags uninstall uninstall-am + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/debian/transcode/transcode-1.1.7/aclib/ac.h b/debian/transcode/transcode-1.1.7/aclib/ac.h new file mode 100644 index 00000000..d2a542b2 --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/ac.h @@ -0,0 +1,107 @@ +/* + * ac.h -- main aclib include + * Written by Andrew Church <[email protected]> + * + * This file is part of transcode, a video stream processing tool. + * transcode is free software, distributable under the terms of the GNU + * General Public License (version 2 or later). See the file COPYING + * for details. + */ + +#ifndef ACLIB_AC_H +#define ACLIB_AC_H + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include <stddef.h> +#include <stdint.h> +#include <sys/types.h> + +/*************************************************************************/ + +/* CPU acceleration support flags, for use with ac_init(): */ + +#define AC_IA32ASM 0x0001 /* x86-32: standard assembly (no MMX) */ +#define AC_AMD64ASM 0x0002 /* x86-64: standard assembly (no MMX) */ +#define AC_CMOVE 0x0004 /* x86: CMOVcc instruction */ +#define AC_MMX 0x0008 /* x86: MMX instructions */ +#define AC_MMXEXT 0x0010 /* x86: MMX extended instructions (AMD) */ +#define AC_3DNOW 0x0020 /* x86: 3DNow! instructions (AMD) */ +#define AC_3DNOWEXT 0x0040 /* x86: 3DNow! instructions (AMD) */ +#define AC_SSE 0x0080 /* x86: SSE instructions */ +#define AC_SSE2 0x0100 /* x86: SSE2 instructions */ +#define AC_SSE3 0x0200 /* x86: SSE3 instructions */ +#define AC_SSSE3 0x0400 /* x86: SSSE3 instructions */ +#define AC_SSE41 0x0800 /* x86: SSE4.1 instructions */ +#define AC_SSE42 0x1000 /* x86: SSE4.2 instructions (Intel) */ +#define AC_SSE4A 0x2000 /* x86: SSE4a instructions (AMD) */ +#define AC_SSE5 0x4000 /* x86: SSE5 instructions (AMD) */ + +#define AC_NONE 0 /* No acceleration (vanilla C functions) */ +#define AC_ALL (~0) /* All available acceleration */ + + +/* Endianness flag: */ +#define AC_LITTLE_ENDIAN 1 +#define AC_BIG_ENDIAN 2 + +/*************************************************************************/ + +/* Library initialization function--MUST be called before any other aclib + * functions are used! `accel' selects the accelerations to enable: + * AC_NONE, AC_ALL, or a combination of the other AC_* flags above. The + * value will always be masked to the acceleration options available on the + * actual CPU, as returned by ac_cpuinfo(). Returns 1 on success, 0 on + * failure. This function can be called multiple times to change the set + * of acceleration features to be used. */ +extern int ac_init(int accel); + +/* Returns the set of acceleration features supported by this CPU. */ +extern int ac_cpuinfo(void); + +/* Returns the endianness of this CPU (AC_BIG_ENDIAN or AC_LITTLE_ENDIAN). */ +extern int ac_endian(void); + +/* Utility routine to convert a set of flags to a descriptive string. The + * string is stored in a static buffer overwritten each call. */ +extern const char *ac_flagstotext(int accel); + +/* Utility routine to parse a comma-separate descriptive string to the + corrisponding flag. The reverse of ac_flagstotext. + Returns 1 on success, 0 on failure */ +extern int ac_parseflags(const char *text, int *accel); + +/*************************************************************************/ + +/* Acceleration-enabled functions: */ + +/* Optimized memcpy(). The copy direction is guaranteed to be ascending + * (so ac_memcpy(ptr, ptr+1, size) will work). */ +extern void *ac_memcpy(void *dest, const void *src, size_t size); + +/* Average of two sets of data */ +extern void ac_average(const uint8_t *src1, const uint8_t *src2, + uint8_t *dest, int bytes); + +/* Weighted average of two sets of data (weight1+weight2 should be 65536) */ +extern void ac_rescale(const uint8_t *src1, const uint8_t *src2, + uint8_t *dest, int bytes, + uint32_t weight1, uint32_t weight2); + +/* Image format manipulation is available in aclib/imgconvert.h */ + +/*************************************************************************/ + +#endif /* ACLIB_AC_H */ + +/* + * Local variables: + * c-file-style: "stroustrup" + * c-file-offsets: ((case-label . *) (statement-case-intro . *)) + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: + */ diff --git a/debian/transcode/transcode-1.1.7/aclib/ac_internal.h b/debian/transcode/transcode-1.1.7/aclib/ac_internal.h new file mode 100644 index 00000000..67a9c59f --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/ac_internal.h @@ -0,0 +1,42 @@ +/* + * ac_internal.h -- internal include file for aclib functions + * Written by Andrew Church <[email protected]> + * + * This file is part of transcode, a video stream processing tool. + * transcode is free software, distributable under the terms of the GNU + * General Public License (version 2 or later). See the file COPYING + * for details. + */ + +#ifndef ACLIB_AC_INTERNAL_H +#define ACLIB_AC_INTERNAL_H + + +/* Compiler hint that a condition is unlikely */ +#ifdef __GNUC__ +# define UNLIKELY(x) (__builtin_expect((x) != 0, 0)) +#else +# define UNLIKELY(x) (x) +#endif + +/* Are _all_ of the given acceleration flags (`test') available? */ +#define HAS_ACCEL(accel,test) (((accel) & (test)) == (test)) + +/* Initialization subfunctions */ +extern int ac_average_init(int accel); +extern int ac_imgconvert_init(int accel); +extern int ac_memcpy_init(int accel); +extern int ac_rescale_init(int accel); + + +#endif /* ACLIB_AC_INTERNAL_H */ + +/* + * Local variables: + * c-file-style: "stroustrup" + * c-file-offsets: ((case-label . *) (statement-case-intro . *)) + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: + */ diff --git a/debian/transcode/transcode-1.1.7/aclib/accore.c b/debian/transcode/transcode-1.1.7/aclib/accore.c new file mode 100644 index 00000000..ec7ea2dd --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/accore.c @@ -0,0 +1,320 @@ +/* + * accore.c -- core aclib functions + * Written by Andrew Church <[email protected]> + * + * This file is part of transcode, a video stream processing tool. + * transcode is free software, distributable under the terms of the GNU + * General Public License (version 2 or later). See the file COPYING + * for details. + */ + +#include "ac.h" +#include "ac_internal.h" +#include "imgconvert.h" + +#include <stdio.h> +#include <string.h> + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static int cpuinfo_x86(void); +#endif + +/*************************************************************************/ + +/* Library initialization function. Determines CPU features, then calls + * all initialization subfunctions with appropriate flags. Returns 1 on + * success, 0 on failure. This function can be called multiple times to + * change the set of acceleration features to be used. */ + +int ac_init(int accel) +{ + accel &= ac_cpuinfo(); + if (!ac_average_init(accel) + || !ac_imgconvert_init(accel) + || !ac_memcpy_init(accel) + || !ac_rescale_init(accel) + ) { + return 0; + } + return 1; +} + +/*************************************************************************/ + +/* Returns the set of acceleration features supported by this CPU. */ + +int ac_cpuinfo(void) +{ +#if defined(ARCH_X86) || defined(ARCH_X86_64) + return cpuinfo_x86(); +#else + return 0; +#endif +} + +/*************************************************************************/ + +/* Returns the endianness of this CPU (AC_BIG_ENDIAN or AC_LITTLE_ENDIAN). */ + +int ac_endian(void) +{ + volatile int test; + + test = 1; + if (*((uint8_t *)&test)) + return AC_LITTLE_ENDIAN; + else + return AC_BIG_ENDIAN; +} + +/*************************************************************************/ + +/* Utility routine to convert a set of flags to a descriptive string. The + * string is stored in a static buffer overwritten each call. `filter' + * selects whether to filter out flags not supported by the CPU. */ + +const char *ac_flagstotext(int accel) +{ + static char retbuf[1000]; + if (!accel) + return "none"; + snprintf(retbuf, sizeof(retbuf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + accel & AC_SSE5 ? " sse5" : "", + accel & AC_SSE4A ? " sse4a" : "", + accel & AC_SSE42 ? " sse42" : "", + accel & AC_SSE41 ? " sse41" : "", + accel & AC_SSSE3 ? " ssse3" : "", + accel & AC_SSE3 ? " sse3" : "", + accel & AC_SSE2 ? " sse2" : "", + accel & AC_SSE ? " sse" : "", + accel & AC_3DNOWEXT ? " 3dnowext" : "", + accel & AC_3DNOW ? " 3dnow" : "", + accel & AC_MMXEXT ? " mmxext" : "", + accel & AC_MMX ? " mmx" : "", + accel & AC_CMOVE ? " cmove" : "", + accel & (AC_IA32ASM|AC_AMD64ASM) ? " asm" : ""); + return *retbuf ? retbuf+1 : retbuf; /* skip initial space */ +} + +/* Utility routine to parse a comma-separate descriptive string to the + corrisponding flag. The reverse of ac_flagstotext. + Returns 1 on success, 0 on failure */ + +#define AC_FLAG_LEN 16 + +int ac_parseflags(const char *text, int *accel) +{ + int parsed = 1, done = 0; + if (!text || !accel) + return 0; +#if defined(ARCH_X86) || defined(ARCH_X86_64) + *accel = 0; + + while (parsed && !done) { + char buf[AC_FLAG_LEN + 1] = { '\0' }; + const char *comma = strchr(text, ','); + if (!comma) { + strncpy(buf, text, AC_FLAG_LEN); + done = 1; + } else { + /* parse the remaining and exit*/ + size_t len = (comma - text); + if (len > AC_FLAG_LEN) + len = AC_FLAG_LEN; + strncpy(buf, text, len); + } +//fprintf(stderr, "(%s) buf=[%s]\n", __func__, buf); + if (strcasecmp(buf, "C") == 0) // dummy for "no accel" + *accel |= 0; +#ifdef ARCH_X86 + else if (strcasecmp(buf, "asm" ) == 0) + *accel |= AC_IA32ASM; +#endif +#ifdef ARCH_X86_64 + else if (strcasecmp(buf, "asm" ) == 0) + *accel |= AC_AMD64ASM; +#endif + else if (strcasecmp(buf, "mmx" ) == 0) + *accel |= AC_MMX; + else if (strcasecmp(buf, "mmxext" ) == 0) + *accel |= AC_MMXEXT; + else if (strcasecmp(buf, "3dnow" ) == 0) + *accel |= AC_3DNOW; + else if (strcasecmp(buf, "3dnowext") == 0) + *accel |= AC_3DNOWEXT; + else if (strcasecmp(buf, "sse" ) == 0) + *accel |= AC_SSE; + else if (strcasecmp(buf, "sse2" ) == 0) + *accel |= AC_SSE2; + else if (strcasecmp(buf, "sse3" ) == 0) + *accel |= AC_SSE3; + else if (strcasecmp(buf, "ssse3" ) == 0) + *accel |= AC_SSSE3; + else if (strcasecmp(buf, "sse41" ) == 0) + *accel |= AC_SSE41; + else if (strcasecmp(buf, "sse42" ) == 0) + *accel |= AC_SSE42; + else if (strcasecmp(buf, "sse4a" ) == 0) + *accel |= AC_SSE4A; + else if (strcasecmp(buf, "sse5" ) == 0) + *accel |= AC_SSE5; + else + parsed = 0; + text = comma + 1; + } +#endif + return parsed; +} + +#undef AC_FLAG_LEN + +/*************************************************************************/ +/*************************************************************************/ + +/* Private functions to return acceleration flags corresponding to available + * CPU features for various CPUs. Currently only x86 is supported. */ + +/*************************************************************************/ + +#if defined(ARCH_X86) || defined(ARCH_X86_64) + +#ifdef ARCH_X86_64 +# define EAX "%%rax" +# define EBX "%%rbx" +# define ESI "%%rsi" +# define PUSHF "pushfq" +# define POPF "popfq" +#else +# define EAX "%%eax" +# define EBX "%%ebx" +# define ESI "%%esi" +# define PUSHF "pushfl" +# define POPF "popfl" +#endif + +/* Macro to execute the CPUID instruction with EAX = func. Results are + * placed in ret_a (EAX), ret_b (EBX), ret_c (ECX), and ret_d (EDX), which + * must be lvalues. Note that we save and restore EBX (RBX on x86-64) + * because it is the PIC register. */ +#define CPUID(func,ret_a,ret_b,ret_c,ret_d) \ + asm("mov "EBX", "ESI"; cpuid; xchg "EBX", "ESI \ + : "=a" (ret_a), "=S" (ret_b), "=c" (ret_c), "=d" (ret_d) \ + : "a" (func)) + +/* Various CPUID flags. The second word of the macro name indicates the + * function (1: function 1, X1: function 0x80000001) and register (D: EDX) + * to which the value belongs. */ +#define CPUID_1D_CMOVE (1UL<<15) +#define CPUID_1D_MMX (1UL<<23) +#define CPUID_1D_SSE (1UL<<25) +#define CPUID_1D_SSE2 (1UL<<26) +#define CPUID_1C_SSE3 (1UL<< 0) +#define CPUID_1C_SSSE3 (1UL<< 9) +#define CPUID_1C_SSE41 (1UL<<19) +#define CPUID_1C_SSE42 (1UL<<20) +#define CPUID_X1D_AMD_MMXEXT (1UL<<22) /* AMD only */ +#define CPUID_X1D_AMD_3DNOW (1UL<<31) /* AMD only */ +#define CPUID_X1D_AMD_3DNOWEXT (1UL<<30) /* AMD only */ +#define CPUID_X1D_CYRIX_MMXEXT (1UL<<24) /* Cyrix only */ +#define CPUID_X1C_AMD_SSE4A (1UL<< 6) /* AMD only */ +#define CPUID_X1C_AMD_SSE5 (1UL<<11) /* AMD only */ + +static int cpuinfo_x86(void) +{ + uint32_t eax, ebx, ecx, edx; + uint32_t cpuid_max, cpuid_ext_max; /* Maximum CPUID function numbers */ + union { + char string[13]; + struct { uint32_t ebx, edx, ecx; } regs; + } cpu_vendor; /* 12-byte CPU vendor string + trailing null */ + uint32_t cpuid_1D, cpuid_1C, cpuid_X1C, cpuid_X1D; + int accel; + + /* First see if the CPUID instruction is even available. We try to + * toggle bit 21 (ID) of the flags register; if the bit changes, then + * CPUID is available. */ + asm(PUSHF" \n\ + pop "EAX" \n\ + mov %%eax, %%edx \n\ + xor $0x200000, %%eax \n\ + push "EAX" \n\ + "POPF" \n\ + "PUSHF" \n\ + pop "EAX" \n\ + xor %%edx, %%eax" + : "=a" (eax) : : "edx"); + if (!eax) + return 0; + + /* Determine the maximum function number available, and save the vendor + * string */ + CPUID(0, cpuid_max, ebx, ecx, edx); + cpu_vendor.regs.ebx = ebx; + cpu_vendor.regs.ecx = ecx; + cpu_vendor.regs.edx = edx; + cpu_vendor.string[12] = 0; + cpuid_ext_max = 0; /* FIXME: how do early CPUs respond to 0x80000000? */ + CPUID(0x80000000, cpuid_ext_max, ebx, ecx, edx); + + /* Read available features */ + cpuid_1D = cpuid_1C = cpuid_X1C = cpuid_X1D = 0; + if (cpuid_max >= 1) + CPUID(1, eax, ebx, cpuid_1C, cpuid_1D); + if (cpuid_ext_max >= 0x80000001) + CPUID(0x80000001, eax, ebx, cpuid_X1C, cpuid_X1D); + + /* Convert to acceleration flags */ +#ifdef ARCH_X86_64 + accel = AC_AMD64ASM; /* but not IA32! (register size issues) */ +#else + accel = AC_IA32ASM; +#endif + if (cpuid_1D & CPUID_1D_CMOVE) + accel |= AC_CMOVE; + if (cpuid_1D & CPUID_1D_MMX) + accel |= AC_MMX; + if (cpuid_1D & CPUID_1D_SSE) + accel |= AC_SSE; + if (cpuid_1D & CPUID_1D_SSE2) + accel |= AC_SSE2; + if (cpuid_1C & CPUID_1C_SSE3) + accel |= AC_SSE3; + if (cpuid_1C & CPUID_1C_SSSE3) + accel |= AC_SSSE3; + if (cpuid_1C & CPUID_1C_SSE41) + accel |= AC_SSE41; + if (cpuid_1C & CPUID_1C_SSE42) + accel |= AC_SSE42; + if (strcmp(cpu_vendor.string, "AuthenticAMD") == 0) { + if (cpuid_X1D & CPUID_X1D_AMD_MMXEXT) + accel |= AC_MMXEXT; + if (cpuid_X1D & CPUID_X1D_AMD_3DNOW) + accel |= AC_3DNOW; + if (cpuid_X1D & CPUID_X1D_AMD_3DNOWEXT) + accel |= AC_3DNOWEXT; + if (cpuid_X1C & CPUID_X1C_AMD_SSE4A) + accel |= AC_SSE4A; + if (cpuid_X1C & CPUID_X1C_AMD_SSE5) + accel |= AC_SSE5; + } else if (strcmp(cpu_vendor.string, "CyrixInstead") == 0) { + if (cpuid_X1D & CPUID_X1D_CYRIX_MMXEXT) + accel |= AC_MMXEXT; + } + + /* And return */ + return accel; +} + +#endif /* ARCH_X86 || ARCH_X86_64 */ + +/*************************************************************************/ + +/* + * Local variables: + * c-file-style: "stroustrup" + * c-file-offsets: ((case-label . *) (statement-case-intro . *)) + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: + */ diff --git a/debian/transcode/transcode-1.1.7/aclib/average.c b/debian/transcode/transcode-1.1.7/aclib/average.c new file mode 100644 index 00000000..517102e6 --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/average.c @@ -0,0 +1,243 @@ +/* + * average.c -- average two sets of byte data + * Written by Andrew Church <[email protected]> + * + * This file is part of transcode, a video stream processing tool. + * transcode is free software, distributable under the terms of the GNU + * General Public License (version 2 or later). See the file COPYING + * for details. + */ + +#include "ac.h" +#include "ac_internal.h" + +static void average(const uint8_t *, const uint8_t *, uint8_t *, int); +static void (*average_ptr)(const uint8_t *, const uint8_t *, uint8_t *, int) + = average; + +/*************************************************************************/ + +/* External interface */ + +void ac_average(const uint8_t *src1, const uint8_t *src2, + uint8_t *dest, int bytes) +{ + (*average_ptr)(src1, src2, dest, bytes); +} + +/*************************************************************************/ +/*************************************************************************/ + +/* Vanilla C version */ + +static void average(const uint8_t *src1, const uint8_t *src2, + uint8_t *dest, int bytes) +{ + int i; + for (i = 0; i < bytes; i++) + dest[i] = (src1[i]+src2[i]+1) / 2; +} + +/*************************************************************************/ + +#if defined(HAVE_ASM_MMX) && defined(ARCH_X86) /* i.e. not x86_64 */ + +static void average_mmx(const uint8_t *src1, const uint8_t *src2, + uint8_t *dest, int bytes) +{ + if (bytes >= 8) { + asm("\ + pxor %%mm7, %%mm7 \n\ + movq %%mm7, %%mm6 \n\ + pcmpeqw %%mm5, %%mm5 \n\ + psubw %%mm5, %%mm6 # Put 0x0001*4 in MM6 \n\ + 0: \n\ + movq -8(%%esi,%%eax), %%mm0 \n\ + movq %%mm0, %%mm1 \n\ + punpcklbw %%mm7, %%mm0 \n\ + punpckhbw %%mm7, %%mm1 \n\ + movq -8(%%edx,%%eax), %%mm2 \n\ + movq %%mm2, %%mm3 \n\ + punpcklbw %%mm7, %%mm2 \n\ + punpckhbw %%mm7, %%mm3 \n\ + paddw %%mm2, %%mm0 \n\ + paddw %%mm6, %%mm0 \n\ + psrlw $1, %%mm0 \n\ + paddw %%mm3, %%mm1 \n\ + paddw %%mm6, %%mm1 \n\ + psrlw $1, %%mm1 \n\ + packuswb %%mm1, %%mm0 \n\ + movq %%mm0, -8(%%edi,%%eax) \n\ + subl $8, %%eax \n\ + jnz 0b \n\ + emms" + : /* no outputs */ + : "S" (src1), "d" (src2), "D" (dest), "a" (bytes & ~7)); + } + if (UNLIKELY(bytes & 7)) { + average(src1+(bytes & ~7), src2+(bytes & ~7), dest+(bytes & ~7), + bytes & 7); + } +} + +#endif /* HAVE_ASM_MMX && ARCH_X86 */ + +/*************************************************************************/ + +#if defined(HAVE_ASM_SSE) && defined(ARCH_X86) + +/* SSE has PAVGB */ + +static void average_sse(const uint8_t *src1, const uint8_t *src2, + uint8_t *dest, int bytes) +{ + if (bytes >= 8) { + asm("\ + testl $~0x1F, %%eax \n\ + jz 1f \n\ + 0: \n\ + movq -32(%%esi,%%eax), %%mm0 \n\ + movq -24(%%esi,%%eax), %%mm1 \n\ + movq -16(%%esi,%%eax), %%mm2 \n\ + movq -8(%%esi,%%eax), %%mm3 \n\ + movq -32(%%edx,%%eax), %%mm4 \n\ + pavgb %%mm4, %%mm0 \n\ + movq -24(%%edx,%%eax), %%mm5 \n\ + pavgb %%mm5, %%mm1 \n\ + movq -16(%%edx,%%eax), %%mm6 \n\ + pavgb %%mm6, %%mm2 \n\ + movq -8(%%edx,%%eax), %%mm7 \n\ + pavgb %%mm7, %%mm3 \n\ + movntq %%mm0, -32(%%edi,%%eax) \n\ + movntq %%mm1, -24(%%edi,%%eax) \n\ + movntq %%mm2, -16(%%edi,%%eax) \n\ + movntq %%mm3, -8(%%edi,%%eax) \n\ + subl $32, %%eax \n\ + testl $~0x1F, %%eax \n\ + jnz 0b \n\ + testl %%eax, %%eax \n\ + jz 2f \n\ + 1: \n\ + movq -8(%%esi,%%eax), %%mm0 \n\ + movq -8(%%edx,%%eax), %%mm1 \n\ + pavgb %%mm1, %%mm0 \n\ + movntq %%mm0, -8(%%edi,%%eax) \n\ + subl $8, %%eax \n\ + jnz 1b \n\ + 2: \n\ + emms \n\ + sfence" + : /* no outputs */ + : "S" (src1), "d" (src2), "D" (dest), "a" (bytes & ~7)); + } + if (UNLIKELY(bytes & 7)) { + average(src1+(bytes & ~7), src2+(bytes & ~7), dest+(bytes & ~7), + bytes & 7); + } +} + +#endif /* HAVE_ASM_SSE && ARCH_X86 */ + +/*************************************************************************/ + +#if defined(HAVE_ASM_SSE2) + +#if defined(ARCH_X86_64) +# define EAX "%%rax" +# define EDX "%%rdx" +# define ESI "%%rsi" +# define EDI "%%rdi" +#else +# define EAX "%%eax" +# define EDX "%%edx" +# define ESI "%%esi" +# define EDI "%%edi" +#endif + +static void average_sse2(const uint8_t *src1, const uint8_t *src2, + uint8_t *dest, int bytes) +{ + if (bytes >= 8) { + asm("\ + testl $~0x3F, %%eax \n\ + jz 1f \n\ + 0: \n\ + movdqu -64("ESI","EAX"), %%xmm0 \n\ + movdqu -48("ESI","EAX"), %%xmm1 \n\ + movdqu -32("ESI","EAX"), %%xmm2 \n\ + movdqu -16("ESI","EAX"), %%xmm3 \n\ + movdqu -64("EDX","EAX"), %%xmm4 \n\ + pavgb %%xmm4, %%xmm0 \n\ + movdqu -48("EDX","EAX"), %%xmm5 \n\ + pavgb %%xmm5, %%xmm1 \n\ + movdqu -32("EDX","EAX"), %%xmm6 \n\ + pavgb %%xmm6, %%xmm2 \n\ + movdqu -16("EDX","EAX"), %%xmm7 \n\ + pavgb %%xmm7, %%xmm3 \n\ + # Note that movntdq requires 16-byte alignment, which we're \n\ + # not guaranteed \n\ + movdqu %%xmm0, -64("EDI","EAX") \n\ + movdqu %%xmm1, -48("EDI","EAX") \n\ + movdqu %%xmm2, -32("EDI","EAX") \n\ + movdqu %%xmm3, -16("EDI","EAX") \n\ + subl $64, %%eax \n\ + testl $~0x3F, %%eax \n\ + jnz 0b \n\ + testl %%eax, %%eax \n\ + jz 2f \n\ + 1: \n\ + movq -8("ESI","EAX"), %%mm0 \n\ + movq -8("EDX","EAX"), %%mm1 \n\ + pavgb %%mm1, %%mm0 \n\ + movq %%mm0, -8("EDI","EAX") \n\ + subl $8, %%eax \n\ + jnz 1b \n\ + 2: \n\ + emms" + : /* no outputs */ + : "S" (src1), "d" (src2), "D" (dest), "a" (bytes & ~7)); + } + if (UNLIKELY(bytes & 7)) { + average(src1+(bytes & ~7), src2+(bytes & ~7), dest+(bytes & ~7), + bytes & 7); + } +} + +#endif /* HAVE_ASM_SSE2 */ + +/*************************************************************************/ +/*************************************************************************/ + +/* Initialization routine. */ + +int ac_average_init(int accel) +{ + average_ptr = average; + +#if defined(HAVE_ASM_MMX) && defined(ARCH_X86) + if (HAS_ACCEL(accel, AC_MMX)) + average_ptr = average_mmx; +#endif +#if defined(HAVE_ASM_SSE) && defined(ARCH_X86) + if (HAS_ACCEL(accel, AC_SSE)) + average_ptr = average_sse; +#endif +#if defined(HAVE_ASM_SSE2) + if (HAS_ACCEL(accel, AC_SSE2)) + average_ptr = average_sse2; +#endif + + return 1; +} + +/*************************************************************************/ + +/* + * Local variables: + * c-file-style: "stroustrup" + * c-file-offsets: ((case-label . *) (statement-case-intro . *)) + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: + */ diff --git a/debian/transcode/transcode-1.1.7/aclib/img_internal.h b/debian/transcode/transcode-1.1.7/aclib/img_internal.h new file mode 100644 index 00000000..153a2fb6 --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/img_internal.h @@ -0,0 +1,40 @@ +/* + * img_internal.h - imgconvert internal use header + * Written by Andrew Church <[email protected]> + * + * This file is part of transcode, a video stream processing tool. + * transcode is free software, distributable under the terms of the GNU + * General Public License (version 2 or later). See the file COPYING + * for details. + */ + +#ifndef ACLIB_IMG_INTERNAL_H +#define ACLIB_IMG_INTERNAL_H + +/* Type of a conversion function */ +typedef int (*ConversionFunc)(uint8_t **src, uint8_t **dest, + int width, int height); + +/* Function to register a conversion */ +extern int register_conversion(ImageFormat srcfmt, ImageFormat destfmt, + ConversionFunc function); + +/* Initialization routines */ +extern int ac_imgconvert_init(int accel); +extern int ac_imgconvert_init_yuv_planar(int accel); +extern int ac_imgconvert_init_yuv_packed(int accel); +extern int ac_imgconvert_init_yuv_mixed(int accel); +extern int ac_imgconvert_init_yuv_rgb(int accel); +extern int ac_imgconvert_init_rgb_packed(int accel); + +#endif /* ACLIB_IMG_INTERNAL_H */ + +/* + * Local variables: + * c-file-style: "stroustrup" + * c-file-offsets: ((case-label . *) (statement-case-intro . *)) + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: + */ diff --git a/debian/transcode/transcode-1.1.7/aclib/img_rgb_packed.c b/debian/transcode/transcode-1.1.7/aclib/img_rgb_packed.c new file mode 100644 index 00000000..e6d5bf35 --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/img_rgb_packed.c @@ -0,0 +1,1106 @@ +/* + * img_rgb_packed.c - RGB packed image format conversion routines + * Written by Andrew Church <[email protected]> + * + * This file is part of transcode, a video stream processing tool. + * transcode is free software, distributable under the terms of the GNU + * General Public License (version 2 or later). See the file COPYING + * for details. + */ + +#include "ac.h" +#include "imgconvert.h" +#include "img_internal.h" + +/*************************************************************************/ +/*************************************************************************/ + +/* Standard C implementations */ + +/*************************************************************************/ + +/* Identity transformations, all work when src==dest */ + +static int rgb_copy(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height*3); + return 1; +} + +static int rgba_copy(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height*4); + return 1; +} + +static int gray8_copy(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height); + return 1; +} + +/*************************************************************************/ + +/* Conversions between various 32-bit formats, all usable when src==dest */ + +/* RGBA<->ABGR and ARGB<->BGRA: reverse byte order */ +static int rgba_swapall(uint8_t **src, uint8_t **dest, int width, int height) +{ + uint32_t *srcp = (uint32_t *)src[0]; + uint32_t *destp = (uint32_t *)dest[0]; + int i; + for (i = 0; i < width*height; i++) { + /* This shortcut works regardless of CPU endianness */ + destp[i] = srcp[i] >> 24 + | (srcp[i] & 0x00FF0000) >> 8 + | (srcp[i] & 0x0000FF00) << 8 + | srcp[i] << 24; + } + return 1; +} + +/* RGBA<->BGRA: swap bytes 0 and 2 */ +static int rgba_swap02(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + uint8_t tmp = src[0][i*4+2]; + dest[0][i*4+2] = src[0][i*4 ]; + dest[0][i*4 ] = tmp; + dest[0][i*4+1] = src[0][i*4+1]; + dest[0][i*4+3] = src[0][i*4+3]; + } + return 1; +} + +/* ARGB<->ABGR: swap bytes 1 and 3 */ +static int rgba_swap13(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + uint8_t tmp = src[0][i*4+3]; + dest[0][i*4+3] = src[0][i*4+1]; + dest[0][i*4+1] = tmp; + dest[0][i*4 ] = src[0][i*4 ]; + dest[0][i*4+2] = src[0][i*4+2]; + } + return 1; +} + +/* RGBA->ARGB and BGRA->ABGR: alpha moves from byte 3 to byte 0 */ +static int rgba_alpha30(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + uint8_t tmp = src[0][i*4+3]; + dest[0][i*4+3] = src[0][i*4+2]; + dest[0][i*4+2] = src[0][i*4+1]; + dest[0][i*4+1] = src[0][i*4 ]; + dest[0][i*4 ] = tmp; + } + return 1; +} + +/* ARGB->RGBA and ABGR->BGRA: alpha moves from byte 0 to byte 3 */ +static int rgba_alpha03(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + uint8_t tmp = src[0][i*4 ]; + dest[0][i*4 ] = src[0][i*4+1]; + dest[0][i*4+1] = src[0][i*4+2]; + dest[0][i*4+2] = src[0][i*4+3]; + dest[0][i*4+3] = tmp; + } + return 1; +} + +/*************************************************************************/ + +static int rgb24_bgr24(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + dest[0][i*3 ] = src[0][i*3+2]; + dest[0][i*3+1] = src[0][i*3+1]; + dest[0][i*3+2] = src[0][i*3 ]; + } + return 1; +} + +static int rgb24_rgba32(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + dest[0][i*4 ] = src[0][i*3 ]; + dest[0][i*4+1] = src[0][i*3+1]; + dest[0][i*4+2] = src[0][i*3+2]; + dest[0][i*4+3] = 0; + } + return 1; +} + +static int rgb24_abgr32(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + dest[0][i*4 ] = 0; + dest[0][i*4+1] = src[0][i*3+2]; + dest[0][i*4+2] = src[0][i*3+1]; + dest[0][i*4+3] = src[0][i*3 ]; + } + return 1; +} + +static int rgb24_argb32(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + dest[0][i*4 ] = 0; + dest[0][i*4+1] = src[0][i*3 ]; + dest[0][i*4+2] = src[0][i*3+1]; + dest[0][i*4+3] = src[0][i*3+2]; + } + return 1; +} + +static int rgb24_bgra32(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + dest[0][i*4 ] = src[0][i*3+2]; + dest[0][i*4+1] = src[0][i*3+1]; + dest[0][i*4+2] = src[0][i*3 ]; + dest[0][i*4+3] = 0; + } + return 1; +} + +static int rgb24_gray8(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + /* Use the Y part of a YUV transformation, scaled to 0..255 */ + int r = src[0][i*3 ]; + int g = src[0][i*3+1]; + int b = src[0][i*3+2]; + dest[0][i] = (19595*r + 38470*g + 7471*b + 32768) >> 16; + } + return 1; +} + +static int bgr24_gray8(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + /* Use the Y part of a YUV transformation, scaled to 0..255 */ + int r = src[0][i*3+2]; + int g = src[0][i*3+1]; + int b = src[0][i*3 ]; + dest[0][i] = (19595*r + 38470*g + 7471*b + 32768) >> 16; + } + return 1; +} + +/*************************************************************************/ + +static int rgba32_rgb24(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + dest[0][i*3 ] = src[0][i*4 ]; + dest[0][i*3+1] = src[0][i*4+1]; + dest[0][i*3+2] = src[0][i*4+2]; + } + return 1; +} + +static int bgra32_rgb24(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + dest[0][i*3 ] = src[0][i*4+2]; + dest[0][i*3+1] = src[0][i*4+1]; + dest[0][i*3+2] = src[0][i*4 ]; + } + return 1; +} + +static int rgba32_gray8(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + /* Use the Y part of a YUV transformation, scaled to 0..255 */ + int r = src[0][i*4 ]; + int g = src[0][i*4+1]; + int b = src[0][i*4+2]; + dest[0][i] = (19595*r + 38470*g + 7471*b + 32768) >> 16; + } + return 1; +} + +static int bgra32_gray8(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + /* Use the Y part of a YUV transformation, scaled to 0..255 */ + int r = src[0][i*4+2]; + int g = src[0][i*4+1]; + int b = src[0][i*4 ]; + dest[0][i] = (19595*r + 38470*g + 7471*b + 32768) >> 16; + } + return 1; +} + +/*************************************************************************/ + +static int argb32_rgb24(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + dest[0][i*3 ] = src[0][i*4+1]; + dest[0][i*3+1] = src[0][i*4+2]; + dest[0][i*3+2] = src[0][i*4+3]; + } + return 1; +} + +static int abgr32_rgb24(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + dest[0][i*3 ] = src[0][i*4+3]; + dest[0][i*3+1] = src[0][i*4+2]; + dest[0][i*3+2] = src[0][i*4+1]; + } + return 1; +} + +static int argb32_gray8(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + /* Use the Y part of a YUV transformation, scaled to 0..255 */ + int r = src[0][i*4+1]; + int g = src[0][i*4+2]; + int b = src[0][i*4+3]; + dest[0][i] = (19595*r + 38470*g + 7471*b + 32768) >> 16; + } + return 1; +} + +static int abgr32_gray8(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + /* Use the Y part of a YUV transformation, scaled to 0..255 */ + int r = src[0][i*4+3]; + int g = src[0][i*4+2]; + int b = src[0][i*4+1]; + dest[0][i] = (19595*r + 38470*g + 7471*b + 32768) >> 16; + } + return 1; +} + +/*************************************************************************/ + +static int gray8_rgb24(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + dest[0][i*3 ] = src[0][i]; + dest[0][i*3+1] = src[0][i]; + dest[0][i*3+2] = src[0][i]; + } + return 1; +} + +static int gray8_rgba32(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + dest[0][i*4 ] = src[0][i]; + dest[0][i*4+1] = src[0][i]; + dest[0][i*4+2] = src[0][i]; + dest[0][i*4+3] = 0; + } + return 1; +} + +static int gray8_argb32(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + dest[0][i*4 ] = 0; + dest[0][i*4+1] = src[0][i]; + dest[0][i*4+2] = src[0][i]; + dest[0][i*4+3] = src[0][i]; + } + return 1; +} + +/*************************************************************************/ +/*************************************************************************/ + +#if defined(ARCH_X86) || defined(ARCH_X86_64) + +#define DEFINE_MASK_DATA +#include "img_x86_common.h" + +/*************************************************************************/ + +/* Basic assembly routines */ + +/* RGBA<->ABGR and ARGB<->BGRA: reverse byte order */ +static int rgba_swapall_x86(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_REV32_X86(width*height); + return 1; +} + +/* RGBA<->BGRA: swap bytes 0 and 2 */ +static int rgba_swap02_x86(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_SWAP32_02_X86(width*height); + return 1; +} + +/* ARGB<->ABGR: swap bytes 1 and 3 */ +static int rgba_swap13_x86(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_SWAP32_13_X86(width*height); + return 1; +} + +/* RGBA->ARGB and BGRA->ABGR: alpha moves from byte 3 to byte 0 */ +static int rgba_alpha30_x86(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_ROL32_X86(width*height); + return 1; +} + +/* ARGB->RGBA and ABGR->BGRA: alpha moves from byte 0 to byte 3 */ +static int rgba_alpha03_x86(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_ROR32_X86(width*height); + return 1; +} + +/*************************************************************************/ + +/* MMX routines */ + +#if defined(HAVE_ASM_MMX) && defined(ARCH_X86) /* i.e. not x86_64 */ + +/* RGBA<->ABGR and ARGB<->BGRA: reverse byte order */ +static int rgba_swapall_mmx(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_REV32_MMX(width*height); + return 1; +} + +/* RGBA<->BGRA: swap bytes 0 and 2 */ +static int rgba_swap02_mmx(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_SWAP32_02_MMX(width*height); + return 1; +} + +/* ARGB<->ABGR: swap bytes 1 and 3 */ +static int rgba_swap13_mmx(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_SWAP32_13_MMX(width*height); + return 1; +} + +/* RGBA->ARGB and BGRA->ABGR: alpha moves from byte 3 to byte 0 */ +static int rgba_alpha30_mmx(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_ROL32_MMX(width*height); + return 1; +} + +/* ARGB->RGBA and ABGR->BGRA: alpha moves from byte 0 to byte 3 */ +static int rgba_alpha03_mmx(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_ROR32_MMX(width*height); + return 1; +} + +#endif /* HAVE_ASM_MMX && ARCH_X86 */ + +/*************************************************************************/ + +/* SSE2 routines */ + +#if defined(HAVE_ASM_SSE2) + +static const struct { uint32_t n[4]; } __attribute__((aligned(16))) rgb_bgr_data = {{ + 0xFF0000FF, 0x00FF0000, 0x0000FF00, 0x00000000 +}}; + +#define SHIFT_RBSWAP \ + "movdqa %%xmm6, %%xmm2 # XMM2: low bytes mask \n\ + pand %%xmm0, %%xmm2 # XMM2: R/B bytes \n\ + pshuflw $0xB1, %%xmm2, %%xmm2 # XMM2: swap R and B (low quad) \n\ + pand %%xmm7, %%xmm0 # XMM0: G bytes \n\ + pshufhw $0xB1, %%xmm2, %%xmm2 # XMM2: swap R and B (high quad)\n\ + por %%xmm2, %%xmm0 # XMM0: data now in BGRA32 \n" + +#define SHIFT_AFIRST \ + "pslldq $1, %%xmm0 # XMM0: move A first \n" + +#define SHIFT_ALAST \ + "psrldq $1, %%xmm0 # XMM0: move A last \n" + +#define RGB24TO32(ROFS,GOFS,BOFS,AOFS,SHIFT) \ + asm("pcmpeqd %%xmm5, %%xmm5 \n\ + movdqa %%xmm5, %%xmm6 \n\ + psrldq $13, %%xmm5 # XMM5: 24-bit mask \n\ + movdqa %%xmm6, %%xmm7 \n\ + psrlw $8, %%xmm6 # XMM6: low bytes mask \n\ + psllw $8, %%xmm7 # XMM7: high bytes mask \n"\ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 4, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ \ + "lea ("ECX","ECX",2),"EDX" \n\ + movb -3("ESI","EDX"), %%al \n\ + movb %%al, ("#ROFS"-4)("EDI","ECX",4) \n\ + movb -2("ESI","EDX"), %%al \n\ + movb %%al, ("#GOFS"-4)("EDI","ECX",4) \n\ + movb -1("ESI","EDX"), %%al \n\ + movb %%al, ("#BOFS"-4)("EDI","ECX",4) \n\ + movb $0, ("#AOFS"-4)("EDI","ECX",4)", \ + /* main_loop */ \ + "lea ("ECX","ECX",2),"EDX" \n\ + # We can't just movdqu, because we might run over the edge \n\ + movd -12("ESI","EDX"), %%xmm1 \n\ + movq -8("ESI","EDX"), %%xmm0 \n\ + pshufd $0xD3, %%xmm0, %%xmm0 # shift left by 4 bytes \n\ + por %%xmm1, %%xmm0 # XMM0: original RGB24 data \n\ + pshufd $0xF3, %%xmm5, %%xmm2 # XMM2: pixel 1 mask \n\ + movdqa %%xmm5, %%xmm1 # XMM1: pixel 0 mask \n\ + pshufd $0xCF, %%xmm5, %%xmm3 # XMM3: pixel 2 mask \n\ + pand %%xmm0, %%xmm1 # XMM1: pixel 0 \n\ + pslldq $1, %%xmm0 \n\ + pand %%xmm0, %%xmm2 # XMM2: pixel 1 \n\ + pshufd $0x3F, %%xmm5, %%xmm4 # XMM4: pixel 3 mask \n\ + por %%xmm2, %%xmm1 # XMM1: pixels 0 and 1 \n\ + pslldq $1, %%xmm0 \n\ + pand %%xmm0, %%xmm3 # XMM3: pixel 2 \n\ + por %%xmm3, %%xmm1 # XMM1: pixels 0, 1, and 2 \n\ + pslldq $1, %%xmm0 \n\ + pand %%xmm4, %%xmm0 # XMM0: pixel 3 \n\ + por %%xmm1, %%xmm0 # XMM0: RGBA32 data \n\ + "SHIFT" # shift bytes to target position\n\ + movdqu %%xmm0, -16("EDI","ECX",4)", \ + /* emms */ "emms") \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (width*height), \ + "d" (&rgb_bgr_data), "m" (rgb_bgr_data) \ + : "eax"); + +#define RGB32TO24(ROFS,GOFS,BOFS,AOFS,SHIFT) \ + asm("pcmpeqd %%xmm5, %%xmm5 \n\ + movdqa %%xmm5, %%xmm6 \n\ + psrldq $13, %%xmm5 # 24-bit mask \n\ + movdqa %%xmm6, %%xmm7 \n\ + psrlw $8, %%xmm6 # low bytes mask \n\ + psllw $8, %%xmm7 # high bytes mask \n"\ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 4, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ \ + "lea ("ECX","ECX",2),"EDX" \n\ + movb ("#ROFS"-4)("ESI","ECX",4), %%al \n\ + movb %%al, -3("EDI","EDX") \n\ + movb ("#GOFS"-4)("ESI","ECX",4), %%al \n\ + movb %%al, -2("EDI","EDX") \n\ + movb ("#BOFS"-4)("ESI","ECX",4), %%al \n\ + movb %%al, -1("EDI","EDX")", \ + /* main_loop */ \ + "lea ("ECX","ECX",2),"EDX" \n\ + movdqu -16("ESI","ECX",4), %%xmm0 \n\ + "SHIFT" # shift source data to RGBA \n\ + pshufd $0xF3, %%xmm5, %%xmm1 # XMM1: pixel 1 mask \n\ + pshufd $0xCF, %%xmm5, %%xmm2 # XMM2: pixel 2 mask \n\ + pshufd $0x3F, %%xmm5, %%xmm3 # XMM3: pixel 3 mask \n\ + pand %%xmm0, %%xmm3 # XMM3: pixel 3 \n\ + psrldq $1, %%xmm3 \n\ + pand %%xmm0, %%xmm2 # XMM2: pixel 2 \n\ + por %%xmm3, %%xmm2 # XMM2: pixels 2 and 3 \n\ + psrldq $1, %%xmm2 \n\ + pand %%xmm0, %%xmm1 # XMM1: pixel 1 \n\ + pand %%xmm5, %%xmm0 # XMM0: pixel 0 \n\ + por %%xmm2, %%xmm1 # XMM1: pixels 1, 2, and 3 \n\ + psrldq $1, %%xmm1 \n\ + por %%xmm1, %%xmm0 # XMM0: RGB24 data \n\ + # We can't just movdqu, because we might run over the edge \n\ + movd %%xmm0, -12("EDI","EDX") # store low 4 bytes \n\ + pshufd $0xF9, %%xmm0, %%xmm0 # shift right 4 bytes \n\ + movq %%xmm0, -8("EDI","EDX") # store high 8 bytes \n",\ + /* emms */ "emms") \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (width*height), \ + "d" (&rgb_bgr_data), "m" (rgb_bgr_data) \ + : "eax"); + + +/* RGBA<->ABGR and ARGB<->BGRA: reverse byte order */ +static int rgba_swapall_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_REV32_SSE2(width*height); + return 1; +} + +/* RGBA<->BGRA: swap bytes 0 and 2 */ +static int rgba_swap02_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_SWAP32_02_SSE2(width*height); + return 1; +} + +/* ARGB<->ABGR: swap bytes 1 and 3 */ +static int rgba_swap13_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_SWAP32_13_SSE2(width*height); + return 1; +} + +/* RGBA->ARGB and BGRA->ABGR: alpha moves from byte 3 to byte 0 */ +static int rgba_alpha30_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_ROL32_SSE2(width*height); + return 1; +} + +/* ARGB->RGBA and ABGR->BGRA: alpha moves from byte 0 to byte 3 */ +static int rgba_alpha03_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_ROR32_SSE2(width*height); + return 1; +} + +/* RGB<->BGR */ +static int rgb24_bgr24_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + asm("movdqa ("EDX"), %%xmm5 # byte 0 mask \n\ + pshufd $0xD2, %%xmm5, %%xmm6 # byte 1 mask \n\ + pshufd $0xC9, %%xmm5, %%xmm7 # byte 2 mask \n" + SIMD_LOOP_WRAPPER( + /* blocksize */ 4, + /* push_regs */ "", + /* pop_regs */ "", + /* small_loop */ + "lea ("ECX","ECX",2),"EDX" \n\ + movb -3("ESI","EDX"), %%al \n\ + movb -2("ESI","EDX"), %%ah \n\ + movb %%ah, -2("EDI","EDX") \n\ + movb -1("ESI","EDX"), %%ah \n\ + movb %%ah, -3("EDI","EDX") \n\ + movb %%al, -1("EDI","EDX")", + /* main_loop */ + "lea ("ECX","ECX",2),"EDX" \n\ + # We can't just movdqu, because we might run over the edge \n\ + movd -12("ESI","EDX"), %%xmm1 \n\ + movq -8("ESI","EDX"), %%xmm0 \n\ + pshufd $0xD3, %%xmm0, %%xmm0 # shift left by 4 bytes \n\ + por %%xmm1, %%xmm0 # XMM0: original data \n\ + movdqa %%xmm5, %%xmm2 \n\ + movdqa %%xmm6, %%xmm3 \n\ + movdqa %%xmm7, %%xmm4 \n\ + pand %%xmm0, %%xmm2 # XMM2: byte 0 \n\ + pslldq $2, %%xmm2 # shift to byte 2 position \n\ + pand %%xmm0, %%xmm3 # XMM3: byte 1 \n\ + pand %%xmm0, %%xmm4 # XMM4: byte 2 \n\ + psrldq $2, %%xmm4 # shift to byte 0 position \n\ + por %%xmm2, %%xmm3 \n\ + por %%xmm4, %%xmm3 # XMM3: reversed data \n\ + movd %%xmm3, -12("EDI","EDX") # avoid running over the edge \n\ + pshufd $0xF9, %%xmm3, %%xmm3 # shift right by 4 bytes \n\ + movq %%xmm3, -8("EDI","EDX")", + /* emms */ "emms") + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "c" (width*height), + "d" (&rgb_bgr_data), "m" (rgb_bgr_data) + : "eax"); + return 1; +} + +/* RGB->RGBA */ +static int rgb24_rgba32_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + RGB24TO32(0,1,2,3, ""); + return 1; +} + +/* RGB->ABGR */ +static int rgb24_abgr32_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + RGB24TO32(3,2,1,0, SHIFT_RBSWAP SHIFT_AFIRST); + return 1; +} + +/* RGB->ARGB */ +static int rgb24_argb32_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + RGB24TO32(1,2,3,0, SHIFT_AFIRST); + return 1; +} + +/* RGB->BGRA */ +static int rgb24_bgra32_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + RGB24TO32(2,1,0,3, SHIFT_RBSWAP); + return 1; +} + +/* RGBA->RGB */ +static int rgba32_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + RGB32TO24(0,1,2,3, ""); + return 1; +} + +/* ABGR->RGB */ +static int abgr32_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + RGB32TO24(3,2,1,0, SHIFT_ALAST SHIFT_RBSWAP); + return 1; +} + +/* ARGB->RGB */ +static int argb32_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + RGB32TO24(1,2,3,0, SHIFT_ALAST); + return 1; +} + +/* BGRA->RGB */ +static int bgra32_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + RGB32TO24(2,1,0,3, SHIFT_RBSWAP); + return 1; +} + +/*************************************************************************/ + +#define R_GRAY 19595 +#define G_GRAY 38470 +#define B_GRAY 7471 +#define INIT_GRAY8 \ + "pxor %%xmm4, %%xmm4 # XMM4: all 0's \n\ + movl %3, %%eax \n\ + movd %%eax, %%xmm5 \n\ + pshuflw $0x00, %%xmm5, %%xmm5 \n\ + pshufd $0x00, %%xmm5, %%xmm5 # XMM5: R->gray constant \n\ + movl %4, %%eax \n\ + movd %%eax, %%xmm6 \n\ + pshuflw $0x00, %%xmm6, %%xmm6 \n\ + pshufd $0x00, %%xmm6, %%xmm6 # XMM6: G->gray constant \n\ + movl %5, %%eax \n\ + movd %%eax, %%xmm7 \n\ + pshuflw $0x00, %%xmm7, %%xmm7 \n\ + pshufd $0x00, %%xmm7, %%xmm7 # XMM7: B->gray constant \n\ + pcmpeqd %%xmm3, %%xmm3 \n\ + psllw $15, %%xmm3 \n\ + psrlw $8, %%xmm3 # XMM3: 0x0080*8 (for rounding) \n" +#define SINGLE_GRAY8(idx,ofsR,ofsG,ofsB) \ + "movzbl "#ofsR"("ESI","idx"), %%eax # retrieve red byte \n\ + imull %3, %%eax # multiply by red->gray factor \n\ + movzbl "#ofsG"("ESI","idx"), %%edx # retrieve green byte \n\ + imull %4, %%edx # multiply by green->gray factor\n\ + addl %%edx, %%eax # add to total \n\ + movzbl "#ofsB"("ESI","idx"), %%edx # retrieve blue byte \n\ + imull %5, %%edx # multiply by blue->gray factor \n\ + addl %%edx, %%eax # add to total \n\ + addl $0x8000, %%eax # round \n\ + shrl $16, %%eax # shift back down \n\ + movb %%al, -1("EDI","ECX") # and store \n" +#define STORE_GRAY8 \ + "psllw $8, %%xmm0 # XMM0: add 8 bits of precision \n\ + pmulhuw %%xmm5, %%xmm0 # XMM0: r7 r6 r5 r4 r3 r2 r1 r0 \n\ + psllw $8, %%xmm1 # XMM1: add 8 bits of precision \n\ + pmulhuw %%xmm6, %%xmm1 # XMM1: g7 g6 g5 g4 g3 g2 g1 g0 \n\ + paddw %%xmm3, %%xmm0 # XMM0: add rounding constant \n\ + psllw $8, %%xmm2 # XMM2: add 8 bits of precision \n\ + pmulhuw %%xmm7, %%xmm2 # XMM2: b7 b6 b5 b4 b3 b2 b1 b0 \n\ + paddw %%xmm1, %%xmm0 # XMM0: add green part \n\ + paddw %%xmm2, %%xmm0 # XMM0: add blue part \n\ + psrlw $8, %%xmm0 # XMM0: shift back to bytes \n\ + packuswb %%xmm4, %%xmm0 # XMM0: gray7..gray0 packed \n\ + movq %%xmm0, -8("EDI","ECX") \n" + +#define ASM_RGB24_GRAY(ofsR,ofsG,ofsB,load) \ + asm(INIT_GRAY8 \ + PUSH(EBX)" \n\ + lea ("ECX","ECX",2),"EBX" \n"\ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 8, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ SINGLE_GRAY8(EBX, ofsR,ofsG,ofsB) "subl $3, %%ebx;",\ + /* main_loop */ load(4) STORE_GRAY8 "subl $24, %%ebx;", \ + /* emms */ "emms") \ + POP(EBX) \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (width*height), \ + "i" (R_GRAY), "i" (G_GRAY), "i" (B_GRAY) \ + : "eax", "edx" COMMA_FAKE_PUSH_REG \ + ) + +#define ASM_RGB32_GRAY(ofsR,ofsG,ofsB,load) \ + asm(INIT_GRAY8 \ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 8, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ SINGLE_GRAY8(ECX",4", ofsR,ofsG,ofsB), \ + /* main_loop */ load(4) STORE_GRAY8, \ + /* emms */ "emms") \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (width*height), \ + "i" (R_GRAY), "i" (G_GRAY), "i" (B_GRAY) \ + : "eax", "edx") + + +static int rgb24_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_RGB24_GRAY(-3,-2,-1, SSE2_LOAD_RGB24); + return 1; +} + +static int bgr24_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_RGB24_GRAY(-1,-2,-3, SSE2_LOAD_BGR24); + return 1; +} + +static int rgba32_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_RGB32_GRAY(-4,-3,-2, SSE2_LOAD_RGBA32); + return 1; +} + +static int bgra32_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_RGB32_GRAY(-2,-3,-4, SSE2_LOAD_BGRA32); + return 1; +} + +static int argb32_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_RGB32_GRAY(-3,-2,-1, SSE2_LOAD_ARGB32); + return 1; +} + +static int abgr32_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_RGB32_GRAY(-1,-2,-3, SSE2_LOAD_ABGR32); + return 1; +} + +/*************************************************************************/ + +static int gray8_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + asm("# Store all 0's in XMM4 \n\ + pxor %%xmm4, %%xmm4 \n\ + # Generate mask in XMM7 to select bytes 0,3,6,9 of an XMM register\n\ + pcmpeqd %%xmm7, %%xmm7 # XMM7: all 1's \n\ + psrlw $8, %%xmm7 # XMM7: 0x00FF * 8 \n\ + pcmpeqd %%xmm6, %%xmm6 # XMM6: all 1's \n\ + psllw $8, %%xmm6 # XMM6: 0xFF00 * 8 \n\ + pslldq $8, %%xmm6 \n\ + psrldq $8, %%xmm7 \n\ + por %%xmm6, %%xmm7 # XMM7: 0xFF00*4, 0x00FF*4 \n\ + pshufd $0xCC, %%xmm7, %%xmm7 # XMM7: {0xFF00*2, 0x00FF*2} * 2\n\ + pshuflw $0xC0, %%xmm7, %%xmm7 # XMM7.l: FF0000FF00FF00FF \n\ + psrldq $4, %%xmm7 # XMM7: 0x00000000FF00FF00 \n\ + # 00FF00FFFF0000FF \n\ + pshufd $0xEC, %%xmm7, %%xmm7 # XMM7: 0x00000000FF00FF00 \n\ + # 00000000FF0000FF \n\ + pshuflw $0x24, %%xmm7, %%xmm7 # XMM7.l: 00FF0000FF0000FF \n\ + pshufhw $0xFC, %%xmm7, %%xmm7 # XMM7.h: 000000000000FF00 \n\ + # Load ECX*3 into EDX ahead of time \n\ + lea ("ECX","ECX",2), "EDX" \n" + SIMD_LOOP_WRAPPER( + /* blocksize */ 4, + /* push_regs */ "", + /* pop_regs */ "", + /* small_loop */ "\ + movb -1("ESI","ECX"), %%al # retrieve gray byte \n\ + movb %%al, -3("EDI","EDX") # and store 3 times \n\ + movb %%al, -2("EDI","EDX") \n\ + movb %%al, -1("EDI","EDX") \n\ + subl $3, %%edx \n", + /* main_loop */ "\ + movd -4("ESI","ECX"), %%xmm0 # XMM0: G3..G0 \n\ + pshufd $0xCC, %%xmm0, %%xmm0 # XMM0: {0,0,0,0,G3..G0} * 2 \n\ + pshuflw $0x50, %%xmm0, %%xmm0 # X0.l: G3 G2 G3 G2 G1 G0 G1 G0 \n\ + pshufhw $0x55, %%xmm0, %%xmm0 # X0.h: G3 G2 G3 G2 G3 G2 G3 G2 \n\ + pand %%xmm7, %%xmm0 # XMM0: ------3--2--1--0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: ------3--2--1--0 \n\ + pslldq $1, %%xmm1 # XMM1: -----3--2--1--0- \n\ + movdqa %%xmm0, %%xmm2 # XMM2: ------3--2--1--0 \n\ + pslldq $2, %%xmm2 # XMM2: ----3--2--1--0-- \n\ + por %%xmm1, %%xmm0 # XMM0: -----33-22-11-00 \n\ + por %%xmm2, %%xmm0 # XMM0: ----333222111000 \n\ + movd %%xmm0, -12("EDI","EDX") \n\ + pshufd $0xC9, %%xmm0, %%xmm0 \n\ + movq %%xmm0, -8("EDI","EDX") \n\ + subl $12, %%edx \n", + /* emms */ "emms") + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "c" (width*height) + : "eax", "edx"); + return 1; +} + +static int gray8_rgba32_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + asm("pxor %%xmm4, %%xmm4 # XMM4: all 0's \n" + SIMD_LOOP_WRAPPER( + /* blocksize */ 4, + /* push_regs */ "", + /* pop_regs */ "", + /* small_loop */ "\ + movb -1("ESI","ECX"), %%al # retrieve gray byte \n\ + movb %%al, -4("EDI","ECX",4) # and store 3 times \n\ + movb %%al, -3("EDI","ECX",4) \n\ + movb %%al, -2("EDI","ECX",4) \n\ + movb $0, -1("EDI","ECX",4) # clear A byte \n", + /* main_loop */ "\ + movd -4("ESI","ECX"), %%xmm0 # XMM0: 00 00 00 00 G3 G2 G1 G0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: 00 00 00 00 G3 G2 G1 G0 \n\ + punpcklbw %%xmm0, %%xmm0 # XMM0: G3 G3 G2 G2 G1 G1 G0 G0 \n\ + punpcklbw %%xmm4, %%xmm1 # XMM1: 00 G3 00 G2 00 G1 00 G0 \n\ + punpcklbw %%xmm1, %%xmm0 # XMM0: 0GGG3 0GGG2 0GGG1 0GGG0 \n\ + movdqu %%xmm0, -16("EDI","ECX",4) \n", + /* emms */ "emms") + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "c" (width*height) + : "eax"); + return 1; +} + +static int gray8_argb32_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + asm("pxor %%xmm4, %%xmm4 # XMM4: all 0's \n" + SIMD_LOOP_WRAPPER( + /* blocksize */ 4, + /* push_regs */ "", + /* pop_regs */ "", + /* small_loop */ "\ + movb -1("ESI","ECX"), %%al # retrieve gray byte \n\ + movb %%al, -3("EDI","ECX",4) # and store 3 times \n\ + movb %%al, -2("EDI","ECX",4) \n\ + movb %%al, -1("EDI","ECX",4) \n\ + movb $0, -4("EDI","ECX",4) # clear A byte \n", + /* main_loop */ "\ + movd -4("ESI","ECX"), %%xmm0 # XMM0: 00 00 00 00 G3 G2 G1 G0 \n\ + movdqa %%xmm4, %%xmm1 # XMM1: 00 00 00 00 00 00 00 00 \n\ + punpcklbw %%xmm0, %%xmm1 # XMM1: G3 00 G2 00 G1 00 G0 00 \n\ + punpcklbw %%xmm0, %%xmm0 # XMM0: G3 G3 G2 G2 G1 G1 G0 G0 \n\ + punpcklbw %%xmm0, %%xmm1 # XMM0: GGG03 GGG02 GGG01 GGG00 \n\ + movdqu %%xmm1, -16("EDI","ECX",4) \n", + /* emms */ "emms") + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "c" (width*height) + : "eax"); + return 1; +} + +#endif /* HAVE_ASM_SSE2 */ + +/*************************************************************************/ + +#endif /* ARCH_X86 || ARCH_X86_64 */ + +/*************************************************************************/ +/*************************************************************************/ + +/* Initialization */ + +int ac_imgconvert_init_rgb_packed(int accel) +{ + if (!register_conversion(IMG_RGB24, IMG_RGB24, rgb_copy) + || !register_conversion(IMG_RGB24, IMG_BGR24, rgb24_bgr24) + || !register_conversion(IMG_RGB24, IMG_RGBA32, rgb24_rgba32) + || !register_conversion(IMG_RGB24, IMG_ABGR32, rgb24_abgr32) + || !register_conversion(IMG_RGB24, IMG_ARGB32, rgb24_argb32) + || !register_conversion(IMG_RGB24, IMG_BGRA32, rgb24_bgra32) + || !register_conversion(IMG_RGB24, IMG_GRAY8, rgb24_gray8) + + || !register_conversion(IMG_BGR24, IMG_BGR24, rgb_copy) + || !register_conversion(IMG_BGR24, IMG_RGB24, rgb24_bgr24) + || !register_conversion(IMG_BGR24, IMG_RGBA32, rgb24_bgra32) + || !register_conversion(IMG_BGR24, IMG_ABGR32, rgb24_argb32) + || !register_conversion(IMG_BGR24, IMG_ARGB32, rgb24_abgr32) + || !register_conversion(IMG_BGR24, IMG_BGRA32, rgb24_rgba32) + || !register_conversion(IMG_BGR24, IMG_GRAY8, bgr24_gray8) + + || !register_conversion(IMG_RGBA32, IMG_RGB24, rgba32_rgb24) + || !register_conversion(IMG_RGBA32, IMG_BGR24, bgra32_rgb24) + || !register_conversion(IMG_RGBA32, IMG_RGBA32, rgba_copy) + || !register_conversion(IMG_RGBA32, IMG_ABGR32, rgba_swapall) + || !register_conversion(IMG_RGBA32, IMG_ARGB32, rgba_alpha30) + || !register_conversion(IMG_RGBA32, IMG_BGRA32, rgba_swap02) + || !register_conversion(IMG_RGBA32, IMG_GRAY8, rgba32_gray8) + + || !register_conversion(IMG_ABGR32, IMG_RGB24, abgr32_rgb24) + || !register_conversion(IMG_ABGR32, IMG_BGR24, argb32_rgb24) + || !register_conversion(IMG_ABGR32, IMG_RGBA32, rgba_swapall) + || !register_conversion(IMG_ABGR32, IMG_ABGR32, rgba_copy) + || !register_conversion(IMG_ABGR32, IMG_ARGB32, rgba_swap13) + || !register_conversion(IMG_ABGR32, IMG_BGRA32, rgba_alpha03) + || !register_conversion(IMG_ABGR32, IMG_GRAY8, abgr32_gray8) + + || !register_conversion(IMG_ARGB32, IMG_RGB24, argb32_rgb24) + || !register_conversion(IMG_ARGB32, IMG_BGR24, abgr32_rgb24) + || !register_conversion(IMG_ARGB32, IMG_RGBA32, rgba_alpha03) + || !register_conversion(IMG_ARGB32, IMG_ABGR32, rgba_swap13) + || !register_conversion(IMG_ARGB32, IMG_ARGB32, rgba_copy) + || !register_conversion(IMG_ARGB32, IMG_BGRA32, rgba_swapall) + || !register_conversion(IMG_ARGB32, IMG_GRAY8, argb32_gray8) + + || !register_conversion(IMG_BGRA32, IMG_RGB24, bgra32_rgb24) + || !register_conversion(IMG_BGRA32, IMG_BGR24, rgba32_rgb24) + || !register_conversion(IMG_BGRA32, IMG_RGBA32, rgba_swap02) + || !register_conversion(IMG_BGRA32, IMG_ABGR32, rgba_alpha30) + || !register_conversion(IMG_BGRA32, IMG_ARGB32, rgba_swapall) + || !register_conversion(IMG_BGRA32, IMG_BGRA32, rgba_copy) + || !register_conversion(IMG_BGRA32, IMG_GRAY8, bgra32_gray8) + + || !register_conversion(IMG_GRAY8, IMG_RGB24, gray8_rgb24) + || !register_conversion(IMG_GRAY8, IMG_BGR24, gray8_rgb24) + || !register_conversion(IMG_GRAY8, IMG_RGBA32, gray8_rgba32) + || !register_conversion(IMG_GRAY8, IMG_ABGR32, gray8_argb32) + || !register_conversion(IMG_GRAY8, IMG_ARGB32, gray8_argb32) + || !register_conversion(IMG_GRAY8, IMG_BGRA32, gray8_rgba32) + || !register_conversion(IMG_GRAY8, IMG_GRAY8, gray8_copy) + ) { + return 0; + } + +#if defined(ARCH_X86) || defined(ARCH_X86_64) + + if (accel & (AC_IA32ASM | AC_AMD64ASM)) { + if (!register_conversion(IMG_RGBA32, IMG_ABGR32, rgba_swapall_x86) + || !register_conversion(IMG_RGBA32, IMG_ARGB32, rgba_alpha30_x86) + || !register_conversion(IMG_RGBA32, IMG_BGRA32, rgba_swap02_x86) + + || !register_conversion(IMG_ABGR32, IMG_RGBA32, rgba_swapall_x86) + || !register_conversion(IMG_ABGR32, IMG_ARGB32, rgba_swap13_x86) + || !register_conversion(IMG_ABGR32, IMG_BGRA32, rgba_alpha03_x86) + + || !register_conversion(IMG_ARGB32, IMG_RGBA32, rgba_alpha03_x86) + || !register_conversion(IMG_ARGB32, IMG_ABGR32, rgba_swap13_x86) + || !register_conversion(IMG_ARGB32, IMG_BGRA32, rgba_swapall_x86) + + || !register_conversion(IMG_BGRA32, IMG_RGBA32, rgba_swap02_x86) + || !register_conversion(IMG_BGRA32, IMG_ABGR32, rgba_alpha30_x86) + || !register_conversion(IMG_BGRA32, IMG_ARGB32, rgba_swapall_x86) + ) { + return 0; + } + } + +#if defined(HAVE_ASM_MMX) && defined(ARCH_X86) + if (accel & AC_MMX) { + if (!register_conversion(IMG_RGBA32, IMG_ABGR32, rgba_swapall_mmx) + || !register_conversion(IMG_RGBA32, IMG_ARGB32, rgba_alpha30_mmx) + || !register_conversion(IMG_RGBA32, IMG_BGRA32, rgba_swap02_mmx) + + || !register_conversion(IMG_ABGR32, IMG_RGBA32, rgba_swapall_mmx) + || !register_conversion(IMG_ABGR32, IMG_ARGB32, rgba_swap13_mmx) + || !register_conversion(IMG_ABGR32, IMG_BGRA32, rgba_alpha03_mmx) + + || !register_conversion(IMG_ARGB32, IMG_RGBA32, rgba_alpha03_mmx) + || !register_conversion(IMG_ARGB32, IMG_ABGR32, rgba_swap13_mmx) + || !register_conversion(IMG_ARGB32, IMG_BGRA32, rgba_swapall_mmx) + + || !register_conversion(IMG_BGRA32, IMG_RGBA32, rgba_swap02_mmx) + || !register_conversion(IMG_BGRA32, IMG_ABGR32, rgba_alpha30_mmx) + || !register_conversion(IMG_BGRA32, IMG_ARGB32, rgba_swapall_mmx) + ) { + return 0; + } + } +#endif + +#if defined(HAVE_ASM_SSE2) + if (accel & AC_SSE2) { + if (!register_conversion(IMG_RGB24, IMG_BGR24, rgb24_bgr24_sse2) + || !register_conversion(IMG_RGB24, IMG_RGBA32, rgb24_rgba32_sse2) + || !register_conversion(IMG_RGB24, IMG_ABGR32, rgb24_abgr32_sse2) + || !register_conversion(IMG_RGB24, IMG_ARGB32, rgb24_argb32_sse2) + || !register_conversion(IMG_RGB24, IMG_BGRA32, rgb24_bgra32_sse2) + || !register_conversion(IMG_RGB24, IMG_GRAY8, rgb24_gray8_sse2) + + || !register_conversion(IMG_BGR24, IMG_RGB24, rgb24_bgr24_sse2) + || !register_conversion(IMG_BGR24, IMG_RGBA32, rgb24_bgra32_sse2) + || !register_conversion(IMG_BGR24, IMG_ABGR32, rgb24_argb32_sse2) + || !register_conversion(IMG_BGR24, IMG_ARGB32, rgb24_abgr32_sse2) + || !register_conversion(IMG_BGR24, IMG_BGRA32, rgb24_rgba32_sse2) + || !register_conversion(IMG_BGR24, IMG_GRAY8, bgr24_gray8_sse2) + + || !register_conversion(IMG_RGBA32, IMG_RGB24, rgba32_rgb24_sse2) + || !register_conversion(IMG_RGBA32, IMG_BGR24, bgra32_rgb24_sse2) + || !register_conversion(IMG_RGBA32, IMG_ABGR32, rgba_swapall_sse2) + || !register_conversion(IMG_RGBA32, IMG_ARGB32, rgba_alpha30_sse2) + || !register_conversion(IMG_RGBA32, IMG_BGRA32, rgba_swap02_sse2) + || !register_conversion(IMG_RGBA32, IMG_GRAY8, rgba32_gray8_sse2) + + || !register_conversion(IMG_ABGR32, IMG_RGB24, abgr32_rgb24_sse2) + || !register_conversion(IMG_ABGR32, IMG_BGR24, argb32_rgb24_sse2) + || !register_conversion(IMG_ABGR32, IMG_RGBA32, rgba_swapall_sse2) + || !register_conversion(IMG_ABGR32, IMG_ARGB32, rgba_swap13_sse2) + || !register_conversion(IMG_ABGR32, IMG_BGRA32, rgba_alpha03_sse2) + || !register_conversion(IMG_ABGR32, IMG_GRAY8, abgr32_gray8_sse2) + + || !register_conversion(IMG_ARGB32, IMG_RGB24, argb32_rgb24_sse2) + || !register_conversion(IMG_ARGB32, IMG_BGR24, abgr32_rgb24_sse2) + || !register_conversion(IMG_ARGB32, IMG_RGBA32, rgba_alpha03_sse2) + || !register_conversion(IMG_ARGB32, IMG_ABGR32, rgba_swap13_sse2) + || !register_conversion(IMG_ARGB32, IMG_BGRA32, rgba_swapall_sse2) + || !register_conversion(IMG_ARGB32, IMG_GRAY8, argb32_gray8_sse2) + + || !register_conversion(IMG_BGRA32, IMG_RGB24, bgra32_rgb24_sse2) + || !register_conversion(IMG_BGRA32, IMG_BGR24, rgba32_rgb24_sse2) + || !register_conversion(IMG_BGRA32, IMG_RGBA32, rgba_swap02_sse2) + || !register_conversion(IMG_BGRA32, IMG_ABGR32, rgba_alpha30_sse2) + || !register_conversion(IMG_BGRA32, IMG_ARGB32, rgba_swapall_sse2) + || !register_conversion(IMG_BGRA32, IMG_GRAY8, bgra32_gray8_sse2) + + || !register_conversion(IMG_GRAY8, IMG_RGB24, gray8_rgb24_sse2) + || !register_conversion(IMG_GRAY8, IMG_BGR24, gray8_rgb24_sse2) + || !register_conversion(IMG_GRAY8, IMG_RGBA32, gray8_rgba32_sse2) + || !register_conversion(IMG_GRAY8, IMG_ABGR32, gray8_argb32_sse2) + || !register_conversion(IMG_GRAY8, IMG_ARGB32, gray8_argb32_sse2) + || !register_conversion(IMG_GRAY8, IMG_BGRA32, gray8_rgba32_sse2) + ) { + return 0; + } + } +#endif + +#endif /* ARCH_X86 || ARCH_X86_64 */ + + return 1; +} + +/*************************************************************************/ + +/* + * Local variables: + * c-file-style: "stroustrup" + * c-file-offsets: ((case-label . *) (statement-case-intro . *)) + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: + */ diff --git a/debian/transcode/transcode-1.1.7/aclib/img_x86_common.h b/debian/transcode/transcode-1.1.7/aclib/img_x86_common.h new file mode 100644 index 00000000..13ed851f --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/img_x86_common.h @@ -0,0 +1,613 @@ +/* + * img_x86_common.h - common x86/x86-64 assembly macros + * Written by Andrew Church <[email protected]> + * + * This file is part of transcode, a video stream processing tool. + * transcode is free software, distributable under the terms of the GNU + * General Public License (version 2 or later). See the file COPYING + * for details. + */ + +#ifndef ACLIB_IMG_X86_COMMON_H +#define ACLIB_IMG_X86_COMMON_H + +/*************************************************************************/ + +/* Register names for pointers */ +#ifdef ARCH_X86_64 +# define EAX "%%rax" +# define EBX "%%rbx" +# define ECX "%%rcx" +# define EDX "%%rdx" +# define ESP "%%rsp" +# define EBP "%%rbp" +# define ESI "%%rsi" +# define EDI "%%rdi" +#else +# define EAX "%%eax" +# define EBX "%%ebx" +# define ECX "%%ecx" +# define EDX "%%edx" +# define ESP "%%esp" +# define EBP "%%ebp" +# define ESI "%%esi" +# define EDI "%%edi" +#endif + +/* Macros to push and pop one or two registers within an assembly block. + * The x86-64 ABI allows leaf functions to write to 128 bytes BELOW + * (yes, below) the stack pointer, so we can't just push our own stuff + * there. Argh. */ +#ifdef ARCH_X86_64 +# define FAKE_PUSH_REG "r12" +# define FAKE_PUSH_REG_2 "r13" +# define COMMA_FAKE_PUSH_REG ,FAKE_PUSH_REG +# define PUSH(reg) "mov " reg ", %%" FAKE_PUSH_REG +# define POP(reg) "mov %%" FAKE_PUSH_REG ", " reg +# define PUSH2(reg1,reg2) PUSH(reg1) "; mov " reg2 ", %%" FAKE_PUSH_REG_2 +# define POP2(reg2,reg1) "mov %%" FAKE_PUSH_REG_2 ", " reg2 "; " POP(reg1) +#else +# define COMMA_FAKE_PUSH_REG /*nothing*/ +# define PUSH(reg) "push " reg +# define POP(reg) "pop " reg +# define PUSH2(reg1,reg2) "push " reg1 "; push " reg2 +# define POP2(reg2,reg1) "pop " reg2 "; pop " reg1 +#endif + +/* Data for isolating particular bytes. Used by the SWAP32 macros; if you + * use them, make sure to define DEFINE_MASK_DATA before including this + * file! */ +#ifdef DEFINE_MASK_DATA +static const struct { uint32_t n[64]; } __attribute__((aligned(16))) mask_data = {{ + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF, + 0x0000FF00, 0x0000FF00, 0x0000FF00, 0x0000FF00, + 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, + 0x00FF0000, 0x00FF0000, 0x00FF0000, 0x00FF0000, + 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, + 0x00FFFF00, 0x00FFFF00, 0x00FFFF00, 0x00FFFF00, + 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, + 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000, + 0xFF0000FF, 0xFF0000FF, 0xFF0000FF, 0xFF0000FF, + 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, + 0xFF00FFFF, 0xFF00FFFF, 0xFF00FFFF, 0xFF00FFFF, + 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, + 0xFFFF00FF, 0xFFFF00FF, 0xFFFF00FF, 0xFFFF00FF, + 0xFFFFFF00, 0xFFFFFF00, 0xFFFFFF00, 0xFFFFFF00, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, +}}; +#endif + +/*************************************************************************/ + +/* Basic assembly macros, used for odd-count loops */ + +/* Swap bytes in pairs of 16-bit values */ +#define X86_SWAP16_2 \ + "movl -4("ESI","ECX",4), %%eax \n\ + movl %%eax, %%edx \n\ + shll $8, %%eax \n\ + andl $0xFF00FF00, %%eax \n\ + shrl $8, %%edx \n\ + andl $0x00FF00FF, %%edx \n\ + orl %%edx, %%eax \n\ + movl %%eax, -4("EDI","ECX",4)" + +/* Swap words in a 32-bit value */ +#define X86_SWAP32 \ + "movl -4("ESI","ECX",4), %%eax \n\ + roll $16, %%eax \n\ + movl %%eax, -4("EDI","ECX",4)" + +/* Swap bytes 0 and 2 of a 32-bit value */ +#define X86_SWAP32_02 \ + "movw -4("ESI","ECX",4), %%ax \n\ + movw -2("ESI","ECX",4), %%dx \n\ + xchg %%dl, %%al \n\ + movw %%ax, -4("EDI","ECX",4) \n\ + movw %%dx, -2("EDI","ECX",4)" + +/* Swap bytes 1 and 3 of a 32-bit value */ +#define X86_SWAP32_13 \ + "movw -4("ESI","ECX",4), %%ax \n\ + movw -2("ESI","ECX",4), %%dx \n\ + xchg %%dh, %%ah \n\ + movw %%ax, -4("EDI","ECX",4) \n\ + movw %%dx, -2("EDI","ECX",4)" + +/* Reverse the order of bytes in a 32-bit value */ +#define X86_REV32 \ + "movl -4("ESI","ECX",4), %%eax \n\ + xchg %%ah, %%al \n\ + roll $16, %%eax \n\ + xchg %%ah, %%al \n\ + movl %%eax, -4("EDI","ECX",4)" + +/* The same, using the BSWAP instruction */ +#define X86_REV32_BSWAP \ + "movl -4("ESI","ECX",4), %%eax \n\ + bswap %%eax \n\ + movl %%eax, -4("EDI","ECX",4)" + +/* Rotate a 32-bit value left 8 bits */ +#define X86_ROL32 \ + "movl -4("ESI","ECX",4), %%eax \n\ + roll $8, %%eax \n\ + movl %%eax, -4("EDI","ECX",4)" + +/* Rotate a 32-bit value right 8 bits */ +#define X86_ROR32 \ + "movl -4("ESI","ECX",4), %%eax \n\ + rorl $8, %%eax \n\ + movl %%eax, -4("EDI","ECX",4)" + +/*************************************************************************/ + +/* Basic assembly routines. Sizes are all given in 32-bit units. */ + +#define ASM_SWAP16_2_X86(size) \ + asm("0: "X86_SWAP16_2" \n\ + subl $1, %%ecx \n\ + jnz 0b" \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax", "edx") + +#define ASM_SWAP32_X86(size) \ + asm("0: "X86_SWAP32" \n\ + subl $1, %%ecx \n\ + jnz 0b" \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax", "edx") + +#define ASM_SWAP32_02_X86(size) \ + asm("0: "X86_SWAP32_02" \n\ + subl $1, %%ecx \n\ + jnz 0b" \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax", "edx") + +#define ASM_SWAP32_13_X86(size) \ + asm("0: "X86_SWAP32_13" \n\ + subl $1, %%ecx \n\ + jnz 0b" \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax", "edx") + +#define ASM_REV32_X86(size) \ + asm("0: "X86_REV32" \n\ + subl $1, %%ecx \n\ + jnz 0b" \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax") + +#define ASM_ROL32_X86(size) \ + asm("0: "X86_ROL32" \n\ + subl $1, %%ecx \n\ + jnz 0b" \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax") + +#define ASM_ROR32_X86(size) \ + asm("0: "X86_ROR32" \n\ + subl $1, %%ecx \n\ + jnz 0b" \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax") + +/*************************************************************************/ +/*************************************************************************/ + +/* Wrapper for SIMD loops. This generates the body of an asm() construct + * (the string only, not the input/output/clobber lists) given the data + * block size (number of data units processed per SIMD loop iteration), + * instructions to save and restore unclobberable registers (such as EBX), + * and the bodies of the odd-count and main loops. The data count is + * assumed to be preloaded in ECX. Parameters are: + * blocksize: number of units of data processed per SIMD loop (must be + * a power of 2); can be a constant or a numerical + * expression containing only constants + * push_regs: string constant containing instructions to push registers + * that must be saved over the small loop + * pop_regs: string constant containing instructions to pop registers + * saved by `push_regs' (restored before the main loop) + * small_loop: loop for handling data elements one at a time (when the + * count is not a multiple of `blocksize' + * main_loop: main SIMD loop for processing data + * emms: EMMS/SFENCE instructions to end main loop with, as needed + */ + +#define SIMD_LOOP_WRAPPER(blocksize,push_regs,pop_regs,small_loop,main_loop,emms) \ + /* Check whether the count is a multiple of the blocksize (this \ + * can cause branch mispredicts but seems to be faster overall) */ \ + "testl $(("#blocksize")-1), %%ecx; " \ + "jz 1f; " \ + /* It's not--run the small loop to align the count */ \ + push_regs"; " \ + "0: " \ + small_loop"; " \ + "subl $1, %%ecx; " \ + "testl $(("#blocksize")-1), %%ecx; " \ + "jnz 0b; " \ + pop_regs"; " \ + /* Make sure there's some data left */ \ + "testl %%ecx, %%ecx; " \ + "jz 2f; " \ + /* Now run the main SIMD loop */ \ + "1: " \ + main_loop"; " \ + "subl $("#blocksize"), %%ecx; " \ + "jnz 1b; " \ + /* Clear MMX state and/or SFENCE, as needed */ \ + emms"; " \ + /* Done */ \ + "2: " + +/*************************************************************************/ + +/* MMX- and SSE2-optimized swap/rotate routines. These routines are + * identical save for data size, so we use common macros to implement them, + * with register names and data offsets replaced by parameters to the + * macros. */ + +#define ASM_SIMD_MMX(name,size) \ + name((size), 64, \ + "movq", "movq", "movq", "", \ + "%%mm0", "%%mm1", "%%mm2", "%%mm3", \ + "%%mm4", "%%mm5", "%%mm6", "%%mm7") +#define ASM_SIMD_SSE2(name,size) \ + name((size), 128, \ + "movdqu", "movdqa", "movdqu", "", \ + "%%xmm0", "%%xmm1", "%%xmm2", "%%xmm3",\ + "%%xmm4", "%%xmm5", "%%xmm6", "%%xmm7") +#define ASM_SIMD_SSE2_ALIGNED(name,size) \ + name((size), 128, \ + "movdqa", "movdqa", "movntdq", "sfence",\ + "%%xmm0", "%%xmm1", "%%xmm2", "%%xmm3",\ + "%%xmm4", "%%xmm5", "%%xmm6", "%%xmm7") + +#define ASM_SWAP16_2_MMX(size) ASM_SIMD_MMX(ASM_SWAP16_2_SIMD,(size)) +#define ASM_SWAP16_2_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP16_2_SIMD,(size)) +#define ASM_SWAP16_2_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP16_2_SIMD,(size)) +#define ASM_SWAP32_MMX(size) ASM_SIMD_MMX(ASM_SWAP32_SIMD,(size)) +#define ASM_SWAP32_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP32_SIMD,(size)) +#define ASM_SWAP32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_SIMD,(size)) +#define ASM_SWAP32_02_MMX(size) ASM_SIMD_MMX(ASM_SWAP32_02_SIMD,(size)) +#define ASM_SWAP32_02_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP32_02_SIMD,(size)) +#define ASM_SWAP32_02_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_02_SIMD,(size)) +#define ASM_SWAP32_13_MMX(size) ASM_SIMD_MMX(ASM_SWAP32_13_SIMD,(size)) +#define ASM_SWAP32_13_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP32_13_SIMD,(size)) +#define ASM_SWAP32_13_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_13_SIMD,(size)) +#define ASM_REV32_MMX(size) ASM_SIMD_MMX(ASM_REV32_SIMD,(size)) +#define ASM_REV32_SSE2(size) ASM_SIMD_SSE2(ASM_REV32_SIMD,(size)) +#define ASM_REV32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_REV32_SIMD,(size)) +#define ASM_ROL32_MMX(size) ASM_SIMD_MMX(ASM_ROL32_SIMD,(size)) +#define ASM_ROL32_SSE2(size) ASM_SIMD_SSE2(ASM_ROL32_SIMD,(size)) +#define ASM_ROL32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_ROL32_SIMD,(size)) +#define ASM_ROR32_MMX(size) ASM_SIMD_MMX(ASM_ROR32_SIMD,(size)) +#define ASM_ROR32_SSE2(size) ASM_SIMD_SSE2(ASM_ROR32_SIMD,(size)) +#define ASM_ROR32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_ROR32_SIMD,(size)) + +/*************************************************************************/ + +/* Actual implementations. Note that unrolling the SIMD loops doesn't seem + * to be a win (only 2-3% improvement at most), and in fact can lose by a + * bit in short loops. */ + +#define ASM_SWAP16_2_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ + asm(SIMD_LOOP_WRAPPER( \ + /* blocksize */ (regsize)/32, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ X86_SWAP16_2, \ + /* main_loop */ \ + ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ + # MM0: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ + psrlw $8, "MM0" # MM0: - 7 - 5 - 3 - 1 \n\ + psllw $8, "MM1" # MM1: 6 - 4 - 2 - 0 - \n\ + por "MM1", "MM0" # MM0: 6 7 4 5 2 3 0 1 \n\ + "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ + /* emms */ "emms; "sfence) \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax", "edx") + +#define ASM_SWAP32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ + asm(SIMD_LOOP_WRAPPER( \ + /* blocksize */ (regsize)/32, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ X86_SWAP32, \ + /* main_loop */ \ + ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ + # MM0: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ + psrld $16, "MM0" # MM0: - - 7 6 - - 3 2 \n\ + pslld $16, "MM1" # MM1: 5 4 - - 1 0 - - \n\ + por "MM1", "MM0" # MM0: 5 4 7 6 1 0 3 2 \n\ + "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ + /* emms */ "emms; "sfence) \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax") + +#define ASM_SWAP32_02_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ + asm(SIMD_LOOP_WRAPPER( \ + /* blocksize */ (regsize)/32, \ + /* push_regs */ "push "EDX, \ + /* pop_regs */ "pop "EDX, \ + /* small_loop */ X86_SWAP32_02, \ + /* main_loop */ \ + ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ + # MM0: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM2" # MM2: 7 6 5 4 3 2 1 0 \n\ + pand 16("EDX"), "MM1" # MM1: - - - 4 - - - 0 \n\ + pslld $16, "MM1" # MM1: - 4 - - - 0 - - \n\ + pand 64("EDX"), "MM2" # MM2: - 6 - - - 2 - - \n\ + psrld $16, "MM2" # MM2: - - - 6 - - - 2 \n\ + pand 160("EDX"), "MM0" # MM0: 7 - 5 - 3 - 1 - \n\ + por "MM1", "MM0" # MM0: 7 4 5 - 3 0 1 - \n\ + por "MM2", "MM0" # MM0: 7 4 5 6 3 0 1 2 \n\ + "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ + /* emms */ "emms; "sfence) \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data), \ + "m" (mask_data) \ + : "eax") + +#define ASM_SWAP32_13_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ + asm(SIMD_LOOP_WRAPPER( \ + /* blocksize */ (regsize)/32, \ + /* push_regs */ "push "EDX, \ + /* pop_regs */ "pop "EDX, \ + /* small_loop */ X86_SWAP32_13, \ + /* main_loop */ \ + ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ + # MM0: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM2" # MM2: 7 6 5 4 3 2 1 0 \n\ + pand 32("EDX"), "MM1" # MM1: - - 5 - - - 1 - \n\ + pslld $16, "MM1" # MM1: 5 - - - 1 - - - \n\ + pand 128("EDX"), "MM2" # MM2: 7 - - - 3 - - - \n\ + psrld $16, "MM2" # MM2: - - 7 - - - 3 - \n\ + pand 80("EDX"), "MM0" # MM0: - 6 - 4 - 2 - 0 \n\ + por "MM1", "MM0" # MM0: 5 6 - 4 1 2 - 0 \n\ + por "MM2", "MM0" # MM0: 5 6 7 4 1 2 3 0 \n\ + "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ + /* emms */ "emms; "sfence) \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data), \ + "m" (mask_data) \ + : "eax"); + +#define ASM_REV32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ + asm(SIMD_LOOP_WRAPPER( \ + /* blocksize */ (regsize)/32, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ X86_REV32_BSWAP, \ + /* main_loop */ \ + ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ + # MM0: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM2" # MM2: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM3" # MM3: 7 6 5 4 3 2 1 0 \n\ + psrld $24, "MM0" # MM0: - - - 7 - - - 3 \n\ + pand 32("EDX"), "MM2" # MM2: - - 5 - - - 1 - \n\ + psrld $8, "MM1" # MM1: - 7 6 5 - 3 2 1 \n\ + pand 32("EDX"), "MM1" # MM1: - - 6 - - - 2 - \n\ + pslld $8, "MM2" # MM2: - 5 - - - 1 - - \n\ + pslld $24, "MM3" # MM3: 4 - - - 0 - - - \n\ + por "MM1", "MM0" # MM0: - - 6 7 - - 2 3 \n\ + por "MM2", "MM0" # MM0: - 5 6 7 - 1 2 3 \n\ + por "MM3", "MM0" # MM0: 4 5 6 7 0 1 2 3 \n\ + "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ + /* emms */ "emms; "sfence) \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data), \ + "m" (mask_data) \ + : "eax") + +#define ASM_ROL32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ + asm(SIMD_LOOP_WRAPPER( \ + /* blocksize */ (regsize)/32, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ X86_ROL32, \ + /* main_loop */ \ + ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ + # MM0: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ + pslld $8, "MM0" # MM0: 6 5 4 - 2 1 0 - \n\ + psrld $24, "MM1" # MM1: - - - 7 - - - 3 \n\ + por "MM1", "MM0" # MM0: 6 5 4 7 2 1 0 3 \n\ + "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ + /* emms */ "emms; "sfence) \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax") + +#define ASM_ROR32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ + asm(SIMD_LOOP_WRAPPER( \ + /* blocksize */ (regsize)/32, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ X86_ROR32, \ + /* main_loop */ \ + ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ + # MM0: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ + psrld $8, "MM0" # MM0: - 7 6 5 - 3 2 1 \n\ + pslld $24, "MM1" # MM1: 4 - - - 0 - - - \n\ + por "MM1", "MM0" # MM0: 4 7 6 5 0 3 2 1 \n\ + "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ + /* emms */ "emms; "sfence) \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax") + +/*************************************************************************/ + +/* SSE2 macros to load 8 24- or 32-bit RGB pixels into XMM0/1/2 (R/G/B) as + * 16-bit values, used for RGB->YUV and RGB->grayscale conversions. + * ZERO is the number of the XMM register containing all zeroes. */ + +#define SSE2_LOAD_RGB24(ZERO) \ + "movl -21("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm0 # XMM0: ----- ----- ----- xBGR1 \n\ + pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xBGR1 ----- ----- ----- \n\ + movl -18("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm0 # XMM0: xBGR1 ----- ----- xBGR2 \n\ + pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xBGR2 xBGR1 ----- ----- \n\ + movl -15("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm0 # XMM0: xBGR2 xBGR1 ----- xBGR3 \n\ + pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xBGR3 xBGR2 xBGR1 ----- \n\ + movl -24("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm0 # XMM0: xBGR3 xBGR2 xBGR1 xBGR0 \n\ + movl -9("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm1 # XMM1: ----- ----- ----- xBGR5 \n\ + pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xBGR5 ----- ----- ----- \n\ + movl -6("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm1 # XMM1: xBGR5 ----- ----- xBGR6 \n\ + pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xBGR6 xBGR5 ----- ----- \n\ + movl -3("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm1 # XMM1: xBGR6 xBGR5 ----- xBGR7 \n\ + pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xBGR7 xBGR6 xBGR5 ----- \n\ + movl -12("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm1 # XMM1: xBGR7 xBGR6 xBGR5 xBGR4 \n"\ + SSE2_MASSAGE_RGBA32(ZERO) + +#define SSE2_LOAD_BGR24(ZERO) \ + "movl -21("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm0 # XMM0: ----- ----- ----- xRGB1 \n\ + pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xRGB1 ----- ----- ----- \n\ + movl -18("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm0 # XMM0: xRGB1 ----- ----- xRGB2 \n\ + pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xRGB2 xRGB1 ----- ----- \n\ + movl -15("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm0 # XMM0: xRGB2 xRGB1 ----- xRGB3 \n\ + pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xRGB3 xRGB2 xRGB1 ----- \n\ + movl -24("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm0 # XMM0: xRGB3 xRGB2 xRGB1 xRGB0 \n\ + movl -9("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm1 # XMM1: ----- ----- ----- xRGB5 \n\ + pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xRGB5 ----- ----- ----- \n\ + movl -6("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm1 # XMM1: xRGB5 ----- ----- xRGB6 \n\ + pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xRGB6 xRGB5 ----- ----- \n\ + movl -3("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm1 # XMM1: xRGB6 xRGB5 ----- xRGB7 \n\ + pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xRGB7 xRGB6 xRGB5 ----- \n\ + movl -12("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm1 # XMM1: xRGB7 xRGB6 xRGB5 xRGB4 \n"\ + SSE2_MASSAGE_BGRA32(ZERO) + +#define SSE2_LOAD_RGBA32(ZERO) "\ + movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: ABGR3 ABGR2 ABGR1 ABGR0 \n\ + movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: ABGR7 ABGR6 ABGR5 ABGR4 \n"\ + SSE2_MASSAGE_RGBA32(ZERO) +#define SSE2_MASSAGE_RGBA32(ZERO) "\ + movdqa %%xmm0, %%xmm2 # XMM2: ABGR3 ABGR2 ABGR1 ABGR0 \n\ + punpcklbw %%xmm1, %%xmm0 # X0.l: A4 A0 B4 B0 G4 G0 R4 R0 \n\ + punpckhbw %%xmm1, %%xmm2 # X2.l: A6 A2 B6 B2 G6 G2 R6 R2 \n\ + movdqa %%xmm0, %%xmm1 # X1.l: A4 A0 B4 B0 G4 G0 R4 R0 \n\ + punpcklbw %%xmm2, %%xmm0 # X0.l: G6 G4 G2 G0 R6 R4 R2 R0 \n\ + punpckhbw %%xmm2, %%xmm1 # X1.l: G7 G5 G3 G1 R7 R5 R3 R1 \n\ + movdqa %%xmm0, %%xmm2 # X2.l: G6 G4 G2 G0 R6 R4 R2 R0 \n\ + punpcklbw %%xmm1, %%xmm0 # XMM0: G7.......G0 R7.......R0 \n\ + punpckhbw %%xmm1, %%xmm2 # XMM2: A7.......A0 B7.......B0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: G7.......G0 R7.......R0 \n\ + punpcklbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\ + punpckhbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\ + punpcklbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n" + +#define SSE2_LOAD_BGRA32(ZERO) "\ + movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: ARGB3 ARGB2 ARGB1 ARGB0 \n\ + movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: ARGB7 ARGB6 ARGB5 ARGB4 \n"\ + SSE2_MASSAGE_BGRA32(ZERO) +#define SSE2_MASSAGE_BGRA32(ZERO) "\ + movdqa %%xmm0, %%xmm2 # XMM2: ARGB3 ARGB2 ARGB1 ARGB0 \n\ + punpcklbw %%xmm1, %%xmm2 # X2.l: A4 A0 R4 R0 G4 G0 B4 B0 \n\ + punpckhbw %%xmm1, %%xmm0 # X0.l: A6 A2 R6 R2 G6 G2 B6 B2 \n\ + movdqa %%xmm2, %%xmm1 # X1.l: A4 A0 R4 R0 G4 G0 B4 B0 \n\ + punpcklbw %%xmm0, %%xmm2 # X2.l: G6 G4 G2 G0 B6 B4 B2 B0 \n\ + punpckhbw %%xmm0, %%xmm1 # X1.l: G7 G5 G3 G1 B7 B5 B3 B1 \n\ + movdqa %%xmm2, %%xmm0 # X0.l: G6 G4 G2 G0 B6 B4 B2 B0 \n\ + punpcklbw %%xmm1, %%xmm2 # XMM2: G7.......G0 B7.......B0 \n\ + punpckhbw %%xmm1, %%xmm0 # XMM0: A7.......A0 R7.......R0 \n\ + movdqa %%xmm2, %%xmm1 # XMM1: G7.......G0 B7.......B0 \n\ + punpcklbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\ + punpckhbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\ + punpcklbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n" + +#define SSE2_LOAD_ARGB32(ZERO) "\ + movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: BGRA3 BGRA2 BGRA1 BGRA0 \n\ + movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: BGRA7 BGRA6 BGRA5 BGRA4 \n"\ + SSE2_MASSAGE_ARGB32(ZERO) +#define SSE2_MASSAGE_ARGB32(ZERO) "\ + movdqa %%xmm0, %%xmm2 # XMM2: BGRA3 BGRA2 BGRA1 BGRA0 \n\ + punpcklbw %%xmm1, %%xmm0 # X0.l: B4 B0 G4 G0 R4 R0 A4 A0 \n\ + punpckhbw %%xmm1, %%xmm2 # X2.l: B6 B2 G6 G2 R6 R2 A6 A2 \n\ + movdqa %%xmm0, %%xmm1 # X1.l: B4 B0 G4 G0 R4 R0 A4 A0 \n\ + punpcklbw %%xmm2, %%xmm0 # X0.l: R6 R4 R2 R0 A6 A4 A2 A0 \n\ + punpckhbw %%xmm2, %%xmm1 # X1.l: R7 R5 R3 R1 A7 A5 A3 A1 \n\ + movdqa %%xmm0, %%xmm2 # X2.l: R6 R4 R2 R0 A6 A4 A2 A0 \n\ + punpcklbw %%xmm1, %%xmm0 # XMM0: R7.......G0 A7.......A0 \n\ + punpckhbw %%xmm1, %%xmm2 # XMM2: B7.......G0 G7.......G0 \n\ + movdqa %%xmm2, %%xmm1 # XMM1: B7.......B0 G7.......G0 \n\ + punpckhbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\ + punpcklbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\ + punpckhbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n" + +#define SSE2_LOAD_ABGR32(ZERO) "\ + movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: RGBA3 RGBA2 RGBA1 RGBA0 \n\ + movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: RGBA7 RGBA6 RGBA5 RGBA4 \n"\ + SSE2_MASSAGE_ABGR32(ZERO) +#define SSE2_MASSAGE_ABGR32(ZERO) "\ + movdqa %%xmm0, %%xmm2 # XMM2: RGBA3 RGBA2 RGBA1 RGBA0 \n\ + punpcklbw %%xmm1, %%xmm2 # X2.l: R4 R0 G4 G0 B4 B0 A4 A0 \n\ + punpckhbw %%xmm1, %%xmm0 # X0.l: R6 R2 G6 G2 B6 B2 A6 A2 \n\ + movdqa %%xmm2, %%xmm1 # X1.l: R4 R0 G4 G0 B4 B0 A4 A0 \n\ + punpcklbw %%xmm0, %%xmm2 # X2.l: B6 B4 B2 B0 A6 A4 A2 A0 \n\ + punpckhbw %%xmm0, %%xmm1 # X1.l: B7 B5 B3 B1 A7 A5 A3 A1 \n\ + movdqa %%xmm2, %%xmm0 # X0.l: B6 B4 B2 B0 A6 A4 A2 A0 \n\ + punpcklbw %%xmm1, %%xmm2 # XMM2: B7.......B0 A7.......A0 \n\ + punpckhbw %%xmm1, %%xmm0 # XMM0: R7.......R0 G7.......G0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: R7.......R0 G7.......G0 \n\ + punpckhbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\ + punpcklbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\ + punpckhbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n" + +/*************************************************************************/ + +#endif /* ACLIB_IMG_X86_COMMON_H */ + +/* + * Local variables: + * c-file-style: "stroustrup" + * c-file-offsets: ((case-label . *) (statement-case-intro . *)) + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: + */ diff --git a/debian/transcode/transcode-1.1.7/aclib/img_yuv_mixed.c b/debian/transcode/transcode-1.1.7/aclib/img_yuv_mixed.c new file mode 100644 index 00000000..7f4b8d70 --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/img_yuv_mixed.c @@ -0,0 +1,981 @@ +/* + * img_yuv_packed.c - YUV planar<->packed image format conversion routines + * Written by Andrew Church <[email protected]> + * + * This file is part of transcode, a video stream processing tool. + * transcode is free software, distributable under the terms of the GNU + * General Public License (version 2 or later). See the file COPYING + * for details. + */ + +#include "ac.h" +#include "imgconvert.h" +#include "img_internal.h" + +/*************************************************************************/ +/*************************************************************************/ + +/* Standard C implementations */ + +/*************************************************************************/ + +/* Wrappers for UYVY and YVYU */ +/* Note: we rely on YUY2<->{UYVY,YVYU} working for src==dest */ +/* FIXME: when converting from UYVY/YVYU, src is destroyed! */ + +static int uyvy_yvyu_wrapper(uint8_t **src, ImageFormat srcfmt, + uint8_t **dest, ImageFormat destfmt, + int width, int height) +{ + if (srcfmt == IMG_UYVY || srcfmt == IMG_YVYU) + return ac_imgconvert(src, srcfmt, src, IMG_YUY2, width, height) + && ac_imgconvert(src, IMG_YUY2, dest, destfmt, width, height); + else + return ac_imgconvert(src, srcfmt, dest, IMG_YUY2, width, height) + && ac_imgconvert(dest, IMG_YUY2, dest, destfmt, width, height); +} + +static int yuv420p_uyvy(uint8_t **src, uint8_t **dest, int width, int height) +{ return uyvy_yvyu_wrapper(src, IMG_YUV420P, dest, IMG_UYVY, width, height); } + +static int yuv420p_yvyu(uint8_t **src, uint8_t **dest, int width, int height) +{ return uyvy_yvyu_wrapper(src, IMG_YUV420P, dest, IMG_YVYU, width, height); } + +static int yuv411p_uyvy(uint8_t **src, uint8_t **dest, int width, int height) +{ return uyvy_yvyu_wrapper(src, IMG_YUV411P, dest, IMG_UYVY, width, height); } + +static int yuv411p_yvyu(uint8_t **src, uint8_t **dest, int width, int height) +{ return uyvy_yvyu_wrapper(src, IMG_YUV411P, dest, IMG_YVYU, width, height); } + +static int yuv422p_uyvy(uint8_t **src, uint8_t **dest, int width, int height) +{ return uyvy_yvyu_wrapper(src, IMG_YUV422P, dest, IMG_UYVY, width, height); } + +static int yuv422p_yvyu(uint8_t **src, uint8_t **dest, int width, int height) +{ return uyvy_yvyu_wrapper(src, IMG_YUV422P, dest, IMG_YVYU, width, height); } + +static int yuv444p_uyvy(uint8_t **src, uint8_t **dest, int width, int height) +{ return uyvy_yvyu_wrapper(src, IMG_YUV444P, dest, IMG_UYVY, width, height); } + +static int yuv444p_yvyu(uint8_t **src, uint8_t **dest, int width, int height) +{ return uyvy_yvyu_wrapper(src, IMG_YUV444P, dest, IMG_YVYU, width, height); } + +static int uyvy_yuv420p(uint8_t **src, uint8_t **dest, int width, int height) +{ return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV420P, width, height); } + +static int yvyu_yuv420p(uint8_t **src, uint8_t **dest, int width, int height) +{ return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV420P, width, height); } + +static int uyvy_yuv411p(uint8_t **src, uint8_t **dest, int width, int height) +{ return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV411P, width, height); } + +static int yvyu_yuv411p(uint8_t **src, uint8_t **dest, int width, int height) +{ return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV411P, width, height); } + +static int uyvy_yuv422p(uint8_t **src, uint8_t **dest, int width, int height) +{ return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV422P, width, height); } + +static int yvyu_yuv422p(uint8_t **src, uint8_t **dest, int width, int height) +{ return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV422P, width, height); } + +static int uyvy_yuv444p(uint8_t **src, uint8_t **dest, int width, int height) +{ return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV444P, width, height); } + +static int yvyu_yuv444p(uint8_t **src, uint8_t **dest, int width, int height) +{ return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV444P, width, height); } + +/*************************************************************************/ + +static int yuv420p_yuy2(uint8_t **src, uint8_t **dest, int width, int height) +{ + int x, y; + + for (y = 0; y < (height & ~1); y++) { + for (x = 0; x < (width & ~1); x += 2) { + dest[0][(y*width+x)*2 ] = src[0][y*width+x]; + dest[0][(y*width+x)*2+1] = src[1][(y/2)*(width/2)+x/2]; + dest[0][(y*width+x)*2+2] = src[0][y*width+x+1]; + dest[0][(y*width+x)*2+3] = src[2][(y/2)*(width/2)+x/2]; + } + } + return 1; +} + +static int yuv411p_yuy2(uint8_t **src, uint8_t **dest, int width, int height) +{ + int x, y; + + for (y = 0; y < height; y++) { + for (x = 0; x < (width & ~1); x += 2) { + dest[0][(y*width+x)*2 ] = src[0][y*width+x]; + dest[0][(y*width+x)*2+1] = src[1][y*(width/4)+x/4]; + dest[0][(y*width+x)*2+2] = src[0][y*width+x+1]; + dest[0][(y*width+x)*2+3] = src[2][y*(width/4)+x/4]; + } + } + return 1; +} + +static int yuv422p_yuy2(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < (width/2)*height; i++) { + dest[0][i*4 ] = src[0][i*2]; + dest[0][i*4+1] = src[1][i]; + dest[0][i*4+2] = src[0][i*2+1]; + dest[0][i*4+3] = src[2][i]; + } + return 1; +} + +static int yuv444p_yuy2(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < (width/2)*height; i++) { + dest[0][i*4 ] = src[0][i*2]; + dest[0][i*4+1] = (src[1][i*2] + src[1][i*2+1]) / 2; + dest[0][i*4+2] = src[0][i*2+1]; + dest[0][i*4+3] = (src[2][i*2] + src[2][i*2+1]) / 2; + } + return 1; +} + +/*************************************************************************/ + +static int yuy2_yuv420p(uint8_t **src, uint8_t **dest, int width, int height) +{ + int x, y; + + for (y = 0; y < (height & ~1); y++) { + for (x = 0; x < (width & ~1); x += 2) { + dest[0][y*width+x ] = src[0][(y*width+x)*2 ]; + dest[0][y*width+x+1] = src[0][(y*width+x)*2+2]; + if (y%2 == 0) { + dest[1][(y/2)*(width/2)+x/2] = src[0][(y*width+x)*2+1]; + dest[2][(y/2)*(width/2)+x/2] = src[0][(y*width+x)*2+3]; + } else { + dest[1][(y/2)*(width/2)+x/2] = + (dest[1][(y/2)*(width/2)+x/2] + src[0][(y*width+x)*2+1] + 1) / 2; + dest[2][(y/2)*(width/2)+x/2] = + (dest[2][(y/2)*(width/2)+x/2] + src[0][(y*width+x)*2+3] + 1) / 2; + } + } + } + return 1; +} + +static int yuy2_yuv411p(uint8_t **src, uint8_t **dest, int width, int height) +{ + int x, y; + for (y = 0; y < height; y++) { + for (x = 0; x < (width & ~3); x += 4) { + dest[0][y*width+x] = src[0][(y*width+x)*2 ]; + dest[0][y*width+x+1] = src[0][(y*width+x)*2+2]; + dest[0][y*width+x+2] = src[0][(y*width+x)*2+4]; + dest[0][y*width+x+3] = src[0][(y*width+x)*2+6]; + dest[1][y*(width/4)+x/4] = (src[0][(y*width+x)*2+1] + + src[0][(y*width+x)*2+5] + 1) / 2; + dest[2][y*(width/4)+x/4] = (src[0][(y*width+x)*2+3] + + src[0][(y*width+x)*2+7] + 1) / 2; + } + } + return 1; +} + +static int yuy2_yuv422p(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < (width/2)*height; i++) { + dest[0][i*2] = src[0][i*4 ]; + dest[1][i] = src[0][i*4+1]; + dest[0][i*2+1] = src[0][i*4+2]; + dest[2][i] = src[0][i*4+3]; + } + return 1; +} + +static int yuy2_yuv444p(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < (width & ~1)*height; i += 2) { + dest[0][i] = src[0][i*2 ]; + dest[1][i] = src[0][i*2+1]; + dest[1][i+1] = src[0][i*2+1]; + dest[0][i+1] = src[0][i*2+2]; + dest[2][i] = src[0][i*2+3]; + dest[2][i+1] = src[0][i*2+3]; + } + return 1; +} + +/*************************************************************************/ + +static int y8_yuy2(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + dest[0][i*2 ] = src[0][i]; + dest[0][i*2+1] = 128; + } + return 1; +} + +static int y8_uyvy(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) { + dest[0][i*2 ] = 128; + dest[0][i*2+1] = src[0][i]; + } + return 1; +} + +static int yuy2_y8(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) + dest[0][i] = src[0][i*2]; + return 1; +} + +static int uyvy_y8(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height; i++) + dest[0][i] = src[0][i*2+1]; + return 1; +} + +/*************************************************************************/ +/*************************************************************************/ + +#if defined(HAVE_ASM_SSE2) + +/* SSE2 routines. See comments in img_x86_common.h for why we don't bother + * unrolling the loops. */ + +/* Common macros/data for x86 code */ +#include "img_x86_common.h" + +/* YUV420P (1 row) or YUV422P -> YUY2 (unit: 2 pixels) */ +#define YUV42XP_YUY2 \ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 8, \ + /* push_regs */ PUSH(EBX), \ + /* pop_regs */ POP(EBX), \ + /* small_loop */ \ + "movb -1("EDX","ECX"), %%bh \n\ + movb -1("ESI","ECX",2), %%bl \n\ + shll $16, %%ebx \n\ + movb -1("EAX","ECX"), %%bh \n\ + movb -2("ESI","ECX",2), %%bl \n\ + movl %%ebx, -4("EDI","ECX",4)", \ + /* main_loop */ \ + "movdqu -16("ESI","ECX",2),%%xmm0 #XM0: YF YE YD ..... Y2 Y1 Y0 \n\ + movq -8("EAX","ECX"), %%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0 \n\ + movq -8("EDX","ECX"), %%xmm3 # XMM3: V7 V6 V5 V4 V3 V2 V1 V0 \n\ + punpcklbw %%xmm3, %%xmm2 # XMM2: V7 U7 V6 ..... U1 V0 U0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\ + punpcklbw %%xmm2, %%xmm0 # XMM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\ + punpckhbw %%xmm2, %%xmm1 # XMM1: V7 YF U7 ..... Y9 U4 Y8 \n\ + movdqu %%xmm0, -32("EDI","ECX",4) \n\ + movdqu %%xmm1, -16("EDI","ECX",4)", \ + /* emms */ "emms") + +/* YUV411P -> YUY2 (unit: 4 pixels) */ +#define YUV411P_YUY2 \ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 4, \ + /* push_regs */ PUSH(EBX), \ + /* pop_regs */ POP(EBX), \ + /* small_loop */ \ + "movb -1("EDX","ECX"), %%bh \n\ + movb -1("ESI","ECX",4), %%bl \n\ + shll $16, %%ebx \n\ + movb -1("EAX","ECX"), %%bh \n\ + movb -2("ESI","ECX",4), %%bl \n\ + movl %%ebx, -4("EDI","ECX",8) \n\ + movb -1("EDX","ECX"), %%bh \n\ + movb -3("ESI","ECX",4), %%bl \n\ + shll $16, %%ebx \n\ + movb -1("EAX","ECX"), %%bh \n\ + movb -4("ESI","ECX",4), %%bl \n\ + movl %%ebx, -8("EDI","ECX",8)", \ + /* main_loop */ \ + "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: YF YE YD ..... Y2 Y1 Y0 \n\ + movd -4("EAX","ECX"), %%xmm2 # XMM2: U3 U2 U1 U0 \n\ + punpcklbw %%xmm2, %%xmm2 # XMM2: U3 U3 U2 U2 U1 U1 U0 U0 \n\ + movd -4("EDX","ECX"), %%xmm3 # XMM3: V3 V2 V1 V0 \n\ + punpcklbw %%xmm3, %%xmm3 # XMM3: V3 V3 V2 V2 V1 V1 V0 V0 \n\ + punpcklbw %%xmm3, %%xmm2 # XMM2: V3 U3 V3 ..... U0 V0 U0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\ + punpcklbw %%xmm2, %%xmm0 # XMM0: V1 Y7 U1 ..... Y1 U0 Y0 \n\ + punpckhbw %%xmm2, %%xmm1 # XMM1: V3 YF U3 ..... Y9 U2 Y8 \n\ + movdqu %%xmm0, -32("EDI","ECX",8) \n\ + movdqu %%xmm1, -16("EDI","ECX",8)", \ + /* emms */ "emms") + +/* YUV444P -> YUY2 (unit: 2 pixels) */ +#define YUV444P_YUY2 \ + /* Load 0x00FF*8 into XMM7 for masking */ \ + "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 8, \ + /* push_regs */ PUSH2(EBX,EBP), \ + /* pop_regs */ POP2(EBP,EBX), \ + /* small_loop */ \ + "movzbl -1("EDX","ECX",2), %%ebx \n\ + movzbl -2("EDX","ECX",2), %%ebp \n\ + addl %%ebp, %%ebx \n\ + shrl $1, %%ebx \n\ + movb %%bl, -1("EDI","ECX",4) \n\ + movb -1("ESI","ECX",2), %%bl \n\ + movb %%bl, -2("EDI","ECX",4) \n\ + movzbl -1("EAX","ECX",2), %%ebx \n\ + movzbl -2("EAX","ECX",2), %%ebp \n\ + addl %%ebp, %%ebx \n\ + shrl $1, %%ebx \n\ + movb %%bl, -3("EDI","ECX",4) \n\ + movb -2("ESI","ECX",2), %%bl \n\ + movb %%bl, -4("EDI","ECX",4)", \ + /* main_loop */ \ + "movdqu -16("ESI","ECX",2),%%xmm0 #XM0: YF YE YD ..... Y2 Y1 Y0 \n\ + movdqu -16("EAX","ECX",2), %%xmm2 #XM2: UF UE UD ..... U2 U1 U0 \n\ + movdqu -16("EDX","ECX",2), %%xmm3 #XM3: VF VE VD ..... V2 V1 V0 \n\ + movdqa %%xmm2, %%xmm4 # XMM4: UF UE UD ..... U2 U1 U0 \n\ + pand %%xmm7, %%xmm2 # XMM2: -- UE -- ..... U2 -- U0 \n\ + psrlw $8, %%xmm4 # XMM4: -- UF -- ..... U3 -- U1 \n\ + pavgw %%xmm4, %%xmm2 # XMM2: -- u7 -- ..... u1 -- u0 \n\ + movdqa %%xmm3, %%xmm5 # XMM4: UF UE UD ..... U2 U1 U0 \n\ + pand %%xmm7, %%xmm3 # XMM3: -- VE -- ..... V2 -- V0 \n\ + psrlw $8, %%xmm5 # XMM5: -- VF -- ..... V3 -- V1 \n\ + pavgw %%xmm5, %%xmm3 # XMM3: -- v7 -- ..... v1 -- v0 \n\ + psllw $8, %%xmm3 # XMM3: v7 -- v6 ..... -- v0 -- \n\ + por %%xmm3, %%xmm2 # XMM2: v7 u7 v6 ..... u1 v0 u0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\ + punpcklbw %%xmm2, %%xmm0 # XMM0: v3 Y7 u3 ..... Y1 u0 Y0 \n\ + punpckhbw %%xmm2, %%xmm1 # XMM1: v7 YF u7 ..... Y9 u4 Y8 \n\ + movdqu %%xmm0, -32("EDI","ECX",4) \n\ + movdqu %%xmm1, -16("EDI","ECX",4)", \ + /* emms */ "emms") + +/* YUY2 -> YUV420P (U row) (unit: 2 pixels) */ +#define YUY2_YUV420P_U \ + /* Load 0x00FF*8 into XMM7 for masking */ \ + "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 4, \ + /* push_regs */ PUSH2(EBX,EBP), \ + /* pop_regs */ POP2(EBP,EBX), \ + /* small_loop */ \ + "movb -4("ESI","ECX",4), %%bl \n\ + movb %%bl, -2("EDI","ECX",2) \n\ + movb -2("ESI","ECX",4), %%bl \n\ + movb %%bl, -1("EDI","ECX",2) \n\ + movzbl -3("ESI","ECX",4), %%ebx \n\ + movzbl -3("EAX","ECX",4), %%ebp \n\ + addl %%ebp, %%ebx \n\ + shrl $1, %%ebx \n\ + movb %%bl, -1("EDX","ECX")", \ + /* main_loop */ \ + "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\ + movdqu -16("EAX","ECX",4),%%xmm2 #XMM2: Vd Yh Ud ..... Yb Ua Ya \n\ + pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\ + packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ + psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\ + psrlw $8, %%xmm2 # XMM2: -- Vd -- ..... Va -- Ua \n\ + pavgw %%xmm2, %%xmm1 # XMM1: -- v3 -- ..... v0 -- u0 \n\ + packuswb %%xmm1, %%xmm1 # XMM1: v3 u3 v2 u2 v1 u1 v0 u0 \n\ + pand %%xmm7, %%xmm1 # XMM1: -- u3 -- u2 -- u1 -- u0 \n\ + packuswb %%xmm1, %%xmm1 # XMM1: u3 u2 u1 u0 \n\ + movq %%xmm0, -8("EDI","ECX",2) \n\ + movd %%xmm1, -4("EDX","ECX")", \ + /* emms */ "emms") + +/* YUY2 -> YUV420P (V row) (unit: 2 pixels) */ +#define YUY2_YUV420P_V \ + /* Load 0x00FF*8 into XMM7 for masking */ \ + "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 4, \ + /* push_regs */ PUSH2(EBX,EBP), \ + /* pop_regs */ POP2(EBP,EBX), \ + /* small_loop */ \ + "movb -4("ESI","ECX",4), %%bl \n\ + movb %%bl, -2("EDI","ECX",2) \n\ + movb -2("ESI","ECX",4), %%bl \n\ + movb %%bl, -1("EDI","ECX",2) \n\ + movzbl -1("ESI","ECX",4), %%ebx \n\ + movzbl -1("EAX","ECX",4), %%ebp \n\ + addl %%ebp, %%ebx \n\ + shrl $1, %%ebx \n\ + movb %%bl, -1("EDX","ECX")", \ + /* main_loop */ \ + "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\ + movdqu -16("EAX","ECX",4),%%xmm2 #XMM2: Vd Yh Ud ..... Yb Ua Ya \n\ + pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\ + packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ + psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\ + psrlw $8, %%xmm2 # XMM2: -- Vd -- ..... Va -- Ua \n\ + pavgw %%xmm1, %%xmm2 # XMM2: -- v3 -- ..... v0 -- u0 \n\ + packuswb %%xmm2, %%xmm2 # XMM2: v3 u3 v2 u2 v1 u1 v0 u0 \n\ + psrlw $8, %%xmm2 # XMM2: -- v3 -- v2 -- v1 -- v0 \n\ + packuswb %%xmm2, %%xmm2 # XMM2: v3 v2 v1 v0 \n\ + movq %%xmm0, -8("EDI","ECX",2) \n\ + movd %%xmm2, -4("EDX","ECX")", \ + /* emms */ "emms") + +/* YUY2 -> YUV411P (unit: 4 pixels) */ +#define YUY2_YUV411P \ + /* Load 0x000..000FFFFFFFF into XMM6, 0x00FF*8 into XMM7 for masking */ \ + "pcmpeqd %%xmm6, %%xmm6; psrldq $12, %%xmm6;" \ + "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 2, \ + /* push_regs */ PUSH2(EBX,EBP), \ + /* pop_regs */ POP2(EBP,EBX), \ + /* small_loop */ \ + "movb -8("ESI","ECX",8), %%bl \n\ + movb %%bl, -4("EDI","ECX",4) \n\ + movb -6("ESI","ECX",8), %%bl \n\ + movb %%bl, -3("EDI","ECX",4) \n\ + movb -4("ESI","ECX",8), %%bl \n\ + movb %%bl, -2("EDI","ECX",4) \n\ + movb -2("ESI","ECX",8), %%bl \n\ + movb %%bl, -1("EDI","ECX",4) \n\ + movzbl -7("ESI","ECX",8), %%ebx \n\ + movzbl -3("ESI","ECX",8), %%ebp \n\ + addl %%ebp, %%ebx \n\ + shrl $1, %%ebx \n\ + movb %%bl, -1("EAX","ECX") \n\ + movzbl -5("ESI","ECX",8), %%ebx \n\ + movzbl -1("ESI","ECX",8), %%ebp \n\ + addl %%ebp, %%ebx \n\ + shrl $1, %%ebx \n\ + movb %%bl, -1("EDX","ECX")", \ + /* main_loop */ \ + "movdqu -16("ESI","ECX",8),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\ + pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\ + packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ + psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\ + packuswb %%xmm1, %%xmm1 # XMM1: V3 U3 V2 U2 V1 U1 V0 U0 \n\ + movdqa %%xmm1, %%xmm2 # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\ + pand %%xmm7, %%xmm1 # XMM1: -- U3 -- U2 -- U1 -- U0 \n\ + psrlw $8, %%xmm2 # XMM2: -- V3 -- V2 -- V1 -- V0 \n\ + packuswb %%xmm1, %%xmm1 # XMM1: U3 U2 U1 U0 \n\ + packuswb %%xmm2, %%xmm2 # XMM2: V3 V2 V1 V0 \n\ + pand %%xmm6, %%xmm1 # XMM1: -- -- -- -- U3 U2 U1 U0 \n\ + psllq $32, %%xmm2 # XMM2: V3 V2 V1 V0 -- -- -- -- \n\ + por %%xmm1, %%xmm2 # XMM2: V3 V2 V1 V0 U3 U2 U1 U0 \n\ + movdqa %%xmm2, %%xmm1 # XMM1: V3 V2 V1 V0 U3 U2 U1 U0 \n\ + pand %%xmm7, %%xmm1 # XMM1: -- V2 -- V0 -- U2 -- U0 \n\ + psrlw $8, %%xmm2 # XMM2: -- V3 -- V1 -- U3 -- U1 \n\ + pavgw %%xmm2, %%xmm1 # XMM1: -- v1 -- v0 -- u1 -- u0 \n\ + packuswb %%xmm1, %%xmm1 # XMM1: v1 v0 u1 u0 \n\ + movq %%xmm0, -8("EDI","ECX",4) \n\ + movd %%xmm1, %%ebx \n\ + movw %%bx, -2("EAX","ECX") \n\ + shrl $16, %%ebx; \n\ + movw %%bx, -2("EDX","ECX")", \ + /* emms */ "emms") + +/* YUY2 -> YUV422P (unit: 2 pixels) */ +#define YUY2_YUV422P \ + /* Load 0x00FF*8 into XMM7 for masking */ \ + "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 4, \ + /* push_regs */ PUSH(EBX), \ + /* pop_regs */ POP(EBX), \ + /* small_loop */ \ + "movb -4("ESI","ECX",4), %%bl \n\ + movb %%bl, -2("EDI","ECX",2) \n\ + movb -2("ESI","ECX",4), %%bl \n\ + movb %%bl, -1("EDI","ECX",2) \n\ + movb -3("ESI","ECX",4), %%bl \n\ + movb %%bl, -1("EAX","ECX") \n\ + movb -1("ESI","ECX",4), %%bl \n\ + movb %%bl, -1("EDX","ECX")", \ + /* main_loop */ \ + "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\ + pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\ + packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ + psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\ + packuswb %%xmm1, %%xmm1 # XMM1: V3 U3 V2 U2 V1 U1 V0 U0 \n\ + movdqa %%xmm1, %%xmm2 # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\ + pand %%xmm7, %%xmm1 # XMM1: -- U3 -- U2 -- U1 -- U0 \n\ + psrlw $8, %%xmm2 # XMM2: -- V3 -- V2 -- V1 -- V0 \n\ + packuswb %%xmm1, %%xmm1 # XMM1: U3 U2 U1 U0 \n\ + packuswb %%xmm2, %%xmm2 # XMM2: V3 V2 V1 V0 \n\ + movq %%xmm0, -8("EDI","ECX",2) \n\ + movd %%xmm1, -4("EAX","ECX") \n\ + movd %%xmm2, -4("EDX","ECX")", \ + /* emms */ "emms") + +/* YUY2 -> YUV444P (unit: 2 pixels) */ +#define YUY2_YUV444P \ + /* Load 0x00FF*8 into XMM7 for masking */ \ + "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 4, \ + /* push_regs */ PUSH(EBX), \ + /* pop_regs */ POP(EBX), \ + /* small_loop */ \ + "movb -4("ESI","ECX",4), %%bl \n\ + movb %%bl, -2("EDI","ECX",2) \n\ + movb -2("ESI","ECX",4), %%bl \n\ + movb %%bl, -1("EDI","ECX",2) \n\ + movb -3("ESI","ECX",4), %%bl \n\ + movb %%bl, -2("EAX","ECX",2) \n\ + movb %%bl, -1("EAX","ECX",2) \n\ + movb -1("ESI","ECX",4), %%bl \n\ + movb %%bl, -2("EDX","ECX",2) \n\ + movb %%bl, -1("EDX","ECX",2)", \ + /* main_loop */ \ + "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\ + pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\ + packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ + psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\ + packuswb %%xmm1, %%xmm1 # XMM1: V3 U3 V2 U2 V1 U1 V0 U0 \n\ + movdqa %%xmm1, %%xmm2 # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\ + pand %%xmm7, %%xmm1 # XMM1: -- U3 -- U2 -- U1 -- U0 \n\ + psrlw $8, %%xmm2 # XMM2: -- V3 -- V2 -- V1 -- V0 \n\ + movdqa %%xmm1, %%xmm3 # XMM3: -- U3 -- U2 -- U1 -- U0 \n\ + psllw $8, %%xmm3 # XMM3: U3 -- U2 -- U1 -- U0 -- \n\ + por %%xmm3, %%xmm1 # XMM1: U3 U3 U2 U2 U1 U1 U0 U0 \n\ + movdqa %%xmm2, %%xmm3 # XMM3: -- V3 -- V2 -- V1 -- V0 \n\ + psllw $8, %%xmm3 # XMM3: V3 -- V2 -- V1 -- V0 -- \n\ + por %%xmm3, %%xmm2 # XMM1: V3 V3 V2 V2 V1 V1 V0 V0 \n\ + movq %%xmm0, -8("EDI","ECX",2) \n\ + movq %%xmm1, -8("EAX","ECX",2) \n\ + movq %%xmm2, -8("EDX","ECX",2)", \ + /* emms */ "emms") + + +/* Y8 -> YUY2/YVYU (unit: 1 pixel) */ +#define Y8_YUY2 \ + /* Load 0x80*16 into XMM7 for interlacing U/V */ \ + "pcmpeqd %%xmm7, %%xmm7; psllw $7, %%xmm7; packsswb %%xmm7, %%xmm7;"\ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 16, \ + /* push_regs */ PUSH(EBX), \ + /* pop_regs */ POP(EBX), \ + /* small_loop */ \ + "movb -1("ESI","ECX"), %%al \n\ + movb %%al, -2("EDI","ECX",2) \n\ + movb $0x80, -1("EDI","ECX",2)", \ + /* main_loop */ \ + "movdqu -16("ESI","ECX"),%%xmm0 # XMM0: YF YE YD ..... Y2 Y1 Y0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\ + punpcklbw %%xmm7, %%xmm0 # XMM0: 80 Y7 80 ..... Y1 80 Y0 \n\ + movdqu %%xmm0, -32("EDI","ECX",2) \n\ + punpckhbw %%xmm7, %%xmm1 # XMM1: 80 YF 80 ..... Y9 80 Y8 \n\ + movdqu %%xmm1, -16("EDI","ECX",2)", \ + /* emms */ "emms") + +/* Y8 -> UYVY (unit: 1 pixel) */ +#define Y8_UYVY \ + /* Load 0x80*16 into XMM7 for interlacing U/V */ \ + "pcmpeqd %%xmm7, %%xmm7; psllw $7, %%xmm7; packsswb %%xmm7, %%xmm7;"\ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 16, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ \ + "movb -1("ESI","ECX"), %%al \n\ + movb %%al, -1("EDI","ECX",2) \n\ + movb $0x80, -2("EDI","ECX",2)", \ + /* main_loop */ \ + "movdqu -16("ESI","ECX"),%%xmm0 # XMM0: YF YE YD ..... Y2 Y1 Y0 \n\ + movdqa %%xmm7, %%xmm1 # XMM1: 80 80 80 ..... 80 80 80 \n\ + punpcklbw %%xmm0, %%xmm1 # XMM1: Y7 80 Y6 ..... 80 Y0 80 \n\ + movdqu %%xmm1, -32("EDI","ECX",2) \n\ + movdqa %%xmm7, %%xmm2 # XMM2: 80 80 80 ..... 80 80 80 \n\ + punpckhbw %%xmm0, %%xmm2 # XMM0: YF 80 YE ..... 80 Y8 80 \n\ + movdqu %%xmm2, -16("EDI","ECX",2)", \ + /* emms */ "emms") + +/* YUY2/YVYU -> Y8 (unit: 1 pixel) */ +#define YUY2_Y8 \ + /* Load 0x00FF*8 into XMM7 for masking */ \ + "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 8, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ \ + "movb -2("ESI","ECX",2), %%al \n\ + movb %%al, -1("EDI","ECX")", \ + /* main_loop */ \ + "movdqu -16("ESI","ECX",2),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\ + pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\ + packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ + movq %%xmm0, -8("EDI","ECX")", \ + /* emms */ "emms") + +/* UYVY -> Y8 (unit: 1 pixel) */ +#define UYVY_Y8 \ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 8, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ \ + "movb -1("ESI","ECX",2), %%al \n\ + movb %%al, -1("EDI","ECX")", \ + /* main_loop */ \ + "movdqu -16("ESI","ECX",2),%%xmm0 #XM0: Y7 V3 Y6 ..... V0 Y0 U0 \n\ + psrlw $8, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\ + packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ + movq %%xmm0, -8("EDI","ECX")", \ + /* emms */ "emms") + +/*************************************************************************/ + +static int yuv420p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + int y; + for (y = 0; y < (height & ~1); y++) { + int dummy; + asm volatile(YUV42XP_YUY2 + : "=c" (dummy) // Ensure GCC reloads ECX each time through + : "S" (src[0]+y*width), "a" (src[1]+(y/2)*(width/2)), + "d" (src[2]+(y/2)*(width/2)), "D" (dest[0]+y*width*2), + "0" (width/2) +#ifdef ARCH_X86_64 + : FAKE_PUSH_REG +#endif + ); + } + return 1; +} + +static int yuv411p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + if (!(width & 3)) { + asm(YUV411P_YUY2 + : /* no outputs */ + : "S" (src[0]), "a" (src[1]), "d" (src[2]), "D" (dest[0]), + "c" ((width/4)*height) +#ifdef ARCH_X86_64 + : FAKE_PUSH_REG +#endif + ); + } else { + int y; + for (y = 0; y < height; y++) { + int dummy; + asm volatile(YUV411P_YUY2 + : "=c" (dummy) + : "S" (src[0]+y*width), "a" (src[1]+y*(width/4)), + "d" (src[2]+y*(width/4)), "D" (dest[0]+y*width*2), + "0" (width/4) +#ifdef ARCH_X86_64 + : FAKE_PUSH_REG +#endif + ); + } + } + return 1; +} + +static int yuv422p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + if (!(width & 1)) { + asm(YUV42XP_YUY2 + : /* no outputs */ + : "S" (src[0]), "a" (src[1]), "d" (src[2]), "D" (dest[0]), + "c" ((width/2)*height) +#ifdef ARCH_X86_64 + : FAKE_PUSH_REG +#endif + ); + } else { + int y; + for (y = 0; y < height; y++) { + int dummy; + asm volatile(YUV42XP_YUY2 + : "=c" (dummy) + : "S" (src[0]+y*width), "a" (src[1]+y*(width/2)), + "d" (src[2]+y*(width/2)), "D" (dest[0]+y*width*2), + "0" (width/2) +#ifdef ARCH_X86_64 + : FAKE_PUSH_REG +#endif + ); + } + } + return 1; +} + +static int yuv444p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + if (!(width & 1)) { + asm(YUV444P_YUY2 + : /* no outputs */ + : "S" (src[0]), "a" (src[1]), "d" (src[2]), "D" (dest[0]), + "c" ((width/2)*height) +#ifdef ARCH_X86_64 + : FAKE_PUSH_REG, FAKE_PUSH_REG_2 +#endif + ); + } else { + int y; + for (y = 0; y < height; y++) { + int dummy; + asm volatile(YUV444P_YUY2 + : "=c" (dummy) + : "S" (src[0]+y*width), "a" (src[1]+y*(width/2)), + "d" (src[2]+y*(width/2)), "D" (dest[0]+y*width*2), + "0" (width/2) +#ifdef ARCH_X86_64 + : FAKE_PUSH_REG, FAKE_PUSH_REG_2 +#endif + ); + } + } + return 1; +} + +/*************************************************************************/ + +static int yuy2_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + int y; + + for (y = 0; y < (height & ~1); y += 2) { + int dummy; + asm volatile(YUY2_YUV420P_U + : "=c" (dummy) + : "S" (src[0]+y*width*2), "a" (src[0]+(y+1)*width*2), + "D" (dest[0]+y*width), "d" (dest[1]+(y/2)*(width/2)), + "0" (width/2) +#ifdef ARCH_X86_64 + : FAKE_PUSH_REG, FAKE_PUSH_REG_2 +#endif + ); + asm volatile(YUY2_YUV420P_V + : "=c" (dummy) + : "S" (src[0]+(y+1)*width*2), "a" (src[0]+y*width*2), + "D" (dest[0]+(y+1)*width), "d" (dest[2]+(y/2)*(width/2)), + "0" (width/2) +#ifdef ARCH_X86_64 + : FAKE_PUSH_REG, FAKE_PUSH_REG_2 +#endif + ); + } + return 1; +} + +static int yuy2_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + if (!(width & 3)) { + asm(YUY2_YUV411P + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "a" (dest[1]), "d" (dest[2]), + "c" ((width/4)*height) +#ifdef ARCH_X86_64 + : FAKE_PUSH_REG, FAKE_PUSH_REG_2 +#endif + ); + } else { + int y; + for (y = 0; y < height; y++) { + int dummy; + asm volatile(YUY2_YUV411P + : "=c" (dummy) + : "S" (src[0]+y*width*2), "D" (dest[0]+y*width), + "a" (dest[1]+y*(width/4)), "d" (dest[2]+y*(width/4)), + "0" (width/4) +#ifdef ARCH_X86_64 + : FAKE_PUSH_REG, FAKE_PUSH_REG_2 +#endif + ); + } + } + return 1; +} + +static int yuy2_yuv422p_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + if (!(width & 1)) { + asm(YUY2_YUV422P + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "a" (dest[1]), "d" (dest[2]), + "c" ((width/2)*height) +#ifdef ARCH_X86_64 + : FAKE_PUSH_REG +#endif + ); + } else { + int y; + for (y = 0; y < height; y++) { + int dummy; + asm volatile(YUY2_YUV422P + : "=c" (dummy) + : "S" (src[0]+y*width*2), "D" (dest[0]+y*width), + "a" (dest[1]+y*(width/2)), "d" (dest[2]+y*(width/2)), + "0" (width/2) +#ifdef ARCH_X86_64 + : FAKE_PUSH_REG +#endif + ); + } + } + return 1; +} + +static int yuy2_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + if (!(width & 1)) { + asm(YUY2_YUV444P + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "a" (dest[1]), "d" (dest[2]), + "c" ((width/2)*height) +#ifdef ARCH_X86_64 + : FAKE_PUSH_REG +#endif + ); + } else { + int y; + for (y = 0; y < height; y++) { + int dummy; + asm volatile(YUY2_YUV444P + : "=c" (dummy) + : "S" (src[0]+y*width*2), "D" (dest[0]+y*width), + "a" (dest[1]+y*width), "d" (dest[2]+y*width), + "0" (width/2) +#ifdef ARCH_X86_64 + : FAKE_PUSH_REG +#endif + ); + } + } + return 1; +} + +/*************************************************************************/ + +static int y8_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + asm(Y8_YUY2 + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "c" (width*height) + : "eax" COMMA_FAKE_PUSH_REG + ); + return 1; +} + +static int y8_uyvy_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + asm(Y8_UYVY + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "c" (width*height) + : "eax"); + return 1; +} + +static int yuy2_y8_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + asm(YUY2_Y8 + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "c" (width*height) + : "eax"); + return 1; +} + +static int uyvy_y8_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + asm(UYVY_Y8 + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "c" (width*height) + : "eax"); + return 1; +} + +/*************************************************************************/ + +#endif /* HAVE_ASM_SSE2 */ + +/*************************************************************************/ +/*************************************************************************/ + +/* Initialization */ + +int ac_imgconvert_init_yuv_mixed(int accel) +{ + if (!register_conversion(IMG_YUV420P, IMG_YUY2, yuv420p_yuy2) + || !register_conversion(IMG_YUV411P, IMG_YUY2, yuv411p_yuy2) + || !register_conversion(IMG_YUV422P, IMG_YUY2, yuv422p_yuy2) + || !register_conversion(IMG_YUV444P, IMG_YUY2, yuv444p_yuy2) + || !register_conversion(IMG_Y8, IMG_YUY2, y8_yuy2) + || !register_conversion(IMG_YUV420P, IMG_UYVY, yuv420p_uyvy) + || !register_conversion(IMG_YUV411P, IMG_UYVY, yuv411p_uyvy) + || !register_conversion(IMG_YUV422P, IMG_UYVY, yuv422p_uyvy) + || !register_conversion(IMG_YUV444P, IMG_UYVY, yuv444p_uyvy) + || !register_conversion(IMG_Y8, IMG_UYVY, y8_uyvy) + || !register_conversion(IMG_YUV420P, IMG_YVYU, yuv420p_yvyu) + || !register_conversion(IMG_YUV411P, IMG_YVYU, yuv411p_yvyu) + || !register_conversion(IMG_YUV422P, IMG_YVYU, yuv422p_yvyu) + || !register_conversion(IMG_YUV444P, IMG_YVYU, yuv444p_yvyu) + || !register_conversion(IMG_Y8, IMG_YVYU, y8_yuy2) + + || !register_conversion(IMG_YUY2, IMG_YUV420P, yuy2_yuv420p) + || !register_conversion(IMG_YUY2, IMG_YUV411P, yuy2_yuv411p) + || !register_conversion(IMG_YUY2, IMG_YUV422P, yuy2_yuv422p) + || !register_conversion(IMG_YUY2, IMG_YUV444P, yuy2_yuv444p) + || !register_conversion(IMG_YUY2, IMG_Y8, yuy2_y8) + || !register_conversion(IMG_UYVY, IMG_YUV420P, uyvy_yuv420p) + || !register_conversion(IMG_UYVY, IMG_YUV411P, uyvy_yuv411p) + || !register_conversion(IMG_UYVY, IMG_YUV422P, uyvy_yuv422p) + || !register_conversion(IMG_UYVY, IMG_YUV444P, uyvy_yuv444p) + || !register_conversion(IMG_UYVY, IMG_Y8, uyvy_y8) + || !register_conversion(IMG_YVYU, IMG_YUV420P, yvyu_yuv420p) + || !register_conversion(IMG_YVYU, IMG_YUV411P, yvyu_yuv411p) + || !register_conversion(IMG_YVYU, IMG_YUV422P, yvyu_yuv422p) + || !register_conversion(IMG_YVYU, IMG_YUV444P, yvyu_yuv444p) + || !register_conversion(IMG_YVYU, IMG_Y8, yuy2_y8) + ) { + return 0; + } + +#if defined(HAVE_ASM_SSE2) + if (accel & AC_SSE2) { + if (!register_conversion(IMG_YUV420P, IMG_YUY2, yuv420p_yuy2_sse2) + || !register_conversion(IMG_YUV411P, IMG_YUY2, yuv411p_yuy2_sse2) + || !register_conversion(IMG_YUV422P, IMG_YUY2, yuv422p_yuy2_sse2) + || !register_conversion(IMG_YUV444P, IMG_YUY2, yuv444p_yuy2_sse2) + || !register_conversion(IMG_Y8, IMG_YUY2, y8_yuy2_sse2) + || !register_conversion(IMG_Y8, IMG_UYVY, y8_uyvy_sse2) + || !register_conversion(IMG_Y8, IMG_YVYU, y8_yuy2_sse2) + + || !register_conversion(IMG_YUY2, IMG_YUV420P, yuy2_yuv420p_sse2) + || !register_conversion(IMG_YUY2, IMG_YUV411P, yuy2_yuv411p_sse2) + || !register_conversion(IMG_YUY2, IMG_YUV422P, yuy2_yuv422p_sse2) + || !register_conversion(IMG_YUY2, IMG_YUV444P, yuy2_yuv444p_sse2) + || !register_conversion(IMG_YUY2, IMG_Y8, yuy2_y8_sse2) + || !register_conversion(IMG_UYVY, IMG_Y8, uyvy_y8_sse2) + || !register_conversion(IMG_YVYU, IMG_Y8, yuy2_y8_sse2) + ) { + return 0; + } + } +#endif /* HAVE_ASM_SSE2 */ + + return 1; +} + +/*************************************************************************/ + +/* + * Local variables: + * c-file-style: "stroustrup" + * c-file-offsets: ((case-label . *) (statement-case-intro . *)) + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: + */ diff --git a/debian/transcode/transcode-1.1.7/aclib/img_yuv_packed.c b/debian/transcode/transcode-1.1.7/aclib/img_yuv_packed.c new file mode 100644 index 00000000..05357405 --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/img_yuv_packed.c @@ -0,0 +1,290 @@ +/* + * img_yuv_packed.c - YUV packed image format conversion routines + * Written by Andrew Church <[email protected]> + * + * This file is part of transcode, a video stream processing tool. + * transcode is free software, distributable under the terms of the GNU + * General Public License (version 2 or later). See the file COPYING + * for details. + */ + +#include "ac.h" +#include "imgconvert.h" +#include "img_internal.h" + +/*************************************************************************/ +/*************************************************************************/ + +/* Standard C implementations */ + +/*************************************************************************/ + +/* Identity transformation, works when src==dest */ +static int yuv16_copy(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height*2); + return 1; +} + +/* Used for YUY2->UYVY and UYVY->YUY2, works when src==dest */ +static int yuv16_swap16(uint8_t **src, uint8_t **dest, int width, int height) +{ + uint16_t *srcp = (uint16_t *)src[0]; + uint16_t *destp = (uint16_t *)dest[0]; + int i; + for (i = 0; i < width*height; i++) + destp[i] = srcp[i]>>8 | srcp[i]<<8; + return 1; +} + +/* Used for YUY2->YVYU and YVYU->YUY2, works when src==dest */ +static int yuv16_swapuv(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height/2; i++) { + uint8_t tmp = src[0][i*4+1]; + dest[0][i*4 ] = src[0][i*4 ]; + dest[0][i*4+1] = src[0][i*4+3]; + dest[0][i*4+2] = src[0][i*4+2]; + dest[0][i*4+3] = tmp; + } + return 1; +} + +/*************************************************************************/ + +static int uyvy_yvyu(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height/2; i++) { + dest[0][i*4 ] = src[0][i*4+1]; + dest[0][i*4+1] = src[0][i*4+2]; + dest[0][i*4+2] = src[0][i*4+3]; + dest[0][i*4+3] = src[0][i*4 ]; + } + return 1; +} + +static int yvyu_uyvy(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + for (i = 0; i < width*height/2; i++) { + dest[0][i*4 ] = src[0][i*4+3]; + dest[0][i*4+1] = src[0][i*4 ]; + dest[0][i*4+2] = src[0][i*4+1]; + dest[0][i*4+3] = src[0][i*4+2]; + } + return 1; +} + +/*************************************************************************/ +/*************************************************************************/ + +#if defined(ARCH_X86) || defined(ARCH_X86_64) + +/* Common macros/data for x86 code */ +#define DEFINE_MASK_DATA +#include "img_x86_common.h" + +/*************************************************************************/ + +/* Basic assembly routines */ + +static int yuv16_swap16_x86(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_SWAP16_2_X86(width*height/2); + if (width*height % 1) + ((uint16_t *)(dest[0]))[width*height-1] = + src[0][width*height*2-2]<<8 | src[0][width*height*2-1]; + return 1; +} + +static int yuv16_swapuv_x86(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_SWAP32_13_X86(width*height/2); + return 1; +} + +static int uyvy_yvyu_x86(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_ROR32_X86(width*height/2); + if (width*height % 1) + ((uint16_t *)(dest[0]))[width*height-1] = + src[0][width*height*2-2]<<8 | src[0][width*height*2-1]; + return 1; +} + +static int yvyu_uyvy_x86(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_ROL32_X86(width*height/2); + if (width*height % 1) + ((uint16_t *)(dest[0]))[width*height-1] = + src[0][width*height*2-2]<<8 | src[0][width*height*2-1]; + return 1; +} + +/*************************************************************************/ + +/* MMX routines */ + +#if defined(HAVE_ASM_MMX) && defined(ARCH_X86) /* i.e. not x86_64 */ + +static int yuv16_swap16_mmx(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_SWAP16_2_MMX(width*height/2); + if (width*height % 1) + ((uint16_t *)(dest[0]))[width*height-1] = + src[0][width*height*2-2]<<8 | src[0][width*height*2-1]; + return 1; +} + +static int yuv16_swapuv_mmx(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_SWAP32_13_MMX(width*height/2); + return 1; +} + +static int uyvy_yvyu_mmx(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_ROR32_MMX(width*height/2); + if (width*height % 1) + ((uint16_t *)(dest[0]))[width*height-1] = + src[0][width*height*2-2]<<8 | src[0][width*height*2-1]; + return 1; +} + +static int yvyu_uyvy_mmx(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_ROL32_MMX(width*height/2); + if (width*height % 1) + ((uint16_t *)(dest[0]))[width*height-1] = + src[0][width*height*2-2]<<8 | src[0][width*height*2-1]; + return 1; +} + +#endif /* HAVE_ASM_MMX && ARCH_X86 */ + +/*************************************************************************/ + +/* SSE2 routines */ + +#if defined(HAVE_ASM_SSE2) + +static int yuv16_swap16_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_SWAP16_2_SSE2(width*height/2); + if (width*height % 1) + ((uint16_t *)(dest[0]))[width*height-1] = + src[0][width*height*2-2]<<8 | src[0][width*height*2-1]; + return 1; +} + +static int yuv16_swapuv_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_SWAP32_13_SSE2(width*height/2); + return 1; +} + +static int uyvy_yvyu_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_ROR32_SSE2(width*height/2); + if (width*height % 1) + ((uint16_t *)(dest[0]))[width*height-1] = + src[0][width*height*2-2]<<8 | src[0][width*height*2-1]; + return 1; +} + +static int yvyu_uyvy_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ASM_ROL32_SSE2(width*height/2); + if (width*height % 1) + ((uint16_t *)(dest[0]))[width*height-1] = + src[0][width*height*2-2]<<8 | src[0][width*height*2-1]; + return 1; +} + +#endif /* HAVE_ASM_SSE2 */ + +/*************************************************************************/ + +#endif /* ARCH_X86 || ARCH_X86_64 */ + +/*************************************************************************/ +/*************************************************************************/ + +/* Initialization */ + +int ac_imgconvert_init_yuv_packed(int accel) +{ + if (!register_conversion(IMG_YUY2, IMG_YUY2, yuv16_copy) + || !register_conversion(IMG_YUY2, IMG_UYVY, yuv16_swap16) + || !register_conversion(IMG_YUY2, IMG_YVYU, yuv16_swapuv) + + || !register_conversion(IMG_UYVY, IMG_YUY2, yuv16_swap16) + || !register_conversion(IMG_UYVY, IMG_UYVY, yuv16_copy) + || !register_conversion(IMG_UYVY, IMG_YVYU, uyvy_yvyu) + + || !register_conversion(IMG_YVYU, IMG_YUY2, yuv16_swapuv) + || !register_conversion(IMG_YVYU, IMG_UYVY, yvyu_uyvy) + || !register_conversion(IMG_YVYU, IMG_YVYU, yuv16_copy) + ) { + return 0; + } + +#if defined(ARCH_X86) || defined(ARCH_X86_64) + if (accel & (AC_IA32ASM | AC_AMD64ASM)) { + if (!register_conversion(IMG_YUY2, IMG_UYVY, yuv16_swap16_x86) + || !register_conversion(IMG_YUY2, IMG_YVYU, yuv16_swapuv_x86) + || !register_conversion(IMG_UYVY, IMG_YUY2, yuv16_swap16_x86) + || !register_conversion(IMG_UYVY, IMG_YVYU, uyvy_yvyu_x86) + || !register_conversion(IMG_YVYU, IMG_YUY2, yuv16_swapuv_x86) + || !register_conversion(IMG_YVYU, IMG_UYVY, yvyu_uyvy_x86) + ) { + return 0; + } + } + +#if defined(HAVE_ASM_MMX) && defined(ARCH_X86) + if (accel & AC_MMX) { + if (!register_conversion(IMG_YUY2, IMG_UYVY, yuv16_swap16_mmx) + || !register_conversion(IMG_YUY2, IMG_YVYU, yuv16_swapuv_mmx) + || !register_conversion(IMG_UYVY, IMG_YUY2, yuv16_swap16_mmx) + || !register_conversion(IMG_UYVY, IMG_YVYU, uyvy_yvyu_mmx) + || !register_conversion(IMG_YVYU, IMG_YUY2, yuv16_swapuv_mmx) + || !register_conversion(IMG_YVYU, IMG_UYVY, yvyu_uyvy_mmx) + ) { + return 0; + } + } +#endif + +#if defined(HAVE_ASM_SSE2) + if (accel & AC_SSE2) { + if (!register_conversion(IMG_YUY2, IMG_UYVY, yuv16_swap16_sse2) + || !register_conversion(IMG_YUY2, IMG_YVYU, yuv16_swapuv_sse2) + || !register_conversion(IMG_UYVY, IMG_YUY2, yuv16_swap16_sse2) + || !register_conversion(IMG_UYVY, IMG_YVYU, uyvy_yvyu_sse2) + || !register_conversion(IMG_YVYU, IMG_YUY2, yuv16_swapuv_sse2) + || !register_conversion(IMG_YVYU, IMG_UYVY, yvyu_uyvy_sse2) + ) { + return 0; + } + } +#endif + +#endif /* ARCH_X86 || ARCH_X86_64 */ + + return 1; +} + +/*************************************************************************/ + +/* + * Local variables: + * c-file-style: "stroustrup" + * c-file-offsets: ((case-label . *) (statement-case-intro . *)) + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: + */ diff --git a/debian/transcode/transcode-1.1.7/aclib/img_yuv_planar.c b/debian/transcode/transcode-1.1.7/aclib/img_yuv_planar.c new file mode 100644 index 00000000..e510fa4a --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/img_yuv_planar.c @@ -0,0 +1,788 @@ +/* + * img_yuv_planar.c - YUV planar image format conversion routines + * Written by Andrew Church <[email protected]> + * + * This file is part of transcode, a video stream processing tool. + * transcode is free software, distributable under the terms of the GNU + * General Public License (version 2 or later). See the file COPYING + * for details. + */ + +#include "ac.h" +#include "imgconvert.h" +#include "img_internal.h" + +#include <string.h> + +/*************************************************************************/ +/*************************************************************************/ + +/* Standard C implementations */ + +/*************************************************************************/ + +/* Identity transformations */ + +static int yuv420p_copy(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height); + ac_memcpy(dest[1], src[1], (width/2)*(height/2)); + ac_memcpy(dest[2], src[2], (width/2)*(height/2)); + return 1; +} + +static int yuv411p_copy(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height); + ac_memcpy(dest[1], src[1], (width/4)*height); + ac_memcpy(dest[2], src[2], (width/4)*height); + return 1; +} + +static int yuv422p_copy(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height); + ac_memcpy(dest[1], src[1], (width/2)*height); + ac_memcpy(dest[2], src[2], (width/2)*height); + return 1; +} + +static int yuv444p_copy(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height); + ac_memcpy(dest[1], src[1], width*height); + ac_memcpy(dest[2], src[2], width*height); + return 1; +} + +static int y8_copy(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height); + return 1; +} + +/*************************************************************************/ + +static int yuv420p_yuv411p(uint8_t **src, uint8_t **dest, int width, int height) +{ + int x, y; + ac_memcpy(dest[0], src[0], width*height); + for (y = 0; y < (height & ~1); y += 2) { + for (x = 0; x < (width/2 & ~1); x += 2) { + dest[1][y*(width/4)+x/2] = (src[1][(y/2)*(width/2)+x] + + src[1][(y/2)*(width/2)+x+1] + 1) / 2; + dest[2][y*(width/4)+x/2] = (src[2][(y/2)*(width/2)+x] + + src[2][(y/2)*(width/2)+x+1] + 1) / 2; + } + ac_memcpy(dest[1]+(y+1)*(width/4), dest[1]+y*(width/4), width/4); + ac_memcpy(dest[2]+(y+1)*(width/4), dest[2]+y*(width/4), width/4); + } + return 1; +} + +static int yuv420p_yuv422p(uint8_t **src, uint8_t **dest, int width, int height) +{ + int y; + ac_memcpy(dest[0], src[0], width*height); + for (y = 0; y < (height & ~1); y += 2) { + ac_memcpy(dest[1]+(y )*(width/2), src[1]+(y/2)*(width/2), width/2); + ac_memcpy(dest[1]+(y+1)*(width/2), src[1]+(y/2)*(width/2), width/2); + ac_memcpy(dest[2]+(y )*(width/2), src[2]+(y/2)*(width/2), width/2); + ac_memcpy(dest[2]+(y+1)*(width/2), src[2]+(y/2)*(width/2), width/2); + } + return 1; +} + +static int yuv420p_yuv444p(uint8_t **src, uint8_t **dest, int width, int height) +{ + int x, y; + ac_memcpy(dest[0], src[0], width*height); + for (y = 0; y < height; y += 2) { + for (x = 0; x < width; x += 2) { + dest[1][y*width+x ] = + dest[1][y*width+x+1] = src[1][(y/2)*(width/2)+(x/2)]; + dest[2][y*width+x ] = + dest[2][y*width+x+1] = src[2][(y/2)*(width/2)+(x/2)]; + } + ac_memcpy(dest[1]+(y+1)*width, dest[1]+y*width, width); + ac_memcpy(dest[2]+(y+1)*width, dest[2]+y*width, width); + } + return 1; +} + +/*************************************************************************/ + +static int yuv411p_yuv420p(uint8_t **src, uint8_t **dest, int width, int height) +{ + int x, y; + ac_memcpy(dest[0], src[0], width*height); + for (y = 0; y < (height & ~1); y += 2) { + for (x = 0; x < ((width/2) & ~1); x += 2) { + dest[1][(y/2)*(width/2)+x] = (src[1][y*(width/4)+x/2] + + src[1][(y+1)*(width/4)+x/2] + 1) / 2; + dest[2][(y/2)*(width/2)+x] = (src[2][y*(width/4)+x/2] + + src[2][(y+1)*(width/4)+x/2] + 1) / 2; + dest[1][(y/2)*(width/2)+x+1] = dest[1][(y/2)*(width/2)+x]; + dest[2][(y/2)*(width/2)+x+1] = dest[2][(y/2)*(width/2)+x]; + } + } + return 1; +} + +static int yuv411p_yuv422p(uint8_t **src, uint8_t **dest, int width, int height) +{ + int x, y; + ac_memcpy(dest[0], src[0], width*height); + for (y = 0; y < height; y++) { + for (x = 0; x < ((width/2) & ~1); x += 2) { + dest[1][y*(width/2)+x ] = src[1][y*(width/4)+x/2]; + dest[1][y*(width/2)+x+1] = src[1][y*(width/4)+x/2]; + dest[2][y*(width/2)+x ] = src[2][y*(width/4)+x/2]; + dest[2][y*(width/2)+x+1] = src[2][y*(width/4)+x/2]; + } + } + return 1; +} + +static int yuv411p_yuv444p(uint8_t **src, uint8_t **dest, int width, int height) +{ + int x, y; + ac_memcpy(dest[0], src[0], width*height); + for (y = 0; y < height; y++) { + for (x = 0; x < (width & ~3); x += 4) { + dest[1][y*width+x ] = src[1][y*(width/4)+x/4]; + dest[1][y*width+x+1] = src[1][y*(width/4)+x/4]; + dest[1][y*width+x+2] = src[1][y*(width/4)+x/4]; + dest[1][y*width+x+3] = src[1][y*(width/4)+x/4]; + dest[2][y*width+x ] = src[2][y*(width/4)+x/4]; + dest[2][y*width+x+1] = src[2][y*(width/4)+x/4]; + dest[2][y*width+x+2] = src[2][y*(width/4)+x/4]; + dest[2][y*width+x+3] = src[2][y*(width/4)+x/4]; + } + } + return 1; +} + +/*************************************************************************/ + +static int yuv422p_yuv420p(uint8_t **src, uint8_t **dest, int width, int height) +{ + int x, y; + ac_memcpy(dest[0], src[0], width*height); + for (y = 0; y < (height & ~1); y += 2) { + for (x = 0; x < width/2; x++) { + dest[1][(y/2)*(width/2)+x] = (src[1][y*(width/2)+x] + + src[1][(y+1)*(width/2)+x] + 1) / 2; + dest[2][(y/2)*(width/2)+x] = (src[2][y*(width/2)+x] + + src[2][(y+1)*(width/2)+x] + 1) / 2; + } + } + return 1; +} + +static int yuv422p_yuv411p(uint8_t **src, uint8_t **dest, int width, int height) +{ + int x, y; + ac_memcpy(dest[0], src[0], width*height); + for (y = 0; y < height; y++) { + for (x = 0; x < ((width/2) & ~1); x += 2) { + dest[1][y*(width/4)+x/2] = (src[1][y*(width/2)+x] + + src[1][y*(width/2)+x+1] + 1) / 2; + dest[2][y*(width/4)+x/2] = (src[2][y*(width/2)+x] + + src[2][y*(width/2)+x+1] + 1) / 2; + } + } + return 1; +} + +static int yuv422p_yuv444p(uint8_t **src, uint8_t **dest, int width, int height) +{ + int x, y; + ac_memcpy(dest[0], src[0], width*height); + for (y = 0; y < height; y++) { + for (x = 0; x < (width & ~1); x += 2) { + dest[1][y*width+x ] = src[1][y*(width/2)+x/2]; + dest[1][y*width+x+1] = src[1][y*(width/2)+x/2]; + dest[2][y*width+x ] = src[2][y*(width/2)+x/2]; + dest[2][y*width+x+1] = src[2][y*(width/2)+x/2]; + } + } + return 1; +} + +/*************************************************************************/ + +static int yuv444p_yuv420p(uint8_t **src, uint8_t **dest, int width, int height) +{ + int x, y; + ac_memcpy(dest[0], src[0], width*height); + for (y = 0; y < (height & ~1); y += 2) { + for (x = 0; x < (width & ~1); x += 2) { + dest[1][(y/2)*(width/2)+x/2] = (src[1][y*width+x] + + src[1][y*width+x+1] + + src[1][(y+1)*width+x] + + src[1][(y+1)*width+x+1] + 2) / 4; + dest[2][(y/2)*(width/2)+x/2] = (src[2][y*width+x] + + src[2][y*width+x+1] + + src[2][(y+1)*width+x] + + src[2][(y+1)*width+x+1] + 2) / 4; + } + } + return 1; +} + +static int yuv444p_yuv411p(uint8_t **src, uint8_t **dest, int width, int height) +{ + int x, y; + ac_memcpy(dest[0], src[0], width*height); + for (y = 0; y < height; y++) { + for (x = 0; x < (width & ~3); x += 4) { + dest[1][y*(width/4)+x/4] = (src[1][y*width+x] + + src[1][y*width+x+1] + + src[1][y*width+x+2] + + src[1][y*width+x+3] + 2) / 4; + dest[2][y*(width/4)+x/4] = (src[2][y*width+x] + + src[2][y*width+x+1] + + src[2][y*width+x+2] + + src[2][y*width+x+3] + 2) / 4; + } + } + return 1; +} + +static int yuv444p_yuv422p(uint8_t **src, uint8_t **dest, int width, int height) +{ + int x, y; + ac_memcpy(dest[0], src[0], width*height); + for (y = 0; y < height; y++) { + for (x = 0; x < (width & ~1); x += 2) { + dest[1][y*(width/2)+x/2] = (src[1][y*width+x] + + src[1][y*width+x+1] + 1) / 2; + dest[2][y*(width/2)+x/2] = (src[2][y*width+x] + + src[2][y*width+x+1] + 1) / 2; + } + } + return 1; +} + +/*************************************************************************/ + +/* We treat Y8 as a planar format */ + +static int yuvp_y8(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height); + return 1; +} + +static int y8_yuv420p(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height); + memset(dest[1], 128, (width/2)*(height/2)); + memset(dest[2], 128, (width/2)*(height/2)); + return 1; +} + +static int y8_yuv411p(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height); + memset(dest[1], 128, (width/4)*height); + memset(dest[2], 128, (width/4)*height); + return 1; +} + +static int y8_yuv422p(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height); + memset(dest[1], 128, (width/2)*height); + memset(dest[2], 128, (width/2)*height); + return 1; +} + +static int y8_yuv444p(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height); + memset(dest[1], 128, width*height); + memset(dest[2], 128, width*height); + return 1; +} + +/*************************************************************************/ +/*************************************************************************/ + +#if defined(HAVE_ASM_SSE2) + +/* SSE2 routines. See comments in img_x86_common.h for why we don't bother + * unrolling the loops. */ + +/* Common macros/data for x86 code */ +#include "img_x86_common.h" + +/* Average 2 bytes horizontally (e.g. 422P->411P) (unit: 2 source bytes) */ +#define AVG_2H(src,dest,count) do { \ + int dummy; \ + asm volatile( \ + "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" /* XMM7: 0x00FF*8 */ \ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 8, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ \ + "movzbl -2("ESI","ECX",2), %%eax \n\ + movzbl -1("ESI","ECX",2), %%edx \n\ + addl %%edx, %%eax \n\ + shrl $1, %%eax \n\ + movb %%al, -1("EDI","ECX")", \ + /* main_loop */ \ + "movdqu -16("ESI","ECX",2),%%xmm0 #XMM0:FEDCBA9876543210 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: FEDCBA9876543210 \n\ + pand %%xmm7, %%xmm0 # XMM0: E C A 8 6 4 2 0 \n\ + psrlw $8, %%xmm1 # XMM1: F D B 9 7 5 3 1 \n\ + pavgw %%xmm1, %%xmm0 # XMM0: w v u t s r q p (avgs) \n\ + packuswb %%xmm0, %%xmm0 # XMM0: wvutsrqpwvutsrqp \n\ + movq %%xmm0, -8("EDI","ECX")", \ + /* emms */ "emms") \ + : "=c" (dummy) \ + : "S" (src), "D" (dest), "0" (count) \ + : "eax", "edx"); \ +} while (0) + +/* Average 4 bytes horizontally (e.g. 444P->411P) (unit: 4 source bytes) */ +#define AVG_4H(src,dest,count) do { \ + int dummy; \ + asm volatile( \ + "pcmpeqd %%xmm7, %%xmm7; psrld $24, %%xmm7;" /* XMM7: 0x000000FF*4 */ \ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 4, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ \ + "movzbl -4("ESI","ECX",4), %%eax \n\ + movzbl -3("ESI","ECX",4), %%edx \n\ + addl %%edx, %%eax \n\ + movzbl -2("ESI","ECX",4), %%edx \n\ + addl %%edx, %%eax \n\ + movzbl -1("ESI","ECX",4), %%edx \n\ + addl %%edx, %%eax \n\ + shrl $2, %%eax \n\ + movb %%al, -1("EDI","ECX")", \ + /* main_loop */ \ + "movdqu -16("ESI","ECX",4),%%xmm0 #XMM0:FEDCBA9876543210 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: FEDCBA9876543210 \n\ + movdqa %%xmm0, %%xmm2 # XMM2: FEDCBA9876543210 \n\ + movdqa %%xmm0, %%xmm3 # XMM3: FEDCBA9876543210 \n\ + pand %%xmm7, %%xmm0 # XMM0: C 8 4 0 \n\ + psrld $8, %%xmm1 # XMM1: FED BA9 765 321 \n\ + pand %%xmm7, %%xmm1 # XMM1: D 9 5 1 \n\ + psrld $16, %%xmm2 # XMM2: FE BA 76 32 \n\ + pand %%xmm7, %%xmm2 # XMM2: E A 6 2 \n\ + psrld $24, %%xmm3 # XMM3: F B 7 3 \n\ + pavgw %%xmm1, %%xmm0 # XMM0: C+D 8+9 4+5 0+1 (avgs) \n\ + pavgw %%xmm3, %%xmm2 # XMM2: E+F A+B 6+7 2+3 (avgs) \n\ + pavgw %%xmm2, %%xmm0 # XMM0: s r q p (avgs) \n\ + packuswb %%xmm0, %%xmm0 # XMM0: s r q p s r q p \n\ + packuswb %%xmm0, %%xmm0 # XMM0: srqpsrqpsrqpsrqp \n\ + movd %%xmm0, -4("EDI","ECX")", \ + /* emms */ "emms") \ + : "=c" (dummy) \ + : "S" (src), "D" (dest), "0" (count) \ + : "eax", "edx"); \ +} while (0) + +/* Repeat 2 bytes horizontally (e.g. 422P->444P) (unit: 1 source byte) */ +#define REP_2H(src,dest,count) do { \ + int dummy; \ + asm volatile(SIMD_LOOP_WRAPPER( \ + /* blocksize */ 8, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ \ + "movb -1("ESI","ECX"), %%al \n\ + movb %%al, %%ah \n\ + movw %%ax, -2("EDI","ECX",2)", \ + /* main_loop */ \ + "movq -8("ESI","ECX"), %%xmm0 # XMM0: 76543210 \n\ + punpcklbw %%xmm0, %%xmm0 # XMM0: 7766554433221100 \n\ + movdqu %%xmm0, -16("EDI","ECX",2)", \ + /* emms */ "emms") \ + : "=c" (dummy) \ + : "S" (src), "D" (dest), "0" (count) \ + : "eax"); \ +} while (0) + +/* Repeat 4 bytes horizontally (e.g. 411P->444P) (unit: 1 source byte) */ +#define REP_4H(src,dest,count) do { \ + int dummy; \ + asm volatile(SIMD_LOOP_WRAPPER( \ + /* blocksize */ 4, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ \ + "movzbl -1("ESI","ECX"), %%eax \n\ + movb %%al, %%ah \n\ + movl %%eax, %%edx \n\ + shll $16, %%eax \n\ + orl %%edx, %%eax \n\ + movl %%eax, -4("EDI","ECX",4)", \ + /* main_loop */ \ + "movd -4("ESI","ECX"), %%xmm0 # XMM0: 3210 \n\ + punpcklbw %%xmm0, %%xmm0 # XMM0: 33221100 \n\ + punpcklwd %%xmm0, %%xmm0 # XMM0: 3333222211110000 \n\ + movdqu %%xmm0, -16("EDI","ECX",4)", \ + /* emms */ "emms") \ + : "=c" (dummy) \ + : "S" (src), "D" (dest), "0" (count) \ + : "eax", "edx"); \ +} while (0) + +/* Average 2 bytes vertically and double horizontally (411P->420P) + * (unit: 1 source byte) */ +#define AVG_411_420(src1,src2,dest,count) do { \ + int dummy; \ + asm volatile(SIMD_LOOP_WRAPPER( \ + /* blocksize */ 8, \ + /* push_regs */ "push "EBX, \ + /* pop_regs */ "pop "EBX, \ + /* small_loop */ \ + "movzbl -1("ESI","ECX"), %%eax \n\ + movzbl -1("EDX","ECX"), %%ebx \n\ + addl %%ebx, %%eax \n\ + shrl $1, %%eax \n\ + movb %%al, %%ah \n\ + movw %%ax, -2("EDI","ECX",2)", \ + /* main_loop */ \ + "movq -8("ESI","ECX"), %%xmm0 \n\ + movq -8("EDX","ECX"), %%xmm1 \n\ + pavgb %%xmm1, %%xmm0 \n\ + punpcklbw %%xmm0, %%xmm0 \n\ + movdqu %%xmm0, -16("EDI","ECX",2)", \ + /* emms */ "emms") \ + : "=c" (dummy) \ + : "S" (src1), "d" (src2), "D" (dest), "0" (count) \ + : "eax"); \ +} while (0) + +/* Average 2 bytes vertically (422P->420P) (unit: 1 source byte) */ +#define AVG_422_420(src1,src2,dest,count) do { \ + int dummy; \ + asm volatile(SIMD_LOOP_WRAPPER( \ + /* blocksize */ 16, \ + /* push_regs */ "push "EBX, \ + /* pop_regs */ "pop "EBX, \ + /* small_loop */ \ + "movzbl -1("ESI","ECX"), %%eax \n\ + movzbl -1("EDX","ECX"), %%ebx \n\ + addl %%ebx, %%eax \n\ + shrl $1, %%eax \n\ + movb %%al, -1("EDI","ECX")", \ + /* main_loop */ \ + "movdqu -16("ESI","ECX"), %%xmm0 \n\ + movdqu -16("EDX","ECX"), %%xmm1 \n\ + pavgb %%xmm1, %%xmm0 \n\ + movdqu %%xmm0, -16("EDI","ECX")", \ + /* emms */ "emms") \ + : "=c" (dummy) \ + : "S" (src1), "d" (src2), "D" (dest), "0" (count) \ + : "eax"); \ +} while (0) + +/* Average 4 bytes, 2 horizontally and 2 vertically (444P->420P) + * (unit: 2 source bytes) */ +#define AVG_444_420(src1,src2,dest,count) do { \ + int dummy; \ + asm volatile( \ + "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" /* XMM7: 0x00FF*8 */ \ + SIMD_LOOP_WRAPPER( \ + /* blocksize */ 8, \ + /* push_regs */ "push "EBX, \ + /* pop_regs */ "pop "EBX, \ + /* small_loop */ \ + "movzbl -2("ESI","ECX",2), %%eax \n\ + movzbl -1("ESI","ECX",2), %%ebx \n\ + addl %%ebx, %%eax \n\ + movzbl -2("EDX","ECX",2), %%ebx \n\ + addl %%ebx, %%eax \n\ + movzbl -1("EDX","ECX",2), %%ebx \n\ + addl %%ebx, %%eax \n\ + shrl $2, %%eax \n\ + movb %%al, -1("EDI","ECX")", \ + /* main_loop */ \ + "movdqu -16("ESI","ECX",2), %%xmm0 \n\ + movdqu -16("EDX","ECX",2), %%xmm2 \n\ + movdqa %%xmm0, %%xmm1 \n\ + pand %%xmm7, %%xmm0 \n\ + psrlw $8, %%xmm1 \n\ + pavgw %%xmm1, %%xmm0 \n\ + movdqa %%xmm2, %%xmm3 \n\ + pand %%xmm7, %%xmm2 \n\ + psrlw $8, %%xmm3 \n\ + pavgw %%xmm3, %%xmm2 \n\ + pavgw %%xmm2, %%xmm0 \n\ + packuswb %%xmm0, %%xmm0 \n\ + movq %%xmm0, -8("EDI","ECX")", \ + /* emms */ "emms") \ + : "=c" (dummy) \ + : "S" (src1), "d" (src2), "D" (dest), "c" (count)); \ +} while (0) + +/*************************************************************************/ + +static int yuv420p_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + int y; + ac_memcpy(dest[0], src[0], width*height); + for (y = 0; y < (height & ~1); y += 2) { + AVG_2H(src[1]+(y/2)*(width/2), dest[1]+y*(width/4), width/4); + ac_memcpy(dest[1]+(y+1)*(width/4), dest[1]+y*(width/4), width/4); + AVG_2H(src[2]+(y/2)*(width/2), dest[2]+y*(width/4), width/4); + ac_memcpy(dest[2]+(y+1)*(width/4), dest[2]+y*(width/4), width/4); + } + return 1; +} + +static int yuv420p_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + int y; + ac_memcpy(dest[0], src[0], width*height); + for (y = 0; y < height; y += 2) { + REP_2H(src[1]+(y/2)*(width/2), dest[1]+y*width, width/2); + ac_memcpy(dest[1]+(y+1)*width, dest[1]+y*width, width); + REP_2H(src[2]+(y/2)*(width/2), dest[2]+y*width, width/2); + ac_memcpy(dest[2]+(y+1)*width, dest[2]+y*width, width); + } + return 1; +} + +/*************************************************************************/ + +static int yuv411p_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + int y; + ac_memcpy(dest[0], src[0], width*height); + for (y = 0; y < (height & ~1); y += 2) { + AVG_411_420(src[1]+y*(width/4), src[1]+(y+1)*(width/4), + dest[1]+(y/2)*(width/2), width/4); + AVG_411_420(src[2]+y*(width/4), src[2]+(y+1)*(width/4), + dest[2]+(y/2)*(width/2), width/4); + } + return 1; +} + +static int yuv411p_yuv422p_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height); + if (!(width & 3)) { + /* Fast version, no bytes at end of row to skip */ + REP_2H(src[1], dest[1], (width/4)*height); + REP_2H(src[2], dest[2], (width/4)*height); + } else { + /* Slow version, loop through each row */ + int y; + for (y = 0; y < height; y++) { + REP_2H(src[1]+y*(width/4), dest[1]+y*(width/2), width/4); + REP_2H(src[2]+y*(width/4), dest[2]+y*(width/2), width/4); + } + } + return 1; +} + +static int yuv411p_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height); + if (!(width & 3)) { + /* Fast version, no bytes at end of row to skip */ + REP_4H(src[1], dest[1], (width/4)*height); + REP_4H(src[2], dest[2], (width/4)*height); + } else { + /* Slow version, loop through each row */ + int y; + for (y = 0; y < height; y++) { + REP_4H(src[1]+y*(width/4), dest[1]+y*width, width/4); + REP_4H(src[2]+y*(width/4), dest[2]+y*width, width/4); + } + } + return 1; +} + +/*************************************************************************/ + +static int yuv422p_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + int y; + ac_memcpy(dest[0], src[0], width*height); + for (y = 0; y < (height & ~1); y += 2) { + AVG_422_420(src[1]+y*(width/2), src[1]+(y+1)*(width/2), + dest[1]+(y/2)*(width/2), width/2); + AVG_422_420(src[2]+y*(width/2), src[2]+(y+1)*(width/2), + dest[2]+(y/2)*(width/2), width/2); + } + return 1; +} + +static int yuv422p_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height); + if (!(width & 3)) { + /* Fast version, no bytes at end of row to skip */ + AVG_2H(src[1], dest[1], (width/4)*height); + AVG_2H(src[2], dest[2], (width/4)*height); + } else { + /* Slow version, loop through each row */ + int y; + for (y = 0; y < height; y++) { + AVG_2H(src[1]+y*(width/2), dest[1]+y*(width/4), width/4); + AVG_2H(src[2]+y*(width/2), dest[2]+y*(width/4), width/4); + } + } + return 1; +} + +static int yuv422p_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height); + if (!(width & 1)) { + /* Fast version, no bytes at end of row to skip */ + REP_2H(src[1], dest[1], (width/2)*height); + REP_2H(src[2], dest[2], (width/2)*height); + } else { + /* Slow version, loop through each row */ + int y; + for (y = 0; y < height; y++) { + REP_2H(src[1]+y*(width/2), dest[1]+y*width, width/2); + REP_2H(src[2]+y*(width/2), dest[2]+y*width, width/2); + } + } + return 1; +} + +/*************************************************************************/ + +static int yuv444p_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + int y; + ac_memcpy(dest[0], src[0], width*height); + for (y = 0; y < (height & ~1); y += 2) { + AVG_444_420(src[1]+y*width, src[1]+(y+1)*width, + dest[1]+(y/2)*(width/2), width/2); + AVG_444_420(src[2]+y*width, src[2]+(y+1)*width, + dest[2]+(y/2)*(width/2), width/2); + } + return 1; +} + +static int yuv444p_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height); + if (!(width & 3)) { + /* Fast version, no bytes at end of row to skip */ + AVG_4H(src[1], dest[1], (width/4)*height); + AVG_4H(src[2], dest[2], (width/4)*height); + } else { + /* Slow version, loop through each row */ + int y; + for (y = 0; y < height; y++) { + AVG_4H(src[1]+y*width, dest[1]+y*(width/4), width/4); + AVG_4H(src[2]+y*width, dest[2]+y*(width/4), width/4); + } + } + return 1; +} + +static int yuv444p_yuv422p_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + ac_memcpy(dest[0], src[0], width*height); + if (!(width & 1)) { + /* Fast version, no bytes at end of row to skip */ + AVG_2H(src[1], dest[1], (width/2)*height); + AVG_2H(src[2], dest[2], (width/2)*height); + } else { + /* Slow version, loop through each row */ + int y; + for (y = 0; y < height; y++) { + AVG_2H(src[1]+y*width, dest[1]+y*(width/2), width/2); + AVG_2H(src[2]+y*width, dest[2]+y*(width/2), width/2); + } + } + return 1; +} + +/*************************************************************************/ + +#endif /* HAVE_ASM_SSE2 */ + +/*************************************************************************/ +/*************************************************************************/ + +/* Initialization */ + +int ac_imgconvert_init_yuv_planar(int accel) +{ + if (!register_conversion(IMG_YUV420P, IMG_YUV420P, yuv420p_copy) + || !register_conversion(IMG_YUV420P, IMG_YUV411P, yuv420p_yuv411p) + || !register_conversion(IMG_YUV420P, IMG_YUV422P, yuv420p_yuv422p) + || !register_conversion(IMG_YUV420P, IMG_YUV444P, yuv420p_yuv444p) + || !register_conversion(IMG_YUV420P, IMG_Y8, yuvp_y8) + + || !register_conversion(IMG_YUV411P, IMG_YUV420P, yuv411p_yuv420p) + || !register_conversion(IMG_YUV411P, IMG_YUV411P, yuv411p_copy) + || !register_conversion(IMG_YUV411P, IMG_YUV422P, yuv411p_yuv422p) + || !register_conversion(IMG_YUV411P, IMG_YUV444P, yuv411p_yuv444p) + || !register_conversion(IMG_YUV411P, IMG_Y8, yuvp_y8) + + || !register_conversion(IMG_YUV422P, IMG_YUV420P, yuv422p_yuv420p) + || !register_conversion(IMG_YUV422P, IMG_YUV411P, yuv422p_yuv411p) + || !register_conversion(IMG_YUV422P, IMG_YUV422P, yuv422p_copy) + || !register_conversion(IMG_YUV422P, IMG_YUV444P, yuv422p_yuv444p) + || !register_conversion(IMG_YUV422P, IMG_Y8, yuvp_y8) + + || !register_conversion(IMG_YUV444P, IMG_YUV420P, yuv444p_yuv420p) + || !register_conversion(IMG_YUV444P, IMG_YUV411P, yuv444p_yuv411p) + || !register_conversion(IMG_YUV444P, IMG_YUV422P, yuv444p_yuv422p) + || !register_conversion(IMG_YUV444P, IMG_YUV444P, yuv444p_copy) + || !register_conversion(IMG_YUV444P, IMG_Y8, yuvp_y8) + + || !register_conversion(IMG_Y8, IMG_YUV420P, y8_yuv420p) + || !register_conversion(IMG_Y8, IMG_YUV411P, y8_yuv411p) + || !register_conversion(IMG_Y8, IMG_YUV422P, y8_yuv422p) + || !register_conversion(IMG_Y8, IMG_YUV444P, y8_yuv444p) + || !register_conversion(IMG_Y8, IMG_Y8, y8_copy) + ) { + return 0; + } + +#if defined(HAVE_ASM_SSE2) + if (accel & AC_SSE2) { + if (!register_conversion(IMG_YUV420P, IMG_YUV411P, yuv420p_yuv411p_sse2) + || !register_conversion(IMG_YUV420P, IMG_YUV444P, yuv420p_yuv444p_sse2) + + || !register_conversion(IMG_YUV411P, IMG_YUV420P, yuv411p_yuv420p_sse2) + || !register_conversion(IMG_YUV411P, IMG_YUV422P, yuv411p_yuv422p_sse2) + || !register_conversion(IMG_YUV411P, IMG_YUV444P, yuv411p_yuv444p_sse2) + + || !register_conversion(IMG_YUV422P, IMG_YUV420P, yuv422p_yuv420p_sse2) + || !register_conversion(IMG_YUV422P, IMG_YUV411P, yuv422p_yuv411p_sse2) + || !register_conversion(IMG_YUV422P, IMG_YUV444P, yuv422p_yuv444p_sse2) + + || !register_conversion(IMG_YUV444P, IMG_YUV420P, yuv444p_yuv420p_sse2) + || !register_conversion(IMG_YUV444P, IMG_YUV411P, yuv444p_yuv411p_sse2) + || !register_conversion(IMG_YUV444P, IMG_YUV422P, yuv444p_yuv422p_sse2) + ) { + return 0; + } + } +#endif /* ARCH_X86 || ARCH_X86_64 */ + + return 1; +} + +/*************************************************************************/ + +/* + * Local variables: + * c-file-style: "stroustrup" + * c-file-offsets: ((case-label . *) (statement-case-intro . *)) + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: + */ diff --git a/debian/transcode/transcode-1.1.7/aclib/img_yuv_rgb.c b/debian/transcode/transcode-1.1.7/aclib/img_yuv_rgb.c new file mode 100644 index 00000000..9dc04fcb --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/img_yuv_rgb.c @@ -0,0 +1,2410 @@ +/* + * img_yuv_rgb.c - YUV<->RGB image format conversion routines + * Written by Andrew Church <[email protected]> + * + * This file is part of transcode, a video stream processing tool. + * transcode is free software, distributable under the terms of the GNU + * General Public License (version 2 or later). See the file COPYING + * for details. + */ + +#include "ac.h" +#include "ac_internal.h" +#include "imgconvert.h" +#include "img_internal.h" + +#include <string.h> + +#define USE_LOOKUP_TABLES /* for YUV420P->RGB24 */ + +/*************************************************************************/ +/*************************************************************************/ + +/* Standard C implementations */ + +const int cY = 76309; +const int crV = 104597; +const int cgU = -25675; +const int cgV = -53279; +const int cbU = 132201; + +/*************************************************************************/ + +#ifdef USE_LOOKUP_TABLES +# define TABLE_SCALE 16 /* scale factor for Y */ +static int Ylutbase[768*TABLE_SCALE]; +static int *Ylut = Ylutbase+256*TABLE_SCALE; +static int rVlut[256]; +static int gUlut[256]; +static int gVlut[256]; +static int bUlut[256]; +static void yuv_create_tables(void) { + static int yuv_tables_created = 0; + if (!yuv_tables_created) { + int i; + for (i = -256*TABLE_SCALE; i < 512*TABLE_SCALE; i++) { + int v = ((cY*(i-16*TABLE_SCALE)/TABLE_SCALE) + 32768) >> 16; + Ylut[i] = v<0 ? 0 : v>255 ? 255 : v; + } + for (i = 0; i < 256; i++) { + rVlut[i] = ((crV * (i-128)) * TABLE_SCALE + cY/2) / cY; + gUlut[i] = ((cgU * (i-128)) * TABLE_SCALE + cY/2) / cY; + gVlut[i] = ((cgV * (i-128)) * TABLE_SCALE + cY/2) / cY; + bUlut[i] = ((cbU * (i-128)) * TABLE_SCALE + cY/2) / cY; + } + yuv_tables_created = 1; + } +} +# define YUV2RGB(uvofs,rgbsz,rofs,gofs,bofs) do { \ + int Y = src[0][y*width+x] * TABLE_SCALE; \ + int U = src[1][(uvofs)]; \ + int V = src[2][(uvofs)]; \ + dest[0][(y*width+x)*rgbsz+rofs] = Ylut[Y+rVlut[V]]; \ + dest[0][(y*width+x)*rgbsz+gofs] = Ylut[Y+gUlut[U]+gVlut[V]];\ + dest[0][(y*width+x)*rgbsz+bofs] = Ylut[Y+bUlut[U]]; \ +} while (0) +# define YUV2RGB_PACKED(yofs,uofs,vofs,rgbsz,rofs,gofs,bofs) do { \ + int Y = src[0][(y*width+x)*2+yofs] * TABLE_SCALE; \ + int U = src[0][(y*width+(x&~1))*2+uofs]; \ + int V = src[0][(y*width+(x&~1))*2+vofs]; \ + dest[0][(y*width+x)*rgbsz+rofs] = Ylut[Y+rVlut[V]]; \ + dest[0][(y*width+x)*rgbsz+gofs] = Ylut[Y+gUlut[U]+gVlut[V]];\ + dest[0][(y*width+x)*rgbsz+bofs] = Ylut[Y+bUlut[U]]; \ +} while (0) +#else /* !USE_LOOKUP_TABLES */ +# define yuv_create_tables() /*nothing*/ +# define YUV2RGB(uvofs,rgbsz,rofs,gofs,bofs) do { \ + int Y = cY * (src[0][y*width+x] - 16); \ + int U = src[1][(uvofs)] - 128; \ + int V = src[2][(uvofs)] - 128; \ + int r = (Y + crV*V + 32768) >> 16; \ + int g = (Y + cgU*U + cgV*V + 32768) >> 16; \ + int b = (Y + cbU*U + 32768) >> 16; \ + dest[0][(y*width+x)*rgbsz+rofs] = r<0 ? 0 : r>255 ? 255 : r;\ + dest[0][(y*width+x)*rgbsz+gofs] = g<0 ? 0 : g>255 ? 255 : g;\ + dest[0][(y*width+x)*rgbsz+bofs] = b<0 ? 0 : b>255 ? 255 : b;\ +} while (0) +# define YUV2RGB_PACKED(yofs,uofs,vofs,rgbsz,rofs,gofs,bofs) do { \ + int Y = cY * (src[0][(y*width+x)*2+yofs] - 16); \ + int U = src[0][(y*width+(x&~1))*2+uofs] - 128; \ + int V = src[0][(y*width+(x&~1))*2+vofs] - 128; \ + int r = (Y + crV*V + 32768) >> 16; \ + int g = (Y + cgU*U + cgV*V + 32768) >> 16; \ + int b = (Y + cbU*U + 32768) >> 16; \ + dest[0][(y*width+x)*rgbsz+rofs] = r<0 ? 0 : r>255 ? 255 : r;\ + dest[0][(y*width+x)*rgbsz+gofs] = g<0 ? 0 : g>255 ? 255 : g;\ + dest[0][(y*width+x)*rgbsz+bofs] = b<0 ? 0 : b>255 ? 255 : b;\ +} while (0) +#endif + +#define YUV2RGB_420P(s,r,g,b) YUV2RGB((y/2)*(width/2)+(x/2),s,r,g,b) +#define YUV2RGB_411P(s,r,g,b) YUV2RGB((y )*(width/4)+(x/4),s,r,g,b) +#define YUV2RGB_422P(s,r,g,b) YUV2RGB((y )*(width/2)+(x/2),s,r,g,b) +#define YUV2RGB_444P(s,r,g,b) YUV2RGB((y )*(width )+(x ),s,r,g,b) +#define YUV2RGB_YUY2(s,r,g,b) YUV2RGB_PACKED(0,1,3, s,r,g,b) +#define YUV2RGB_UYVY(s,r,g,b) YUV2RGB_PACKED(1,0,2, s,r,g,b) +#define YUV2RGB_YVYU(s,r,g,b) YUV2RGB_PACKED(0,3,1, s,r,g,b) + +#define DEFINE_YUV2RGB(name,op) \ +static int name(uint8_t **src, uint8_t **dest, int width, int height) \ +{ \ + int x, y; \ + \ + yuv_create_tables(); \ + for (y = 0; y < height; y++) { \ + for (x = 0; x < width; x++) { \ + op; \ + } \ + } \ + return 1; \ +} + +#define DEFINE_YUV2RGB_SET(rgb,rgbsz,rofs,gofs,bofs) \ + DEFINE_YUV2RGB(yuv420p_##rgb, YUV2RGB_420P(rgbsz,rofs,gofs,bofs)) \ + DEFINE_YUV2RGB(yuv411p_##rgb, YUV2RGB_411P(rgbsz,rofs,gofs,bofs)) \ + DEFINE_YUV2RGB(yuv422p_##rgb, YUV2RGB_422P(rgbsz,rofs,gofs,bofs)) \ + DEFINE_YUV2RGB(yuv444p_##rgb, YUV2RGB_444P(rgbsz,rofs,gofs,bofs)) \ + DEFINE_YUV2RGB(yuy2_##rgb, YUV2RGB_YUY2(rgbsz,rofs,gofs,bofs)) \ + DEFINE_YUV2RGB(uyvy_##rgb, YUV2RGB_UYVY(rgbsz,rofs,gofs,bofs)) \ + DEFINE_YUV2RGB(yvyu_##rgb, YUV2RGB_YVYU(rgbsz,rofs,gofs,bofs)) + +DEFINE_YUV2RGB_SET(rgb24, 3,0,1,2) +DEFINE_YUV2RGB_SET(bgr24, 3,2,1,0) +DEFINE_YUV2RGB_SET(rgba32, 4,0,1,2) +DEFINE_YUV2RGB_SET(abgr32, 4,3,2,1) +DEFINE_YUV2RGB_SET(argb32, 4,1,2,3) +DEFINE_YUV2RGB_SET(bgra32, 4,2,1,0) + +/* Y8->RGB is defined as part of grayscale stuff below */ + +/*************************************************************************/ + +#define RGB2Y() \ + (dest[0][y*width+x] = ((16829*r + 33039*g + 6416*b + 32768) >> 16) + 16) +#define RGB2U(uvofs) \ + (dest[1][(uvofs)] = ((-9714*r - 19070*g + 28784*b + 32768) >> 16) + 128) +#define RGB2V(uvofs) \ + (dest[2][(uvofs)] = ((28784*r - 24103*g - 4681*b + 32768) >> 16) + 128) +#define RGB2Y_PACKED(ofs) \ + (dest[0][(y*width+x)*2+(ofs)] = ((16829*r + 33039*g + 6416*b + 32768) >> 16) + 16) +#define RGB2U_PACKED(ofs) \ + (dest[0][(y*width+x)*2+(ofs)] = ((-9714*r - 19070*g + 28784*b + 32768) >> 16) + 128) +#define RGB2V_PACKED(ofs) \ + (dest[0][(y*width+x)*2+(ofs)] = ((28784*r - 24103*g - 4681*b + 32768) >> 16) + 128) + +#define RGB2YUV(utest,vtest,uvofs) \ + RGB2Y(); if (utest) RGB2U(uvofs); if (vtest) RGB2V(uvofs) +#define RGB2YUV_PACKED(utest,vtest,yofs,uvofs) \ + RGB2Y_PACKED(yofs); \ + if (utest) RGB2U_PACKED(uvofs); \ + if (vtest) RGB2V_PACKED(uvofs) +/* YUV420P: take Cb/Cr from opposite corners */ +#define RGB2YUV_420P RGB2YUV(!((x|y) & 1), (x&y) & 1, (y/2)*(width/2)+(x/2)) +/* YUV411P: take Cb/Cr from points 2 pixels apart */ +#define RGB2YUV_411P RGB2YUV(!(x & 3), !((x^2) & 3), y*(width/4)+(x/4)) +/* YUV422P: take Cb/Cr from adjacent pixels */ +#define RGB2YUV_422P RGB2YUV(!(x & 1), x & 1, y*(width/2)+(x/2)) +/* YUV444P: every pixel is sampled */ +#define RGB2YUV_444P RGB2YUV(1, 1, y*width+x) +/* YUY2/UYVY/YVYU: take Cb/Cr from the corresponding pixel */ +#define RGB2YUV_YUY2 RGB2YUV_PACKED(!(x & 1), x & 1, 0,1) +#define RGB2YUV_UYVY RGB2YUV_PACKED(!(x & 1), x & 1, 1,0) +#define RGB2YUV_YVYU RGB2YUV_PACKED(x & 1, !(x & 1), 0,1) + +#define DEFINE_RGB2YUV(name,rgbsz,rofs,gofs,bofs,op) \ +static int name(uint8_t **src, uint8_t **dest, int width, int height) \ +{ \ + int x, y; \ + \ + for (y = 0; y < height; y++) { \ + for (x = 0; x < width; x++) { \ + int r = src[0][(y*width+x)*rgbsz+rofs]; \ + int g = src[0][(y*width+x)*rgbsz+gofs]; \ + int b = src[0][(y*width+x)*rgbsz+bofs]; \ + op; \ + } \ + } \ + return 1; \ +} + +#define DEFINE_RGB2Y8(name,rgbsz,rofs,gofs,bofs) \ +static int name(uint8_t **src, uint8_t **dest, int width, int height) \ +{ \ + int x, y; \ + \ + for (y = 0; y < height; y++) { \ + for (x = 0; x < width; x++) { \ + int r = src[0][(y*width+x)*rgbsz+rofs]; \ + int g = src[0][(y*width+x)*rgbsz+gofs]; \ + int b = src[0][(y*width+x)*rgbsz+bofs]; \ + RGB2Y(); \ + } \ + } \ + return 1; \ +} + +#define DEFINE_RGB2YUV_SET(rgb,rgbsz,rofs,gofs,bofs) \ + DEFINE_RGB2YUV(rgb##_yuv420p, rgbsz,rofs,gofs,bofs, RGB2YUV_420P) \ + DEFINE_RGB2YUV(rgb##_yuv411p, rgbsz,rofs,gofs,bofs, RGB2YUV_411P) \ + DEFINE_RGB2YUV(rgb##_yuv422p, rgbsz,rofs,gofs,bofs, RGB2YUV_422P) \ + DEFINE_RGB2YUV(rgb##_yuv444p, rgbsz,rofs,gofs,bofs, RGB2YUV_444P) \ + DEFINE_RGB2YUV(rgb##_yuy2, rgbsz,rofs,gofs,bofs, RGB2YUV_YUY2) \ + DEFINE_RGB2YUV(rgb##_uyvy, rgbsz,rofs,gofs,bofs, RGB2YUV_UYVY) \ + DEFINE_RGB2YUV(rgb##_yvyu, rgbsz,rofs,gofs,bofs, RGB2YUV_YVYU) \ + DEFINE_RGB2Y8 (rgb##_y8, rgbsz,rofs,gofs,bofs) + +DEFINE_RGB2YUV_SET(rgb24, 3,0,1,2) +DEFINE_RGB2YUV_SET(bgr24, 3,2,1,0) +DEFINE_RGB2YUV_SET(rgba32, 4,0,1,2) +DEFINE_RGB2YUV_SET(abgr32, 4,3,2,1) +DEFINE_RGB2YUV_SET(argb32, 4,1,2,3) +DEFINE_RGB2YUV_SET(bgra32, 4,2,1,0) + +/*************************************************************************/ + +/* All YUV planar formats convert to grayscale the same way */ + +#ifdef USE_LOOKUP_TABLES +static uint8_t graylut[2][256]; +static int graylut_created = 0; +static void gray8_create_tables(void) +{ + if (!graylut_created) { + int i; + for (i = 0; i < 256; i++) { + if (i <= 16) + graylut[0][i] = 0; + else if (i >= 235) + graylut[0][i] = 255; + else + graylut[0][i] = (i-16) * 255 / 219; + graylut[1][i] = 16 + i*219/255; + } + graylut_created = 1; + } +} +# define Y2GRAY(val) (graylut[0][(val)]) +# define GRAY2Y(val) (graylut[1][(val)]) +#else +# define gray8_create_tables() /*nothing*/ +# define Y2GRAY(val) ((val)<16 ? 0 : (val)>=235 ? 255 : ((val)-16)*256/219) +# define GRAY2Y(val) (16 + (val)*219/255) +#endif + +static int yuvp_gray8(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + gray8_create_tables(); + for (i = 0; i < width*height; i++) + dest[0][i] = Y2GRAY(src[0][i]); + return 1; +} + +static int yuy2_gray8(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + gray8_create_tables(); + for (i = 0; i < width*height; i++) + dest[0][i] = Y2GRAY(src[0][i*2]); + return 1; +} + +static int uyvy_gray8(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + gray8_create_tables(); + for (i = 0; i < width*height; i++) + dest[0][i] = Y2GRAY(src[0][i*2+1]); + return 1; +} + +/*************************************************************************/ + +static int gray8_y8(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + gray8_create_tables(); + for (i = 0; i < width*height; i++) + dest[0][i] = GRAY2Y(src[0][i]); + return 1; +} + +static int gray8_yuv420p(uint8_t **src, uint8_t **dest, int width, int height) +{ + if (!ac_imgconvert(src, IMG_GRAY8, dest, IMG_Y8, width, height)) + return 0; + memset(dest[1], 128, (width/2)*(height/2)); + memset(dest[2], 128, (width/2)*(height/2)); + return 1; +} + +static int gray8_yuv411p(uint8_t **src, uint8_t **dest, int width, int height) +{ + if (!ac_imgconvert(src, IMG_GRAY8, dest, IMG_Y8, width, height)) + return 0; + memset(dest[1], 128, (width/4)*height); + memset(dest[2], 128, (width/4)*height); + return 1; +} + +static int gray8_yuv422p(uint8_t **src, uint8_t **dest, int width, int height) +{ + if (!ac_imgconvert(src, IMG_GRAY8, dest, IMG_Y8, width, height)) + return 0; + memset(dest[1], 128, (width/2)*height); + memset(dest[2], 128, (width/2)*height); + return 1; +} + +static int gray8_yuv444p(uint8_t **src, uint8_t **dest, int width, int height) +{ + if (!ac_imgconvert(src, IMG_GRAY8, dest, IMG_Y8, width, height)) + return 0; + memset(dest[1], 128, width*height); + memset(dest[2], 128, width*height); + return 1; +} + +static int gray8_yuy2(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + gray8_create_tables(); + for (i = 0; i < width*height; i++) { + dest[0][i*2 ] = GRAY2Y(src[0][i]); + dest[0][i*2+1] = 128; + } + return 1; +} + +static int gray8_uyvy(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + gray8_create_tables(); + for (i = 0; i < width*height; i++) { + dest[0][i*2 ] = 128; + dest[0][i*2+1] = GRAY2Y(src[0][i]); + } + return 1; +} + +/*************************************************************************/ + +/* We only need 3 functions for Y8->RGB (no difference between RGB and BGR) */ + +static int y8_rgb24(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + gray8_create_tables(); + for (i = 0; i < width*height; i++) + dest[0][i*3] = dest[0][i*3+1] = dest[0][i*3+2] = Y2GRAY(src[0][i]); + return 1; +} + +static int y8_rgba32(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + gray8_create_tables(); + for (i = 0; i < width*height; i++) + dest[0][i*4] = dest[0][i*4+1] = dest[0][i*4+2] = Y2GRAY(src[0][i]); + return 1; +} + +static int y8_argb32(uint8_t **src, uint8_t **dest, int width, int height) +{ + int i; + gray8_create_tables(); + for (i = 0; i < width*height; i++) + dest[0][i*4+1] = dest[0][i*4+2] = dest[0][i*4+3] = Y2GRAY(src[0][i]); + return 1; +} + +/*************************************************************************/ +/*************************************************************************/ + +/* Accelerated versions of colorspace routines. */ + +/* Common constant values used in routines: */ + +#if defined(HAVE_ASM_MMX) + +#include "img_x86_common.h" + +static const struct { uint16_t n[72]; } __attribute__((aligned(16))) yuv_data = {{ + 0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF, /* for odd/even */ + 0x0010,0x0010,0x0010,0x0010,0x0010,0x0010,0x0010,0x0010, /* for Y -16 */ + 0x0080,0x0080,0x0080,0x0080,0x0080,0x0080,0x0080,0x0080, /* for U/V -128 */ + 0x2543,0x2543,0x2543,0x2543,0x2543,0x2543,0x2543,0x2543, /* Y constant */ + 0x3313,0x3313,0x3313,0x3313,0x3313,0x3313,0x3313,0x3313, /* rV constant */ + 0xF377,0xF377,0xF377,0xF377,0xF377,0xF377,0xF377,0xF377, /* gU constant */ + 0xE5FC,0xE5FC,0xE5FC,0xE5FC,0xE5FC,0xE5FC,0xE5FC,0xE5FC, /* gV constant */ + 0x408D,0x408D,0x408D,0x408D,0x408D,0x408D,0x408D,0x408D, /* bU constant */ + 0x0008,0x0008,0x0008,0x0008,0x0008,0x0008,0x0008,0x0008, /* for rounding */ +}}; +/* Note that G->Y exceeds 0x7FFF, so be careful to treat it as unsigned + * (the rest of the values are signed) */ +static const struct { uint16_t n[96]; } __attribute__((aligned(16))) rgb_data = {{ + 0x41BD,0x41BD,0x41BD,0x41BD,0x41BD,0x41BD,0x41BD,0x41BD, /* R->Y */ + 0x810F,0x810F,0x810F,0x810F,0x810F,0x810F,0x810F,0x810F, /* G->Y */ + 0x1910,0x1910,0x1910,0x1910,0x1910,0x1910,0x1910,0x1910, /* B->Y */ + 0xDA0E,0xDA0E,0xDA0E,0xDA0E,0xDA0E,0xDA0E,0xDA0E,0xDA0E, /* R->U */ + 0xB582,0xB582,0xB582,0xB582,0xB582,0xB582,0xB582,0xB582, /* G->U */ + 0x7070,0x7070,0x7070,0x7070,0x7070,0x7070,0x7070,0x7070, /* B->U */ + 0x7070,0x7070,0x7070,0x7070,0x7070,0x7070,0x7070,0x7070, /* R->V */ + 0xA1D9,0xA1D9,0xA1D9,0xA1D9,0xA1D9,0xA1D9,0xA1D9,0xA1D9, /* G->V */ + 0xEDB7,0xEDB7,0xEDB7,0xEDB7,0xEDB7,0xEDB7,0xEDB7,0xEDB7, /* B->V */ + 0x0420,0x0420,0x0420,0x0420,0x0420,0x0420,0x0420,0x0420, /* Y +16.5 */ + 0x2020,0x2020,0x2020,0x2020,0x2020,0x2020,0x2020,0x2020, /* U/V +128.5 */ + 0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF, /* for odd/even */ +}}; +#define Y_GRAY 0x4A85 +#define GRAY_Y 0x36F7 +static const struct { uint16_t n[32]; } __attribute__((aligned(16))) gray_data = {{ + Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY, /* 255/219 */ + GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y, /* 219/255 */ + 0x0010,0x0010,0x0010,0x0010,0x0010,0x0010,0x0010,0x0010, /* Y +/-16 */ + 0x00FF,0xFF00,0x0000,0x00FF,0xFF00,0x0000,0x0000,0x0000, /* for Y->RGB */ +}}; + +/* Convert 4 RGB32 pixels in EAX/EBX/ECX/EDX to RGB24 in EAX/EBX/ECX */ +#define IA32_RGB32_TO_RGB24 \ + "movl %%ebx, %%esi # ESI: 00 B1 G1 R1 \n\ + shll $24, %%esi # ESI: R1 00 00 00 \n\ + shrl $8, %%ebx # EBX: 00 00 B1 G1 \n\ + orl %%esi, %%eax # EAX: R1 B0 G0 R0 \n\ + movl %%ecx, %%esi # ESI: 00 B2 G2 R2 \n\ + shll $16, %%esi # ESI: G2 R2 00 00 \n\ + shrl $16, %%ecx # ECX: 00 00 00 B2 \n\ + shll $8, %%edx # EDX: B3 G3 R3 00 \n\ + orl %%esi, %%ebx # EBX: G2 R2 B1 G1 \n\ + orl %%edx, %%ecx # ECX: B3 G3 R3 B2 \n" + +/* Convert 4 RGB24 pixels in EAX/EBX/ECX to RGB32 in EAX/EBX/ECX/EDX */ +#define IA32_RGB24_TO_RGB32 \ + "movl %%ecx, %%edx # EDX: B3 G3 R3 B2 \n\ + shrl $8, %%edx # EDX: 00 B3 G3 R3 \n\ + andl $0xFF, %%ecx # ECX: 00 00 00 B2 \n\ + movl %%ebx, %%edi # EDI: G2 R2 B1 G1 \n\ + andl $0xFFFF0000, %%edi # EDI: G2 R2 00 00 \n\ + orl %%edi, %%ecx # ECX: G2 R2 00 B2 \n\ + rorl $16, %%ecx # ECX: 00 B2 G2 R2 \n\ + movl %%eax, %%edi # EDI: R1 B0 G0 R0 \n\ + andl $0xFF000000, %%edi # EDI: R1 00 00 00 \n\ + andl $0x0000FFFF, %%ebx # EBX: 00 00 B1 G1 \n\ + orl %%edi, %%ebx # EBX: R1 00 B1 G1 \n\ + roll $8, %%ebx # EBX: 00 B1 G1 R1 \n\ + andl $0x00FFFFFF, %%eax # EAX: 00 B0 G0 R0 \n" + +#endif /* HAVE_ASM_MMX */ + +/*************************************************************************/ +/*************************************************************************/ + +/* MMX routines */ + +#if defined(HAVE_ASM_MMX) && defined(ARCH_X86) /* i.e. not x86_64 */ + +static inline void mmx_yuv42Xp_to_rgb(uint8_t *srcY, uint8_t *srcU, + uint8_t *srcV); +#define mmx_yuv420p_to_rgb mmx_yuv42Xp_to_rgb +#define mmx_yuv422p_to_rgb mmx_yuv42Xp_to_rgb +static inline void mmx_store_rgb24(uint8_t *dest); +static inline void mmx_store_bgr24(uint8_t *dest); +static inline void mmx_store_rgba32(uint8_t *dest); +static inline void mmx_store_abgr32(uint8_t *dest); +static inline void mmx_store_argb32(uint8_t *dest); +static inline void mmx_store_bgra32(uint8_t *dest); + +#define DEFINE_YUV2RGB_MMX(yuv,rgb,uvofs,rgbsz,rofs,gofs,bofs) \ +static int yuv##_##rgb##_mmx(uint8_t **src, uint8_t **dest, \ + int width, int height) \ +{ \ + int x, y; \ + \ + yuv_create_tables(); \ + for (y = 0; y < height; y++) { \ + for (x = 0; x < (width & ~7); x += 8) { \ + mmx_##yuv##_to_rgb(src[0]+y*width+x, \ + src[1]+(uvofs), src[2]+(uvofs)); \ + mmx_store_##rgb(dest[0]+(y*width+x)*rgbsz); \ + } \ + while (x < width) { \ + YUV2RGB(uvofs,rgbsz,rofs,gofs,bofs); \ + x++; \ + } \ + } \ + asm("emms"); \ + return 1; \ +} + +#define DEFINE_YUV2RGB_MMX_SET(rgb,rgbsz,rofs,gofs,bofs) \ + DEFINE_YUV2RGB_MMX(yuv420p,rgb,(y/2)*(width/2)+(x/2),rgbsz,rofs,gofs,bofs)\ + DEFINE_YUV2RGB_MMX(yuv422p,rgb,(y )*(width/2)+(x/2),rgbsz,rofs,gofs,bofs) + +DEFINE_YUV2RGB_MMX_SET(rgb24, 3,0,1,2) +DEFINE_YUV2RGB_MMX_SET(bgr24, 3,2,1,0) +DEFINE_YUV2RGB_MMX_SET(rgba32, 4,0,1,2) +DEFINE_YUV2RGB_MMX_SET(abgr32, 4,3,2,1) +DEFINE_YUV2RGB_MMX_SET(argb32, 4,1,2,3) +DEFINE_YUV2RGB_MMX_SET(bgra32, 4,2,1,0) + +/************************************/ + +static inline void mmx_yuv42Xp_to_rgb(uint8_t *srcY, uint8_t *srcU, + uint8_t *srcV) +{ + asm("\ + # Load data, bias and expand to 16 bits \n\ + pxor %%mm4, %%mm4 # MM4: 00 00 00 00 00 00 00 00 \n\ + movq ("EAX"), %%mm6 # MM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ + movd ("ECX"), %%mm2 # MM2: U3 U2 U1 U0 \n\ + movd ("EDX"), %%mm3 # MM3: V3 V2 V1 V0 \n\ + movq %%mm6, %%mm7 # MM7: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ + pand ("ESI"), %%mm6 # MM6: -Y6- -Y4- -Y2- -Y0- \n\ + psrlw $8, %%mm7 # MM7: -Y7- -Y5- -Y3- -Y1- \n\ + punpcklbw %%mm4, %%mm2 # MM2: -U3- -U2- -U1- -U0- \n\ + punpcklbw %%mm4, %%mm3 # MM3: -V3- -V2- -V1- -V0- \n\ + psubw 16("ESI"), %%mm6 # MM6: subtract 16 \n\ + psubw 16("ESI"), %%mm7 # MM7: subtract 16 \n\ + psubw 32("ESI"), %%mm2 # MM2: subtract 128 \n\ + psubw 32("ESI"), %%mm3 # MM3: subtract 128 \n\ + psllw $7, %%mm6 # MM6: convert to fixed point 8.7 \n\ + psllw $7, %%mm7 # MM7: convert to fixed point 8.7 \n\ + psllw $7, %%mm2 # MM2: convert to fixed point 8.7 \n\ + psllw $7, %%mm3 # MM3: convert to fixed point 8.7 \n\ + # Multiply by constants \n\ + pmulhw 48("ESI"), %%mm6 # MM6: -cY6- -cY4- -cY2- -cY0- \n\ + pmulhw 48("ESI"), %%mm7 # MM6: -cY7- -cY5- -cY3- -cY1- \n\ + movq 80("ESI"), %%mm4 # MM4: gU constant \n\ + movq 96("ESI"), %%mm5 # MM5: gV constant \n\ + pmulhw %%mm2, %%mm4 # MM4: -gU3- -gU2- -gU1- -gU0- \n\ + pmulhw %%mm3, %%mm5 # MM5: -gV3- -gV2- -gV1- -gV0- \n\ + paddw %%mm5, %%mm4 # MM4: -g3- -g2- -g1- -g0- \n\ + pmulhw 64("ESI"), %%mm3 # MM3: -r3- -r2- -r1- -r0- \n\ + pmulhw 112("ESI"),%%mm2 # MM2: -b3- -b2- -b1- -b0- \n\ + movq %%mm3, %%mm0 # MM0: -r3- -r2- -r1- -r0- \n\ + movq %%mm4, %%mm1 # MM1: -g3- -g2- -g1- -g0- \n\ + movq %%mm2, %%mm5 # MM5: -b3- -b2- -b1- -b0- \n\ + # Add intermediate results and round/shift to get R/G/B values \n\ + paddw 128("ESI"), %%mm6 # Add rounding value (0.5 @ 8.4 fixed) \n\ + paddw 128("ESI"), %%mm7 \n\ + paddw %%mm6, %%mm0 # MM0: -R6- -R4- -R2- -R0- \n\ + psraw $4, %%mm0 # Shift back to 8.0 fixed \n\ + paddw %%mm6, %%mm1 # MM1: -G6- -G4- -G2- -G0- \n\ + psraw $4, %%mm1 \n\ + paddw %%mm6, %%mm2 # MM2: -B6- -B4- -B2- -B0- \n\ + psraw $4, %%mm2 \n\ + paddw %%mm7, %%mm3 # MM3: -R7- -R5- -R3- -R1- \n\ + psraw $4, %%mm3 \n\ + paddw %%mm7, %%mm4 # MM4: -G7- -G5- -G3- -G1- \n\ + psraw $4, %%mm4 \n\ + paddw %%mm7, %%mm5 # MM5: -B7- -B5- -B3- -B1- \n\ + psraw $4, %%mm5 \n\ + # Saturate to 0-255 and pack into bytes \n\ + packuswb %%mm0, %%mm0 # MM0: R6 R4 R2 R0 R6 R4 R2 R0 \n\ + packuswb %%mm1, %%mm1 # MM1: G6 G4 G2 G0 G6 G4 G2 G0 \n\ + packuswb %%mm2, %%mm2 # MM2: B6 B4 B2 B0 B6 B4 B2 B0 \n\ + packuswb %%mm3, %%mm3 # MM3: R7 R5 R3 R1 R7 R5 R3 R1 \n\ + packuswb %%mm4, %%mm4 # MM4: G7 G5 G3 G1 G7 G5 G3 G1 \n\ + packuswb %%mm5, %%mm5 # MM5: B7 B5 B3 B1 B7 B5 B3 B1 \n\ + punpcklbw %%mm3, %%mm0 # MM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\ + punpcklbw %%mm4, %%mm1 # MM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\ + punpcklbw %%mm5, %%mm2 # MM2: B7 B6 B5 B4 B3 B2 B1 B0 \n" + : /* no outputs */ + : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data) + ); +} + +/************************************/ + +/* Convert YUV->RGB output to RGBA pixels in MM0..MM3 */ +#define MMX_RGB_TO_RGBA "\ + pxor %%mm7, %%mm7 # MM7: 00 00 00 00 00 00 00 00 \n\ + movq %%mm0, %%mm3 # MM3: R7 R6 R5 R4 R3 R2 R1 R0 \n\ + movq %%mm1, %%mm4 # MM4: G7 G6 G5 G4 G3 G2 G1 G0 \n\ + movq %%mm2, %%mm5 # MM5: B7 B6 B5 B4 B3 B2 B1 B0 \n\ + punpcklbw %%mm1, %%mm0 # MM0: G3 R3 G2 R2 G1 R1 G0 R0 \n\ + punpcklbw %%mm7, %%mm2 # MM2: 00 B3 00 B2 00 B1 00 B0 \n\ + movq %%mm0, %%mm1 # MM1: G3 R3 G2 R2 G1 R1 G0 R0 \n\ + punpcklwd %%mm2, %%mm0 # MM0: 00 B1 G1 R1 00 B0 G0 R0 \n\ + punpckhwd %%mm2, %%mm1 # MM1: 00 B3 G3 R3 00 B2 G2 R2 \n\ + punpckhbw %%mm4, %%mm3 # MM3: G7 R7 G6 R6 G5 R5 G4 R4 \n\ + punpckhbw %%mm7, %%mm5 # MM5: 00 B7 00 B6 00 B5 00 B4 \n\ + movq %%mm3, %%mm2 # MM2: G7 R7 G6 R6 G5 R5 G4 R4 \n\ + punpckhwd %%mm5, %%mm3 # MM3: 00 B7 G7 R7 00 B6 G6 R6 \n\ + punpcklwd %%mm5, %%mm2 # MM2: 00 B5 G5 R5 00 B4 G4 R4 \n" + +/* Convert YUV->RGB output to BGRA pixels in MM0..MM3 */ +#define MMX_RGB_TO_BGRA "\ + pxor %%mm7, %%mm7 # MM7: 00 00 00 00 00 00 00 00 \n\ + movq %%mm0, %%mm5 # MM5: R7 R6 R5 R4 R3 R2 R1 R0 \n\ + movq %%mm1, %%mm4 # MM4: G7 G6 G5 G4 G3 G2 G1 G0 \n\ + movq %%mm2, %%mm3 # MM3: B7 B6 B5 B4 B3 B2 B1 B0 \n\ + punpcklbw %%mm1, %%mm2 # MM2: G3 B3 G2 B2 G1 B1 G0 B0 \n\ + punpcklbw %%mm7, %%mm0 # MM0: 00 R3 00 R2 00 R1 00 R0 \n\ + movq %%mm2, %%mm1 # MM1: G3 B3 G2 B2 G1 B1 G0 B0 \n\ + punpcklwd %%mm0, %%mm2 # MM2: 00 R1 G1 B1 00 R0 G0 B0 \n\ + punpckhwd %%mm0, %%mm1 # MM1: 00 R3 G3 B3 00 R2 G2 B2 \n\ + movq %%mm2, %%mm0 # MM0: 00 R1 G1 B1 00 R0 G0 B0 \n\ + punpckhbw %%mm4, %%mm3 # MM3: G7 B7 G6 B6 G5 B5 G4 B4 \n\ + punpckhbw %%mm7, %%mm5 # MM5: 00 R7 00 R6 00 R5 00 R4 \n\ + movq %%mm3, %%mm2 # MM2: G7 B7 G6 B6 G5 B5 G4 B4 \n\ + punpckhwd %%mm5, %%mm3 # MM3: 00 R7 G7 B7 00 R6 G6 B6 \n\ + punpcklwd %%mm5, %%mm2 # MM2: 00 R5 G5 B5 00 R4 G4 B4 \n" + + +static inline void mmx_store_rgb24(uint8_t *dest) +{ + /* It looks like it's fastest to go to RGB32 first, then shift the + * result to merge the 24-bit pixels together. */ + asm(MMX_RGB_TO_RGBA "\ + movq %%mm0, %%mm4 # MM4: 00 B1 G1 R1 00 B0 G0 R0 \n\ + movq %%mm1, %%mm5 # MM5: 00 B3 G3 R3 00 B2 G2 R2 \n\ + movq %%mm2, %%mm6 # MM6: 00 B5 G5 R5 00 B4 G4 R4 \n\ + movq %%mm3, %%mm7 # MM7: 00 B7 G7 R7 00 B6 G6 R6 \n\ + psrlq $32, %%mm4 # MM4: 00 00 00 00 00 B1 G1 R1 \n\ + psrlq $32, %%mm5 # MM5: 00 00 00 00 00 B3 G3 R3 \n\ + psrlq $32, %%mm6 # MM6: 00 00 00 00 00 B5 G5 R5 \n\ + psrlq $32, %%mm7 # MM7: 00 00 00 00 00 B7 G7 R7 \n\ + push "EBX" \n\ + movd %%mm0, %%eax # EAX: 00 B0 G0 R0 \n\ + movd %%mm4, %%ebx # EBX: 00 B1 G1 R1 \n\ + movd %%mm1, %%ecx # ECX: 00 B2 G2 R2 \n\ + movd %%mm5, %%edx # EDX: 00 B3 G3 R3 \n\ + "IA32_RGB32_TO_RGB24" \n\ + movl %%eax, ("EDI") \n\ + movl %%ebx, 4("EDI") \n\ + movl %%ecx, 8("EDI") \n\ + movd %%mm2, %%eax # EAX: 00 B4 G4 R4 \n\ + movd %%mm6, %%ebx # EBX: 00 B5 G5 R5 \n\ + movd %%mm3, %%ecx # ECX: 00 B6 G6 R6 \n\ + movd %%mm7, %%edx # EDX: 00 B7 G7 R7 \n\ + "IA32_RGB32_TO_RGB24" \n\ + movl %%eax, 12("EDI") \n\ + movl %%ebx, 16("EDI") \n\ + movl %%ecx, 20("EDI") \n\ + pop "EBX" \n" + : /* no outputs */ + : "D" (dest) + : "eax", "ecx", "edx", "esi" + ); +} + +static inline void mmx_store_bgr24(uint8_t *dest) +{ + asm(MMX_RGB_TO_BGRA "\ + movq %%mm0, %%mm4 # MM4: 00 B1 G1 R1 00 B0 G0 R0 \n\ + movq %%mm1, %%mm5 # MM5: 00 B3 G3 R3 00 B2 G2 R2 \n\ + movq %%mm2, %%mm6 # MM6: 00 B5 G5 R5 00 B4 G4 R4 \n\ + movq %%mm3, %%mm7 # MM7: 00 B7 G7 R7 00 B6 G6 R6 \n\ + psrlq $32, %%mm4 # MM4: 00 00 00 00 00 B1 G1 R1 \n\ + psrlq $32, %%mm5 # MM5: 00 00 00 00 00 B3 G3 R3 \n\ + psrlq $32, %%mm6 # MM6: 00 00 00 00 00 B5 G5 R5 \n\ + psrlq $32, %%mm7 # MM7: 00 00 00 00 00 B7 G7 R7 \n\ + push "EBX" \n\ + movd %%mm0, %%eax # EAX: 00 B0 G0 R0 \n\ + movd %%mm4, %%ebx # EBX: 00 B1 G1 R1 \n\ + movd %%mm1, %%ecx # ECX: 00 B2 G2 R2 \n\ + movd %%mm5, %%edx # EDX: 00 B3 G3 R3 \n\ + "IA32_RGB32_TO_RGB24" \n\ + movl %%eax, ("EDI") \n\ + movl %%ebx, 4("EDI") \n\ + movl %%ecx, 8("EDI") \n\ + movd %%mm2, %%eax # EAX: 00 B4 G4 R4 \n\ + movd %%mm6, %%ebx # EBX: 00 B5 G5 R5 \n\ + movd %%mm3, %%ecx # ECX: 00 B6 G6 R6 \n\ + movd %%mm7, %%edx # EDX: 00 B7 G7 R7 \n\ + "IA32_RGB32_TO_RGB24" \n\ + movl %%eax, 12("EDI") \n\ + movl %%ebx, 16("EDI") \n\ + movl %%ecx, 20("EDI") \n\ + pop "EBX" \n" + : /* no outputs */ + : "D" (dest) + : "eax", "ecx", "edx", "esi" + ); +} + +static inline void mmx_store_rgba32(uint8_t *dest) +{ + asm(MMX_RGB_TO_RGBA "\ + movq %%mm0, ("EDI") \n\ + movq %%mm1, 8("EDI") \n\ + movq %%mm2, 16("EDI") \n\ + movq %%mm3, 24("EDI") \n" + : /* no outputs */ + : "D" (dest) + ); +} + +static inline void mmx_store_abgr32(uint8_t *dest) +{ + asm(MMX_RGB_TO_BGRA "\ + psllq $8, %%mm0 \n\ + psllq $8, %%mm1 \n\ + psllq $8, %%mm2 \n\ + psllq $8, %%mm3 \n\ + movq %%mm0, ("EDI") \n\ + movq %%mm1, 8("EDI") \n\ + movq %%mm2, 16("EDI") \n\ + movq %%mm3, 24("EDI") \n" + : /* no outputs */ + : "D" (dest) + ); +} + +static inline void mmx_store_argb32(uint8_t *dest) +{ + asm(MMX_RGB_TO_RGBA "\ + psllq $8, %%mm0 \n\ + psllq $8, %%mm1 \n\ + psllq $8, %%mm2 \n\ + psllq $8, %%mm3 \n\ + movq %%mm0, ("EDI") \n\ + movq %%mm1, 8("EDI") \n\ + movq %%mm2, 16("EDI") \n\ + movq %%mm3, 24("EDI") \n" + : /* no outputs */ + : "D" (dest) + ); +} + +static inline void mmx_store_bgra32(uint8_t *dest) +{ + asm(MMX_RGB_TO_BGRA "\ + movq %%mm0, ("EDI") \n\ + movq %%mm1, 8("EDI") \n\ + movq %%mm2, 16("EDI") \n\ + movq %%mm3, 24("EDI") \n" + : /* no outputs */ + : "D" (dest) + ); +} + +#endif /* HAVE_ASM_MMX && ARCH_X86 */ + +/*************************************************************************/ +/*************************************************************************/ + +/* SSE2 routines */ + +#if defined(HAVE_ASM_SSE2) + +/*************************************************************************/ + +static inline void sse2_load_yuv420p(uint8_t *srcY, uint8_t *srcU, + uint8_t *srcV, int x, int y, int width); +static inline void sse2_load_yuv411p(uint8_t *srcY, uint8_t *srcU, + uint8_t *srcV, int x, int y, int width); +static inline void sse2_load_yuv422p(uint8_t *srcY, uint8_t *srcU, + uint8_t *srcV, int x, int y, int width); +static inline void sse2_load_yuv444p(uint8_t *srcY, uint8_t *srcU, + uint8_t *srcV, int x, int y, int width); +static inline void sse2_load_yuy2(uint8_t *srcY, uint8_t *srcU, + uint8_t *srcV, int x, int y, int width); +static inline void sse2_load_uyvy(uint8_t *srcY, uint8_t *srcU, + uint8_t *srcV, int x, int y, int width); +static inline void sse2_load_yvyu(uint8_t *srcY, uint8_t *srcU, + uint8_t *srcV, int x, int y, int width); +static inline void sse2_yuv_to_rgb(void); +static inline void sse2_yuv444_to_rgb(void); +static inline void sse2_store_rgb24(uint8_t *dest); +static inline void sse2_store_bgr24(uint8_t *dest); +static inline void sse2_store_rgba32(uint8_t *dest); +static inline void sse2_store_abgr32(uint8_t *dest); +static inline void sse2_store_argb32(uint8_t *dest); +static inline void sse2_store_bgra32(uint8_t *dest); + +#define DEFINE_YUV2RGB_SSE2(yuv,y2r,rgb,rgbsz,slowop) \ +static int yuv##_##rgb##_sse2(uint8_t **src, uint8_t **dest, \ + int width, int height) \ +{ \ + int x, y; \ + \ + yuv_create_tables(); \ + for (y = 0; y < height; y++) { \ + for (x = 0; x < (width & ~15); x += 16) { \ + sse2_load_##yuv(src[0], src[1], src[2], x, y, width); \ + sse2_##y2r(); \ + sse2_store_##rgb(dest[0] + (y*width+x)*rgbsz); \ + } \ + while (x < width) { \ + slowop; \ + x++; \ + } \ + } \ + asm("emms"); \ + return 1; \ +} + +#define DEFINE_YUV2RGB_SSE2_SET(rgb,sz,r,g,b) \ + DEFINE_YUV2RGB_SSE2(yuv420p, yuv_to_rgb, rgb,sz, YUV2RGB_420P(sz,r,g,b))\ + DEFINE_YUV2RGB_SSE2(yuv411p, yuv_to_rgb, rgb,sz, YUV2RGB_411P(sz,r,g,b))\ + DEFINE_YUV2RGB_SSE2(yuv422p, yuv_to_rgb, rgb,sz, YUV2RGB_422P(sz,r,g,b))\ + DEFINE_YUV2RGB_SSE2(yuv444p, yuv444_to_rgb,rgb,sz, YUV2RGB_444P(sz,r,g,b))\ + DEFINE_YUV2RGB_SSE2(yuy2, yuv_to_rgb, rgb,sz, YUV2RGB_YUY2(sz,r,g,b))\ + DEFINE_YUV2RGB_SSE2(uyvy, yuv_to_rgb, rgb,sz, YUV2RGB_UYVY(sz,r,g,b))\ + DEFINE_YUV2RGB_SSE2(yvyu, yuv_to_rgb, rgb,sz, YUV2RGB_YVYU(sz,r,g,b)) + +DEFINE_YUV2RGB_SSE2_SET(rgb24, 3,0,1,2) +DEFINE_YUV2RGB_SSE2_SET(bgr24, 3,2,1,0) +DEFINE_YUV2RGB_SSE2_SET(rgba32, 4,0,1,2) +DEFINE_YUV2RGB_SSE2_SET(abgr32, 4,3,2,1) +DEFINE_YUV2RGB_SSE2_SET(argb32, 4,1,2,3) +DEFINE_YUV2RGB_SSE2_SET(bgra32, 4,2,1,0) + +/************************************/ + +static inline void sse2_load_yuv420p(uint8_t *srcY, uint8_t *srcU, + uint8_t *srcV, int x, int y, int width) +{ + srcY += y*width+x; + srcU += (y/2)*(width/2)+(x/2); + srcV += (y/2)*(width/2)+(x/2); + asm("\ + # Load data, bias and expand to 16 bits \n\ + pxor %%xmm4, %%xmm4 # XMM4: 00 00 00 00 00 00 00 00 \n\ + movdqu ("EAX"), %%xmm6 # XMM6: YF...................Y0 \n\ + movq ("ECX"), %%xmm2 # XMM2: U7.......U0 \n\ + movq ("EDX"), %%xmm3 # XMM3: V7.......V0 \n\ + movdqa %%xmm6, %%xmm7 # XMM7: YF...................Y0 \n\ + pand ("ESI"), %%xmm6 # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0 \n\ + psrlw $8, %%xmm7 # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1 \n\ + punpcklbw %%xmm4,%%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0 \n\ + punpcklbw %%xmm4,%%xmm3 # XMM3: V7 V6 V5 V4 V3 V2 V1 V0 \n" + : /* no outputs */ + : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data) + ); +} + +static inline void sse2_load_yuv411p(uint8_t *srcY, uint8_t *srcU, + uint8_t *srcV, int x, int y, int width) +{ + srcY += y*width+x; + srcU += y*(width/4)+(x/4); + srcV += y*(width/4)+(x/4); + asm("\ + # Load data, bias and expand to 16 bits \n\ + pxor %%xmm4, %%xmm4 # XMM4: 00 00 00 00 00 00 00 00 \n\ + movdqu ("EAX"), %%xmm6 # XMM6: YF...................Y0 \n\ + movd ("ECX"), %%xmm2 # XMM2: U3.U0 \n\ + punpcklbw %%xmm2,%%xmm2 # XMM2: U3 U3.U0 U0 \n\ + movd ("EDX"), %%xmm3 # XMM3: V3.V0 \n\ + punpcklbw %%xmm3,%%xmm3 # XMM2: V3 V3.V0 V0 \n\ + movdqa %%xmm6, %%xmm7 # XMM7: YF...................Y0 \n\ + pand ("ESI"), %%xmm6 # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0 \n\ + psrlw $8, %%xmm7 # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1 \n\ + punpcklbw %%xmm4,%%xmm2 # XMM2: U3 U3 U2 U2 U1 U1 U0 U0 \n\ + punpcklbw %%xmm4,%%xmm3 # XMM3: V3 V3 V2 V2 V1 V1 V0 V0 \n" + : /* no outputs */ + : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data) + ); +} + +static inline void sse2_load_yuv422p(uint8_t *srcY, uint8_t *srcU, + uint8_t *srcV, int x, int y, int width) +{ + srcY += y*width+x; + srcU += y*(width/2)+(x/2); + srcV += y*(width/2)+(x/2); + asm("\ + # Load data, bias and expand to 16 bits \n\ + pxor %%xmm4, %%xmm4 # XMM4: 00 00 00 00 00 00 00 00 \n\ + movdqu ("EAX"), %%xmm6 # XMM6: YF...................Y0 \n\ + movq ("ECX"), %%xmm2 # XMM2: U7.......U0 \n\ + movq ("EDX"), %%xmm3 # XMM3: V7.......V0 \n\ + movdqa %%xmm6, %%xmm7 # XMM7: YF...................Y0 \n\ + pand ("ESI"), %%xmm6 # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0 \n\ + psrlw $8, %%xmm7 # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1 \n\ + punpcklbw %%xmm4,%%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0 \n\ + punpcklbw %%xmm4,%%xmm3 # XMM3: V7 V6 V5 V4 V3 V2 V1 V0 \n" + : /* no outputs */ + : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data) + ); +} + +static inline void sse2_load_yuv444p(uint8_t *srcY, uint8_t *srcU, + uint8_t *srcV, int x, int y, int width) +{ + srcY += y*width+x; + srcU += y*width+x; + srcV += y*width+x; + asm("\ + # Load data, bias and expand to 16 bits \n\ + pxor %%xmm4, %%xmm4 # XMM4: 00 00 00 00 00 00 00 00 \n\ + movdqu ("EAX"), %%xmm6 # XMM6: YF...................Y0 \n\ + movdqu ("ECX"), %%xmm2 # XMM2: UF...................U0 \n\ + movdqu ("EDX"), %%xmm0 # XMM0: VF...................V0 \n\ + movdqa %%xmm6, %%xmm7 # XMM7: YF...................Y0 \n\ + punpcklbw %%xmm4,%%xmm6 # XMM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ + punpckhbw %%xmm4,%%xmm7 # XMM7: YF YE YD YC YB YA Y9 Y8 \n\ + movdqa %%xmm2, %%xmm5 # XMM5: UF...................U0 \n\ + punpcklbw %%xmm4,%%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0 \n\ + punpckhbw %%xmm4,%%xmm5 # XMM5: UF UE UD UC UB UA U9 U8 \n\ + movdqa %%xmm0, %%xmm3 # XMM3: VF...................V0 \n\ + punpcklbw %%xmm4,%%xmm0 # XMM0: V7 V6 V5 V4 V3 V2 V1 V0 \n\ + punpckhbw %%xmm4,%%xmm3 # XMM3: VF VE VD VC VB VA V9 V8 \n" + : /* no outputs */ + : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data) + ); +} + +static inline void sse2_load_yuy2(uint8_t *srcY, uint8_t *srcU, + uint8_t *srcV, int x, int y, int width) +{ + srcY += (y*width+x)*2; + asm("\ + # Load data, bias and expand to 16 bits \n\ + pxor %%xmm4, %%xmm4 # XMM4: 00 00 00 00 00 00 00 00 \n\ + movdqu ("EAX"), %%xmm6 # XMM6: V3 Y7.............U0 Y0 \n\ + movdqu 16("EAX"),%%xmm7 # XMM7: V7 YF.............U4 Y8 \n\ + movdqa %%xmm6, %%xmm2 # XMM2: V3 Y7.............U0 Y0 \n\ + psrlw $8, %%xmm2 # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\ + pand ("ESI"), %%xmm6 # XMM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ + movdqa %%xmm7, %%xmm3 # XMM3: V7 YF.............U4 Y8 \n\ + psrlw $8, %%xmm3 # XMM3: V7 U7 V6 U6 V5 U5 V4 U4 \n\ + pand ("ESI"), %%xmm7 # XMM6: YF YE YD YC YB YA Y9 Y8 \n\ + packuswb %%xmm3, %%xmm2 # XMM2: V7 U7.............V0 U0 \n\ + movdqa %%xmm2, %%xmm3 # XMM3: V7 U7.............V0 U0 \n\ + pand ("ESI"), %%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0 \n\ + psrlw $8, %%xmm3 # XMM3: V7 V6 V5 V4 V3 V2 V1 V0 \n\ + packuswb %%xmm7, %%xmm6 # XMM6: YF...................Y0 \n\ + movdqa %%xmm6, %%xmm7 # XMM7: YF...................Y0 \n\ + pand ("ESI"), %%xmm6 # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0 \n\ + psrlw $8, %%xmm7 # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1 \n" + : /* no outputs */ + : "a" (srcY), "S" (&yuv_data), "m" (yuv_data) + ); +} + +static inline void sse2_load_uyvy(uint8_t *srcY, uint8_t *srcU, + uint8_t *srcV, int x, int y, int width) +{ + srcY += (y*width+x)*2; + asm("\ + # Load data, bias and expand to 16 bits \n\ + pxor %%xmm4, %%xmm4 # XMM4: 00 00 00 00 00 00 00 00 \n\ + movdqu ("EAX"), %%xmm6 # XMM6: Y7 V3.............Y0 00 \n\ + movdqu 16("EAX"),%%xmm7 # XMM7: YF V7.............Y8 U4 \n\ + movdqa %%xmm6, %%xmm2 # XMM2: Y7 V3.............Y0 U0 \n\ + pand ("ESI"), %%xmm2 # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\ + psrlw $8, %%xmm6 # XMM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ + movdqa %%xmm7, %%xmm3 # XMM3: YF V7.............Y8 U4 \n\ + pand ("ESI"), %%xmm3 # XMM3: V7 U7 V6 U6 V5 U5 V4 U4 \n\ + psrlw $8, %%xmm7 # XMM6: YF YE YD YC YB YA Y9 Y8 \n\ + packuswb %%xmm3, %%xmm2 # XMM2: V7 U7.............V0 U0 \n\ + movdqa %%xmm2, %%xmm3 # XMM3: V7 U7.............V0 U0 \n\ + pand ("ESI"), %%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0 \n\ + psrlw $8, %%xmm3 # XMM3: V7 V6 V5 V4 V3 V2 V1 V0 \n\ + packuswb %%xmm7, %%xmm6 # XMM6: YF...................Y0 \n\ + movdqa %%xmm6, %%xmm7 # XMM7: YF...................Y0 \n\ + pand ("ESI"), %%xmm6 # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0 \n\ + psrlw $8, %%xmm7 # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1 \n" + : /* no outputs */ + : "a" (srcY), "S" (&yuv_data), "m" (yuv_data) + ); +} + +static inline void sse2_load_yvyu(uint8_t *srcY, uint8_t *srcU, + uint8_t *srcV, int x, int y, int width) +{ + srcY += (y*width+x)*2; + asm("\ + # Load data, bias and expand to 16 bits \n\ + pxor %%xmm4, %%xmm4 # XMM4: 00 00 00 00 00 00 00 00 \n\ + movdqu ("EAX"), %%xmm6 # XMM6: U3 Y7.............V0 Y0 \n\ + movdqu 16("EAX"),%%xmm7 # XMM7: U7 YF.............V4 Y8 \n\ + movdqa %%xmm6, %%xmm2 # XMM2: U3 Y7.............V0 Y0 \n\ + psrlw $8, %%xmm2 # XMM2: U3 V3 U2 V2 U1 V1 U0 V0 \n\ + pand ("ESI"), %%xmm6 # XMM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ + movdqa %%xmm7, %%xmm3 # XMM3: U7 YF.............V4 Y8 \n\ + psrlw $8, %%xmm3 # XMM3: U7 V7 U6 V6 U5 V5 U4 V4 \n\ + pand ("ESI"), %%xmm7 # XMM6: YF YE YD YC YB YA Y9 Y8 \n\ + packuswb %%xmm3, %%xmm2 # XMM2: U7 V7.............U0 V0 \n\ + movdqa %%xmm2, %%xmm3 # XMM3: U7 V7.............U0 V0 \n\ + psrlw $8, %%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0 \n\ + pand ("ESI"), %%xmm3 # XMM3: V7 V6 V5 V4 V3 V2 V1 V0 \n\ + packuswb %%xmm7, %%xmm6 # XMM6: YF...................Y0 \n\ + movdqa %%xmm6, %%xmm7 # XMM7: YF...................Y0 \n\ + pand ("ESI"), %%xmm6 # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0 \n\ + psrlw $8, %%xmm7 # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1 \n" + : /* no outputs */ + : "a" (srcY), "S" (&yuv_data), "m" (yuv_data) + ); +} + +/************************************/ + +/* Standard YUV->RGB (Yodd=XMM7 Yeven=XMM6 U=XMM2 V=XMM3) */ +static inline void sse2_yuv_to_rgb(void) +{ + asm("\ + psubw 16("ESI"), %%xmm6 # XMM6: subtract 16 \n\ + psllw $7, %%xmm6 # XMM6: convert to fixed point 8.7 \n\ + psubw 16("ESI"), %%xmm7 # XMM7: subtract 16 \n\ + psllw $7, %%xmm7 # XMM7: convert to fixed point 8.7 \n\ + psubw 32("ESI"), %%xmm2 # XMM2: subtract 128 \n\ + psllw $7, %%xmm2 # XMM2: convert to fixed point 8.7 \n\ + psubw 32("ESI"), %%xmm3 # XMM3: subtract 128 \n\ + psllw $7, %%xmm3 # XMM3: convert to fixed point 8.7 \n\ + # Multiply by constants \n\ + pmulhw 48("ESI"),%%xmm6 # XMM6: cYE.................cY0 \n\ + pmulhw 48("ESI"),%%xmm7 # XMM7: cYF.................cY1 \n\ + movdqa 80("ESI"),%%xmm4 # XMM4: gU constant \n\ + pmulhw %%xmm2, %%xmm4 # XMM4: gU7.................gU0 \n\ + movdqa 96("ESI"),%%xmm5 # XMM5: gV constant \n\ + pmulhw %%xmm3, %%xmm5 # XMM5: gV7.................gV0 \n\ + paddw %%xmm5, %%xmm4 # XMM4: g7 g6 g5 g4 g3 g2 g1 g0 \n\ + pmulhw 64("ESI"),%%xmm3 # XMM3: r7 r6 r5 r4 r3 r2 r1 r0 \n\ + pmulhw 112("ESI"),%%xmm2 #XMM2: b7 b6 b5 b4 b3 b2 b1 b0 \n\ + movdqa %%xmm3, %%xmm0 # XMM0: r7 r6 r5 r4 r3 r2 r1 r0 \n\ + movdqa %%xmm4, %%xmm1 # XMM1: g7 g6 g5 g4 g3 g2 g1 g0 \n\ + movdqa %%xmm2, %%xmm5 # XMM5: b7 b6 b5 b4 b3 b2 b1 b0 \n\ + # Add intermediate results and round/shift to get R/G/B values \n\ + paddw 128("ESI"),%%xmm6 # Add rounding value (0.5 @ 8.4 fixed) \n\ + paddw 128("ESI"),%%xmm7 \n\ + paddw %%xmm6, %%xmm0 # XMM0: RE RC RA R8 R6 R4 R2 R0 \n\ + psraw $4, %%xmm0 # Shift back to 8.0 fixed \n\ + paddw %%xmm6, %%xmm1 # XMM1: GE GC GA G8 G6 G4 G2 G0 \n\ + psraw $4, %%xmm1 \n\ + paddw %%xmm6, %%xmm2 # XMM2: BE BC BA B8 B6 B4 B2 B0 \n\ + psraw $4, %%xmm2 \n\ + paddw %%xmm7, %%xmm3 # XMM3: RF RD RB R9 R7 R5 R3 R1 \n\ + psraw $4, %%xmm3 \n\ + paddw %%xmm7, %%xmm4 # XMM4: GF GD GB G9 G7 G5 G3 G1 \n\ + psraw $4, %%xmm4 \n\ + paddw %%xmm7, %%xmm5 # XMM5: BF BD BB B9 B7 B5 B3 B1 \n\ + psraw $4, %%xmm5 \n\ + # Saturate to 0-255 and pack into bytes \n\ + packuswb %%xmm0, %%xmm0 # XMM0: RE.......R0 RE.......R0 \n\ + packuswb %%xmm1, %%xmm1 # XMM1: GE.......G0 GE.......G0 \n\ + packuswb %%xmm2, %%xmm2 # XMM2: BE.......B0 BE.......B0 \n\ + packuswb %%xmm3, %%xmm3 # XMM3: RF.......R1 RF.......R1 \n\ + packuswb %%xmm4, %%xmm4 # XMM4: GF.......G1 GF.......G1 \n\ + packuswb %%xmm5, %%xmm5 # XMM5: BF.......B1 BF.......B1 \n\ + punpcklbw %%xmm3,%%xmm0 # XMM0: RF...................R0 \n\ + punpcklbw %%xmm4,%%xmm1 # XMM1: GF...................G0 \n\ + punpcklbw %%xmm5,%%xmm2 # XMM2: BF...................B0 \n" + : /* no outputs */ + : "S" (&yuv_data), "m" (yuv_data) + ); +} + +/* YUV444 YUV->RGB (Y=XMM7:XMM6 U=XMM5:XMM2 V=XMM3:XMM0) */ +static inline void sse2_yuv444_to_rgb(void) +{ + asm("\ + psubw 16("ESI"), %%xmm6 # XMM6: subtract 16 \n\ + psllw $7, %%xmm6 # XMM6: convert to fixed point 8.7 \n\ + psubw 16("ESI"), %%xmm7 # XMM7: subtract 16 \n\ + psllw $7, %%xmm7 # XMM7: convert to fixed point 8.7 \n\ + psubw 32("ESI"), %%xmm2 # XMM2: subtract 128 \n\ + psllw $7, %%xmm2 # XMM2: convert to fixed point 8.7 \n\ + psubw 32("ESI"), %%xmm5 # XMM5: subtract 128 \n\ + psllw $7, %%xmm5 # XMM5: convert to fixed point 8.7 \n\ + psubw 32("ESI"), %%xmm0 # XMM0: subtract 128 \n\ + psllw $7, %%xmm0 # XMM0: convert to fixed point 8.7 \n\ + psubw 32("ESI"), %%xmm3 # XMM3: subtract 128 \n\ + psllw $7, %%xmm3 # XMM3: convert to fixed point 8.7 \n\ + # Multiply by constants \n\ + pmulhw 48("ESI"),%%xmm6 # XMM6: cY7.................cY0 \n\ + movdqa 80("ESI"),%%xmm1 # XMM1: gU constant \n\ + pmulhw %%xmm2, %%xmm1 # XMM1: gU7.................gU0 \n\ + movdqa 96("ESI"),%%xmm4 # XMM4: gV constant \n\ + pmulhw %%xmm0, %%xmm4 # XMM4: gV7.................gV0 \n\ + paddw %%xmm4, %%xmm1 # XMM1: g7 g6 g5 g4 g3 g2 g1 g0 \n\ + pmulhw 64("ESI"),%%xmm0 # XMM0: r7 r6 r5 r4 r3 r2 r1 r0 \n\ + pmulhw 112("ESI"),%%xmm2 #XMM2: b7 b6 b5 b4 b3 b2 b1 b0 \n\ + # Add intermediate results and round/shift to get R/G/B values \n\ + paddw 128("ESI"),%%xmm6 # Add rounding value (0.5 @ 8.4 fixed) \n\ + paddw %%xmm6, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\ + psraw $4, %%xmm0 # Shift back to 8.0 fixed \n\ + paddw %%xmm6, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\ + psraw $4, %%xmm1 \n\ + paddw %%xmm6, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n\ + psraw $4, %%xmm2 \n\ + # Do it all over again for pixels 8-15 \n\ + pmulhw 48("ESI"),%%xmm7 # XMM7: cYF.................cY8 \n\ + movdqa 80("ESI"),%%xmm6 # XMM6: gU constant \n\ + pmulhw %%xmm5, %%xmm6 # XMM6: gUF.................gU8 \n\ + movdqa 96("ESI"),%%xmm4 # XMM4: gV constant \n\ + pmulhw %%xmm3, %%xmm4 # XMM4: gVF.................gV8 \n\ + paddw %%xmm6, %%xmm4 # XMM4: gF gE gD gC gB gA g9 g8 \n\ + pmulhw 64("ESI"),%%xmm3 # XMM3: rF rE rD rC rB rA r9 r8 \n\ + pmulhw 112("ESI"),%%xmm5 #XMM5: bF bE bD bC bB bA b9 b8 \n\ + paddw 128("ESI"),%%xmm7 # Add rounding value (0.5 @ 8.4 fixed) \n\ + paddw %%xmm7, %%xmm3 # XMM3: RF RE RD RC RB RA R9 R8 \n\ + psraw $4, %%xmm3 \n\ + paddw %%xmm7, %%xmm4 # XMM4: GF GE GD GC GB GA G9 G8 \n\ + psraw $4, %%xmm4 \n\ + paddw %%xmm7, %%xmm5 # XMM5: BF BE BD BC BB BA B9 B8 \n\ + psraw $4, %%xmm5 \n\ + # Saturate to 0-255 and pack into bytes \n\ + packuswb %%xmm3, %%xmm0 # XMM0: RF...................R0 \n\ + packuswb %%xmm4, %%xmm1 # XMM1: GF...................G0 \n\ + packuswb %%xmm5, %%xmm2 # XMM2: BF...................B0 \n" + : /* no outputs */ + : "S" (&yuv_data), "m" (yuv_data) + ); +} + +/************************************/ + +/* Convert YUV->RGB output to RGBA pixels in XMM0..XMM3 */ +#define SSE2_RGB_TO_RGBA "\ + pxor %%xmm7, %%xmm7 # XMM7: 00 00 00 00 00 00 00 00 \n\ + movdqa %%xmm0, %%xmm3 # XMM3: RF...................R0 \n\ + movdqa %%xmm1, %%xmm4 # XMM4: GF...................G0 \n\ + movdqa %%xmm2, %%xmm5 # XMM5: BF...................B0 \n\ + punpcklbw %%xmm1,%%xmm0 # XMM0: G7 R7.............G0 R0 \n\ + punpcklbw %%xmm7,%%xmm2 # XMM2: 00 B7.............00 B0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: G7 R7.............G0 R0 \n\ + punpcklwd %%xmm2,%%xmm0 # XMM0: 0BGR3 0BGR2 0BGR1 0BGR0 \n\ + punpckhwd %%xmm2,%%xmm1 # XMM1: 0BGR7 0BGR6 0BGR5 0BGR4 \n\ + punpckhbw %%xmm4,%%xmm3 # XMM3: GF RF.............G8 R8 \n\ + punpckhbw %%xmm7,%%xmm5 # XMM5: 00 BF.............00 B8 \n\ + movdqa %%xmm3, %%xmm2 # XMM2: GF RF.............G8 R8 \n\ + punpckhwd %%xmm5,%%xmm3 # XMM3: 0BGRF 0BGRE 0BGRD 0BGRC \n\ + punpcklwd %%xmm5,%%xmm2 # XMM2: 0BGRB 0BGRA 0BGR9 0BGR8 \n" + +/* Convert YUV->RGB output to BGRA pixels in XMM0..XMM3 */ +#define SSE2_RGB_TO_BGRA "\ + pxor %%xmm7, %%xmm7 # XMM7: 00 00 00 00 00 00 00 00 \n\ + movdqa %%xmm0, %%xmm5 # XMM5: RF...................R0 \n\ + movdqa %%xmm1, %%xmm4 # XMM4: GF...................G0 \n\ + movdqa %%xmm2, %%xmm3 # XMM3: BF...................B0 \n\ + punpcklbw %%xmm1,%%xmm2 # XMM0: G7 B7.............G0 B0 \n\ + punpcklbw %%xmm7,%%xmm0 # XMM2: 00 R7.............00 R0 \n\ + movdqa %%xmm2, %%xmm1 # XMM1: G7 B7.............G0 B0 \n\ + punpcklwd %%xmm0,%%xmm2 # XMM2: 0RGB3 0RGB2 0RGB1 0RGB0 \n\ + punpckhwd %%xmm0,%%xmm1 # XMM1: 0RGB7 0RGB6 0RGB5 0RGB4 \n\ + movdqa %%xmm2, %%xmm0 # XMM0: 0RGB3 0RGB2 0RGB1 0RGB0 \n\ + punpckhbw %%xmm4,%%xmm3 # XMM3: GF BF.............G8 B8 \n\ + punpckhbw %%xmm7,%%xmm5 # XMM5: 00 RF.............00 R8 \n\ + movdqa %%xmm3, %%xmm2 # XMM2: GF BF.............G8 B8 \n\ + punpckhwd %%xmm5,%%xmm3 # XMM3: 0RGBF 0RGBE 0RGBD 0RGBC \n\ + punpcklwd %%xmm5,%%xmm2 # XMM2: 0RGBB 0RGBA 0RGB9 0RGB8 \n" + +/* Convert and 4 RGBA32 (BGRA32) pixels in XMMn to RGB24 (BGR24) and store + * at EDI+(12*n) */ +#define SSE2_RGB32_TO_RGB24(n) "\ + movd %%xmm"#n", %%eax # EAX: 00 B0 G0 R0 \n\ + psrldq $4, %%xmm"#n" # XMMn: 00000 0BGR3 0BGR2 0BGR1 \n\ + movd %%xmm"#n", %%ebx # EBX: 00 B1 G1 R1 \n\ + psrldq $4, %%xmm"#n" # XMMn: 00000 00000 0BGR3 0BGR2 \n\ + movd %%xmm"#n", %%ecx # ECX: 00 B2 G2 R2 \n\ + psrldq $4, %%xmm"#n" # XMMn: 00000 00000 00000 0BGR3 \n\ + movd %%xmm"#n", %%edx # EDX: 00 B3 G3 R3 \n\ + "IA32_RGB32_TO_RGB24" \n\ + movl %%eax, 12*"#n"+0("EDI") \n\ + movl %%ebx, 12*"#n"+4("EDI") \n\ + movl %%ecx, 12*"#n"+8("EDI") \n" + + +static inline void sse2_store_rgb24(uint8_t *dest) +{ + /* It looks like it's fastest to go to RGB32 first, then shift the + * result to merge the 24-bit pixels together. */ + asm(SSE2_RGB_TO_RGBA" \n\ + "PUSH(EBX)" \n\ + "SSE2_RGB32_TO_RGB24(0)" \n\ + "SSE2_RGB32_TO_RGB24(1)" \n\ + "SSE2_RGB32_TO_RGB24(2)" \n\ + "SSE2_RGB32_TO_RGB24(3)" \n\ + "POP(EBX)" \n" + : /* no outputs */ + : "D" (dest) + : "eax", "ecx", "edx", "esi" COMMA_FAKE_PUSH_REG + ); +} + +static inline void sse2_store_bgr24(uint8_t *dest) +{ + asm(SSE2_RGB_TO_BGRA "\ + "PUSH(EBX)" \n\ + "SSE2_RGB32_TO_RGB24(0)" \n\ + "SSE2_RGB32_TO_RGB24(1)" \n\ + "SSE2_RGB32_TO_RGB24(2)" \n\ + "SSE2_RGB32_TO_RGB24(3)" \n\ + "POP(EBX)" \n" + : /* no outputs */ + : "D" (dest) + : "eax", "ecx", "edx", "esi" COMMA_FAKE_PUSH_REG + ); +} + +/* It would be nice to be able to use movntdq here for a 50% speedup, + * but we're not guaranteed alignment... (think 766x512 for example) */ +static inline void sse2_store_rgba32(uint8_t *dest) +{ + asm(SSE2_RGB_TO_RGBA "\ + movdqu %%xmm0, ("EDI") \n\ + movdqu %%xmm1, 16("EDI") \n\ + movdqu %%xmm2, 32("EDI") \n\ + movdqu %%xmm3, 48("EDI") \n" + : /* no outputs */ + : "D" (dest) + ); +} + +static inline void sse2_store_abgr32(uint8_t *dest) +{ + asm(SSE2_RGB_TO_BGRA "\ + pslldq $1, %%xmm0 \n\ + pslldq $1, %%xmm1 \n\ + pslldq $1, %%xmm2 \n\ + pslldq $1, %%xmm3 \n\ + movdqu %%xmm0, ("EDI") \n\ + movdqu %%xmm1, 16("EDI") \n\ + movdqu %%xmm2, 32("EDI") \n\ + movdqu %%xmm3, 48("EDI") \n" + : /* no outputs */ + : "D" (dest) + ); +} + +static inline void sse2_store_argb32(uint8_t *dest) +{ + asm(SSE2_RGB_TO_RGBA "\ + pslldq $1, %%xmm0 \n\ + pslldq $1, %%xmm1 \n\ + pslldq $1, %%xmm2 \n\ + pslldq $1, %%xmm3 \n\ + movdqu %%xmm0, ("EDI") \n\ + movdqu %%xmm1, 16("EDI") \n\ + movdqu %%xmm2, 32("EDI") \n\ + movdqu %%xmm3, 48("EDI") \n" + : /* no outputs */ + : "D" (dest) + ); +} + +static inline void sse2_store_bgra32(uint8_t *dest) +{ + asm(SSE2_RGB_TO_BGRA "\ + movdqu %%xmm0, ("EDI") \n\ + movdqu %%xmm1, 16("EDI") \n\ + movdqu %%xmm2, 32("EDI") \n\ + movdqu %%xmm3, 48("EDI") \n" + : /* no outputs */ + : "D" (dest) + ); +} + +/*************************************************************************/ + +static inline void sse2_load_rgb24(uint8_t *src); +static inline void sse2_load_bgr24(uint8_t *src); +static inline void sse2_load_rgba32(uint8_t *src); +static inline void sse2_load_abgr32(uint8_t *src); +static inline void sse2_load_argb32(uint8_t *src); +static inline void sse2_load_bgra32(uint8_t *src); +static inline void sse2_rgb_to_yuv420p( + uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width); +static inline void sse2_rgb_to_yuv411p( + uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width); +static inline void sse2_rgb_to_yuv422p( + uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width); +static inline void sse2_rgb_to_yuv444p( + uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width); +static inline void sse2_rgb_to_yuy2( + uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width); +static inline void sse2_rgb_to_uyvy( + uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width); +static inline void sse2_rgb_to_yvyu( + uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width); +static inline void sse2_rgb_to_y8( + uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width); + +#define DEFINE_RGB2YUV_SSE2(rgb,yuv,rgbsz,rofs,gofs,bofs,slowop) \ +static int rgb##_##yuv##_sse2(uint8_t **src, uint8_t **dest, \ + int width, int height) \ +{ \ + int x, y; \ + \ + for (y = 0; y < height; y++) { \ + for (x = 0; x < (width & ~7); x += 8) { \ + sse2_load_##rgb(src[0]+(y*width+x)*rgbsz); \ + sse2_rgb_to_##yuv(dest[0], dest[1], dest[2], x, y, width); \ + } \ + while (x < width) { \ + int r = src[0][(y*width+x)*rgbsz+rofs]; \ + int g = src[0][(y*width+x)*rgbsz+gofs]; \ + int b = src[0][(y*width+x)*rgbsz+bofs]; \ + slowop; \ + x++; \ + } \ + } \ + asm("emms"); \ + return 1; \ +} + +#define DEFINE_RGB2YUV_SSE2_SET(rgb,sz,r,g,b) \ + DEFINE_RGB2YUV_SSE2(rgb,yuv420p, sz,r,g,b, RGB2YUV_420P) \ + DEFINE_RGB2YUV_SSE2(rgb,yuv411p, sz,r,g,b, RGB2YUV_411P) \ + DEFINE_RGB2YUV_SSE2(rgb,yuv422p, sz,r,g,b, RGB2YUV_422P) \ + DEFINE_RGB2YUV_SSE2(rgb,yuv444p, sz,r,g,b, RGB2YUV_444P) \ + DEFINE_RGB2YUV_SSE2(rgb,yuy2, sz,r,g,b, RGB2YUV_YUY2) \ + DEFINE_RGB2YUV_SSE2(rgb,uyvy, sz,r,g,b, RGB2YUV_UYVY) \ + DEFINE_RGB2YUV_SSE2(rgb,yvyu, sz,r,g,b, RGB2YUV_YVYU) \ + DEFINE_RGB2YUV_SSE2(rgb,y8, sz,r,g,b, RGB2Y()) + +DEFINE_RGB2YUV_SSE2_SET(rgb24, 3,0,1,2) +DEFINE_RGB2YUV_SSE2_SET(bgr24, 3,2,1,0) +DEFINE_RGB2YUV_SSE2_SET(rgba32, 4,0,1,2) +DEFINE_RGB2YUV_SSE2_SET(abgr32, 4,3,2,1) +DEFINE_RGB2YUV_SSE2_SET(argb32, 4,1,2,3) +DEFINE_RGB2YUV_SSE2_SET(bgra32, 4,2,1,0) + +/************************************/ + +/* Split 8 RGBA pixels in XMMr/XMMb into R/G/B in XMM0/XMM1/XMM2. + * r and b are 0 and 2 for RGB, 2 and 0 for BGR */ +#define SSE2_SPLIT_RGB32(r,b) "\ + movdqa 176("EDI"), %%xmm7 # XMM7: 00FF*8 \n\ + movdqa %%xmm"#r", %%xmm1 # XMM1: XBGR3 XBGR2 XBGR1 XBGR0 \n\ + movdqa %%xmm"#b", %%xmm3 # XMM3: XBGR7 XBGR6 XBGR5 XBGR4 \n\ + pand %%xmm7, %%xmm"#r" # XMMr: B3 R3 B2 R2 B1 R1 B0 R0 \n\ + psrld $8, %%xmm1 # XMM1: -XBG3 -XBG2 -XBG1 -XBG0 \n\ + pand %%xmm7, %%xmm"#b" # XMMb: B7 R7 B6 R6 B5 R5 B4 R4 \n\ + psrld $8, %%xmm3 # XMM3: -XBG7 -XBG6 -XBG5 -XBG4 \n\ + pand %%xmm7, %%xmm1 # XMM1: XX G3 XX G2 XX G1 XX G0 \n\ + packuswb %%xmm"#b", %%xmm"#r" # XMMr: B7 R7 ........... B0 R0 \n\ + pand %%xmm7, %%xmm3 # XMM3: XX G7 XX G6 XX G5 XX G4 \n\ + movdqa %%xmm"#r", %%xmm"#b" # XMMb: B7 R7 ........... B0 R0 \n\ + packuswb %%xmm3, %%xmm1 # XMM1: XX G7 ........... XX G0 \n\ + pand %%xmm7, %%xmm"#r" # XMMr: R7 R6 R5 R4 R3 R2 R1 R0 \n\ + psrlw $8, %%xmm"#b" # XMMb: B7 B6 B5 B4 B3 B2 B1 B0 \n\ + pand %%xmm7, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n" + +static inline void sse2_load_rgb24(uint8_t *src) +{ + asm("\ + "PUSH(EBX)" \n\ + # Make stack space for loading XMM registers \n" +#ifdef ARCH_X86_64 +" sub $24+128, "ESP" \n" +#else +" sub $24, "ESP" \n" +#endif +" # Copy source pixels to appropriate positions in stack (this \n\ + # seems to be the fastest way to get them where we want them) \n\ + movl $8, %%ebx \n\ + movl $24, %%edx \n\ + 0: \n\ + movb -3("ESI","EDX"), %%al \n\ + movb %%al, 0-1("ESP","EBX") \n\ + movb -2("ESI","EDX"), %%al \n\ + movb %%al, 8-1("ESP","EBX") \n\ + movb -1("ESI","EDX"), %%al \n\ + movb %%al, 16-1("ESP","EBX") \n\ + subl $3, %%edx \n\ + subl $1, %%ebx \n\ + jnz 0b \n\ + # Load XMM0-XMM2 with R/G/B values and expand to 16-bit \n\ + pxor %%xmm7, %%xmm7 \n\ + movq ("ESP"), %%xmm0 \n\ + punpcklbw %%xmm7, %%xmm0 \n\ + movq 8("ESP"), %%xmm1 \n\ + punpcklbw %%xmm7, %%xmm1 \n\ + movq 16("ESP"), %%xmm2 \n\ + punpcklbw %%xmm7, %%xmm2 \n" +#ifdef ARCH_X86_64 +" add $24+128, "ESP" \n" +#else +" add $24, "ESP" \n" +#endif +" "POP(EBX)" \n" + : /* no outputs */ + : "S" (src) + : "eax", "ecx", "edx", "edi" COMMA_FAKE_PUSH_REG + ); +} + +static inline void sse2_load_bgr24(uint8_t *src) +{ + /* Load as RGB and swap registers */ + sse2_load_rgb24(src); + asm("\ + movdqa %%xmm0, %%xmm3 \n\ + movdqa %%xmm2, %%xmm0 \n\ + movdqa %%xmm3, %%xmm2 \n" + : /* no outputs */ + : /* no inputs */ + ); +} + +static inline void sse2_load_rgba32(uint8_t *src) +{ + asm("\ + movdqu ("ESI"), %%xmm0 # XMM0: XBGR3 XBGR2 XBGR1 XBGR0 \n\ + movdqu 16("ESI"), %%xmm2 # XMM2: XBGR7 XBGR6 XBGR5 XBGR4 \n\ + "SSE2_SPLIT_RGB32(0,2)" \n" + : /* no outputs */ + : "S" (src), "D" (&rgb_data), "m" (rgb_data) + ); +} + +static inline void sse2_load_abgr32(uint8_t *src) +{ + asm("\ + movdqu ("ESI"), %%xmm2 # XMM2: RGBX3 RGBX2 RGBX1 RGBX0 \n\ + movdqu 16("ESI"), %%xmm0 # XMM0: RGBX7 RGBX6 RGBX5 RGBX4 \n\ + psrld $8, %%xmm2 # XMM2: -RGB3 -RGB2 -RGB1 -RGB0 \n\ + psrld $8, %%xmm0 # XMM0: -RGB7 -RGB6 -RGB5 -RGB4 \n\ + "SSE2_SPLIT_RGB32(2,0)" \n" + : /* no outputs */ + : "S" (src), "D" (&rgb_data), "m" (rgb_data) + ); +} + +static inline void sse2_load_argb32(uint8_t *src) +{ + asm("\ + movdqu ("ESI"), %%xmm0 # XMM0: BGRX3 BGRX2 BGRX1 BGRX0 \n\ + movdqu 16("ESI"), %%xmm2 # XMM2: BGRX7 BGRX6 BGRX5 BGRX4 \n\ + psrld $8, %%xmm0 # XMM0: -BGR3 -BGR2 -BGR1 -BGR0 \n\ + psrld $8, %%xmm2 # XMM2: -BGR7 -BGR6 -BGR5 -BGR4 \n\ + "SSE2_SPLIT_RGB32(0,2)" \n" + : /* no outputs */ + : "S" (src), "D" (&rgb_data), "m" (rgb_data) + ); +} + +static inline void sse2_load_bgra32(uint8_t *src) +{ + asm("\ + movdqu ("ESI"), %%xmm2 # XMM2: XRGB3 XRGB2 XRGB1 XRGB0 \n\ + movdqu 16("ESI"), %%xmm0 # XMM0: XRGB7 XRGB6 XRGB5 XRGB4 \n\ + "SSE2_SPLIT_RGB32(2,0)" \n" + : /* no outputs */ + : "S" (src), "D" (&rgb_data), "m" (rgb_data) + ); +} + +/************************************/ + +#define SSE2_RGB2Y "\ + # Make RGB data into 8.6 fixed-point, then create 8.6 \n\ + # fixed-point Y data in XMM3 \n\ + psllw $6, %%xmm0 \n\ + movdqa %%xmm0, %%xmm3 \n\ + pmulhuw ("EDI"), %%xmm3 \n\ + psllw $6, %%xmm1 \n\ + movdqa %%xmm1, %%xmm6 \n\ + pmulhuw 16("EDI"), %%xmm6 \n\ + psllw $6, %%xmm2 \n\ + movdqa %%xmm2, %%xmm7 \n\ + pmulhuw 32("EDI"), %%xmm7 \n\ + paddw %%xmm6, %%xmm3 # No possibility of overflow \n\ + paddw %%xmm7, %%xmm3 \n\ + paddw 144("EDI"), %%xmm3 \n" +#define SSE2_RGB2U "\ + # Create 8.6 fixed-point U data in XMM4 \n\ + movdqa %%xmm0, %%xmm4 \n\ + pmulhw 48("EDI"), %%xmm4 \n\ + movdqa %%xmm1, %%xmm6 \n\ + pmulhw 64("EDI"), %%xmm6 \n\ + movdqa %%xmm2, %%xmm7 \n\ + pmulhw 80("EDI"), %%xmm7 \n\ + paddw %%xmm6, %%xmm4 \n\ + paddw %%xmm7, %%xmm4 \n\ + paddw 160("EDI"), %%xmm4 \n" +#define SSE2_RGB2U0 "\ + # Create 8.6 fixed-point U data in XMM0 \n\ + pmulhw 48("EDI"), %%xmm0 \n\ + pmulhw 64("EDI"), %%xmm1 \n\ + pmulhw 80("EDI"), %%xmm2 \n\ + paddw %%xmm1, %%xmm0 \n\ + paddw %%xmm2, %%xmm0 \n\ + paddw 160("EDI"), %%xmm0 \n" +#define SSE2_RGB2V "\ + # Create 8.6 fixed-point V data in XMM0 \n\ + pmulhw 96("EDI"), %%xmm0 \n\ + pmulhw 112("EDI"), %%xmm1 \n\ + pmulhw 128("EDI"), %%xmm2 \n\ + paddw %%xmm1, %%xmm0 \n\ + paddw %%xmm2, %%xmm0 \n\ + paddw 160("EDI"), %%xmm0 \n" +#define SSE2_PACKYU "\ + # Shift back down to 8-bit values \n\ + psraw $6, %%xmm3 \n\ + psraw $6, %%xmm0 \n\ + # Pack into bytes \n\ + pxor %%xmm7, %%xmm7 \n\ + packuswb %%xmm7, %%xmm3 \n\ + packuswb %%xmm7, %%xmm0 \n" +#define SSE2_PACKYUV "\ + # Shift back down to 8-bit values \n\ + psraw $6, %%xmm3 \n\ + psraw $6, %%xmm4 \n\ + psraw $6, %%xmm0 \n\ + # Pack into bytes \n\ + pxor %%xmm7, %%xmm7 \n\ + packuswb %%xmm7, %%xmm3 \n\ + packuswb %%xmm7, %%xmm4 \n\ + packuswb %%xmm7, %%xmm0 \n" +#define SSE2_STRIPU(N) "\ + # Remove every odd U value \n\ + pand 176("EDI"), %%xmm"#N" \n\ + packuswb %%xmm7, %%xmm"#N" \n" +#define SSE2_STRIPV "\ + # Remove every even V value \n\ + psrlw $8, %%xmm0 \n\ + packuswb %%xmm7, %%xmm0 \n" + +static inline void sse2_rgb_to_yuv420p( + uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width) +{ + if (y%2 == 0) { + asm("\ + "SSE2_RGB2Y" \n\ + "SSE2_RGB2U0" \n\ + "SSE2_PACKYU" \n\ + "SSE2_STRIPU(0)" \n\ + # Store into destination pointers \n\ + movq %%xmm3, ("EAX") \n\ + movd %%xmm0, ("ECX") \n" + : /* no outputs */ + : "a" (destY+y*width+x), "c" (destU+(y/2)*(width/2)+(x/2)), + "D" (&rgb_data), "m" (rgb_data) + ); + } else { + asm("\ + "SSE2_RGB2Y" \n\ + "SSE2_RGB2V" \n\ + "SSE2_PACKYU" \n\ + "SSE2_STRIPV" \n\ + # Store into destination pointers \n\ + movq %%xmm3, ("EAX") \n\ + movd %%xmm0, ("EDX") \n" + : /* no outputs */ + : "a" (destY+y*width+x), "d" (destV+(y/2)*(width/2)+(x/2)), + "D" (&rgb_data), "m" (rgb_data) + ); + } +} + +static inline void sse2_rgb_to_yuv411p( + uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width) +{ + asm("\ + "SSE2_RGB2Y" \n\ + "SSE2_RGB2U" \n\ + "SSE2_RGB2V" \n\ + "SSE2_PACKYUV" \n\ + "SSE2_STRIPU(4)" \n\ + "SSE2_STRIPU(4)" \n\ + "SSE2_STRIPU(0)" \n\ + "SSE2_STRIPV" \n\ + # Store into destination pointers \n\ + movq %%xmm3, ("EAX") \n\ + "PUSH(EAX)" # needed because GCC might rely on it later \n\ + movd %%xmm4, %%eax \n\ + movw %%ax, ("ECX") \n\ + movd %%xmm0, %%eax \n\ + movw %%ax, ("EDX") \n\ + "POP(EAX)" \n" + : /* no outputs */ + : "a" (destY+y*width+x), "c" (destU+y*(width/4)+(x/4)), + "d" (destV+y*(width/4)+(x/4)), "D" (&rgb_data), "m" (rgb_data) +#ifdef ARCH_X86_64 + : FAKE_PUSH_REG +#endif + ); +} + +static inline void sse2_rgb_to_yuv422p( + uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width) +{ + asm("\ + "SSE2_RGB2Y" \n\ + "SSE2_RGB2U" \n\ + "SSE2_RGB2V" \n\ + "SSE2_PACKYUV" \n\ + "SSE2_STRIPU(4)" \n\ + "SSE2_STRIPV" \n\ + # Store into destination pointers \n\ + movq %%xmm3, ("EAX") \n\ + movd %%xmm4, ("ECX") \n\ + movd %%xmm0, ("EDX") \n" + : /* no outputs */ + : "a" (destY+y*width+x), "c" (destU+y*(width/2)+(x/2)), + "d" (destV+y*(width/2)+(x/2)), "D" (&rgb_data), "m" (rgb_data) + ); +} + +static inline void sse2_rgb_to_yuv444p( + uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width) +{ + asm("\ + "SSE2_RGB2Y" \n\ + "SSE2_RGB2U" \n\ + "SSE2_RGB2V" \n\ + "SSE2_PACKYUV" \n\ + # Store into destination pointers \n\ + movq %%xmm3, ("EAX") \n\ + movq %%xmm4, ("ECX") \n\ + movq %%xmm0, ("EDX") \n" + : /* no outputs */ + : "a" (destY+y*width+x), "c" (destU+y*width+x), "d" (destV+y*width+x), + "D" (&rgb_data), "m" (rgb_data) + ); +} + +static inline void sse2_rgb_to_yuy2( + uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width) +{ + asm("\ + "SSE2_RGB2Y" \n\ + "SSE2_RGB2U" \n\ + "SSE2_RGB2V" \n\ + "SSE2_PACKYUV" \n\ + "SSE2_STRIPU(4)" \n\ + "SSE2_STRIPV" \n\ + # Interleave Y/U/V \n\ + punpcklbw %%xmm0, %%xmm4 \n\ + punpcklbw %%xmm4, %%xmm3 \n\ + # Store into destination pointer \n\ + movdqu %%xmm3, ("EAX") \n" + : /* no outputs */ + : "a" (destY+(y*width+x)*2), "D" (&rgb_data), "m" (rgb_data) + ); +} + +static inline void sse2_rgb_to_uyvy( + uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width) +{ + asm("\ + "SSE2_RGB2Y" \n\ + "SSE2_RGB2U" \n\ + "SSE2_RGB2V" \n\ + "SSE2_PACKYUV" \n\ + "SSE2_STRIPU(4)" \n\ + "SSE2_STRIPV" \n\ + # Interleave Y/U/V \n\ + punpcklbw %%xmm0, %%xmm4 \n\ + punpcklbw %%xmm3, %%xmm4 \n\ + # Store into destination pointer \n\ + movdqu %%xmm4, ("EAX") \n" + : /* no outputs */ + : "a" (destY+(y*width+x)*2), "D" (&rgb_data), "m" (rgb_data) + ); +} + +static inline void sse2_rgb_to_yvyu( + uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width) +{ + asm("\ + "SSE2_RGB2Y" \n\ + "SSE2_RGB2U" \n\ + "SSE2_RGB2V" \n\ + "SSE2_PACKYUV" \n\ + # Remove every odd V value \n\ + pand 176("EDI"), %%xmm0 \n\ + packuswb %%xmm7, %%xmm0 \n\ + # Remove every even U value \n\ + psrlw $8, %%xmm4 \n\ + packuswb %%xmm7, %%xmm4 \n\ + # Interleave Y/U/V \n\ + punpcklbw %%xmm4, %%xmm0 \n\ + punpcklbw %%xmm0, %%xmm3 \n\ + # Store into destination pointer \n\ + movdqu %%xmm3, ("EAX") \n" + : /* no outputs */ + : "a" (destY+(y*width+x)*2), "D" (&rgb_data), "m" (rgb_data) + ); +} + +static inline void sse2_rgb_to_y8( + uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width) +{ + asm("\ + psllw $6, %%xmm0 \n\ + pmulhuw ("EDI"), %%xmm0 \n\ + psllw $6, %%xmm1 \n\ + pmulhuw 16("EDI"), %%xmm1 \n\ + psllw $6, %%xmm2 \n\ + pmulhuw 32("EDI"), %%xmm2 \n\ + paddw %%xmm1, %%xmm0 # No possibility of overflow \n\ + paddw %%xmm2, %%xmm0 \n\ + paddw 144("EDI"), %%xmm0 \n\ + psraw $6, %%xmm0 \n\ + packuswb %%xmm0, %%xmm0 \n\ + movq %%xmm0, ("EAX") \n" + : /* no outputs */ + : "a" (destY+y*width+x), "D" (&rgb_data), "m" (rgb_data) + ); +} + +/*************************************************************************/ + +static int yuvp_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + asm("movdqa ("EDX"), %%xmm7 # constant: 255/219 \n\ + movdqa 32("EDX"), %%xmm6 # constant: 16 \n\ + pxor %%xmm4, %%xmm4 # constant: 0 \n" + SIMD_LOOP_WRAPPER( + /* blocksize */ 16, + /* push_regs */ "", + /* pop_regs */ "", + /* small_loop */ "\ + movzbl -1("ESI","ECX"), %%eax # retrieve Y byte \n\ + subl $16, %%eax # subtract 16 \n\ + imull %3, %%eax # multiply by 255/219 \n\ + shrl $14, %%eax # shift down to 8 bits \n\ + testb %%ah, %%ah # saturate to 0..255 \n\ + movl $-1, %%edx # (trash EDX, we don't need it \n\ + cmovnz %%edx, %%eax # anymore) \n\ + movl $0, %%edx \n\ + cmovs %%edx, %%eax \n\ + movb %%al, -1("EDI","ECX") # and store \n", + /* main_loop */ "\ + movdqu -16("ESI","ECX"), %%xmm0 # XMM0: Y15..Y0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: Y15..Y0 \n\ + punpcklbw %%xmm4, %%xmm0 # XMM0: Y7..Y0 \n\ + psubw %%xmm6, %%xmm0 # XMM0: unbias by 16 \n\ + psllw $2, %%xmm0 # XMM0: fixed point 8.2 \n\ + pmulhw %%xmm7, %%xmm0 # XMM0: multiply by 255/219>>2 \n\ + punpckhbw %%xmm4, %%xmm1 # XMM1: Y15..Y8 << 8 \n\ + psubw %%xmm6, %%xmm1 # XMM1: unbias by 16 \n\ + psllw $2, %%xmm1 # XMM1: fixed point 8.2 \n\ + pmulhw %%xmm7, %%xmm1 # XMM1: multiply by 255/219>>2 \n\ + packuswb %%xmm1, %%xmm0 # XMM0: G15..G0, saturated \n\ + movdqu %%xmm0, -16("EDI","ECX") \n", + /* emms */ "emms") + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "c" (width*height), + "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data) + : "eax"); + return 1; +} + +static int yuy2_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + asm("movdqa ("EDX"), %%xmm7 # constant: 255/219 \n\ + movdqa 32("EDX"), %%xmm6 # constant: 16 \n\ + pcmpeqd %%xmm5, %%xmm5 \n\ + psrlw $8, %%xmm5 # constant: 0x00FF \n\ + pxor %%xmm4, %%xmm4 # constant: 0 \n" + SIMD_LOOP_WRAPPER( + /* blocksize */ 8, + /* push_regs */ "", + /* pop_regs */ "", + /* small_loop */ "\ + movzbl -2("ESI","ECX",2), %%eax # retrieve Y byte \n\ + subl $16, %%eax # subtract 16 \n\ + imull %3, %%eax # multiply by 255/219 \n\ + shrl $14, %%eax # shift down to 8 bits \n\ + testb %%ah, %%ah # saturate to 0..255 \n\ + movl $-1, %%edx # (trash EDX, we don't need it \n\ + cmovnz %%edx, %%eax # anymore) \n\ + movl $0, %%edx \n\ + cmovs %%edx, %%eax \n\ + movb %%al, -1("EDI","ECX") # and store \n", + /* main_loop */ "\ + movdqu -16("ESI","ECX",2),%%xmm0 #XMM0: V3 Y7..U0 Y0 \n\ + pand %%xmm5, %%xmm0 # XMM0: Y7..Y0 \n\ + psubw %%xmm6, %%xmm0 # XMM0: unbias by 16 \n\ + psllw $2, %%xmm0 # XMM0: fixed point 8.2 \n\ + pmulhw %%xmm7, %%xmm0 # XMM0: multiply by 255/219>>2 \n\ + packuswb %%xmm0, %%xmm0 # XMM0: G7..G0, saturated \n\ + movq %%xmm0, -8("EDI","ECX") \n", + /* emms */ "emms") + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "c" (width*height), + "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data) + : "eax"); + return 1; +} + +static int uyvy_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + asm("movdqa ("EDX"), %%xmm7 # constant: 255/219 \n\ + movdqa 32("EDX"), %%xmm6 \n\ + psllw $2, %%xmm6 # constant: 16<<2 \n\ + pcmpeqd %%xmm5, %%xmm5 \n\ + psllw $8, %%xmm5 # constant: 0xFF00 \n\ + pxor %%xmm4, %%xmm4 # constant: 0 \n" + SIMD_LOOP_WRAPPER( + /* blocksize */ 8, + /* push_regs */ "", + /* pop_regs */ "", + /* small_loop */ "\ + movzbl -1("ESI","ECX",2), %%eax # retrieve Y byte \n\ + subl $16, %%eax # subtract 16 \n\ + imull %3, %%eax # multiply by 255/219 \n\ + shrl $14, %%eax # shift down to 8 bits \n\ + testb %%ah, %%ah # saturate to 0..255 \n\ + movl $-1, %%edx # (trash EDX, we don't need it \n\ + cmovnz %%edx, %%eax # anymore) \n\ + movl $0, %%edx \n\ + cmovs %%edx, %%eax \n\ + movb %%al, -1("EDI","ECX") # and store \n", + /* main_loop */ "\ + movdqu -16("ESI","ECX",2),%%xmm0 #XMM0: Y7 V3..Y0 U0 \n\ + pand %%xmm5, %%xmm0 # XMM0: Y7..Y0 << 8 \n\ + psrlw $6, %%xmm0 # XMM0: fixed point 8.2 \n\ + psubw %%xmm6, %%xmm0 # XMM0: unbias by 16 \n\ + pmulhw %%xmm7, %%xmm0 # XMM0: multiply by 255/219>>2 \n\ + packuswb %%xmm0, %%xmm0 # XMM0: G7..G0, saturated \n\ + movq %%xmm0, -8("EDI","ECX") \n", + /* emms */ "emms") + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "c" (width*height), + "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data) + : "eax"); + return 1; +} + +/*************************************************************************/ + +static int gray8_y8_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + asm("movdqa 16("EDX"), %%xmm7 # constant: 219/255 \n\ + movdqa 32("EDX"), %%xmm6 # constant: 16 \n\ + pxor %%xmm4, %%xmm4 # constant: 0 \n" + SIMD_LOOP_WRAPPER( + /* blocksize */ 16, + /* push_regs */ "", + /* pop_regs */ "", + /* small_loop */ "\ + movzbl -1("ESI","ECX"), %%eax # retrieve gray byte \n\ + imull %3, %%eax # multiply by 219/255 \n\ + shrl $14, %%eax # shift down to 8 bits \n\ + addl $16, %%eax # add 16 \n\ + movb %%al, -1("EDI","ECX") # and store \n", + /* main_loop */ "\ + movdqu -16("ESI","ECX"), %%xmm2 # XMM2: G15..G0 \n\ + movdqa %%xmm4, %%xmm0 \n\ + punpcklbw %%xmm2, %%xmm0 # XMM0: G7..G0 << 8 \n\ + pmulhuw %%xmm7, %%xmm0 # XMM0: multiply by 219/255>>2 \n\ + movdqa %%xmm4, %%xmm1 \n\ + punpckhbw %%xmm2, %%xmm1 # XMM1: G15..G8 << 8 \n\ + pmulhuw %%xmm7, %%xmm1 # XMM1: multiply by 219/255>>2 \n\ + psrlw $6, %%xmm0 # XMM0: shift down to 8 bits \n\ + paddw %%xmm6, %%xmm0 # XMM0: bias by 16 \n\ + psrlw $6, %%xmm1 # XMM1: shift down to 8 bits \n\ + paddw %%xmm6, %%xmm1 # XMM1: bias by 16 \n\ + packuswb %%xmm1, %%xmm0 # XMM0: Y15..Y0 \n\ + movdqu %%xmm0, -16("EDI","ECX") \n", + /* emms */ "emms") + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "c" (width*height), + "i" (GRAY_Y), "d" (&gray_data), "m" (gray_data) + : "eax"); + return 1; +} + +static int gray8_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + asm("movdqa 16("EDX"), %%xmm7 # constant: 219/255 \n\ + movdqa 32("EDX"), %%xmm6 # constant: 16 \n\ + pcmpeqd %%xmm5, %%xmm5 \n\ + psllw $15, %%xmm5 # constant: 0x8000 \n\ + pxor %%xmm4, %%xmm4 # constant: 0 \n" + SIMD_LOOP_WRAPPER( + /* blocksize */ 8, + /* push_regs */ "", + /* pop_regs */ "", + /* small_loop */ "\ + movzbl -1("ESI","ECX"), %%eax # retrieve gray byte \n\ + imull %3, %%eax # multiply by 219/255 \n\ + shrl $14, %%eax # shift down to 8 bits \n\ + addl $16, %%eax # add 16 \n\ + movb %%al, -2("EDI","ECX",2) # and store \n\ + movb $128, -1("EDI","ECX",2) # store 128 in U/V byte \n", + /* main_loop */ "\ + movq -8("ESI","ECX"), %%xmm2 # XMM2: G5..G0 \n\ + movdqa %%xmm4, %%xmm0 \n\ + punpcklbw %%xmm2, %%xmm0 # XMM0: G7..G0 << 8 \n\ + pmulhuw %%xmm7, %%xmm0 # XMM0: multiply by 219/255>>2 \n\ + psrlw $6, %%xmm0 # XMM0: shift down to 8 bits \n\ + paddw %%xmm6, %%xmm0 # XMM0: bias by 16 \n\ + por %%xmm5, %%xmm0 # XMM0: OR in U/V bytes \n\ + movdqu %%xmm0, -16("EDI","ECX",2) \n", + /* emms */ "emms") + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "c" (width*height), + "i" (GRAY_Y), "d" (&gray_data), "m" (gray_data) + : "eax"); + return 1; +} + +static int gray8_uyvy_sse2(uint8_t **src, uint8_t **dest, int width, int height) { + asm("movdqa 16("EDX"), %%xmm7 # constant: 219/255 \n\ + movdqa 32("EDX"), %%xmm6 \n\ + psllw $8, %%xmm6 # constant: 16 << 8 \n\ + pcmpeqd %%xmm5, %%xmm5 \n\ + psllw $15, %%xmm5 \n\ + psrlw $8, %%xmm5 # constant: 0x0080 \n\ + pxor %%xmm4, %%xmm4 # constant: 0 \n\ + pcmpeqd %%xmm3, %%xmm3 \n\ + psllw $8, %%xmm3 # constant: 0xFF00 \n" + SIMD_LOOP_WRAPPER( + /* blocksize */ 8, + /* push_regs */ "", + /* pop_regs */ "", + /* small_loop */ "\ + movzbl -1("ESI","ECX"), %%eax # retrieve gray byte \n\ + imull %3, %%eax # multiply by 219/255 \n\ + shrl $14, %%eax # shift down to 8 bits \n\ + addl $16, %%eax # add 16 \n\ + movb %%al, -1("EDI","ECX",2) # and store \n\ + movb $128, -2("EDI","ECX",2) # store 128 in U/V byte \n", + /* main_loop */ "\ + movq -8("ESI","ECX"), %%xmm2 # XMM2: G5..G0 \n\ + movdqa %%xmm4, %%xmm0 \n\ + punpcklbw %%xmm2, %%xmm0 # XMM0: G7..G0 << 8 \n\ + pmulhuw %%xmm7, %%xmm0 # XMM0: multiply by 219/255>>2 \n\ + psllw $2, %%xmm0 # XMM0: shift results to hi byte\n\ + pand %%xmm3, %%xmm0 # XMM0: clear low byte \n\ + paddw %%xmm6, %%xmm0 # XMM0: bias by 16 \n\ + por %%xmm5, %%xmm0 # XMM0: OR in U/V bytes \n\ + movdqu %%xmm0, -16("EDI","ECX",2) \n", + /* emms */ "emms") + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "c" (width*height), + "i" (GRAY_Y), "d" (&gray_data), "m" (gray_data) + : "eax"); + return 1; +} + +/*************************************************************************/ + +static int y8_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + asm("movdqa ("EDX"), %%xmm7 # constant: 255/219 \n\ + movdqa 32("EDX"), %%xmm6 # constant: 16 \n\ + movdqa 48("EDX"), %%xmm5 # constant: bytes 0/3/6/9 mask \n\ + pxor %%xmm4, %%xmm4 # constant: 0 \n" + SIMD_LOOP_WRAPPER( + /* blocksize */ 4, + /* push_regs */ "push "EBX, + /* pop_regs */ "pop "EBX, + /* small_loop */ "\ + lea ("ECX","ECX",2), "EDX" # 3*count for RGB offset \n\ + movzbl -1("ESI","ECX"), %%eax # retrieve Y byte \n\ + subl $16, %%eax # subtract 16 \n\ + imull %3, %%eax # multiply by 255/219 \n\ + shrl $14, %%eax # shift down to 8 bits \n\ + testb %%ah, %%ah # saturate to 0..255 \n\ + movl $-1, %%ebx \n\ + cmovnz %%ebx, %%eax \n\ + movl $0, %%ebx \n\ + cmovs %%ebx, %%eax \n\ + movb %%al, -3("EDI","EDX") # and store \n\ + movb %%al, -2("EDI","EDX") \n\ + movb %%al, -1("EDI","EDX") \n", + /* main_loop */ "\ + lea ("ECX","ECX",2), "EDX" \n\ + movd -4("ESI","ECX"), %%xmm0 # XMM0: Y3..Y0 \n\ + punpcklbw %%xmm4, %%xmm0 # XMM0: Y3..Y0 in 16 bits \n\ + psubw %%xmm6, %%xmm0 # XMM0: unbias by 16 \n\ + psllw $2, %%xmm0 # XMM0: fixed point 8.2 \n\ + pmulhw %%xmm7, %%xmm0 # XMM0: multiply by 255/219>>2 \n\ + packuswb %%xmm0, %%xmm0 # XMM0: G3..G0, saturated \n\ + pshuflw $0x50, %%xmm0, %%xmm0 # X0.l: G3 G2 G3 G2 G1 G0 G1 G0 \n\ + pshufhw $0x55, %%xmm0, %%xmm0 # X0.h: G3 G2 G3 G2 G3 G2 G3 G2 \n\ + pand %%xmm5, %%xmm0 # XMM0: ------3--2--1--0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: ------3--2--1--0 \n\ + pslldq $1, %%xmm1 # XMM1: -----3--2--1--0- \n\ + movdqa %%xmm0, %%xmm2 # XMM2: ------3--2--1--0 \n\ + pslldq $2, %%xmm2 # XMM2: ----3--2--1--0-- \n\ + por %%xmm1, %%xmm0 # XMM0: -----33-22-11-00 \n\ + por %%xmm2, %%xmm0 # XMM0: ----333222111000 \n\ + movd %%xmm0, -12("EDI","EDX") \n\ + pshufd $0xC9, %%xmm0, %%xmm0 \n\ + movq %%xmm0, -8("EDI","EDX") \n", + /* emms */ "emms") + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "c" (width*height), + "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data) + : "eax"); + return 1; +} + +/* 4BPP is slightly easier... */ +static int y8_rgba32_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + asm("movdqa ("EDX"), %%xmm7 # constant: 255/219 \n\ + movdqa 32("EDX"), %%xmm6 # constant: 16 \n\ + pxor %%xmm4, %%xmm4 # constant: 0 \n" + SIMD_LOOP_WRAPPER( + /* blocksize */ 4, + /* push_regs */ "", + /* pop_regs */ "", + /* small_loop */ "\ + movzbl -1("ESI","ECX"), %%eax # retrieve Y byte \n\ + subl $16, %%eax # subtract 16 \n\ + imull %3, %%eax # multiply by 255/219 \n\ + shrl $14, %%eax # shift down to 8 bits \n\ + testb %%ah, %%ah # saturate to 0..255 \n\ + movl $-1, %%edx \n\ + cmovnz %%edx, %%eax \n\ + movl $0, %%edx \n\ + cmovs %%edx, %%eax \n\ + movb %%al, -4("EDI","ECX",4) # and store \n\ + movb %%al, -3("EDI","ECX",4) \n\ + movb %%al, -2("EDI","ECX",4) \n", + /* main_loop */ "\ + movd -4("ESI","ECX"), %%xmm0 # XMM0: Y3..Y0 \n\ + punpcklbw %%xmm4, %%xmm0 # XMM0: Y3..Y0 in 16 bits \n\ + psubw %%xmm6, %%xmm0 # XMM0: unbias by 16 \n\ + psllw $2, %%xmm0 # XMM0: fixed point 8.2 \n\ + pmulhw %%xmm7, %%xmm0 # XMM0: multiply by 255/219>>2 \n\ + packuswb %%xmm0, %%xmm0 # XMM0: G3..G0, saturated \n\ + punpcklbw %%xmm4, %%xmm0 # XMM0: G3..G0 in 16 bits \n\ + punpcklbw %%xmm4, %%xmm0 # XMM0: ---3---2---1---0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: ---3---2---1---0 \n\ + pslldq $1, %%xmm1 # XMM1: --3---2---1---0- \n\ + movdqa %%xmm0, %%xmm2 # XMM2: ---3---2---1---0 \n\ + pslldq $2, %%xmm2 # XMM2: -3---2---1---0-- \n\ + por %%xmm1, %%xmm0 # XMM0: --33--22--11--00 \n\ + por %%xmm2, %%xmm0 # XMM0: -333-222-111-000 \n\ + movntdq %%xmm0, -16("EDI","ECX",4) \n", + /* emms */ "emms") + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "c" (width*height), + "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data) + : "eax"); + return 1; +} + +static int y8_argb32_sse2(uint8_t **src, uint8_t **dest, int width, int height) +{ + asm("movdqa ("EDX"), %%xmm7 # constant: 255/219 \n\ + movdqa 32("EDX"), %%xmm6 # constant: 16 \n\ + pxor %%xmm4, %%xmm4 # constant: 0 \n" + SIMD_LOOP_WRAPPER( + /* blocksize */ 4, + /* push_regs */ "", + /* pop_regs */ "", + /* small_loop */ "\ + movzbl -1("ESI","ECX"), %%eax # retrieve Y byte \n\ + subl $16, %%eax # subtract 16 \n\ + imull %3, %%eax # multiply by 255/219 \n\ + shrl $14, %%eax # shift down to 8 bits \n\ + testb %%ah, %%ah # saturate to 0..255 \n\ + movl $-1, %%edx \n\ + cmovnz %%edx, %%eax \n\ + movl $0, %%edx \n\ + cmovs %%edx, %%eax \n\ + movb %%al, -3("EDI","ECX",4) # and store \n\ + movb %%al, -2("EDI","ECX",4) \n\ + movb %%al, -1("EDI","ECX",4) \n", + /* main_loop */ "\ + movd -4("ESI","ECX"), %%xmm0 # XMM0: Y3..Y0 \n\ + punpcklbw %%xmm4, %%xmm0 # XMM0: Y3..Y0 in 16 bits \n\ + psubw %%xmm6, %%xmm0 # XMM0: unbias by 16 \n\ + psllw $2, %%xmm0 # XMM0: fixed point 8.2 \n\ + pmulhw %%xmm7, %%xmm0 # XMM0: multiply by 255/219>>2 \n\ + packuswb %%xmm0, %%xmm0 # XMM0: G3..G0, saturated \n\ + punpcklbw %%xmm4, %%xmm0 # XMM0: G3..G0 in 16 bits \n\ + movdqa %%xmm4, %%xmm3 # XMM3: 0 \n\ + punpcklbw %%xmm0, %%xmm3 # XMM3: --3---2---1---0- \n\ + movdqa %%xmm3, %%xmm1 # XMM1: --3---2---1---0- \n\ + pslldq $1, %%xmm1 # XMM1: -3---2---1---0-- \n\ + movdqa %%xmm3, %%xmm2 # XMM2: --3---2---1---0- \n\ + pslldq $2, %%xmm2 # XMM2: 3---2---1---0--- \n\ + por %%xmm1, %%xmm3 # XMM3: -33--22--11--00- \n\ + por %%xmm2, %%xmm3 # XMM3: 333-222-111-000- \n\ + movntdq %%xmm3, -16("EDI","ECX",4) \n", + /* emms */ "emms") + : /* no outputs */ + : "S" (src[0]), "D" (dest[0]), "c" (width*height), + "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data) + : "eax"); + return 1; +} + +/*************************************************************************/ + +#endif /* HAVE_ASM_SSE2 */ + +/*************************************************************************/ +/*************************************************************************/ + +/* Initialization */ + +int ac_imgconvert_init_yuv_rgb(int accel) +{ + /******** Standard C implementations ********/ + + //---- YUV->RGB ----// + + if (!register_conversion(IMG_YUV420P, IMG_RGB24, yuv420p_rgb24) + || !register_conversion(IMG_YUV411P, IMG_RGB24, yuv411p_rgb24) + || !register_conversion(IMG_YUV422P, IMG_RGB24, yuv422p_rgb24) + || !register_conversion(IMG_YUV444P, IMG_RGB24, yuv444p_rgb24) + || !register_conversion(IMG_YUY2, IMG_RGB24, yuy2_rgb24) + || !register_conversion(IMG_UYVY, IMG_RGB24, uyvy_rgb24) + || !register_conversion(IMG_YVYU, IMG_RGB24, yvyu_rgb24) + || !register_conversion(IMG_Y8, IMG_RGB24, y8_rgb24) + + || !register_conversion(IMG_YUV420P, IMG_BGR24, yuv420p_bgr24) + || !register_conversion(IMG_YUV411P, IMG_BGR24, yuv411p_bgr24) + || !register_conversion(IMG_YUV422P, IMG_BGR24, yuv422p_bgr24) + || !register_conversion(IMG_YUV444P, IMG_BGR24, yuv444p_bgr24) + || !register_conversion(IMG_YUY2, IMG_BGR24, yuy2_bgr24) + || !register_conversion(IMG_UYVY, IMG_BGR24, uyvy_bgr24) + || !register_conversion(IMG_YVYU, IMG_BGR24, yvyu_bgr24) + || !register_conversion(IMG_Y8, IMG_BGR24, y8_rgb24) + + || !register_conversion(IMG_YUV420P, IMG_RGBA32, yuv420p_rgba32) + || !register_conversion(IMG_YUV411P, IMG_RGBA32, yuv411p_rgba32) + || !register_conversion(IMG_YUV422P, IMG_RGBA32, yuv422p_rgba32) + || !register_conversion(IMG_YUV444P, IMG_RGBA32, yuv444p_rgba32) + || !register_conversion(IMG_YUY2, IMG_RGBA32, yuy2_rgba32) + || !register_conversion(IMG_UYVY, IMG_RGBA32, uyvy_rgba32) + || !register_conversion(IMG_YVYU, IMG_RGBA32, yvyu_rgba32) + || !register_conversion(IMG_Y8, IMG_RGBA32, y8_rgba32) + + || !register_conversion(IMG_YUV420P, IMG_ABGR32, yuv420p_abgr32) + || !register_conversion(IMG_YUV411P, IMG_ABGR32, yuv411p_abgr32) + || !register_conversion(IMG_YUV422P, IMG_ABGR32, yuv422p_abgr32) + || !register_conversion(IMG_YUV444P, IMG_ABGR32, yuv444p_abgr32) + || !register_conversion(IMG_YUY2, IMG_ABGR32, yuy2_abgr32) + || !register_conversion(IMG_UYVY, IMG_ABGR32, uyvy_abgr32) + || !register_conversion(IMG_YVYU, IMG_ABGR32, yvyu_abgr32) + || !register_conversion(IMG_Y8, IMG_ABGR32, y8_argb32) + + || !register_conversion(IMG_YUV420P, IMG_ARGB32, yuv420p_argb32) + || !register_conversion(IMG_YUV411P, IMG_ARGB32, yuv411p_argb32) + || !register_conversion(IMG_YUV422P, IMG_ARGB32, yuv422p_argb32) + || !register_conversion(IMG_YUV444P, IMG_ARGB32, yuv444p_argb32) + || !register_conversion(IMG_YUY2, IMG_ARGB32, yuy2_argb32) + || !register_conversion(IMG_UYVY, IMG_ARGB32, uyvy_argb32) + || !register_conversion(IMG_YVYU, IMG_ARGB32, yvyu_argb32) + || !register_conversion(IMG_Y8, IMG_ARGB32, y8_argb32) + + || !register_conversion(IMG_YUV420P, IMG_BGRA32, yuv420p_bgra32) + || !register_conversion(IMG_YUV411P, IMG_BGRA32, yuv411p_bgra32) + || !register_conversion(IMG_YUV422P, IMG_BGRA32, yuv422p_bgra32) + || !register_conversion(IMG_YUV444P, IMG_BGRA32, yuv444p_bgra32) + || !register_conversion(IMG_YUY2, IMG_BGRA32, yuy2_bgra32) + || !register_conversion(IMG_UYVY, IMG_BGRA32, uyvy_bgra32) + || !register_conversion(IMG_YVYU, IMG_BGRA32, yvyu_bgra32) + || !register_conversion(IMG_Y8, IMG_BGRA32, y8_rgba32) + + //---- RGB->YUV ----// + + || !register_conversion(IMG_RGB24, IMG_YUV420P, rgb24_yuv420p) + || !register_conversion(IMG_RGB24, IMG_YUV411P, rgb24_yuv411p) + || !register_conversion(IMG_RGB24, IMG_YUV422P, rgb24_yuv422p) + || !register_conversion(IMG_RGB24, IMG_YUV444P, rgb24_yuv444p) + || !register_conversion(IMG_RGB24, IMG_YUY2, rgb24_yuy2) + || !register_conversion(IMG_RGB24, IMG_UYVY, rgb24_uyvy) + || !register_conversion(IMG_RGB24, IMG_YVYU, rgb24_yvyu) + || !register_conversion(IMG_RGB24, IMG_Y8, rgb24_y8) + + || !register_conversion(IMG_BGR24, IMG_YUV420P, bgr24_yuv420p) + || !register_conversion(IMG_BGR24, IMG_YUV411P, bgr24_yuv411p) + || !register_conversion(IMG_BGR24, IMG_YUV422P, bgr24_yuv422p) + || !register_conversion(IMG_BGR24, IMG_YUV444P, bgr24_yuv444p) + || !register_conversion(IMG_BGR24, IMG_YUY2, bgr24_yuy2) + || !register_conversion(IMG_BGR24, IMG_UYVY, bgr24_uyvy) + || !register_conversion(IMG_BGR24, IMG_YVYU, bgr24_yvyu) + || !register_conversion(IMG_BGR24, IMG_Y8, bgr24_y8) + + || !register_conversion(IMG_RGBA32, IMG_YUV420P, rgba32_yuv420p) + || !register_conversion(IMG_RGBA32, IMG_YUV411P, rgba32_yuv411p) + || !register_conversion(IMG_RGBA32, IMG_YUV422P, rgba32_yuv422p) + || !register_conversion(IMG_RGBA32, IMG_YUV444P, rgba32_yuv444p) + || !register_conversion(IMG_RGBA32, IMG_YUY2, rgba32_yuy2) + || !register_conversion(IMG_RGBA32, IMG_UYVY, rgba32_uyvy) + || !register_conversion(IMG_RGBA32, IMG_YVYU, rgba32_yvyu) + || !register_conversion(IMG_RGBA32, IMG_Y8, rgba32_y8) + + || !register_conversion(IMG_ABGR32, IMG_YUV420P, abgr32_yuv420p) + || !register_conversion(IMG_ABGR32, IMG_YUV411P, abgr32_yuv411p) + || !register_conversion(IMG_ABGR32, IMG_YUV422P, abgr32_yuv422p) + || !register_conversion(IMG_ABGR32, IMG_YUV444P, abgr32_yuv444p) + || !register_conversion(IMG_ABGR32, IMG_YUY2, abgr32_yuy2) + || !register_conversion(IMG_ABGR32, IMG_UYVY, abgr32_uyvy) + || !register_conversion(IMG_ABGR32, IMG_YVYU, abgr32_yvyu) + || !register_conversion(IMG_ABGR32, IMG_Y8, abgr32_y8) + + || !register_conversion(IMG_ARGB32, IMG_YUV420P, argb32_yuv420p) + || !register_conversion(IMG_ARGB32, IMG_YUV411P, argb32_yuv411p) + || !register_conversion(IMG_ARGB32, IMG_YUV422P, argb32_yuv422p) + || !register_conversion(IMG_ARGB32, IMG_YUV444P, argb32_yuv444p) + || !register_conversion(IMG_ARGB32, IMG_YUY2, argb32_yuy2) + || !register_conversion(IMG_ARGB32, IMG_UYVY, argb32_uyvy) + || !register_conversion(IMG_ARGB32, IMG_YVYU, argb32_yvyu) + || !register_conversion(IMG_ARGB32, IMG_Y8, argb32_y8) + + || !register_conversion(IMG_BGRA32, IMG_YUV420P, bgra32_yuv420p) + || !register_conversion(IMG_BGRA32, IMG_YUV411P, bgra32_yuv411p) + || !register_conversion(IMG_BGRA32, IMG_YUV422P, bgra32_yuv422p) + || !register_conversion(IMG_BGRA32, IMG_YUV444P, bgra32_yuv444p) + || !register_conversion(IMG_BGRA32, IMG_YUY2, bgra32_yuy2) + || !register_conversion(IMG_BGRA32, IMG_UYVY, bgra32_uyvy) + || !register_conversion(IMG_BGRA32, IMG_YVYU, bgra32_yvyu) + || !register_conversion(IMG_BGRA32, IMG_Y8, bgra32_y8) + + //---- Grayscale ----// + + || !register_conversion(IMG_YUV420P, IMG_GRAY8, yuvp_gray8) + || !register_conversion(IMG_YUV411P, IMG_GRAY8, yuvp_gray8) + || !register_conversion(IMG_YUV422P, IMG_GRAY8, yuvp_gray8) + || !register_conversion(IMG_YUV444P, IMG_GRAY8, yuvp_gray8) + || !register_conversion(IMG_YUY2, IMG_GRAY8, yuy2_gray8) + || !register_conversion(IMG_UYVY, IMG_GRAY8, uyvy_gray8) + || !register_conversion(IMG_YVYU, IMG_GRAY8, yuy2_gray8) + || !register_conversion(IMG_Y8, IMG_GRAY8, yuvp_gray8) + + || !register_conversion(IMG_GRAY8, IMG_YUV420P, gray8_yuv420p) + || !register_conversion(IMG_GRAY8, IMG_YUV411P, gray8_yuv411p) + || !register_conversion(IMG_GRAY8, IMG_YUV422P, gray8_yuv422p) + || !register_conversion(IMG_GRAY8, IMG_YUV444P, gray8_yuv444p) + || !register_conversion(IMG_GRAY8, IMG_YUY2, gray8_yuy2) + || !register_conversion(IMG_GRAY8, IMG_UYVY, gray8_uyvy) + || !register_conversion(IMG_GRAY8, IMG_YVYU, gray8_yuy2) + || !register_conversion(IMG_GRAY8, IMG_Y8, gray8_y8) + ) { + return 0; + } + + /******** MMX implementations ********/ + +#if defined(HAVE_ASM_MMX) && defined(ARCH_X86) + if (accel & AC_MMX) { + + //---- YUV->RGB ----// + + if (!register_conversion(IMG_YUV420P, IMG_RGB24, yuv420p_rgb24_mmx) + || !register_conversion(IMG_YUV422P, IMG_RGB24, yuv422p_rgb24_mmx) + || !register_conversion(IMG_YUV420P, IMG_BGR24, yuv420p_bgr24_mmx) + || !register_conversion(IMG_YUV422P, IMG_BGR24, yuv422p_bgr24_mmx) + || !register_conversion(IMG_YUV420P, IMG_RGBA32, yuv420p_rgba32_mmx) + || !register_conversion(IMG_YUV422P, IMG_RGBA32, yuv422p_rgba32_mmx) + || !register_conversion(IMG_YUV420P, IMG_ABGR32, yuv420p_abgr32_mmx) + || !register_conversion(IMG_YUV422P, IMG_ABGR32, yuv422p_abgr32_mmx) + || !register_conversion(IMG_YUV420P, IMG_ARGB32, yuv420p_argb32_mmx) + || !register_conversion(IMG_YUV422P, IMG_ARGB32, yuv422p_argb32_mmx) + || !register_conversion(IMG_YUV420P, IMG_BGRA32, yuv420p_bgra32_mmx) + || !register_conversion(IMG_YUV422P, IMG_BGRA32, yuv422p_bgra32_mmx) + ) { + return 0; + } + } +#endif + + /******** SSE2 implementations ********/ + +#if defined(HAVE_ASM_SSE2) + if (HAS_ACCEL(accel, AC_SSE2)) { + + //---- YUV->RGB ----// + + if (!register_conversion(IMG_YUV420P, IMG_RGB24, yuv420p_rgb24_sse2) + || !register_conversion(IMG_YUV411P, IMG_RGB24, yuv411p_rgb24_sse2) + || !register_conversion(IMG_YUV422P, IMG_RGB24, yuv422p_rgb24_sse2) + || !register_conversion(IMG_YUV444P, IMG_RGB24, yuv444p_rgb24_sse2) + || !register_conversion(IMG_YUY2, IMG_RGB24, yuy2_rgb24_sse2) + || !register_conversion(IMG_UYVY, IMG_RGB24, uyvy_rgb24_sse2) + || !register_conversion(IMG_YVYU, IMG_RGB24, yvyu_rgb24_sse2) + || !register_conversion(IMG_Y8, IMG_RGB24, y8_rgb24_sse2) + + || !register_conversion(IMG_YUV420P, IMG_BGR24, yuv420p_bgr24_sse2) + || !register_conversion(IMG_YUV411P, IMG_BGR24, yuv411p_bgr24_sse2) + || !register_conversion(IMG_YUV422P, IMG_BGR24, yuv422p_bgr24_sse2) + || !register_conversion(IMG_YUV444P, IMG_BGR24, yuv444p_bgr24_sse2) + || !register_conversion(IMG_YUY2, IMG_BGR24, yuy2_bgr24_sse2) + || !register_conversion(IMG_UYVY, IMG_BGR24, uyvy_bgr24_sse2) + || !register_conversion(IMG_YVYU, IMG_BGR24, yvyu_bgr24_sse2) + || !register_conversion(IMG_Y8, IMG_BGR24, y8_rgb24_sse2) + + || !register_conversion(IMG_YUV420P, IMG_RGBA32, yuv420p_rgba32_sse2) + || !register_conversion(IMG_YUV411P, IMG_RGBA32, yuv411p_rgba32_sse2) + || !register_conversion(IMG_YUV422P, IMG_RGBA32, yuv422p_rgba32_sse2) + || !register_conversion(IMG_YUV444P, IMG_RGBA32, yuv444p_rgba32_sse2) + || !register_conversion(IMG_YUY2, IMG_RGBA32, yuy2_rgba32_sse2) + || !register_conversion(IMG_UYVY, IMG_RGBA32, uyvy_rgba32_sse2) + || !register_conversion(IMG_YVYU, IMG_RGBA32, yvyu_rgba32_sse2) + || !register_conversion(IMG_Y8, IMG_RGBA32, y8_rgba32_sse2) + + || !register_conversion(IMG_YUV420P, IMG_ABGR32, yuv420p_abgr32_sse2) + || !register_conversion(IMG_YUV411P, IMG_ABGR32, yuv411p_abgr32_sse2) + || !register_conversion(IMG_YUV422P, IMG_ABGR32, yuv422p_abgr32_sse2) + || !register_conversion(IMG_YUV444P, IMG_ABGR32, yuv444p_abgr32_sse2) + || !register_conversion(IMG_YUY2, IMG_ABGR32, yuy2_abgr32_sse2) + || !register_conversion(IMG_UYVY, IMG_ABGR32, uyvy_abgr32_sse2) + || !register_conversion(IMG_YVYU, IMG_ABGR32, yvyu_abgr32_sse2) + || !register_conversion(IMG_Y8, IMG_ABGR32, y8_argb32_sse2) + + || !register_conversion(IMG_YUV420P, IMG_ARGB32, yuv420p_argb32_sse2) + || !register_conversion(IMG_YUV411P, IMG_ARGB32, yuv411p_argb32_sse2) + || !register_conversion(IMG_YUV422P, IMG_ARGB32, yuv422p_argb32_sse2) + || !register_conversion(IMG_YUV444P, IMG_ARGB32, yuv444p_argb32_sse2) + || !register_conversion(IMG_YUY2, IMG_ARGB32, yuy2_argb32_sse2) + || !register_conversion(IMG_UYVY, IMG_ARGB32, uyvy_argb32_sse2) + || !register_conversion(IMG_YVYU, IMG_ARGB32, yvyu_argb32_sse2) + || !register_conversion(IMG_Y8, IMG_ARGB32, y8_argb32_sse2) + + || !register_conversion(IMG_YUV420P, IMG_BGRA32, yuv420p_bgra32_sse2) + || !register_conversion(IMG_YUV411P, IMG_BGRA32, yuv411p_bgra32_sse2) + || !register_conversion(IMG_YUV422P, IMG_BGRA32, yuv422p_bgra32_sse2) + || !register_conversion(IMG_YUV444P, IMG_BGRA32, yuv444p_bgra32_sse2) + || !register_conversion(IMG_YUY2, IMG_BGRA32, yuy2_bgra32_sse2) + || !register_conversion(IMG_UYVY, IMG_BGRA32, uyvy_bgra32_sse2) + || !register_conversion(IMG_YVYU, IMG_BGRA32, yvyu_bgra32_sse2) + || !register_conversion(IMG_Y8, IMG_BGRA32, y8_rgba32_sse2) + + //---- RGB->YUV ----// + + || !register_conversion(IMG_RGB24, IMG_YUV420P, rgb24_yuv420p_sse2) + || !register_conversion(IMG_RGB24, IMG_YUV411P, rgb24_yuv411p_sse2) + || !register_conversion(IMG_RGB24, IMG_YUV422P, rgb24_yuv422p_sse2) + || !register_conversion(IMG_RGB24, IMG_YUV444P, rgb24_yuv444p_sse2) + || !register_conversion(IMG_RGB24, IMG_YUY2, rgb24_yuy2_sse2) + || !register_conversion(IMG_RGB24, IMG_UYVY, rgb24_uyvy_sse2) + || !register_conversion(IMG_RGB24, IMG_YVYU, rgb24_yvyu_sse2) + || !register_conversion(IMG_RGB24, IMG_Y8, rgb24_y8_sse2) + + || !register_conversion(IMG_BGR24, IMG_YUV420P, bgr24_yuv420p_sse2) + || !register_conversion(IMG_BGR24, IMG_YUV411P, bgr24_yuv411p_sse2) + || !register_conversion(IMG_BGR24, IMG_YUV422P, bgr24_yuv422p_sse2) + || !register_conversion(IMG_BGR24, IMG_YUV444P, bgr24_yuv444p_sse2) + || !register_conversion(IMG_BGR24, IMG_YUY2, bgr24_yuy2_sse2) + || !register_conversion(IMG_BGR24, IMG_UYVY, bgr24_uyvy_sse2) + || !register_conversion(IMG_BGR24, IMG_YVYU, bgr24_yvyu_sse2) + || !register_conversion(IMG_BGR24, IMG_Y8, bgr24_y8_sse2) + + || !register_conversion(IMG_RGBA32, IMG_YUV420P, rgba32_yuv420p_sse2) + || !register_conversion(IMG_RGBA32, IMG_YUV411P, rgba32_yuv411p_sse2) + || !register_conversion(IMG_RGBA32, IMG_YUV422P, rgba32_yuv422p_sse2) + || !register_conversion(IMG_RGBA32, IMG_YUV444P, rgba32_yuv444p_sse2) + || !register_conversion(IMG_RGBA32, IMG_YUY2, rgba32_yuy2_sse2) + || !register_conversion(IMG_RGBA32, IMG_UYVY, rgba32_uyvy_sse2) + || !register_conversion(IMG_RGBA32, IMG_YVYU, rgba32_yvyu_sse2) + || !register_conversion(IMG_RGBA32, IMG_Y8, rgba32_y8_sse2) + + || !register_conversion(IMG_ABGR32, IMG_YUV420P, abgr32_yuv420p_sse2) + || !register_conversion(IMG_ABGR32, IMG_YUV411P, abgr32_yuv411p_sse2) + || !register_conversion(IMG_ABGR32, IMG_YUV422P, abgr32_yuv422p_sse2) + || !register_conversion(IMG_ABGR32, IMG_YUV444P, abgr32_yuv444p_sse2) + || !register_conversion(IMG_ABGR32, IMG_YUY2, abgr32_yuy2_sse2) + || !register_conversion(IMG_ABGR32, IMG_UYVY, abgr32_uyvy_sse2) + || !register_conversion(IMG_ABGR32, IMG_YVYU, abgr32_yvyu_sse2) + || !register_conversion(IMG_ABGR32, IMG_Y8, abgr32_y8_sse2) + + || !register_conversion(IMG_ARGB32, IMG_YUV420P, argb32_yuv420p_sse2) + || !register_conversion(IMG_ARGB32, IMG_YUV411P, argb32_yuv411p_sse2) + || !register_conversion(IMG_ARGB32, IMG_YUV422P, argb32_yuv422p_sse2) + || !register_conversion(IMG_ARGB32, IMG_YUV444P, argb32_yuv444p_sse2) + || !register_conversion(IMG_ARGB32, IMG_YUY2, argb32_yuy2_sse2) + || !register_conversion(IMG_ARGB32, IMG_UYVY, argb32_uyvy_sse2) + || !register_conversion(IMG_ARGB32, IMG_YVYU, argb32_yvyu_sse2) + || !register_conversion(IMG_ARGB32, IMG_Y8, argb32_y8_sse2) + + || !register_conversion(IMG_BGRA32, IMG_YUV420P, bgra32_yuv420p_sse2) + || !register_conversion(IMG_BGRA32, IMG_YUV411P, bgra32_yuv411p_sse2) + || !register_conversion(IMG_BGRA32, IMG_YUV422P, bgra32_yuv422p_sse2) + || !register_conversion(IMG_BGRA32, IMG_YUV444P, bgra32_yuv444p_sse2) + || !register_conversion(IMG_BGRA32, IMG_YUY2, bgra32_yuy2_sse2) + || !register_conversion(IMG_BGRA32, IMG_UYVY, bgra32_uyvy_sse2) + || !register_conversion(IMG_BGRA32, IMG_YVYU, bgra32_yvyu_sse2) + || !register_conversion(IMG_BGRA32, IMG_Y8, bgra32_y8_sse2) + + //---- Grayscale ----// + + || !register_conversion(IMG_GRAY8, IMG_YUY2, gray8_yuy2_sse2) + || !register_conversion(IMG_GRAY8, IMG_UYVY, gray8_uyvy_sse2) + || !register_conversion(IMG_GRAY8, IMG_YVYU, gray8_yuy2_sse2) + || !register_conversion(IMG_GRAY8, IMG_Y8, gray8_y8_sse2) + ) { + return 0; + } + } + + /* YUV->GRAY8 routines use CMOVcc */ + if (HAS_ACCEL(accel, AC_CMOVE|AC_SSE2)) { + if (!register_conversion(IMG_YUV420P, IMG_GRAY8, yuvp_gray8_sse2) + || !register_conversion(IMG_YUV411P, IMG_GRAY8, yuvp_gray8_sse2) + || !register_conversion(IMG_YUV422P, IMG_GRAY8, yuvp_gray8_sse2) + || !register_conversion(IMG_YUV444P, IMG_GRAY8, yuvp_gray8_sse2) + || !register_conversion(IMG_YUY2, IMG_GRAY8, yuy2_gray8_sse2) + || !register_conversion(IMG_UYVY, IMG_GRAY8, uyvy_gray8_sse2) + || !register_conversion(IMG_YVYU, IMG_GRAY8, yuy2_gray8_sse2) + || !register_conversion(IMG_Y8, IMG_GRAY8, yuvp_gray8_sse2) + ) { + return 0; + } + } +#endif + + return 1; +} + +/*************************************************************************/ + +/* + * Local variables: + * c-file-style: "stroustrup" + * c-file-offsets: ((case-label . *) (statement-case-intro . *)) + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: + */ diff --git a/debian/transcode/transcode-1.1.7/aclib/imgconvert.c b/debian/transcode/transcode-1.1.7/aclib/imgconvert.c new file mode 100644 index 00000000..cc502977 --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/imgconvert.c @@ -0,0 +1,119 @@ +/* + * imgconvert.c - image format conversion routines + * Written by Andrew Church <[email protected]> + * + * This file is part of transcode, a video stream processing tool. + * transcode is free software, distributable under the terms of the GNU + * General Public License (version 2 or later). See the file COPYING + * for details. + */ + +#include "ac.h" +#include "imgconvert.h" +#include "img_internal.h" + +#include <stdio.h> +#include <stdlib.h> + +/*************************************************************************/ + +static struct { + ImageFormat srcfmt, destfmt; + ConversionFunc func; +} *conversions; +static int n_conversions = 0; + +/*************************************************************************/ +/*************************************************************************/ + +/* Image conversion routine. src and dest are arrays of pointers to planes + * (for packed formats with only one plane, just use `&data'); srcfmt and + * destfmt specify the source and destination image formats (IMG_*). + * width and height are in pixels. Returns 1 on success, 0 on failure. */ + +int ac_imgconvert(uint8_t **src, ImageFormat srcfmt, + uint8_t **dest, ImageFormat destfmt, + int width, int height) +{ + int i; + + /* Hack to handle YV12 easily, because conversion routines don't get + * format tags */ + uint8_t *newsrc[3], *newdest[3]; + if (srcfmt == IMG_YV12) { + srcfmt = IMG_YUV420P; + newsrc[0] = src[0]; + newsrc[1] = src[2]; + newsrc[2] = src[1]; + src = newsrc; + } + if (destfmt == IMG_YV12) { + destfmt = IMG_YUV420P; + newdest[0] = dest[0]; + newdest[1] = dest[2]; + newdest[2] = dest[1]; + dest = newdest; + } + + for (i = 0; i < n_conversions; i++) { + if (conversions[i].srcfmt==srcfmt && conversions[i].destfmt==destfmt) + return (*conversions[i].func)(src, dest, width, height); + } + + return 0; +} + +/*************************************************************************/ +/*************************************************************************/ + +/* Internal use only! */ + +int ac_imgconvert_init(int accel) +{ + if (!ac_imgconvert_init_yuv_planar(accel) + || !ac_imgconvert_init_yuv_packed(accel) + || !ac_imgconvert_init_yuv_mixed(accel) + || !ac_imgconvert_init_yuv_rgb(accel) + || !ac_imgconvert_init_rgb_packed(accel) + ) { + fprintf(stderr, "ac_imgconvert_init() failed"); + return 0; + } + return 1; +} + +int register_conversion(ImageFormat srcfmt, ImageFormat destfmt, + ConversionFunc function) +{ + int i; + + for (i = 0; i < n_conversions; i++) { + if (conversions[i].srcfmt==srcfmt && conversions[i].destfmt==destfmt) { + conversions[i].func = function; + return 1; + } + } + + if (!(conversions = realloc(conversions, + (n_conversions+1) * sizeof(*conversions)))) { + fprintf(stderr, "register_conversion(): out of memory\n"); + return 0; + } + conversions[n_conversions].srcfmt = srcfmt; + conversions[n_conversions].destfmt = destfmt; + conversions[n_conversions].func = function; + n_conversions++; + return 1; +} + +/*************************************************************************/ + +/* + * Local variables: + * c-file-style: "stroustrup" + * c-file-offsets: ((case-label . *) (statement-case-intro . *)) + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: + */ diff --git a/debian/transcode/transcode-1.1.7/aclib/imgconvert.h b/debian/transcode/transcode-1.1.7/aclib/imgconvert.h new file mode 100644 index 00000000..c02d5a01 --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/imgconvert.h @@ -0,0 +1,105 @@ +/* + * imgconvert.h - defines for image format conversion routines + * Written by Andrew Church <[email protected]> + * + * This file is part of transcode, a video stream processing tool. + * transcode is free software, distributable under the terms of the GNU + * General Public License (version 2 or later). See the file COPYING + * for details. + */ + +#ifndef ACLIB_IMGCONVERT_H +#define ACLIB_IMGCONVERT_H + +/*************************************************************************/ + +/* Image format defines */ +typedef enum { + IMG_UNKNOWN = 0, /* Unknown/unset (dummy value, guaranteed to be 0) */ + /* YUV formats */ + IMG_YUV_BASE = 0x1000, + IMG_YUV420P, /* YUV planar, 1 U/V per 2x2 Y pixels */ + IMG_YV12, /* YUV420P with U and V reversed */ + IMG_YUV411P, /* YUV planar, 1 U/V per 4x1 Y pixels */ + IMG_YUV422P, /* YUV planar, 1 U/V per 2x1 Y pixels */ + IMG_YUV444P, /* YUV planar, 1 U/V per 1x1 Y pixels */ + IMG_YUY2, /* YUV packed, 1 U/V per 2x1 Y pixels, Y:U:Y:V */ + IMG_UYVY, /* YUV packed, 1 U/V per 2x1 Y pixels, U:Y:V:Y */ + IMG_YVYU, /* YUV packed, 1 U/V per 2x1 Y pixels, Y:V:Y:U */ + IMG_Y8, /* Y-only 8-bit data */ + IMG_YUV_LAST, + /* RGB formats */ + IMG_RGB_BASE = 0x2000, + IMG_RGB24, /* RGB packed, 8 bits per component, R:G:B */ + IMG_BGR24, /* RGB packed, 8 bits per component, B:G:R */ + IMG_RGBA32, /* RGB+alpha packed, 8 bits per component, R:G:B:A */ + IMG_ABGR32, /* RGB+alpha packed, 8 bits per component, A:B:G:R */ + IMG_ARGB32, /* RGB+alpha packed, 8 bits per component, A:R:G:B */ + IMG_BGRA32, /* RGB+alpha packed, 8 bits per component, B:G:R:A */ + IMG_GRAY8, /* Grayscale 8-bit data */ + IMG_RGB_LAST, +} ImageFormat; + +/* Alias */ +#define IMG_NONE IMG_UNKNOWN + +/* Default YUV and RGB formats */ +#define IMG_YUV_DEFAULT IMG_YUV420P +#define IMG_RGB_DEFAULT IMG_RGB24 + +/* Is the given image format a YUV/RGB one? */ +#define IS_YUV_FORMAT(fmt) ((fmt) > IMG_YUV_BASE && (fmt) < IMG_YUV_LAST) +#define IS_RGB_FORMAT(fmt) ((fmt) > IMG_RGB_BASE && (fmt) < IMG_RGB_LAST) + +/* U/V plane size for YUV planar formats (Y plane size is always w*h) */ +#define UV_PLANE_SIZE(fmt,w,h) \ + ((fmt)==IMG_YUV420P ? ((w)/2)*((h)/2) : \ + (fmt)==IMG_YV12 ? ((w)/2)*((h)/2) : \ + (fmt)==IMG_YUV411P ? ((w)/4)* (h) : \ + (fmt)==IMG_YUV422P ? ((w)/2)* (h) : \ + (fmt)==IMG_YUV444P ? (w) * (h) : 0) + +/* Macro to initialize an array of planes from a buffer */ +#define YUV_INIT_PLANES(planes,buffer,fmt,w,h) \ + ((planes)[0] = (buffer), \ + (planes)[1] = (planes)[0] + (w)*(h), \ + (planes)[2] = (planes)[1] + UV_PLANE_SIZE((fmt),(w),(h))) + +#if 0 +/* Structure describing an image. FIXME: not currently used--this should + * eventually replace the (planes,format) pairs passed to ac_imgconvert. */ +typedef struct { + ImageFormat format; /* Format of image data */ + int width, height; /* Size of image */ + uint8_t *planes[4]; /* Data planes (use planes[0] for packed data) */ + int stride[4]; /* Length of one row in each plane, incl. padding */ +} Image; +#endif + +/*************************************************************************/ + +/* Initialization routine. Returns 1 on success, 0 on failure. */ +extern int ac_imgconvert_init(int accel); + +/* Conversion routine. Returns 1 on success, 0 on failure. */ +extern int ac_imgconvert(uint8_t **src, /* Array of source planes */ + ImageFormat srcfmt, /* Source image format */ + uint8_t **dest, /* Array of dest planes */ + ImageFormat destfmt, /* Destination image format */ + int width, /* Image width in pixels */ + int height /* Image height in pixels */ + ); + +/*************************************************************************/ + +#endif /* ACLIB_IMGCONVERT_H */ + +/* + * Local variables: + * c-file-style: "stroustrup" + * c-file-offsets: ((case-label . *) (statement-case-intro . *)) + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: + */ diff --git a/debian/transcode/transcode-1.1.7/aclib/memcpy.c b/debian/transcode/transcode-1.1.7/aclib/memcpy.c new file mode 100644 index 00000000..05cdf41c --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/memcpy.c @@ -0,0 +1,543 @@ +/* + * memcpy.c - optimized memcpy() routines for aclib + * Written by Andrew Church <[email protected]> + * + * This file is part of transcode, a video stream processing tool. + * transcode is free software, distributable under the terms of the GNU + * General Public License (version 2 or later). See the file COPYING + * for details. + */ + +#include "ac.h" +#include "ac_internal.h" +#include <string.h> + +/* Use memmove because memcpy isn't guaranteed to be ascending */ +static void *(*memcpy_ptr)(void *, const void *, size_t) = memmove; + +/*************************************************************************/ + +/* External interface */ + +void *ac_memcpy(void *dest, const void *src, size_t size) +{ + return (*memcpy_ptr)(dest, src, size); +} + +/*************************************************************************/ +/*************************************************************************/ + +/* Note the check for ARCH_X86 here: this is to prevent compilation of this + * code on x86_64, since all x86_64 processors support SSE2, and because + * this code is not set up to use the 64-bit registers for addressing on + * x86_64. */ + +#if defined(HAVE_ASM_MMX) && defined(ARCH_X86) + +/* MMX-optimized routine, intended for PMMX/PII processors. + * Nonstandard instructions used: + * (CPUID.MMX) MOVQ + */ + +static void *memcpy_mmx(void *dest, const void *src, size_t bytes) +{ + asm("\ +PENTIUM_LINE_SIZE = 32 # PMMX/PII cache line size \n\ +PENTIUM_CACHE_SIZE = 8192 # PMMX/PII total cache size \n\ +# Use only half because writes may touch the cache too (PII) \n\ +PENTIUM_CACHE_BLOCK = (PENTIUM_CACHE_SIZE/2 - PENTIUM_LINE_SIZE) \n\ + \n\ + push %%ebx # Save PIC register \n\ + push %%edi # Save destination for return value \n\ + cld # MOVS* should ascend \n\ + \n\ + mov $64, %%ebx # Constant \n\ + \n\ + cmp %%ebx, %%ecx \n\ + jb mmx.memcpy_last # Just use movs if <64 bytes \n\ + \n\ + # First align destination address to a multiple of 8 bytes \n\ + mov $8, %%eax # EAX <- (8-dest) & 7 \n\ + sub %%edi, %%eax \n\ + and $7, %%eax # ... which is the number of bytes to copy\n" +#ifdef ACLIB_DISABLE_X86_TEXTRELS // Because "lea 0f" requires a textrel +" xchg %%eax, %%ecx \n\ + mov %%ecx, %%edx \n\ + repz movsb \n\ + mov %%eax, %%ecx \n\ + mov %%edx, %%eax \n" +#else +" lea 0f, %%edx # Use a computed jump--faster than a loop\n\ + sub %%eax, %%edx \n\ + jmp *%%edx # Execute 0-7 MOVSB's \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n" +#endif +"0: sub %%eax, %%ecx # Update count \n\ + \n\ + # Now copy data in blocks \n\ +0: mov %%ecx, %%edx # EDX <- ECX >> 6 (cache lines to copy) \n\ + shr $6, %%edx \n\ + jz mmx.memcpy_last # <64 bytes left? Skip to end \n\ + cmp $PENTIUM_CACHE_BLOCK/64, %%edx \n\ + jb 1f # Limit size of block \n\ + mov $PENTIUM_CACHE_BLOCK/64, %%edx \n\ +1: mov %%edx, %%eax # EAX <- EDX << 6 (bytes to copy) \n\ + shl $6, %%eax \n\ + sub %%eax, %%ecx # Update remaining count \n\ + add %%eax, %%esi # Point to end of region to be block-copied\n\ +2: test %%eax, -32(%%esi) # Touch each cache line in reverse order\n\ + test %%eax, -64(%%esi) \n\ + sub %%ebx, %%esi # Update pointer \n\ + sub %%ebx, %%eax # And loop \n\ + jnz 2b \n\ + # Note that ESI now points to the beginning of the block \n\ +3: movq (%%esi), %%mm0 # Do the actual copy, 64 bytes at a time\n\ + movq 8(%%esi), %%mm1 \n\ + movq 16(%%esi), %%mm2 \n\ + movq 24(%%esi), %%mm3 \n\ + movq 32(%%esi), %%mm4 \n\ + movq 40(%%esi), %%mm5 \n\ + movq 48(%%esi), %%mm6 \n\ + movq 56(%%esi), %%mm7 \n\ + movq %%mm0, (%%edi) \n\ + movq %%mm1, 8(%%edi) \n\ + movq %%mm2, 16(%%edi) \n\ + movq %%mm3, 24(%%edi) \n\ + movq %%mm4, 32(%%edi) \n\ + movq %%mm5, 40(%%edi) \n\ + movq %%mm6, 48(%%edi) \n\ + movq %%mm7, 56(%%edi) \n\ + add %%ebx, %%esi # Update pointers \n\ + add %%ebx, %%edi \n\ + dec %%edx # And loop \n\ + jnz 3b \n\ + jmp 0b \n\ + \n\ +mmx.memcpy_last: \n\ + # Copy last <64 bytes, using the computed jump trick \n\ + mov %%ecx, %%eax # EAX <- ECX>>2 \n\ + shr $2, %%eax \n" +#ifdef ACLIB_DISABLE_X86_TEXTRELS +" xchg %%eax, %%ecx \n\ + repz movsd \n\ + mov %%eax, %%ecx \n" +#else +" lea 0f, %%edx \n\ + sub %%eax, %%edx \n\ + jmp *%%edx # Execute 0-15 MOVSD's \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n" +#endif +"0: and $3, %%ecx # ECX <- ECX & 3 \n" +#ifdef ACLIB_DISABLE_X86_TEXTRELS +" repz movsb \n" +#else +" lea 0f, %%edx \n\ + sub %%ecx, %%edx \n\ + jmp *%%edx # Execute 0-3 MOVSB's \n\ + movsb \n\ + movsb \n\ + movsb \n" +#endif +"0: \n\ + # All done! \n\ + emms # Clean up MMX state \n\ + pop %%edi # Restore destination (return value) \n\ + pop %%ebx # Restore PIC register \n\ + " : /* no outputs */ + : "D" (dest), "S" (src), "c" (bytes) + : "%eax", "%edx" + ); + return dest; +} + +#endif /* HAVE_ASM_MMX && ARCH_X86 */ + +/*************************************************************************/ + +#if defined(HAVE_ASM_SSE) && defined(ARCH_X86) + +/* SSE-optimized routine. Backported from AMD64 routine below. + * Nonstandard instructions used: + * (CPUID.CMOVE) CMOVA + * (CPUID.MMX) MOVQ + * (CPUID.SSE) MOVNTQ + */ + +static void *memcpy_sse(void *dest, const void *src, size_t bytes) +{ + asm("\ + push %%ebx # Save PIC register \n\ + push %%edi # Save destination for return value \n\ + cld # MOVS* should ascend \n\ + \n\ + cmp $64, %%ecx # Skip block copy for small blocks \n\ + jb sse.memcpy_last \n\ + \n\ + mov $128, %%ebx # Constant used later \n\ + \n\ + # First align destination address to a multiple of 8 bytes \n\ + mov $8, %%eax # EAX <- (8-dest) & 7 \n\ + sub %%edi, %%eax \n\ + and $7, %%eax # ... which is the number of bytes to copy\n" +#ifdef ACLIB_DISABLE_X86_TEXTRELS +" xchg %%eax, %%ecx \n\ + mov %%ecx, %%edx \n\ + repz movsb \n\ + mov %%eax, %%ecx \n\ + mov %%edx, %%eax \n" +#else +" lea 0f, %%edx # Use a computed jump--faster than a loop\n\ + sub %%eax, %%edx \n\ + jmp *%%edx # Execute 0-7 MOVSB's \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n" +#endif +"0: sub %%eax, %%ecx # Update count \n\ + \n\ + cmp $0x10040, %%ecx # Is this a large block? (0x10040 is an \n\ + # arbitrary value where prefetching and \n\ + # write combining seem to start becoming\n\ + # faster) \n\ + jae sse.memcpy_bp # Yup, use prefetch copy \n\ + \n\ +sse.memcpy_small: # Small block copy routine--no prefetch \n" +#if 0 +" mov %%ecx, %%edx # EDX <- bytes to copy / 8 \n\ + shr $3, %%edx \n\ + mov %%edx, %%eax # Leave remainder in ECX for later \n\ + shl $3, %%eax \n\ + sub %%eax, %%ecx \n\ + .balign 16 \n\ +0: movq (%%esi), %%mm0 # Copy 8 bytes of data \n\ + movq %%mm0, (%%edi) \n\ + add $8, %%esi # Update pointers \n\ + add $8, %%edi \n\ + dec %%edx # And loop \n\ + jg 0b \n\ + jmp sse.memcpy_last # Copy any remaining bytes \n\ + \n\ + nop # Align loops below \n" +#else +" # It appears that a simple rep movs is faster than cleverness \n\ + # with movq... \n\ + mov %%ecx, %%edx # EDX <- ECX & 3 \n\ + and $3, %%edx \n\ + shr $2, %%ecx # ECX <- ECX >> 2 \n\ + rep movsl # Copy away! \n\ + mov %%edx, %%ecx # Take care of last 0-3 bytes \n\ + rep movsb \n\ + jmp sse.memcpy_end # And exit \n\ + \n\ + .balign 16 \n\ + nop \n\ + nop \n" +#endif +"sse.memcpy_bp: # Block prefetch copy routine \n\ +0: mov %%ecx, %%edx # EDX: temp counter \n\ + shr $6, %%edx # Divide by cache line size (64 bytes) \n\ + cmp %%ebx, %%edx # ... and cap at 128 (8192 bytes) \n\ + cmova %%ebx, %%edx \n\ + shl $3, %%edx # EDX <- cache lines to copy * 8 \n\ + mov %%edx, %%eax # EAX <- cache lines to preload * 8 \n\ + # (also used as memory offset) \n\ +1: test %%eax, -64(%%esi,%%eax,8) # Preload cache lines in pairs \n\ + test %%eax, -128(%%esi,%%eax,8) # (going backwards) \n\ + # (note that test %%eax,... seems to be faster than prefetchnta \n\ + # on x86) \n\ + sub $16, %%eax # And loop \n\ + jg 1b \n\ + \n\ + # Then copy--forward, which seems to be faster than reverse for \n\ + # certain alignments \n\ + xor %%eax, %%eax \n\ +2: movq (%%esi,%%eax,8), %%mm0 # Copy 8 bytes and loop \n\ + movntq %%mm0, (%%edi,%%eax,8) \n\ + inc %%eax \n\ + cmp %%edx, %%eax \n\ + jb 2b \n\ + \n\ + # Finally, update pointers and count, and loop \n\ + shl $3, %%edx # EDX <- bytes copied \n\ + add %%edx, %%esi \n\ + add %%edx, %%edi \n\ + sub %%edx, %%ecx \n\ + cmp $64, %%ecx # At least one cache line left? \n\ + jae 0b # Yup, loop \n\ + \n\ +sse.memcpy_last: \n\ + # Copy last <64 bytes, using the computed jump trick \n\ + mov %%ecx, %%eax # EAX <- ECX>>2 \n\ + shr $2, %%eax \n" +#ifdef ACLIB_DISABLE_X86_TEXTRELS +" xchg %%eax, %%ecx \n\ + repz movsd \n\ + mov %%eax, %%ecx \n" +#else +" lea 0f, %%edx \n\ + sub %%eax, %%edx \n\ + jmp *%%edx # Execute 0-15 MOVSD's \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n" +#endif +"0: and $3, %%ecx # ECX <- ECX & 3 \n" +#ifdef ACLIB_DISABLE_X86_TEXTRELS +" repz movsb \n" +#else +" lea sse.memcpy_end, %%edx \n\ + sub %%ecx, %%edx \n\ + jmp *%%edx # Execute 0-3 MOVSB's \n\ + movsb \n\ + movsb \n\ + movsb \n" +#endif +" \n\ +sse.memcpy_end: \n\ + # All done! \n\ + emms # Clean up after MMX instructions \n\ + sfence # Flush the write buffer \n\ + pop %%edi # Restore destination (return value) \n\ + pop %%ebx # Restore PIC register \n\ + " : /* no outputs */ + : "D" (dest), "S" (src), "c" (bytes) + : "%eax", "%edx" + ); + return dest; +} + +#endif /* HAVE_ASM_SSE && ARCH_X86 */ + +/*************************************************************************/ + +#if defined(HAVE_ASM_SSE2) && defined(ARCH_X86_64) + +/* AMD64-optimized routine, using SSE2. Derived from AMD64 optimization + * guide section 5.13: Appropriate Memory Copying Routines. + * Nonstandard instructions used: + * (CPUID.CMOVE) CMOVA + * (CPUID.SSE2) MOVDQA, MOVDQU, MOVNTDQ + * + * Note that this routine will also run more or less as-is (modulo register + * names and label(%%rip) references) on x86 CPUs, but tests have shown the + * SSE1 version above to be faster. + */ + +/* The block copying code--macroized because we use two versions of it + * depending on whether the source is 16-byte-aligned or not. Pass either + * movdqa or movdqu (unquoted) for the parameter. */ +#define AMD64_BLOCK_MEMCPY(movdq) \ +" # First prefetch (note that if we end on an odd number of cache \n\ + # lines, we skip prefetching the last one--faster that way than \n\ + # prefetching line by line or treating it as a special case) \n\ +0: mov %%ecx, %%edx # EDX: temp counter (always <32 bits) \n\ + shr $6, %%edx # Divide by cache line size (64 bytes) \n\ + cmp %%ebx, %%edx # ... and cap at 128 (8192 bytes) \n\ + cmova %%ebx, %%edx \n\ + shl $3, %%edx # EDX <- cache lines to copy * 8 \n\ + mov %%edx, %%eax # EAX <- cache lines to preload * 8 \n\ + # (also used as memory offset) \n\ +1: prefetchnta -64(%%rsi,%%rax,8) # Preload cache lines in pairs \n\ + prefetchnta -128(%%rsi,%%rax,8) # (going backwards) \n\ + sub $16, %%eax # And loop \n\ + jg 1b \n\ + \n\ + # Then copy--forward, which seems to be faster than reverse for \n\ + # certain alignments \n\ + xor %%eax, %%eax \n\ +2: " #movdq " (%%rsi,%%rax,8), %%xmm0 # Copy 16 bytes and loop \n\ + movntdq %%xmm0, (%%rdi,%%rax,8) \n\ + add $2, %%eax \n\ + cmp %%edx, %%eax \n\ + jb 2b \n\ + \n\ + # Finally, update pointers and count, and loop \n\ + shl $3, %%edx # EDX <- bytes copied \n\ + add %%rdx, %%rsi \n\ + add %%rdx, %%rdi \n\ + sub %%rdx, %%rcx \n\ + cmp $64, %%rcx # At least one cache line left? \n\ + jae 0b # Yup, loop \n" + +static void *memcpy_amd64(void *dest, const void *src, size_t bytes) +{ + asm("\ + push %%rdi # Save destination for return value \n\ + cld # MOVS* should ascend \n\ + \n\ + cmp $64, %%rcx # Skip block copy for small blocks \n\ + jb amd64.memcpy_last \n\ + \n\ + mov $128, %%ebx # Constant used later \n\ + \n\ + # First align destination address to a multiple of 16 bytes \n\ + mov $8, %%eax # EAX <- (8-dest) & 7 \n\ + sub %%edi, %%eax # (we don't care about the top 32 bits) \n\ + and $7, %%eax # ... which is the number of bytes to copy\n\ + lea 0f(%%rip), %%rdx # Use a computed jump--faster than a loop\n\ + sub %%rax, %%rdx \n\ + jmp *%%rdx # Execute 0-7 MOVSB's \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ +0: sub %%rax, %%rcx # Update count \n\ + test $8, %%edi # Is destination not 16-byte aligned? \n\ + je 1f \n\ + movsq # Then move 8 bytes to align it \n\ + sub $8, %%rcx \n\ + \n\ +1: cmp $0x38000, %%rcx # Is this a large block? (0x38000 is an \n\ + # arbitrary value where prefetching and \n\ + # write combining seem to start becoming\n\ + # faster) \n\ + jb amd64.memcpy_small # Nope, use small copy (no prefetch/WC) \n\ + test $15, %%esi # Is source also 16-byte aligned? \n\ + # (use ESI to save a REX prefix byte) \n\ + jnz amd64.memcpy_normal_bp # Nope, use slow copy \n\ + jmp amd64.memcpy_fast_bp # Yup, use fast copy \n\ + \n\ +amd64.memcpy_small: # Small block copy routine--no prefetch \n\ + mov %%ecx, %%edx # EDX <- bytes to copy / 16 \n\ + shr $4, %%edx # (count known to fit in 32 bits) \n\ + mov %%edx, %%eax # Leave remainder in ECX for later \n\ + shl $4, %%eax \n\ + sub %%eax, %%ecx \n\ + .balign 16 \n\ +0: movdqu (%%rsi), %%xmm0 # Copy 16 bytes of data \n\ + movdqa %%xmm0, (%%rdi) \n\ + add $16, %%rsi # Update pointers \n\ + add $16, %%rdi \n\ + dec %%edx # And loop \n\ + jnz 0b \n\ + jmp amd64.memcpy_last # Copy any remaining bytes \n\ + \n\ + .balign 16 \n\ + nop \n\ + nop \n\ +amd64.memcpy_fast_bp: # Fast block prefetch loop \n" +AMD64_BLOCK_MEMCPY(movdqa) +" jmp amd64.memcpy_last # Copy any remaining bytes \n\ + \n\ + .balign 16 \n\ + nop \n\ + nop \n\ +amd64.memcpy_normal_bp: # Normal (unaligned) block prefetch loop\n" +AMD64_BLOCK_MEMCPY(movdqu) +" \n\ +amd64.memcpy_last: \n\ + # Copy last <64 bytes, using the computed jump trick \n\ + mov %%ecx, %%eax # EAX <- ECX>>3 \n\ + shr $3, %%eax \n\ + lea 0f(%%rip), %%rdx \n\ + add %%eax, %%eax # Watch out, MOVSQ is 2 bytes! \n\ + sub %%rax, %%rdx \n\ + jmp *%%rdx # Execute 0-7 MOVSQ's \n\ + movsq \n\ + movsq \n\ + movsq \n\ + movsq \n\ + movsq \n\ + movsq \n\ + movsq \n\ +0: and $7, %%ecx # ECX <- ECX & 7 \n\ + lea 0f(%%rip), %%rdx \n\ + sub %%rcx, %%rdx \n\ + jmp *%%rdx # Execute 0-7 MOVSB's \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ +0: \n\ + # All done! \n\ + emms # Clean up after MMX instructions \n\ + sfence # Flush the write buffer \n\ + pop %%rdi # Restore destination (return value) \n\ + " : /* no outputs */ + : "D" (dest), "S" (src), "c" (bytes) + : "%rax", "%rbx", "%rdx" + ); + return dest; +} + +#endif /* HAVE_ASM_SSE2 && ARCH_X86_64 */ + +/*************************************************************************/ + +/* Initialization routine. */ + +int ac_memcpy_init(int accel) +{ + memcpy_ptr = memmove; + +#if defined(HAVE_ASM_MMX) && defined(ARCH_X86) + if (HAS_ACCEL(accel, AC_MMX)) + memcpy_ptr = memcpy_mmx; +#endif + +#if defined(HAVE_ASM_SSE) && defined(ARCH_X86) + if (HAS_ACCEL(accel, AC_CMOVE|AC_SSE)) + memcpy_ptr = memcpy_sse; +#endif + +#if defined(HAVE_ASM_SSE2) && defined(ARCH_X86_64) + if (HAS_ACCEL(accel, AC_CMOVE|AC_SSE2)) + memcpy_ptr = memcpy_amd64; +#endif + + return 1; +} + +/*************************************************************************/ + +/* + * Local variables: + * c-file-style: "stroustrup" + * c-file-offsets: ((case-label . *) (statement-case-intro . *)) + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: + */ diff --git a/debian/transcode/transcode-1.1.7/aclib/rescale.c b/debian/transcode/transcode-1.1.7/aclib/rescale.c new file mode 100644 index 00000000..5a619735 --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/rescale.c @@ -0,0 +1,280 @@ +/* + * rescale.c -- take the weighted average of two sets of byte data + * Written by Andrew Church <[email protected]> + * + * This file is part of transcode, a video stream processing tool. + * transcode is free software, distributable under the terms of the GNU + * General Public License (version 2 or later). See the file COPYING + * for details. + */ + +#include "ac.h" +#include "ac_internal.h" + +static void rescale(const uint8_t *, const uint8_t *, uint8_t *, int, + uint32_t, uint32_t); +static void (*rescale_ptr)(const uint8_t *, const uint8_t *, uint8_t *, int, + uint32_t, uint32_t) = rescale; + +/*************************************************************************/ + +/* External interface */ + +void ac_rescale(const uint8_t *src1, const uint8_t *src2, + uint8_t *dest, int bytes, uint32_t weight1, uint32_t weight2) +{ + if (weight1 >= 0x10000) + ac_memcpy(dest, src1, bytes); + else if (weight2 >= 0x10000) + ac_memcpy(dest, src2, bytes); + else + (*rescale_ptr)(src1, src2, dest, bytes, weight1, weight2); +} + +/*************************************************************************/ +/*************************************************************************/ + +/* Vanilla C version */ + +static void rescale(const uint8_t *src1, const uint8_t *src2, + uint8_t *dest, int bytes, + uint32_t weight1, uint32_t weight2) +{ + int i; + for (i = 0; i < bytes; i++) + dest[i] = (src1[i]*weight1 + src2[i]*weight2 + 32768) >> 16; +} + +/*************************************************************************/ + +/* MMX version */ + +#if defined(HAVE_ASM_MMX) && defined(ARCH_X86) /* i.e. not x86_64 */ + +static void rescale_mmx(const uint8_t *src1, const uint8_t *src2, + uint8_t *dest, int bytes, + uint32_t weight1, uint32_t weight2) +{ + if (bytes >= 8) { + /* First store weights in MM4/MM5 to relieve register pressure; + * save time by making 2 copies ahead of time in the general + * registers. Note that we divide by 2 for MMX due to the lack + * of an unsigned SIMD multiply instruction (PMULHUW). */ + int half1 = weight1 / 2; + int half2 = weight2 / 2; + half2 += weight1 & weight2 & 1; // pick up the lost bit here + asm("movd %%eax, %%mm4; movd %%edx, %%mm5" + : : "a" (half1<<16|half1), "d" (half2<<16|half2)); + asm("\ + movq %%mm4, %%mm6 # MM6: 00 00 W1 W1 \n\ + psllq $32, %%mm4 # MM4: W1 W1 00 00 \n\ + por %%mm6, %%mm4 # MM4: W1 W1 W1 W1 \n\ + movq %%mm5, %%mm7 # MM7: 00 00 W2 W2 \n\ + psllq $32, %%mm5 # MM5: W2 W2 00 00 \n\ + por %%mm7, %%mm5 # MM5: W2 W2 W2 W2 \n\ + pxor %%mm7, %%mm7 # MM7: 00 00 00 00 \n\ + pxor %%mm6, %%mm6 # Put 0x0020*4 in MM6 (rounding)\n\ + pcmpeqw %%mm3, %%mm3 \n\ + psubw %%mm3, %%mm6 \n\ + psllw $5, %%mm6 \n\ + 0: \n\ + movq -8(%%esi,%%ecx), %%mm0 \n\ + movq %%mm0, %%mm1 \n\ + punpcklbw %%mm7, %%mm0 \n\ + psllw $7, %%mm0 # 9.7 fixed point \n\ + pmulhw %%mm4, %%mm0 # Multiply to get 10.6 fixed \n\ + punpckhbw %%mm7, %%mm1 \n\ + psllw $7, %%mm1 \n\ + pmulhw %%mm4, %%mm1 \n\ + movq -8(%%edx,%%ecx), %%mm2 \n\ + movq %%mm2, %%mm3 \n\ + punpcklbw %%mm7, %%mm2 \n\ + psllw $7, %%mm2 \n\ + pmulhw %%mm5, %%mm2 \n\ + punpckhbw %%mm7, %%mm3 \n\ + psllw $7, %%mm3 \n\ + pmulhw %%mm5, %%mm3 \n\ + paddw %%mm2, %%mm0 \n\ + paddw %%mm6, %%mm0 \n\ + psrlw $6, %%mm0 \n\ + paddw %%mm3, %%mm1 \n\ + paddw %%mm6, %%mm1 \n\ + psrlw $6, %%mm1 \n\ + packuswb %%mm1, %%mm0 \n\ + movq %%mm0, -8(%%edi,%%ecx) \n\ + subl $8, %%ecx \n\ + jnz 0b \n\ + emms" + : /* no outputs */ + : "S" (src1), "d" (src2), "D" (dest), "c" (bytes & ~7)); + } + if (UNLIKELY(bytes & 7)) { + rescale(src1+(bytes & ~7), src2+(bytes & ~7), dest+(bytes & ~7), + bytes & 7, weight1, weight2); + } +} + +#endif /* HAVE_ASM_MMX && ARCH_X86 */ + +/*************************************************************************/ + +/* MMXEXT version (also for SSE) */ + +#if (defined(HAVE_ASM_MMXEXT) || defined(HAVE_ASM_SSE)) && defined(ARCH_X86) + +static void rescale_mmxext(const uint8_t *src1, const uint8_t *src2, + uint8_t *dest, int bytes, + uint32_t weight1, uint32_t weight2) +{ + if (bytes >= 8) { + asm("movd %%eax, %%mm4; movd %%edx, %%mm5" + : : "a" (weight1), "d" (weight2)); + asm("\ + pshufw $0, %%mm4, %%mm4 # MM4: W1 W1 W1 W1 \n\ + pshufw $0, %%mm5, %%mm5 # MM5: W2 W2 W2 W2 \n\ + pxor %%mm6, %%mm6 # Put 0x0080*4 in MM6 (rounding)\n\ + pcmpeqw %%mm7, %%mm7 \n\ + psubw %%mm7, %%mm6 \n\ + psllw $7, %%mm6 \n\ + 0: \n\ + movq -8(%%esi,%%ecx), %%mm7 \n\ + pxor %%mm0, %%mm0 # Load data into high bytes \n\ + punpcklbw %%mm7, %%mm0 # (gives 8.8 fixed point) \n\ + pmulhuw %%mm4, %%mm0 # Result: 0000..FF00 \n\ + pxor %%mm1, %%mm1 \n\ + punpckhbw %%mm7, %%mm1 \n\ + pmulhuw %%mm4, %%mm1 \n\ + movq -8(%%edx,%%ecx), %%mm7 \n\ + pxor %%mm2, %%mm2 \n\ + punpcklbw %%mm7, %%mm2 \n\ + pmulhuw %%mm5, %%mm2 \n\ + pxor %%mm3, %%mm3 \n\ + punpckhbw %%mm7, %%mm3 \n\ + pmulhuw %%mm5, %%mm3 \n\ + paddw %%mm2, %%mm0 \n\ + paddw %%mm6, %%mm0 \n\ + psrlw $8, %%mm0 # Shift back down to 00..FF \n\ + paddw %%mm3, %%mm1 \n\ + paddw %%mm6, %%mm1 \n\ + psrlw $8, %%mm1 \n\ + packuswb %%mm1, %%mm0 \n\ + movq %%mm0, -8(%%edi,%%ecx) \n\ + subl $8, %%ecx \n\ + jnz 0b \n\ + emms" + : /* no outputs */ + : "S" (src1), "d" (src2), "D" (dest), "c" (bytes & ~7)); + } + if (UNLIKELY(bytes & 7)) { + rescale(src1+(bytes & ~7), src2+(bytes & ~7), dest+(bytes & ~7), + bytes & 7, weight1, weight2); + } +} + +#endif /* (HAVE_ASM_MMXEXT || HAVE_ASM_SSE) && ARCH_X86 */ + +/*************************************************************************/ + +/* SSE2 version */ + +#if defined(HAVE_ASM_SSE2) + +#ifdef ARCH_X86_64 +# define ECX "%%rcx" +# define EDX "%%rdx" +# define ESI "%%rsi" +# define EDI "%%rdi" +#else +# define ECX "%%ecx" +# define EDX "%%edx" +# define ESI "%%esi" +# define EDI "%%edi" +#endif + +static void rescale_sse2(const uint8_t *src1, const uint8_t *src2, + uint8_t *dest, int bytes, + uint32_t weight1, uint32_t weight2) +{ + if (bytes >= 16) { + asm("movd %%eax, %%xmm4; movd %%edx, %%xmm5" + : : "a" (weight1<<16|weight1), "d" (weight2<<16|weight2)); + asm("\ + pshufd $0, %%xmm4, %%xmm4 # XMM4: W1 W1 W1 W1 W1 W1 W1 W1 \n\ + pshufd $0, %%xmm5, %%xmm5 # XMM5: W2 W2 W2 W2 W2 W2 W2 W2 \n\ + pxor %%xmm6, %%xmm6 # Put 0x0080*4 in XMM6 (rounding)\n\ + pcmpeqw %%xmm7, %%xmm7 \n\ + psubw %%xmm7, %%xmm6 \n\ + psllw $7, %%xmm6 \n\ + 0: \n\ + movdqu -16("ESI","ECX"), %%xmm7 \n\ + pxor %%xmm0, %%xmm0 \n\ + punpcklbw %%xmm7, %%xmm0 \n\ + pmulhuw %%xmm4, %%xmm0 \n\ + pxor %%xmm1, %%xmm1 \n\ + punpckhbw %%xmm7, %%xmm1 \n\ + pmulhuw %%xmm4, %%xmm1 \n\ + movdqu -16("EDX","ECX"), %%xmm7 \n\ + pxor %%xmm2, %%xmm2 \n\ + punpcklbw %%xmm7, %%xmm2 \n\ + pmulhuw %%xmm5, %%xmm2 \n\ + pxor %%xmm3, %%xmm3 \n\ + punpckhbw %%xmm7, %%xmm3 \n\ + pmulhuw %%xmm5, %%xmm3 \n\ + paddw %%xmm2, %%xmm0 \n\ + paddw %%xmm6, %%xmm0 \n\ + psrlw $8, %%xmm0 \n\ + paddw %%xmm3, %%xmm1 \n\ + paddw %%xmm6, %%xmm1 \n\ + psrlw $8, %%xmm1 \n\ + packuswb %%xmm1, %%xmm0 \n\ + movdqu %%xmm0, -16("EDI","ECX") \n\ + subl $16, %%ecx \n\ + jnz 0b \n\ + emms" + : /* no outputs */ + : "S" (src1), "d" (src2), "D" (dest), "c" (bytes & ~15)); + } + if (UNLIKELY(bytes & 15)) { + rescale(src1+(bytes & ~15), src2+(bytes & ~15), dest+(bytes & ~15), + bytes & 15, weight1, weight2); + } +} + +#endif /* HAVE_ASM_SSE2 */ + +/*************************************************************************/ +/*************************************************************************/ + +/* Initialization routine. */ + +int ac_rescale_init(int accel) +{ + rescale_ptr = rescale; + +#if defined(HAVE_ASM_MMX) && defined(ARCH_X86) + if (HAS_ACCEL(accel, AC_MMX)) + rescale_ptr = rescale_mmx; +#endif +#if (defined(HAVE_ASM_MMXEXT) || defined(HAVE_ASM_SSE)) && defined(ARCH_X86) + if (HAS_ACCEL(accel, AC_MMXEXT) || HAS_ACCEL(accel, AC_SSE)) + rescale_ptr = rescale_mmxext; +#endif +#if defined(HAVE_ASM_SSE2) + if (HAS_ACCEL(accel, AC_SSE2)) + rescale_ptr = rescale_sse2; +#endif + + return 1; +} + +/*************************************************************************/ + +/* + * Local variables: + * c-file-style: "stroustrup" + * c-file-offsets: ((case-label . *) (statement-case-intro . *)) + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: + */ diff --git a/debian/transcode/transcode-1.1.7/aclib/rgb-yuv-conv.pl b/debian/transcode/transcode-1.1.7/aclib/rgb-yuv-conv.pl new file mode 100755 index 00000000..a2b6257c --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/rgb-yuv-conv.pl @@ -0,0 +1,48 @@ +#!/usr/bin/perl -w +# Calculate conversion matrices for RGB<->YUV given Kb and Kr + +die "Usage: $0 Kb Kr [scale]\n" if @ARGV < 2; +$scale = $ARGV[2] || 1; +$Kb = $ARGV[0]; +$Kr = $ARGV[1]; +$Kg = 1 - $Kr - $Kb; +$a11 = $Kr; +$a12 = $Kg; +$a13 = $Kb; +$a21 = -$Kr/(1-$Kb)/2; +$a22 = -$Kg/(1-$Kb)/2; +$a23 = 1/2; +$a31 = 1/2; +$a32 = -$Kg/(1-$Kr)/2; +$a33 = -$Kb/(1-$Kr)/2; +print "Y [R] = ".($a11*$scale)."\n"; +print "Y [G] = ".($a12*$scale)."\n"; +print "Y [B] = ".($a13*$scale)."\n"; +print "Cb[R] = ".($a21*$scale)."\n"; +print "Cb[G] = ".($a22*$scale)."\n"; +print "Cb[B] = ".($a23*$scale)."\n"; +print "Cr[R] = ".($a31*$scale)."\n"; +print "Cr[G] = ".($a32*$scale)."\n"; +print "Cr[B] = ".($a33*$scale)."\n"; +$det = $a11*$a22*$a33 - $a11*$a23*$a32 + + $a12*$a23*$a31 - $a12*$a21*$a33 + + $a13*$a21*$a32 - $a13*$a22*$a31; +$b11 = (1/$det)*($a22*$a33-$a23*$a32); +$b12 = (1/$det)*($a13*$a32-$a12*$a33); +$b13 = (1/$det)*($a12*$a23-$a13*$a22); +$b21 = (1/$det)*($a23*$a31-$a21*$a33); +$b22 = (1/$det)*($a11*$a33-$a13*$a31); +$b23 = (1/$det)*($a13*$a21-$a11*$a23); +$b31 = (1/$det)*($a21*$a32-$a22*$a31); +$b32 = (1/$det)*($a12*$a31-$a11*$a32); +$b33 = (1/$det)*($a11*$a22-$a12*$a21); +map {$_ = 0 if abs($_) < 1e-10} ($b11,$b12,$b13,$b21,$b22,$b23,$b31,$b32,$b33); +print "R[Y ] = ".($b11*$scale)."\n"; +print "R[Cb] = ".($b12*$scale)."\n"; +print "R[Cr] = ".($b13*$scale)."\n"; +print "G[Y ] = ".($b21*$scale)."\n"; +print "G[Cb] = ".($b22*$scale)."\n"; +print "G[Cr] = ".($b23*$scale)."\n"; +print "B[Y ] = ".($b31*$scale)."\n"; +print "B[Cb] = ".($b32*$scale)."\n"; +print "B[Cr] = ".($b33*$scale)."\n"; |