/* * img_yuv_packed.c - YUV planar<->packed image format conversion routines * Written by Andrew Church * * This file is part of transcode, a video stream processing tool. * transcode is free software, distributable under the terms of the GNU * General Public License (version 2 or later). See the file COPYING * for details. */ #include "ac.h" #include "imgconvert.h" #include "img_internal.h" /*************************************************************************/ /*************************************************************************/ /* Standard C implementations */ /*************************************************************************/ /* Wrappers for UYVY and YVYU */ /* Note: we rely on YUY2<->{UYVY,YVYU} working for src==dest */ /* FIXME: when converting from UYVY/YVYU, src is destroyed! */ static int uyvy_yvyu_wrapper(uint8_t **src, ImageFormat srcfmt, uint8_t **dest, ImageFormat destfmt, int width, int height) { if (srcfmt == IMG_UYVY || srcfmt == IMG_YVYU) return ac_imgconvert(src, srcfmt, src, IMG_YUY2, width, height) && ac_imgconvert(src, IMG_YUY2, dest, destfmt, width, height); else return ac_imgconvert(src, srcfmt, dest, IMG_YUY2, width, height) && ac_imgconvert(dest, IMG_YUY2, dest, destfmt, width, height); } static int yuv420p_uyvy(uint8_t **src, uint8_t **dest, int width, int height) { return uyvy_yvyu_wrapper(src, IMG_YUV420P, dest, IMG_UYVY, width, height); } static int yuv420p_yvyu(uint8_t **src, uint8_t **dest, int width, int height) { return uyvy_yvyu_wrapper(src, IMG_YUV420P, dest, IMG_YVYU, width, height); } static int yuv411p_uyvy(uint8_t **src, uint8_t **dest, int width, int height) { return uyvy_yvyu_wrapper(src, IMG_YUV411P, dest, IMG_UYVY, width, height); } static int yuv411p_yvyu(uint8_t **src, uint8_t **dest, int width, int height) { return uyvy_yvyu_wrapper(src, IMG_YUV411P, dest, IMG_YVYU, width, height); } static int yuv422p_uyvy(uint8_t **src, uint8_t **dest, int width, int height) { return uyvy_yvyu_wrapper(src, IMG_YUV422P, dest, IMG_UYVY, width, height); } static int yuv422p_yvyu(uint8_t **src, uint8_t **dest, int width, int height) { return uyvy_yvyu_wrapper(src, IMG_YUV422P, dest, IMG_YVYU, width, height); } static int yuv444p_uyvy(uint8_t **src, uint8_t **dest, int width, int height) { return uyvy_yvyu_wrapper(src, IMG_YUV444P, dest, IMG_UYVY, width, height); } static int yuv444p_yvyu(uint8_t **src, uint8_t **dest, int width, int height) { return uyvy_yvyu_wrapper(src, IMG_YUV444P, dest, IMG_YVYU, width, height); } static int uyvy_yuv420p(uint8_t **src, uint8_t **dest, int width, int height) { return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV420P, width, height); } static int yvyu_yuv420p(uint8_t **src, uint8_t **dest, int width, int height) { return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV420P, width, height); } static int uyvy_yuv411p(uint8_t **src, uint8_t **dest, int width, int height) { return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV411P, width, height); } static int yvyu_yuv411p(uint8_t **src, uint8_t **dest, int width, int height) { return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV411P, width, height); } static int uyvy_yuv422p(uint8_t **src, uint8_t **dest, int width, int height) { return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV422P, width, height); } static int yvyu_yuv422p(uint8_t **src, uint8_t **dest, int width, int height) { return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV422P, width, height); } static int uyvy_yuv444p(uint8_t **src, uint8_t **dest, int width, int height) { return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV444P, width, height); } static int yvyu_yuv444p(uint8_t **src, uint8_t **dest, int width, int height) { return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV444P, width, height); } /*************************************************************************/ static int yuv420p_yuy2(uint8_t **src, uint8_t **dest, int width, int height) { int x, y; for (y = 0; y < (height & ~1); y++) { for (x = 0; x < (width & ~1); x += 2) { dest[0][(y*width+x)*2 ] = src[0][y*width+x]; dest[0][(y*width+x)*2+1] = src[1][(y/2)*(width/2)+x/2]; dest[0][(y*width+x)*2+2] = src[0][y*width+x+1]; dest[0][(y*width+x)*2+3] = src[2][(y/2)*(width/2)+x/2]; } } return 1; } static int yuv411p_yuy2(uint8_t **src, uint8_t **dest, int width, int height) { int x, y; for (y = 0; y < height; y++) { for (x = 0; x < (width & ~1); x += 2) { dest[0][(y*width+x)*2 ] = src[0][y*width+x]; dest[0][(y*width+x)*2+1] = src[1][y*(width/4)+x/4]; dest[0][(y*width+x)*2+2] = src[0][y*width+x+1]; dest[0][(y*width+x)*2+3] = src[2][y*(width/4)+x/4]; } } return 1; } static int yuv422p_yuy2(uint8_t **src, uint8_t **dest, int width, int height) { int i; for (i = 0; i < (width/2)*height; i++) { dest[0][i*4 ] = src[0][i*2]; dest[0][i*4+1] = src[1][i]; dest[0][i*4+2] = src[0][i*2+1]; dest[0][i*4+3] = src[2][i]; } return 1; } static int yuv444p_yuy2(uint8_t **src, uint8_t **dest, int width, int height) { int i; for (i = 0; i < (width/2)*height; i++) { dest[0][i*4 ] = src[0][i*2]; dest[0][i*4+1] = (src[1][i*2] + src[1][i*2+1]) / 2; dest[0][i*4+2] = src[0][i*2+1]; dest[0][i*4+3] = (src[2][i*2] + src[2][i*2+1]) / 2; } return 1; } /*************************************************************************/ static int yuy2_yuv420p(uint8_t **src, uint8_t **dest, int width, int height) { int x, y; for (y = 0; y < (height & ~1); y++) { for (x = 0; x < (width & ~1); x += 2) { dest[0][y*width+x ] = src[0][(y*width+x)*2 ]; dest[0][y*width+x+1] = src[0][(y*width+x)*2+2]; if (y%2 == 0) { dest[1][(y/2)*(width/2)+x/2] = src[0][(y*width+x)*2+1]; dest[2][(y/2)*(width/2)+x/2] = src[0][(y*width+x)*2+3]; } else { dest[1][(y/2)*(width/2)+x/2] = (dest[1][(y/2)*(width/2)+x/2] + src[0][(y*width+x)*2+1] + 1) / 2; dest[2][(y/2)*(width/2)+x/2] = (dest[2][(y/2)*(width/2)+x/2] + src[0][(y*width+x)*2+3] + 1) / 2; } } } return 1; } static int yuy2_yuv411p(uint8_t **src, uint8_t **dest, int width, int height) { int x, y; for (y = 0; y < height; y++) { for (x = 0; x < (width & ~3); x += 4) { dest[0][y*width+x] = src[0][(y*width+x)*2 ]; dest[0][y*width+x+1] = src[0][(y*width+x)*2+2]; dest[0][y*width+x+2] = src[0][(y*width+x)*2+4]; dest[0][y*width+x+3] = src[0][(y*width+x)*2+6]; dest[1][y*(width/4)+x/4] = (src[0][(y*width+x)*2+1] + src[0][(y*width+x)*2+5] + 1) / 2; dest[2][y*(width/4)+x/4] = (src[0][(y*width+x)*2+3] + src[0][(y*width+x)*2+7] + 1) / 2; } } return 1; } static int yuy2_yuv422p(uint8_t **src, uint8_t **dest, int width, int height) { int i; for (i = 0; i < (width/2)*height; i++) { dest[0][i*2] = src[0][i*4 ]; dest[1][i] = src[0][i*4+1]; dest[0][i*2+1] = src[0][i*4+2]; dest[2][i] = src[0][i*4+3]; } return 1; } static int yuy2_yuv444p(uint8_t **src, uint8_t **dest, int width, int height) { int i; for (i = 0; i < (width & ~1)*height; i += 2) { dest[0][i] = src[0][i*2 ]; dest[1][i] = src[0][i*2+1]; dest[1][i+1] = src[0][i*2+1]; dest[0][i+1] = src[0][i*2+2]; dest[2][i] = src[0][i*2+3]; dest[2][i+1] = src[0][i*2+3]; } return 1; } /*************************************************************************/ static int y8_yuy2(uint8_t **src, uint8_t **dest, int width, int height) { int i; for (i = 0; i < width*height; i++) { dest[0][i*2 ] = src[0][i]; dest[0][i*2+1] = 128; } return 1; } static int y8_uyvy(uint8_t **src, uint8_t **dest, int width, int height) { int i; for (i = 0; i < width*height; i++) { dest[0][i*2 ] = 128; dest[0][i*2+1] = src[0][i]; } return 1; } static int yuy2_y8(uint8_t **src, uint8_t **dest, int width, int height) { int i; for (i = 0; i < width*height; i++) dest[0][i] = src[0][i*2]; return 1; } static int uyvy_y8(uint8_t **src, uint8_t **dest, int width, int height) { int i; for (i = 0; i < width*height; i++) dest[0][i] = src[0][i*2+1]; return 1; } /*************************************************************************/ /*************************************************************************/ #if defined(HAVE_ASM_SSE2) /* SSE2 routines. See comments in img_x86_common.h for why we don't bother * unrolling the loops. */ /* Common macros/data for x86 code */ #include "img_x86_common.h" /* YUV420P (1 row) or YUV422P -> YUY2 (unit: 2 pixels) */ #define YUV42XP_YUY2 \ SIMD_LOOP_WRAPPER( \ /* blocksize */ 8, \ /* push_regs */ PUSH(EBX), \ /* pop_regs */ POP(EBX), \ /* small_loop */ \ "movb -1("EDX","ECX"), %%bh \n\ movb -1("ESI","ECX",2), %%bl \n\ shll $16, %%ebx \n\ movb -1("EAX","ECX"), %%bh \n\ movb -2("ESI","ECX",2), %%bl \n\ movl %%ebx, -4("EDI","ECX",4)", \ /* main_loop */ \ "movdqu -16("ESI","ECX",2),%%xmm0 #XM0: YF YE YD ..... Y2 Y1 Y0 \n\ movq -8("EAX","ECX"), %%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0 \n\ movq -8("EDX","ECX"), %%xmm3 # XMM3: V7 V6 V5 V4 V3 V2 V1 V0 \n\ punpcklbw %%xmm3, %%xmm2 # XMM2: V7 U7 V6 ..... U1 V0 U0 \n\ movdqa %%xmm0, %%xmm1 # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\ punpcklbw %%xmm2, %%xmm0 # XMM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\ punpckhbw %%xmm2, %%xmm1 # XMM1: V7 YF U7 ..... Y9 U4 Y8 \n\ movdqu %%xmm0, -32("EDI","ECX",4) \n\ movdqu %%xmm1, -16("EDI","ECX",4)", \ /* emms */ "emms") /* YUV411P -> YUY2 (unit: 4 pixels) */ #define YUV411P_YUY2 \ SIMD_LOOP_WRAPPER( \ /* blocksize */ 4, \ /* push_regs */ PUSH(EBX), \ /* pop_regs */ POP(EBX), \ /* small_loop */ \ "movb -1("EDX","ECX"), %%bh \n\ movb -1("ESI","ECX",4), %%bl \n\ shll $16, %%ebx \n\ movb -1("EAX","ECX"), %%bh \n\ movb -2("ESI","ECX",4), %%bl \n\ movl %%ebx, -4("EDI","ECX",8) \n\ movb -1("EDX","ECX"), %%bh \n\ movb -3("ESI","ECX",4), %%bl \n\ shll $16, %%ebx \n\ movb -1("EAX","ECX"), %%bh \n\ movb -4("ESI","ECX",4), %%bl \n\ movl %%ebx, -8("EDI","ECX",8)", \ /* main_loop */ \ "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: YF YE YD ..... Y2 Y1 Y0 \n\ movd -4("EAX","ECX"), %%xmm2 # XMM2: U3 U2 U1 U0 \n\ punpcklbw %%xmm2, %%xmm2 # XMM2: U3 U3 U2 U2 U1 U1 U0 U0 \n\ movd -4("EDX","ECX"), %%xmm3 # XMM3: V3 V2 V1 V0 \n\ punpcklbw %%xmm3, %%xmm3 # XMM3: V3 V3 V2 V2 V1 V1 V0 V0 \n\ punpcklbw %%xmm3, %%xmm2 # XMM2: V3 U3 V3 ..... U0 V0 U0 \n\ movdqa %%xmm0, %%xmm1 # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\ punpcklbw %%xmm2, %%xmm0 # XMM0: V1 Y7 U1 ..... Y1 U0 Y0 \n\ punpckhbw %%xmm2, %%xmm1 # XMM1: V3 YF U3 ..... Y9 U2 Y8 \n\ movdqu %%xmm0, -32("EDI","ECX",8) \n\ movdqu %%xmm1, -16("EDI","ECX",8)", \ /* emms */ "emms") /* YUV444P -> YUY2 (unit: 2 pixels) */ #define YUV444P_YUY2 \ /* Load 0x00FF*8 into XMM7 for masking */ \ "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \ SIMD_LOOP_WRAPPER( \ /* blocksize */ 8, \ /* push_regs */ PUSH2(EBX,EBP), \ /* pop_regs */ POP2(EBP,EBX), \ /* small_loop */ \ "movzbl -1("EDX","ECX",2), %%ebx \n\ movzbl -2("EDX","ECX",2), %%ebp \n\ addl %%ebp, %%ebx \n\ shrl $1, %%ebx \n\ movb %%bl, -1("EDI","ECX",4) \n\ movb -1("ESI","ECX",2), %%bl \n\ movb %%bl, -2("EDI","ECX",4) \n\ movzbl -1("EAX","ECX",2), %%ebx \n\ movzbl -2("EAX","ECX",2), %%ebp \n\ addl %%ebp, %%ebx \n\ shrl $1, %%ebx \n\ movb %%bl, -3("EDI","ECX",4) \n\ movb -2("ESI","ECX",2), %%bl \n\ movb %%bl, -4("EDI","ECX",4)", \ /* main_loop */ \ "movdqu -16("ESI","ECX",2),%%xmm0 #XM0: YF YE YD ..... Y2 Y1 Y0 \n\ movdqu -16("EAX","ECX",2), %%xmm2 #XM2: UF UE UD ..... U2 U1 U0 \n\ movdqu -16("EDX","ECX",2), %%xmm3 #XM3: VF VE VD ..... V2 V1 V0 \n\ movdqa %%xmm2, %%xmm4 # XMM4: UF UE UD ..... U2 U1 U0 \n\ pand %%xmm7, %%xmm2 # XMM2: -- UE -- ..... U2 -- U0 \n\ psrlw $8, %%xmm4 # XMM4: -- UF -- ..... U3 -- U1 \n\ pavgw %%xmm4, %%xmm2 # XMM2: -- u7 -- ..... u1 -- u0 \n\ movdqa %%xmm3, %%xmm5 # XMM4: UF UE UD ..... U2 U1 U0 \n\ pand %%xmm7, %%xmm3 # XMM3: -- VE -- ..... V2 -- V0 \n\ psrlw $8, %%xmm5 # XMM5: -- VF -- ..... V3 -- V1 \n\ pavgw %%xmm5, %%xmm3 # XMM3: -- v7 -- ..... v1 -- v0 \n\ psllw $8, %%xmm3 # XMM3: v7 -- v6 ..... -- v0 -- \n\ por %%xmm3, %%xmm2 # XMM2: v7 u7 v6 ..... u1 v0 u0 \n\ movdqa %%xmm0, %%xmm1 # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\ punpcklbw %%xmm2, %%xmm0 # XMM0: v3 Y7 u3 ..... Y1 u0 Y0 \n\ punpckhbw %%xmm2, %%xmm1 # XMM1: v7 YF u7 ..... Y9 u4 Y8 \n\ movdqu %%xmm0, -32("EDI","ECX",4) \n\ movdqu %%xmm1, -16("EDI","ECX",4)", \ /* emms */ "emms") /* YUY2 -> YUV420P (U row) (unit: 2 pixels) */ #define YUY2_YUV420P_U \ /* Load 0x00FF*8 into XMM7 for masking */ \ "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \ SIMD_LOOP_WRAPPER( \ /* blocksize */ 4, \ /* push_regs */ PUSH2(EBX,EBP), \ /* pop_regs */ POP2(EBP,EBX), \ /* small_loop */ \ "movb -4("ESI","ECX",4), %%bl \n\ movb %%bl, -2("EDI","ECX",2) \n\ movb -2("ESI","ECX",4), %%bl \n\ movb %%bl, -1("EDI","ECX",2) \n\ movzbl -3("ESI","ECX",4), %%ebx \n\ movzbl -3("EAX","ECX",4), %%ebp \n\ addl %%ebp, %%ebx \n\ shrl $1, %%ebx \n\ movb %%bl, -1("EDX","ECX")", \ /* main_loop */ \ "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\ movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\ movdqu -16("EAX","ECX",4),%%xmm2 #XMM2: Vd Yh Ud ..... Yb Ua Ya \n\ pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\ packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\ psrlw $8, %%xmm2 # XMM2: -- Vd -- ..... Va -- Ua \n\ pavgw %%xmm2, %%xmm1 # XMM1: -- v3 -- ..... v0 -- u0 \n\ packuswb %%xmm1, %%xmm1 # XMM1: v3 u3 v2 u2 v1 u1 v0 u0 \n\ pand %%xmm7, %%xmm1 # XMM1: -- u3 -- u2 -- u1 -- u0 \n\ packuswb %%xmm1, %%xmm1 # XMM1: u3 u2 u1 u0 \n\ movq %%xmm0, -8("EDI","ECX",2) \n\ movd %%xmm1, -4("EDX","ECX")", \ /* emms */ "emms") /* YUY2 -> YUV420P (V row) (unit: 2 pixels) */ #define YUY2_YUV420P_V \ /* Load 0x00FF*8 into XMM7 for masking */ \ "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \ SIMD_LOOP_WRAPPER( \ /* blocksize */ 4, \ /* push_regs */ PUSH2(EBX,EBP), \ /* pop_regs */ POP2(EBP,EBX), \ /* small_loop */ \ "movb -4("ESI","ECX",4), %%bl \n\ movb %%bl, -2("EDI","ECX",2) \n\ movb -2("ESI","ECX",4), %%bl \n\ movb %%bl, -1("EDI","ECX",2) \n\ movzbl -1("ESI","ECX",4), %%ebx \n\ movzbl -1("EAX","ECX",4), %%ebp \n\ addl %%ebp, %%ebx \n\ shrl $1, %%ebx \n\ movb %%bl, -1("EDX","ECX")", \ /* main_loop */ \ "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\ movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\ movdqu -16("EAX","ECX",4),%%xmm2 #XMM2: Vd Yh Ud ..... Yb Ua Ya \n\ pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\ packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\ psrlw $8, %%xmm2 # XMM2: -- Vd -- ..... Va -- Ua \n\ pavgw %%xmm1, %%xmm2 # XMM2: -- v3 -- ..... v0 -- u0 \n\ packuswb %%xmm2, %%xmm2 # XMM2: v3 u3 v2 u2 v1 u1 v0 u0 \n\ psrlw $8, %%xmm2 # XMM2: -- v3 -- v2 -- v1 -- v0 \n\ packuswb %%xmm2, %%xmm2 # XMM2: v3 v2 v1 v0 \n\ movq %%xmm0, -8("EDI","ECX",2) \n\ movd %%xmm2, -4("EDX","ECX")", \ /* emms */ "emms") /* YUY2 -> YUV411P (unit: 4 pixels) */ #define YUY2_YUV411P \ /* Load 0x000..000FFFFFFFF into XMM6, 0x00FF*8 into XMM7 for masking */ \ "pcmpeqd %%xmm6, %%xmm6; psrldq $12, %%xmm6;" \ "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \ SIMD_LOOP_WRAPPER( \ /* blocksize */ 2, \ /* push_regs */ PUSH2(EBX,EBP), \ /* pop_regs */ POP2(EBP,EBX), \ /* small_loop */ \ "movb -8("ESI","ECX",8), %%bl \n\ movb %%bl, -4("EDI","ECX",4) \n\ movb -6("ESI","ECX",8), %%bl \n\ movb %%bl, -3("EDI","ECX",4) \n\ movb -4("ESI","ECX",8), %%bl \n\ movb %%bl, -2("EDI","ECX",4) \n\ movb -2("ESI","ECX",8), %%bl \n\ movb %%bl, -1("EDI","ECX",4) \n\ movzbl -7("ESI","ECX",8), %%ebx \n\ movzbl -3("ESI","ECX",8), %%ebp \n\ addl %%ebp, %%ebx \n\ shrl $1, %%ebx \n\ movb %%bl, -1("EAX","ECX") \n\ movzbl -5("ESI","ECX",8), %%ebx \n\ movzbl -1("ESI","ECX",8), %%ebp \n\ addl %%ebp, %%ebx \n\ shrl $1, %%ebx \n\ movb %%bl, -1("EDX","ECX")", \ /* main_loop */ \ "movdqu -16("ESI","ECX",8),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\ movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\ pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\ packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\ packuswb %%xmm1, %%xmm1 # XMM1: V3 U3 V2 U2 V1 U1 V0 U0 \n\ movdqa %%xmm1, %%xmm2 # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\ pand %%xmm7, %%xmm1 # XMM1: -- U3 -- U2 -- U1 -- U0 \n\ psrlw $8, %%xmm2 # XMM2: -- V3 -- V2 -- V1 -- V0 \n\ packuswb %%xmm1, %%xmm1 # XMM1: U3 U2 U1 U0 \n\ packuswb %%xmm2, %%xmm2 # XMM2: V3 V2 V1 V0 \n\ pand %%xmm6, %%xmm1 # XMM1: -- -- -- -- U3 U2 U1 U0 \n\ psllq $32, %%xmm2 # XMM2: V3 V2 V1 V0 -- -- -- -- \n\ por %%xmm1, %%xmm2 # XMM2: V3 V2 V1 V0 U3 U2 U1 U0 \n\ movdqa %%xmm2, %%xmm1 # XMM1: V3 V2 V1 V0 U3 U2 U1 U0 \n\ pand %%xmm7, %%xmm1 # XMM1: -- V2 -- V0 -- U2 -- U0 \n\ psrlw $8, %%xmm2 # XMM2: -- V3 -- V1 -- U3 -- U1 \n\ pavgw %%xmm2, %%xmm1 # XMM1: -- v1 -- v0 -- u1 -- u0 \n\ packuswb %%xmm1, %%xmm1 # XMM1: v1 v0 u1 u0 \n\ movq %%xmm0, -8("EDI","ECX",4) \n\ movd %%xmm1, %%ebx \n\ movw %%bx, -2("EAX","ECX") \n\ shrl $16, %%ebx; \n\ movw %%bx, -2("EDX","ECX")", \ /* emms */ "emms") /* YUY2 -> YUV422P (unit: 2 pixels) */ #define YUY2_YUV422P \ /* Load 0x00FF*8 into XMM7 for masking */ \ "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \ SIMD_LOOP_WRAPPER( \ /* blocksize */ 4, \ /* push_regs */ PUSH(EBX), \ /* pop_regs */ POP(EBX), \ /* small_loop */ \ "movb -4("ESI","ECX",4), %%bl \n\ movb %%bl, -2("EDI","ECX",2) \n\ movb -2("ESI","ECX",4), %%bl \n\ movb %%bl, -1("EDI","ECX",2) \n\ movb -3("ESI","ECX",4), %%bl \n\ movb %%bl, -1("EAX","ECX") \n\ movb -1("ESI","ECX",4), %%bl \n\ movb %%bl, -1("EDX","ECX")", \ /* main_loop */ \ "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\ movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\ pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\ packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\ packuswb %%xmm1, %%xmm1 # XMM1: V3 U3 V2 U2 V1 U1 V0 U0 \n\ movdqa %%xmm1, %%xmm2 # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\ pand %%xmm7, %%xmm1 # XMM1: -- U3 -- U2 -- U1 -- U0 \n\ psrlw $8, %%xmm2 # XMM2: -- V3 -- V2 -- V1 -- V0 \n\ packuswb %%xmm1, %%xmm1 # XMM1: U3 U2 U1 U0 \n\ packuswb %%xmm2, %%xmm2 # XMM2: V3 V2 V1 V0 \n\ movq %%xmm0, -8("EDI","ECX",2) \n\ movd %%xmm1, -4("EAX","ECX") \n\ movd %%xmm2, -4("EDX","ECX")", \ /* emms */ "emms") /* YUY2 -> YUV444P (unit: 2 pixels) */ #define YUY2_YUV444P \ /* Load 0x00FF*8 into XMM7 for masking */ \ "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \ SIMD_LOOP_WRAPPER( \ /* blocksize */ 4, \ /* push_regs */ PUSH(EBX), \ /* pop_regs */ POP(EBX), \ /* small_loop */ \ "movb -4("ESI","ECX",4), %%bl \n\ movb %%bl, -2("EDI","ECX",2) \n\ movb -2("ESI","ECX",4), %%bl \n\ movb %%bl, -1("EDI","ECX",2) \n\ movb -3("ESI","ECX",4), %%bl \n\ movb %%bl, -2("EAX","ECX",2) \n\ movb %%bl, -1("EAX","ECX",2) \n\ movb -1("ESI","ECX",4), %%bl \n\ movb %%bl, -2("EDX","ECX",2) \n\ movb %%bl, -1("EDX","ECX",2)", \ /* main_loop */ \ "movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\ movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\ pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\ packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\ packuswb %%xmm1, %%xmm1 # XMM1: V3 U3 V2 U2 V1 U1 V0 U0 \n\ movdqa %%xmm1, %%xmm2 # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\ pand %%xmm7, %%xmm1 # XMM1: -- U3 -- U2 -- U1 -- U0 \n\ psrlw $8, %%xmm2 # XMM2: -- V3 -- V2 -- V1 -- V0 \n\ movdqa %%xmm1, %%xmm3 # XMM3: -- U3 -- U2 -- U1 -- U0 \n\ psllw $8, %%xmm3 # XMM3: U3 -- U2 -- U1 -- U0 -- \n\ por %%xmm3, %%xmm1 # XMM1: U3 U3 U2 U2 U1 U1 U0 U0 \n\ movdqa %%xmm2, %%xmm3 # XMM3: -- V3 -- V2 -- V1 -- V0 \n\ psllw $8, %%xmm3 # XMM3: V3 -- V2 -- V1 -- V0 -- \n\ por %%xmm3, %%xmm2 # XMM1: V3 V3 V2 V2 V1 V1 V0 V0 \n\ movq %%xmm0, -8("EDI","ECX",2) \n\ movq %%xmm1, -8("EAX","ECX",2) \n\ movq %%xmm2, -8("EDX","ECX",2)", \ /* emms */ "emms") /* Y8 -> YUY2/YVYU (unit: 1 pixel) */ #define Y8_YUY2 \ /* Load 0x80*16 into XMM7 for interlacing U/V */ \ "pcmpeqd %%xmm7, %%xmm7; psllw $7, %%xmm7; packsswb %%xmm7, %%xmm7;"\ SIMD_LOOP_WRAPPER( \ /* blocksize */ 16, \ /* push_regs */ PUSH(EBX), \ /* pop_regs */ POP(EBX), \ /* small_loop */ \ "movb -1("ESI","ECX"), %%al \n\ movb %%al, -2("EDI","ECX",2) \n\ movb $0x80, -1("EDI","ECX",2)", \ /* main_loop */ \ "movdqu -16("ESI","ECX"),%%xmm0 # XMM0: YF YE YD ..... Y2 Y1 Y0 \n\ movdqa %%xmm0, %%xmm1 # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\ punpcklbw %%xmm7, %%xmm0 # XMM0: 80 Y7 80 ..... Y1 80 Y0 \n\ movdqu %%xmm0, -32("EDI","ECX",2) \n\ punpckhbw %%xmm7, %%xmm1 # XMM1: 80 YF 80 ..... Y9 80 Y8 \n\ movdqu %%xmm1, -16("EDI","ECX",2)", \ /* emms */ "emms") /* Y8 -> UYVY (unit: 1 pixel) */ #define Y8_UYVY \ /* Load 0x80*16 into XMM7 for interlacing U/V */ \ "pcmpeqd %%xmm7, %%xmm7; psllw $7, %%xmm7; packsswb %%xmm7, %%xmm7;"\ SIMD_LOOP_WRAPPER( \ /* blocksize */ 16, \ /* push_regs */ "", \ /* pop_regs */ "", \ /* small_loop */ \ "movb -1("ESI","ECX"), %%al \n\ movb %%al, -1("EDI","ECX",2) \n\ movb $0x80, -2("EDI","ECX",2)", \ /* main_loop */ \ "movdqu -16("ESI","ECX"),%%xmm0 # XMM0: YF YE YD ..... Y2 Y1 Y0 \n\ movdqa %%xmm7, %%xmm1 # XMM1: 80 80 80 ..... 80 80 80 \n\ punpcklbw %%xmm0, %%xmm1 # XMM1: Y7 80 Y6 ..... 80 Y0 80 \n\ movdqu %%xmm1, -32("EDI","ECX",2) \n\ movdqa %%xmm7, %%xmm2 # XMM2: 80 80 80 ..... 80 80 80 \n\ punpckhbw %%xmm0, %%xmm2 # XMM0: YF 80 YE ..... 80 Y8 80 \n\ movdqu %%xmm2, -16("EDI","ECX",2)", \ /* emms */ "emms") /* YUY2/YVYU -> Y8 (unit: 1 pixel) */ #define YUY2_Y8 \ /* Load 0x00FF*8 into XMM7 for masking */ \ "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \ SIMD_LOOP_WRAPPER( \ /* blocksize */ 8, \ /* push_regs */ "", \ /* pop_regs */ "", \ /* small_loop */ \ "movb -2("ESI","ECX",2), %%al \n\ movb %%al, -1("EDI","ECX")", \ /* main_loop */ \ "movdqu -16("ESI","ECX",2),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\ pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\ packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ movq %%xmm0, -8("EDI","ECX")", \ /* emms */ "emms") /* UYVY -> Y8 (unit: 1 pixel) */ #define UYVY_Y8 \ SIMD_LOOP_WRAPPER( \ /* blocksize */ 8, \ /* push_regs */ "", \ /* pop_regs */ "", \ /* small_loop */ \ "movb -1("ESI","ECX",2), %%al \n\ movb %%al, -1("EDI","ECX")", \ /* main_loop */ \ "movdqu -16("ESI","ECX",2),%%xmm0 #XM0: Y7 V3 Y6 ..... V0 Y0 U0 \n\ psrlw $8, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\ packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ movq %%xmm0, -8("EDI","ECX")", \ /* emms */ "emms") /*************************************************************************/ static int yuv420p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height) { int y; for (y = 0; y < (height & ~1); y++) { int dummy; asm volatile(YUV42XP_YUY2 : "=c" (dummy) // Ensure GCC reloads ECX each time through : "S" (src[0]+y*width), "a" (src[1]+(y/2)*(width/2)), "d" (src[2]+(y/2)*(width/2)), "D" (dest[0]+y*width*2), "0" (width/2) #ifdef ARCH_X86_64 : FAKE_PUSH_REG #endif ); } return 1; } static int yuv411p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height) { if (!(width & 3)) { asm(YUV411P_YUY2 : /* no outputs */ : "S" (src[0]), "a" (src[1]), "d" (src[2]), "D" (dest[0]), "c" ((width/4)*height) #ifdef ARCH_X86_64 : FAKE_PUSH_REG #endif ); } else { int y; for (y = 0; y < height; y++) { int dummy; asm volatile(YUV411P_YUY2 : "=c" (dummy) : "S" (src[0]+y*width), "a" (src[1]+y*(width/4)), "d" (src[2]+y*(width/4)), "D" (dest[0]+y*width*2), "0" (width/4) #ifdef ARCH_X86_64 : FAKE_PUSH_REG #endif ); } } return 1; } static int yuv422p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height) { if (!(width & 1)) { asm(YUV42XP_YUY2 : /* no outputs */ : "S" (src[0]), "a" (src[1]), "d" (src[2]), "D" (dest[0]), "c" ((width/2)*height) #ifdef ARCH_X86_64 : FAKE_PUSH_REG #endif ); } else { int y; for (y = 0; y < height; y++) { int dummy; asm volatile(YUV42XP_YUY2 : "=c" (dummy) : "S" (src[0]+y*width), "a" (src[1]+y*(width/2)), "d" (src[2]+y*(width/2)), "D" (dest[0]+y*width*2), "0" (width/2) #ifdef ARCH_X86_64 : FAKE_PUSH_REG #endif ); } } return 1; } static int yuv444p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height) { if (!(width & 1)) { asm(YUV444P_YUY2 : /* no outputs */ : "S" (src[0]), "a" (src[1]), "d" (src[2]), "D" (dest[0]), "c" ((width/2)*height) #ifdef ARCH_X86_64 : FAKE_PUSH_REG, FAKE_PUSH_REG_2 #endif ); } else { int y; for (y = 0; y < height; y++) { int dummy; asm volatile(YUV444P_YUY2 : "=c" (dummy) : "S" (src[0]+y*width), "a" (src[1]+y*(width/2)), "d" (src[2]+y*(width/2)), "D" (dest[0]+y*width*2), "0" (width/2) #ifdef ARCH_X86_64 : FAKE_PUSH_REG, FAKE_PUSH_REG_2 #endif ); } } return 1; } /*************************************************************************/ static int yuy2_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height) { int y; for (y = 0; y < (height & ~1); y += 2) { int dummy; asm volatile(YUY2_YUV420P_U : "=c" (dummy) : "S" (src[0]+y*width*2), "a" (src[0]+(y+1)*width*2), "D" (dest[0]+y*width), "d" (dest[1]+(y/2)*(width/2)), "0" (width/2) #ifdef ARCH_X86_64 : FAKE_PUSH_REG, FAKE_PUSH_REG_2 #endif ); asm volatile(YUY2_YUV420P_V : "=c" (dummy) : "S" (src[0]+(y+1)*width*2), "a" (src[0]+y*width*2), "D" (dest[0]+(y+1)*width), "d" (dest[2]+(y/2)*(width/2)), "0" (width/2) #ifdef ARCH_X86_64 : FAKE_PUSH_REG, FAKE_PUSH_REG_2 #endif ); } return 1; } static int yuy2_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height) { if (!(width & 3)) { asm(YUY2_YUV411P : /* no outputs */ : "S" (src[0]), "D" (dest[0]), "a" (dest[1]), "d" (dest[2]), "c" ((width/4)*height) #ifdef ARCH_X86_64 : FAKE_PUSH_REG, FAKE_PUSH_REG_2 #endif ); } else { int y; for (y = 0; y < height; y++) { int dummy; asm volatile(YUY2_YUV411P : "=c" (dummy) : "S" (src[0]+y*width*2), "D" (dest[0]+y*width), "a" (dest[1]+y*(width/4)), "d" (dest[2]+y*(width/4)), "0" (width/4) #ifdef ARCH_X86_64 : FAKE_PUSH_REG, FAKE_PUSH_REG_2 #endif ); } } return 1; } static int yuy2_yuv422p_sse2(uint8_t **src, uint8_t **dest, int width, int height) { if (!(width & 1)) { asm(YUY2_YUV422P : /* no outputs */ : "S" (src[0]), "D" (dest[0]), "a" (dest[1]), "d" (dest[2]), "c" ((width/2)*height) #ifdef ARCH_X86_64 : FAKE_PUSH_REG #endif ); } else { int y; for (y = 0; y < height; y++) { int dummy; asm volatile(YUY2_YUV422P : "=c" (dummy) : "S" (src[0]+y*width*2), "D" (dest[0]+y*width), "a" (dest[1]+y*(width/2)), "d" (dest[2]+y*(width/2)), "0" (width/2) #ifdef ARCH_X86_64 : FAKE_PUSH_REG #endif ); } } return 1; } static int yuy2_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height) { if (!(width & 1)) { asm(YUY2_YUV444P : /* no outputs */ : "S" (src[0]), "D" (dest[0]), "a" (dest[1]), "d" (dest[2]), "c" ((width/2)*height) #ifdef ARCH_X86_64 : FAKE_PUSH_REG #endif ); } else { int y; for (y = 0; y < height; y++) { int dummy; asm volatile(YUY2_YUV444P : "=c" (dummy) : "S" (src[0]+y*width*2), "D" (dest[0]+y*width), "a" (dest[1]+y*width), "d" (dest[2]+y*width), "0" (width/2) #ifdef ARCH_X86_64 : FAKE_PUSH_REG #endif ); } } return 1; } /*************************************************************************/ static int y8_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height) { asm(Y8_YUY2 : /* no outputs */ : "S" (src[0]), "D" (dest[0]), "c" (width*height) : "eax" COMMA_FAKE_PUSH_REG ); return 1; } static int y8_uyvy_sse2(uint8_t **src, uint8_t **dest, int width, int height) { asm(Y8_UYVY : /* no outputs */ : "S" (src[0]), "D" (dest[0]), "c" (width*height) : "eax"); return 1; } static int yuy2_y8_sse2(uint8_t **src, uint8_t **dest, int width, int height) { asm(YUY2_Y8 : /* no outputs */ : "S" (src[0]), "D" (dest[0]), "c" (width*height) : "eax"); return 1; } static int uyvy_y8_sse2(uint8_t **src, uint8_t **dest, int width, int height) { asm(UYVY_Y8 : /* no outputs */ : "S" (src[0]), "D" (dest[0]), "c" (width*height) : "eax"); return 1; } /*************************************************************************/ #endif /* HAVE_ASM_SSE2 */ /*************************************************************************/ /*************************************************************************/ /* Initialization */ int ac_imgconvert_init_yuv_mixed(int accel) { if (!register_conversion(IMG_YUV420P, IMG_YUY2, yuv420p_yuy2) || !register_conversion(IMG_YUV411P, IMG_YUY2, yuv411p_yuy2) || !register_conversion(IMG_YUV422P, IMG_YUY2, yuv422p_yuy2) || !register_conversion(IMG_YUV444P, IMG_YUY2, yuv444p_yuy2) || !register_conversion(IMG_Y8, IMG_YUY2, y8_yuy2) || !register_conversion(IMG_YUV420P, IMG_UYVY, yuv420p_uyvy) || !register_conversion(IMG_YUV411P, IMG_UYVY, yuv411p_uyvy) || !register_conversion(IMG_YUV422P, IMG_UYVY, yuv422p_uyvy) || !register_conversion(IMG_YUV444P, IMG_UYVY, yuv444p_uyvy) || !register_conversion(IMG_Y8, IMG_UYVY, y8_uyvy) || !register_conversion(IMG_YUV420P, IMG_YVYU, yuv420p_yvyu) || !register_conversion(IMG_YUV411P, IMG_YVYU, yuv411p_yvyu) || !register_conversion(IMG_YUV422P, IMG_YVYU, yuv422p_yvyu) || !register_conversion(IMG_YUV444P, IMG_YVYU, yuv444p_yvyu) || !register_conversion(IMG_Y8, IMG_YVYU, y8_yuy2) || !register_conversion(IMG_YUY2, IMG_YUV420P, yuy2_yuv420p) || !register_conversion(IMG_YUY2, IMG_YUV411P, yuy2_yuv411p) || !register_conversion(IMG_YUY2, IMG_YUV422P, yuy2_yuv422p) || !register_conversion(IMG_YUY2, IMG_YUV444P, yuy2_yuv444p) || !register_conversion(IMG_YUY2, IMG_Y8, yuy2_y8) || !register_conversion(IMG_UYVY, IMG_YUV420P, uyvy_yuv420p) || !register_conversion(IMG_UYVY, IMG_YUV411P, uyvy_yuv411p) || !register_conversion(IMG_UYVY, IMG_YUV422P, uyvy_yuv422p) || !register_conversion(IMG_UYVY, IMG_YUV444P, uyvy_yuv444p) || !register_conversion(IMG_UYVY, IMG_Y8, uyvy_y8) || !register_conversion(IMG_YVYU, IMG_YUV420P, yvyu_yuv420p) || !register_conversion(IMG_YVYU, IMG_YUV411P, yvyu_yuv411p) || !register_conversion(IMG_YVYU, IMG_YUV422P, yvyu_yuv422p) || !register_conversion(IMG_YVYU, IMG_YUV444P, yvyu_yuv444p) || !register_conversion(IMG_YVYU, IMG_Y8, yuy2_y8) ) { return 0; } #if defined(HAVE_ASM_SSE2) if (accel & AC_SSE2) { if (!register_conversion(IMG_YUV420P, IMG_YUY2, yuv420p_yuy2_sse2) || !register_conversion(IMG_YUV411P, IMG_YUY2, yuv411p_yuy2_sse2) || !register_conversion(IMG_YUV422P, IMG_YUY2, yuv422p_yuy2_sse2) || !register_conversion(IMG_YUV444P, IMG_YUY2, yuv444p_yuy2_sse2) || !register_conversion(IMG_Y8, IMG_YUY2, y8_yuy2_sse2) || !register_conversion(IMG_Y8, IMG_UYVY, y8_uyvy_sse2) || !register_conversion(IMG_Y8, IMG_YVYU, y8_yuy2_sse2) || !register_conversion(IMG_YUY2, IMG_YUV420P, yuy2_yuv420p_sse2) || !register_conversion(IMG_YUY2, IMG_YUV411P, yuy2_yuv411p_sse2) || !register_conversion(IMG_YUY2, IMG_YUV422P, yuy2_yuv422p_sse2) || !register_conversion(IMG_YUY2, IMG_YUV444P, yuy2_yuv444p_sse2) || !register_conversion(IMG_YUY2, IMG_Y8, yuy2_y8_sse2) || !register_conversion(IMG_UYVY, IMG_Y8, uyvy_y8_sse2) || !register_conversion(IMG_YVYU, IMG_Y8, yuy2_y8_sse2) ) { return 0; } } #endif /* HAVE_ASM_SSE2 */ return 1; } /*************************************************************************/ /* * Local variables: * c-file-style: "stroustrup" * c-file-offsets: ((case-label . *) (statement-case-intro . *)) * indent-tabs-mode: nil * End: * * vim: expandtab shiftwidth=4: */