• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • Examples
  • File List
  • Globals

libswscale/x86/swscale_template.c

Go to the documentation of this file.
00001 /*
00002  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
00003  *
00004  * This file is part of Libav.
00005  *
00006  * Libav is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * Libav is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with Libav; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00019  */
00020 
00021 #undef REAL_MOVNTQ
00022 #undef MOVNTQ
00023 #undef PREFETCH
00024 
00025 #if COMPILE_TEMPLATE_MMX2
00026 #define PREFETCH "prefetchnta"
00027 #else
00028 #define PREFETCH  " # nop"
00029 #endif
00030 
00031 #if COMPILE_TEMPLATE_MMX2
00032 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
00033 #else
00034 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
00035 #endif
00036 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
00037 
00038 #define YSCALEYUV2PACKEDX_UV \
00039     __asm__ volatile(\
00040         "xor                   %%"REG_a", %%"REG_a"     \n\t"\
00041         ".p2align                      4                \n\t"\
00042         "nop                                            \n\t"\
00043         "1:                                             \n\t"\
00044         "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
00045         "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00046         "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
00047         "movq                      %%mm3, %%mm4         \n\t"\
00048         ".p2align                      4                \n\t"\
00049         "2:                                             \n\t"\
00050         "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
00051         "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
00052         "add                          %6, %%"REG_S"     \n\t" \
00053         "movq     (%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
00054         "add                         $16, %%"REG_d"     \n\t"\
00055         "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00056         "pmulhw                    %%mm0, %%mm2         \n\t"\
00057         "pmulhw                    %%mm0, %%mm5         \n\t"\
00058         "paddw                     %%mm2, %%mm3         \n\t"\
00059         "paddw                     %%mm5, %%mm4         \n\t"\
00060         "test                  %%"REG_S", %%"REG_S"     \n\t"\
00061         " jnz                         2b                \n\t"\
00062 
00063 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
00064     "lea                "offset"(%0), %%"REG_d"     \n\t"\
00065     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00066     "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
00067     "movq                    "#dst1", "#dst2"       \n\t"\
00068     ".p2align                      4                \n\t"\
00069     "2:                                             \n\t"\
00070     "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
00071     "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
00072     "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
00073     "add                         $16, %%"REG_d"            \n\t"\
00074     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00075     "pmulhw                 "#coeff", "#src1"       \n\t"\
00076     "pmulhw                 "#coeff", "#src2"       \n\t"\
00077     "paddw                   "#src1", "#dst1"       \n\t"\
00078     "paddw                   "#src2", "#dst2"       \n\t"\
00079     "test                  %%"REG_S", %%"REG_S"     \n\t"\
00080     " jnz                         2b                \n\t"\
00081 
00082 #define YSCALEYUV2PACKEDX \
00083     YSCALEYUV2PACKEDX_UV \
00084     YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
00085 
00086 #define YSCALEYUV2PACKEDX_END                     \
00087         :: "r" (&c->redDither),                   \
00088             "m" (dummy), "m" (dummy), "m" (dummy),\
00089             "r" (dest), "m" (dstW_reg), "m"(uv_off) \
00090         : "%"REG_a, "%"REG_d, "%"REG_S            \
00091     );
00092 
00093 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
00094     __asm__ volatile(\
00095         "xor %%"REG_a", %%"REG_a"                       \n\t"\
00096         ".p2align                      4                \n\t"\
00097         "nop                                            \n\t"\
00098         "1:                                             \n\t"\
00099         "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
00100         "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00101         "pxor                      %%mm4, %%mm4         \n\t"\
00102         "pxor                      %%mm5, %%mm5         \n\t"\
00103         "pxor                      %%mm6, %%mm6         \n\t"\
00104         "pxor                      %%mm7, %%mm7         \n\t"\
00105         ".p2align                      4                \n\t"\
00106         "2:                                             \n\t"\
00107         "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
00108         "add                          %6, %%"REG_S"      \n\t" \
00109         "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
00110         "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
00111         "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
00112         "movq                      %%mm0, %%mm3         \n\t"\
00113         "punpcklwd                 %%mm1, %%mm0         \n\t"\
00114         "punpckhwd                 %%mm1, %%mm3         \n\t"\
00115         "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
00116         "pmaddwd                   %%mm1, %%mm0         \n\t"\
00117         "pmaddwd                   %%mm1, %%mm3         \n\t"\
00118         "paddd                     %%mm0, %%mm4         \n\t"\
00119         "paddd                     %%mm3, %%mm5         \n\t"\
00120         "add                          %6, %%"REG_S"      \n\t" \
00121         "movq     (%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
00122         "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
00123         "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
00124         "test                  %%"REG_S", %%"REG_S"     \n\t"\
00125         "movq                      %%mm2, %%mm0         \n\t"\
00126         "punpcklwd                 %%mm3, %%mm2         \n\t"\
00127         "punpckhwd                 %%mm3, %%mm0         \n\t"\
00128         "pmaddwd                   %%mm1, %%mm2         \n\t"\
00129         "pmaddwd                   %%mm1, %%mm0         \n\t"\
00130         "paddd                     %%mm2, %%mm6         \n\t"\
00131         "paddd                     %%mm0, %%mm7         \n\t"\
00132         " jnz                         2b                \n\t"\
00133         "psrad                       $16, %%mm4         \n\t"\
00134         "psrad                       $16, %%mm5         \n\t"\
00135         "psrad                       $16, %%mm6         \n\t"\
00136         "psrad                       $16, %%mm7         \n\t"\
00137         "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
00138         "packssdw                  %%mm5, %%mm4         \n\t"\
00139         "packssdw                  %%mm7, %%mm6         \n\t"\
00140         "paddw                     %%mm0, %%mm4         \n\t"\
00141         "paddw                     %%mm0, %%mm6         \n\t"\
00142         "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
00143         "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
00144 
00145 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
00146     "lea                "offset"(%0), %%"REG_d"     \n\t"\
00147     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00148     "pxor                      %%mm1, %%mm1         \n\t"\
00149     "pxor                      %%mm5, %%mm5         \n\t"\
00150     "pxor                      %%mm7, %%mm7         \n\t"\
00151     "pxor                      %%mm6, %%mm6         \n\t"\
00152     ".p2align                      4                \n\t"\
00153     "2:                                             \n\t"\
00154     "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
00155     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
00156     "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
00157     "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
00158     "movq                      %%mm0, %%mm3         \n\t"\
00159     "punpcklwd                 %%mm4, %%mm0         \n\t"\
00160     "punpckhwd                 %%mm4, %%mm3         \n\t"\
00161     "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
00162     "pmaddwd                   %%mm4, %%mm0         \n\t"\
00163     "pmaddwd                   %%mm4, %%mm3         \n\t"\
00164     "paddd                     %%mm0, %%mm1         \n\t"\
00165     "paddd                     %%mm3, %%mm5         \n\t"\
00166     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
00167     "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
00168     "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
00169     "test                  %%"REG_S", %%"REG_S"     \n\t"\
00170     "movq                      %%mm2, %%mm0         \n\t"\
00171     "punpcklwd                 %%mm3, %%mm2         \n\t"\
00172     "punpckhwd                 %%mm3, %%mm0         \n\t"\
00173     "pmaddwd                   %%mm4, %%mm2         \n\t"\
00174     "pmaddwd                   %%mm4, %%mm0         \n\t"\
00175     "paddd                     %%mm2, %%mm7         \n\t"\
00176     "paddd                     %%mm0, %%mm6         \n\t"\
00177     " jnz                         2b                \n\t"\
00178     "psrad                       $16, %%mm1         \n\t"\
00179     "psrad                       $16, %%mm5         \n\t"\
00180     "psrad                       $16, %%mm7         \n\t"\
00181     "psrad                       $16, %%mm6         \n\t"\
00182     "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
00183     "packssdw                  %%mm5, %%mm1         \n\t"\
00184     "packssdw                  %%mm6, %%mm7         \n\t"\
00185     "paddw                     %%mm0, %%mm1         \n\t"\
00186     "paddw                     %%mm0, %%mm7         \n\t"\
00187     "movq               "U_TEMP"(%0), %%mm3         \n\t"\
00188     "movq               "V_TEMP"(%0), %%mm4         \n\t"\
00189 
00190 #define YSCALEYUV2PACKEDX_ACCURATE \
00191     YSCALEYUV2PACKEDX_ACCURATE_UV \
00192     YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
00193 
00194 #define YSCALEYUV2RGBX \
00195     "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
00196     "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
00197     "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
00198     "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
00199     "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
00200     "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
00201     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
00202     "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
00203     "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
00204     "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
00205     "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
00206     "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
00207     "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
00208     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
00209     "paddw           %%mm3, %%mm4       \n\t"\
00210     "movq            %%mm2, %%mm0       \n\t"\
00211     "movq            %%mm5, %%mm6       \n\t"\
00212     "movq            %%mm4, %%mm3       \n\t"\
00213     "punpcklwd       %%mm2, %%mm2       \n\t"\
00214     "punpcklwd       %%mm5, %%mm5       \n\t"\
00215     "punpcklwd       %%mm4, %%mm4       \n\t"\
00216     "paddw           %%mm1, %%mm2       \n\t"\
00217     "paddw           %%mm1, %%mm5       \n\t"\
00218     "paddw           %%mm1, %%mm4       \n\t"\
00219     "punpckhwd       %%mm0, %%mm0       \n\t"\
00220     "punpckhwd       %%mm6, %%mm6       \n\t"\
00221     "punpckhwd       %%mm3, %%mm3       \n\t"\
00222     "paddw           %%mm7, %%mm0       \n\t"\
00223     "paddw           %%mm7, %%mm6       \n\t"\
00224     "paddw           %%mm7, %%mm3       \n\t"\
00225     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
00226     "packuswb        %%mm0, %%mm2       \n\t"\
00227     "packuswb        %%mm6, %%mm5       \n\t"\
00228     "packuswb        %%mm3, %%mm4       \n\t"\
00229 
00230 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
00231     "movq       "#b", "#q2"     \n\t" /* B */\
00232     "movq       "#r", "#t"      \n\t" /* R */\
00233     "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
00234     "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
00235     "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
00236     "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
00237     "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
00238     "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
00239     "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
00240     "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
00241     "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
00242     "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
00243 \
00244     MOVNTQ(   q0,   (dst, index, 4))\
00245     MOVNTQ(    b,  8(dst, index, 4))\
00246     MOVNTQ(   q2, 16(dst, index, 4))\
00247     MOVNTQ(   q3, 24(dst, index, 4))\
00248 \
00249     "add      $8, "#index"      \n\t"\
00250     "cmp "#dstw", "#index"      \n\t"\
00251     " jb      1b                \n\t"
00252 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
00253 
00254 static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
00255                                    const int16_t **lumSrc, int lumFilterSize,
00256                                    const int16_t *chrFilter, const int16_t **chrUSrc,
00257                                    const int16_t **chrVSrc,
00258                                    int chrFilterSize, const int16_t **alpSrc,
00259                                    uint8_t *dest, int dstW, int dstY)
00260 {
00261     x86_reg dummy=0;
00262     x86_reg dstW_reg = dstW;
00263     x86_reg uv_off = c->uv_off_byte;
00264 
00265     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
00266         YSCALEYUV2PACKEDX_ACCURATE
00267         YSCALEYUV2RGBX
00268         "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
00269         "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
00270         "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
00271         YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
00272         "movq               "Y_TEMP"(%0), %%mm5         \n\t"
00273         "psraw                        $3, %%mm1         \n\t"
00274         "psraw                        $3, %%mm7         \n\t"
00275         "packuswb                  %%mm7, %%mm1         \n\t"
00276         WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
00277         YSCALEYUV2PACKEDX_END
00278     } else {
00279         YSCALEYUV2PACKEDX_ACCURATE
00280         YSCALEYUV2RGBX
00281         "pcmpeqd %%mm7, %%mm7 \n\t"
00282         WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
00283         YSCALEYUV2PACKEDX_END
00284     }
00285 }
00286 
00287 static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
00288                                 const int16_t **lumSrc, int lumFilterSize,
00289                                 const int16_t *chrFilter, const int16_t **chrUSrc,
00290                                 const int16_t **chrVSrc,
00291                                 int chrFilterSize, const int16_t **alpSrc,
00292                                 uint8_t *dest, int dstW, int dstY)
00293 {
00294     x86_reg dummy=0;
00295     x86_reg dstW_reg = dstW;
00296     x86_reg uv_off = c->uv_off_byte;
00297 
00298     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
00299         YSCALEYUV2PACKEDX
00300         YSCALEYUV2RGBX
00301         YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
00302         "psraw                        $3, %%mm1         \n\t"
00303         "psraw                        $3, %%mm7         \n\t"
00304         "packuswb                  %%mm7, %%mm1         \n\t"
00305         WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
00306         YSCALEYUV2PACKEDX_END
00307     } else {
00308         YSCALEYUV2PACKEDX
00309         YSCALEYUV2RGBX
00310         "pcmpeqd %%mm7, %%mm7 \n\t"
00311         WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
00312         YSCALEYUV2PACKEDX_END
00313     }
00314 }
00315 
00316 #define REAL_WRITERGB16(dst, dstw, index) \
00317     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
00318     "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
00319     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
00320     "psrlq           $3, %%mm2  \n\t"\
00321 \
00322     "movq         %%mm2, %%mm1  \n\t"\
00323     "movq         %%mm4, %%mm3  \n\t"\
00324 \
00325     "punpcklbw    %%mm7, %%mm3  \n\t"\
00326     "punpcklbw    %%mm5, %%mm2  \n\t"\
00327     "punpckhbw    %%mm7, %%mm4  \n\t"\
00328     "punpckhbw    %%mm5, %%mm1  \n\t"\
00329 \
00330     "psllq           $3, %%mm3  \n\t"\
00331     "psllq           $3, %%mm4  \n\t"\
00332 \
00333     "por          %%mm3, %%mm2  \n\t"\
00334     "por          %%mm4, %%mm1  \n\t"\
00335 \
00336     MOVNTQ(%%mm2,  (dst, index, 2))\
00337     MOVNTQ(%%mm1, 8(dst, index, 2))\
00338 \
00339     "add             $8, "#index"   \n\t"\
00340     "cmp        "#dstw", "#index"   \n\t"\
00341     " jb             1b             \n\t"
00342 #define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
00343 
00344 static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
00345                                     const int16_t **lumSrc, int lumFilterSize,
00346                                     const int16_t *chrFilter, const int16_t **chrUSrc,
00347                                     const int16_t **chrVSrc,
00348                                     int chrFilterSize, const int16_t **alpSrc,
00349                                     uint8_t *dest, int dstW, int dstY)
00350 {
00351     x86_reg dummy=0;
00352     x86_reg dstW_reg = dstW;
00353     x86_reg uv_off = c->uv_off_byte;
00354 
00355     YSCALEYUV2PACKEDX_ACCURATE
00356     YSCALEYUV2RGBX
00357     "pxor %%mm7, %%mm7 \n\t"
00358     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00359 #ifdef DITHER1XBPP
00360     "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
00361     "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
00362     "paddusb "RED_DITHER"(%0), %%mm5\n\t"
00363 #endif
00364     WRITERGB16(%4, %5, %%REGa)
00365     YSCALEYUV2PACKEDX_END
00366 }
00367 
00368 static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
00369                                  const int16_t **lumSrc, int lumFilterSize,
00370                                  const int16_t *chrFilter, const int16_t **chrUSrc,
00371                                  const int16_t **chrVSrc,
00372                                  int chrFilterSize, const int16_t **alpSrc,
00373                                  uint8_t *dest, int dstW, int dstY)
00374 {
00375     x86_reg dummy=0;
00376     x86_reg dstW_reg = dstW;
00377     x86_reg uv_off = c->uv_off_byte;
00378 
00379     YSCALEYUV2PACKEDX
00380     YSCALEYUV2RGBX
00381     "pxor %%mm7, %%mm7 \n\t"
00382     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00383 #ifdef DITHER1XBPP
00384     "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
00385     "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
00386     "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
00387 #endif
00388     WRITERGB16(%4, %5, %%REGa)
00389     YSCALEYUV2PACKEDX_END
00390 }
00391 
00392 #define REAL_WRITERGB15(dst, dstw, index) \
00393     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
00394     "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
00395     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
00396     "psrlq           $3, %%mm2  \n\t"\
00397     "psrlq           $1, %%mm5  \n\t"\
00398 \
00399     "movq         %%mm2, %%mm1  \n\t"\
00400     "movq         %%mm4, %%mm3  \n\t"\
00401 \
00402     "punpcklbw    %%mm7, %%mm3  \n\t"\
00403     "punpcklbw    %%mm5, %%mm2  \n\t"\
00404     "punpckhbw    %%mm7, %%mm4  \n\t"\
00405     "punpckhbw    %%mm5, %%mm1  \n\t"\
00406 \
00407     "psllq           $2, %%mm3  \n\t"\
00408     "psllq           $2, %%mm4  \n\t"\
00409 \
00410     "por          %%mm3, %%mm2  \n\t"\
00411     "por          %%mm4, %%mm1  \n\t"\
00412 \
00413     MOVNTQ(%%mm2,  (dst, index, 2))\
00414     MOVNTQ(%%mm1, 8(dst, index, 2))\
00415 \
00416     "add             $8, "#index"   \n\t"\
00417     "cmp        "#dstw", "#index"   \n\t"\
00418     " jb             1b             \n\t"
00419 #define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
00420 
00421 static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
00422                                     const int16_t **lumSrc, int lumFilterSize,
00423                                     const int16_t *chrFilter, const int16_t **chrUSrc,
00424                                     const int16_t **chrVSrc,
00425                                     int chrFilterSize, const int16_t **alpSrc,
00426                                     uint8_t *dest, int dstW, int dstY)
00427 {
00428     x86_reg dummy=0;
00429     x86_reg dstW_reg = dstW;
00430     x86_reg uv_off = c->uv_off_byte;
00431 
00432     YSCALEYUV2PACKEDX_ACCURATE
00433     YSCALEYUV2RGBX
00434     "pxor %%mm7, %%mm7 \n\t"
00435     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00436 #ifdef DITHER1XBPP
00437     "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
00438     "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
00439     "paddusb "RED_DITHER"(%0), %%mm5\n\t"
00440 #endif
00441     WRITERGB15(%4, %5, %%REGa)
00442     YSCALEYUV2PACKEDX_END
00443 }
00444 
00445 static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
00446                                  const int16_t **lumSrc, int lumFilterSize,
00447                                  const int16_t *chrFilter, const int16_t **chrUSrc,
00448                                  const int16_t **chrVSrc,
00449                                  int chrFilterSize, const int16_t **alpSrc,
00450                                  uint8_t *dest, int dstW, int dstY)
00451 {
00452     x86_reg dummy=0;
00453     x86_reg dstW_reg = dstW;
00454     x86_reg uv_off = c->uv_off_byte;
00455 
00456     YSCALEYUV2PACKEDX
00457     YSCALEYUV2RGBX
00458     "pxor %%mm7, %%mm7 \n\t"
00459     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00460 #ifdef DITHER1XBPP
00461     "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
00462     "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
00463     "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
00464 #endif
00465     WRITERGB15(%4, %5, %%REGa)
00466     YSCALEYUV2PACKEDX_END
00467 }
00468 
00469 #define WRITEBGR24MMX(dst, dstw, index) \
00470     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
00471     "movq      %%mm2, %%mm1     \n\t" /* B */\
00472     "movq      %%mm5, %%mm6     \n\t" /* R */\
00473     "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
00474     "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
00475     "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
00476     "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
00477     "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
00478     "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
00479     "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
00480     "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
00481     "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
00482     "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
00483 \
00484     "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
00485     "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
00486     "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
00487     "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
00488 \
00489     "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
00490     "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
00491     "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
00492     "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
00493 \
00494     "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
00495     "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
00496     "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
00497     "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
00498 \
00499     "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
00500     "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
00501     "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
00502     "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
00503     MOVNTQ(%%mm0, (dst))\
00504 \
00505     "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
00506     "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
00507     "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
00508     "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
00509     MOVNTQ(%%mm6, 8(dst))\
00510 \
00511     "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
00512     "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
00513     "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
00514     MOVNTQ(%%mm5, 16(dst))\
00515 \
00516     "add         $24, "#dst"    \n\t"\
00517 \
00518     "add          $8, "#index"  \n\t"\
00519     "cmp     "#dstw", "#index"  \n\t"\
00520     " jb          1b            \n\t"
00521 
00522 #define WRITEBGR24MMX2(dst, dstw, index) \
00523     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
00524     "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
00525     "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
00526     "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
00527     "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
00528     "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
00529 \
00530     "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
00531     "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
00532     "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
00533 \
00534     "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
00535     "por    %%mm1, %%mm6        \n\t"\
00536     "por    %%mm3, %%mm6        \n\t"\
00537     MOVNTQ(%%mm6, (dst))\
00538 \
00539     "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
00540     "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
00541     "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
00542     "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
00543 \
00544     "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
00545     "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
00546     "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
00547 \
00548     "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
00549     "por    %%mm3, %%mm6        \n\t"\
00550     MOVNTQ(%%mm6, 8(dst))\
00551 \
00552     "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
00553     "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
00554     "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
00555 \
00556     "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
00557     "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
00558     "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
00559 \
00560     "por    %%mm1, %%mm3        \n\t"\
00561     "por    %%mm3, %%mm6        \n\t"\
00562     MOVNTQ(%%mm6, 16(dst))\
00563 \
00564     "add      $24, "#dst"       \n\t"\
00565 \
00566     "add       $8, "#index"     \n\t"\
00567     "cmp  "#dstw", "#index"     \n\t"\
00568     " jb       1b               \n\t"
00569 
00570 #if COMPILE_TEMPLATE_MMX2
00571 #undef WRITEBGR24
00572 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
00573 #else
00574 #undef WRITEBGR24
00575 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
00576 #endif
00577 
00578 static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
00579                                    const int16_t **lumSrc, int lumFilterSize,
00580                                    const int16_t *chrFilter, const int16_t **chrUSrc,
00581                                    const int16_t **chrVSrc,
00582                                    int chrFilterSize, const int16_t **alpSrc,
00583                                    uint8_t *dest, int dstW, int dstY)
00584 {
00585     x86_reg dummy=0;
00586     x86_reg dstW_reg = dstW;
00587     x86_reg uv_off = c->uv_off_byte;
00588 
00589     YSCALEYUV2PACKEDX_ACCURATE
00590     YSCALEYUV2RGBX
00591     "pxor %%mm7, %%mm7 \n\t"
00592     "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
00593     "add %4, %%"REG_c"                        \n\t"
00594     WRITEBGR24(%%REGc, %5, %%REGa)
00595     :: "r" (&c->redDither),
00596        "m" (dummy), "m" (dummy), "m" (dummy),
00597        "r" (dest), "m" (dstW_reg), "m"(uv_off)
00598     : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
00599     );
00600 }
00601 
00602 static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
00603                                 const int16_t **lumSrc, int lumFilterSize,
00604                                 const int16_t *chrFilter, const int16_t **chrUSrc,
00605                                 const int16_t **chrVSrc,
00606                                 int chrFilterSize, const int16_t **alpSrc,
00607                                 uint8_t *dest, int dstW, int dstY)
00608 {
00609     x86_reg dummy=0;
00610     x86_reg dstW_reg = dstW;
00611     x86_reg uv_off = c->uv_off_byte;
00612 
00613     YSCALEYUV2PACKEDX
00614     YSCALEYUV2RGBX
00615     "pxor                    %%mm7, %%mm7       \n\t"
00616     "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
00617     "add                        %4, %%"REG_c"   \n\t"
00618     WRITEBGR24(%%REGc, %5, %%REGa)
00619     :: "r" (&c->redDither),
00620        "m" (dummy), "m" (dummy), "m" (dummy),
00621        "r" (dest),  "m" (dstW_reg), "m"(uv_off)
00622     : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
00623     );
00624 }
00625 
00626 #define REAL_WRITEYUY2(dst, dstw, index) \
00627     "packuswb  %%mm3, %%mm3     \n\t"\
00628     "packuswb  %%mm4, %%mm4     \n\t"\
00629     "packuswb  %%mm7, %%mm1     \n\t"\
00630     "punpcklbw %%mm4, %%mm3     \n\t"\
00631     "movq      %%mm1, %%mm7     \n\t"\
00632     "punpcklbw %%mm3, %%mm1     \n\t"\
00633     "punpckhbw %%mm3, %%mm7     \n\t"\
00634 \
00635     MOVNTQ(%%mm1, (dst, index, 2))\
00636     MOVNTQ(%%mm7, 8(dst, index, 2))\
00637 \
00638     "add          $8, "#index"  \n\t"\
00639     "cmp     "#dstw", "#index"  \n\t"\
00640     " jb          1b            \n\t"
00641 #define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
00642 
00643 static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
00644                                      const int16_t **lumSrc, int lumFilterSize,
00645                                      const int16_t *chrFilter, const int16_t **chrUSrc,
00646                                      const int16_t **chrVSrc,
00647                                      int chrFilterSize, const int16_t **alpSrc,
00648                                      uint8_t *dest, int dstW, int dstY)
00649 {
00650     x86_reg dummy=0;
00651     x86_reg dstW_reg = dstW;
00652     x86_reg uv_off = c->uv_off_byte;
00653 
00654     YSCALEYUV2PACKEDX_ACCURATE
00655     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00656     "psraw $3, %%mm3    \n\t"
00657     "psraw $3, %%mm4    \n\t"
00658     "psraw $3, %%mm1    \n\t"
00659     "psraw $3, %%mm7    \n\t"
00660     WRITEYUY2(%4, %5, %%REGa)
00661     YSCALEYUV2PACKEDX_END
00662 }
00663 
00664 static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
00665                                   const int16_t **lumSrc, int lumFilterSize,
00666                                   const int16_t *chrFilter, const int16_t **chrUSrc,
00667                                   const int16_t **chrVSrc,
00668                                   int chrFilterSize, const int16_t **alpSrc,
00669                                   uint8_t *dest, int dstW, int dstY)
00670 {
00671     x86_reg dummy=0;
00672     x86_reg dstW_reg = dstW;
00673     x86_reg uv_off = c->uv_off_byte;
00674 
00675     YSCALEYUV2PACKEDX
00676     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00677     "psraw $3, %%mm3    \n\t"
00678     "psraw $3, %%mm4    \n\t"
00679     "psraw $3, %%mm1    \n\t"
00680     "psraw $3, %%mm7    \n\t"
00681     WRITEYUY2(%4, %5, %%REGa)
00682     YSCALEYUV2PACKEDX_END
00683 }
00684 
00685 #define REAL_YSCALEYUV2RGB_UV(index, c) \
00686     "xor            "#index", "#index"  \n\t"\
00687     ".p2align              4            \n\t"\
00688     "1:                                 \n\t"\
00689     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
00690     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
00691     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
00692     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
00693     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
00694     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
00695     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
00696     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
00697     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
00698     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
00699     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
00700     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
00701     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
00702     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
00703     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
00704     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
00705     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
00706     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
00707     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
00708     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
00709     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
00710     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
00711 
00712 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
00713     "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
00714     "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
00715     "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
00716     "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
00717     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
00718     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
00719     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00720     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00721     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00722     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00723     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00724     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00725 
00726 #define REAL_YSCALEYUV2RGB_COEFF(c) \
00727     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
00728     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
00729     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
00730     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
00731     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
00732     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
00733     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
00734     "paddw             %%mm3, %%mm4     \n\t"\
00735     "movq              %%mm2, %%mm0     \n\t"\
00736     "movq              %%mm5, %%mm6     \n\t"\
00737     "movq              %%mm4, %%mm3     \n\t"\
00738     "punpcklwd         %%mm2, %%mm2     \n\t"\
00739     "punpcklwd         %%mm5, %%mm5     \n\t"\
00740     "punpcklwd         %%mm4, %%mm4     \n\t"\
00741     "paddw             %%mm1, %%mm2     \n\t"\
00742     "paddw             %%mm1, %%mm5     \n\t"\
00743     "paddw             %%mm1, %%mm4     \n\t"\
00744     "punpckhwd         %%mm0, %%mm0     \n\t"\
00745     "punpckhwd         %%mm6, %%mm6     \n\t"\
00746     "punpckhwd         %%mm3, %%mm3     \n\t"\
00747     "paddw             %%mm7, %%mm0     \n\t"\
00748     "paddw             %%mm7, %%mm6     \n\t"\
00749     "paddw             %%mm7, %%mm3     \n\t"\
00750     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
00751     "packuswb          %%mm0, %%mm2     \n\t"\
00752     "packuswb          %%mm6, %%mm5     \n\t"\
00753     "packuswb          %%mm3, %%mm4     \n\t"\
00754 
00755 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
00756 
00757 #define YSCALEYUV2RGB(index, c) \
00758     REAL_YSCALEYUV2RGB_UV(index, c) \
00759     REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
00760     REAL_YSCALEYUV2RGB_COEFF(c)
00761 
00765 static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
00766                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
00767                                 const int16_t *abuf[2], uint8_t *dest,
00768                                 int dstW, int yalpha, int uvalpha, int y)
00769 {
00770     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
00771                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
00772 
00773     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
00774         const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
00775 #if ARCH_X86_64
00776         __asm__ volatile(
00777             YSCALEYUV2RGB(%%r8, %5)
00778             YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
00779             "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
00780             "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
00781             "packuswb            %%mm7, %%mm1       \n\t"
00782             WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
00783             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
00784                "a" (&c->redDither),
00785                "r" (abuf0), "r" (abuf1)
00786             : "%r8"
00787         );
00788 #else
00789         *(const uint16_t **)(&c->u_temp)=abuf0;
00790         *(const uint16_t **)(&c->v_temp)=abuf1;
00791         __asm__ volatile(
00792             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
00793             "mov        %4, %%"REG_b"               \n\t"
00794             "push %%"REG_BP"                        \n\t"
00795             YSCALEYUV2RGB(%%REGBP, %5)
00796             "push                   %0              \n\t"
00797             "push                   %1              \n\t"
00798             "mov          "U_TEMP"(%5), %0          \n\t"
00799             "mov          "V_TEMP"(%5), %1          \n\t"
00800             YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
00801             "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
00802             "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
00803             "packuswb            %%mm7, %%mm1       \n\t"
00804             "pop                    %1              \n\t"
00805             "pop                    %0              \n\t"
00806             WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
00807             "pop %%"REG_BP"                         \n\t"
00808             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
00809             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
00810                "a" (&c->redDither)
00811         );
00812 #endif
00813     } else {
00814         __asm__ volatile(
00815             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
00816             "mov        %4, %%"REG_b"               \n\t"
00817             "push %%"REG_BP"                        \n\t"
00818             YSCALEYUV2RGB(%%REGBP, %5)
00819             "pcmpeqd %%mm7, %%mm7                   \n\t"
00820             WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
00821             "pop %%"REG_BP"                         \n\t"
00822             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
00823             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
00824                "a" (&c->redDither)
00825         );
00826     }
00827 }
00828 
00829 static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
00830                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
00831                                 const int16_t *abuf[2], uint8_t *dest,
00832                                 int dstW, int yalpha, int uvalpha, int y)
00833 {
00834     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
00835                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
00836 
00837     //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
00838     __asm__ volatile(
00839         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
00840         "mov        %4, %%"REG_b"               \n\t"
00841         "push %%"REG_BP"                        \n\t"
00842         YSCALEYUV2RGB(%%REGBP, %5)
00843         "pxor    %%mm7, %%mm7                   \n\t"
00844         WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
00845         "pop %%"REG_BP"                         \n\t"
00846         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
00847         :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
00848            "a" (&c->redDither)
00849     );
00850 }
00851 
00852 static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
00853                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
00854                                  const int16_t *abuf[2], uint8_t *dest,
00855                                  int dstW, int yalpha, int uvalpha, int y)
00856 {
00857     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
00858                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
00859 
00860     //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
00861     __asm__ volatile(
00862         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
00863         "mov        %4, %%"REG_b"               \n\t"
00864         "push %%"REG_BP"                        \n\t"
00865         YSCALEYUV2RGB(%%REGBP, %5)
00866         "pxor    %%mm7, %%mm7                   \n\t"
00867         /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00868 #ifdef DITHER1XBPP
00869         "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
00870         "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
00871         "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
00872 #endif
00873         WRITERGB15(%%REGb, 8280(%5), %%REGBP)
00874         "pop %%"REG_BP"                         \n\t"
00875         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
00876         :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
00877            "a" (&c->redDither)
00878     );
00879 }
00880 
00881 static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
00882                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
00883                                  const int16_t *abuf[2], uint8_t *dest,
00884                                  int dstW, int yalpha, int uvalpha, int y)
00885 {
00886     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
00887                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
00888 
00889     //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
00890     __asm__ volatile(
00891         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
00892         "mov        %4, %%"REG_b"               \n\t"
00893         "push %%"REG_BP"                        \n\t"
00894         YSCALEYUV2RGB(%%REGBP, %5)
00895         "pxor    %%mm7, %%mm7                   \n\t"
00896         /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00897 #ifdef DITHER1XBPP
00898         "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
00899         "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
00900         "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
00901 #endif
00902         WRITERGB16(%%REGb, 8280(%5), %%REGBP)
00903         "pop %%"REG_BP"                         \n\t"
00904         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
00905         :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
00906            "a" (&c->redDither)
00907     );
00908 }
00909 
00910 #define REAL_YSCALEYUV2PACKED(index, c) \
00911     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
00912     "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
00913     "psraw                $3, %%mm0                           \n\t"\
00914     "psraw                $3, %%mm1                           \n\t"\
00915     "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
00916     "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
00917     "xor            "#index", "#index"                        \n\t"\
00918     ".p2align              4            \n\t"\
00919     "1:                                 \n\t"\
00920     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
00921     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
00922     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
00923     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
00924     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
00925     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
00926     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
00927     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
00928     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
00929     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
00930     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
00931     "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
00932     "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
00933     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
00934     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
00935     "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
00936     "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
00937     "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
00938     "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
00939     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
00940     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
00941     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00942     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00943     "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00944     "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00945     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00946     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00947 
00948 #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
00949 
00950 static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
00951                                   const int16_t *ubuf[2], const int16_t *vbuf[2],
00952                                   const int16_t *abuf[2], uint8_t *dest,
00953                                   int dstW, int yalpha, int uvalpha, int y)
00954 {
00955     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
00956                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
00957 
00958     //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
00959     __asm__ volatile(
00960         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
00961         "mov %4, %%"REG_b"                        \n\t"
00962         "push %%"REG_BP"                        \n\t"
00963         YSCALEYUV2PACKED(%%REGBP, %5)
00964         WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
00965         "pop %%"REG_BP"                         \n\t"
00966         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
00967         :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
00968            "a" (&c->redDither)
00969     );
00970 }
00971 
00972 #define REAL_YSCALEYUV2RGB1(index, c) \
00973     "xor            "#index", "#index"  \n\t"\
00974     ".p2align              4            \n\t"\
00975     "1:                                 \n\t"\
00976     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
00977     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
00978     "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
00979     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
00980     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
00981     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
00982     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
00983     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
00984     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
00985     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
00986     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
00987     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
00988     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
00989     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
00990     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
00991     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00992     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00993     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
00994     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
00995     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
00996     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
00997     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
00998     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
00999     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
01000     "paddw             %%mm3, %%mm4     \n\t"\
01001     "movq              %%mm2, %%mm0     \n\t"\
01002     "movq              %%mm5, %%mm6     \n\t"\
01003     "movq              %%mm4, %%mm3     \n\t"\
01004     "punpcklwd         %%mm2, %%mm2     \n\t"\
01005     "punpcklwd         %%mm5, %%mm5     \n\t"\
01006     "punpcklwd         %%mm4, %%mm4     \n\t"\
01007     "paddw             %%mm1, %%mm2     \n\t"\
01008     "paddw             %%mm1, %%mm5     \n\t"\
01009     "paddw             %%mm1, %%mm4     \n\t"\
01010     "punpckhwd         %%mm0, %%mm0     \n\t"\
01011     "punpckhwd         %%mm6, %%mm6     \n\t"\
01012     "punpckhwd         %%mm3, %%mm3     \n\t"\
01013     "paddw             %%mm7, %%mm0     \n\t"\
01014     "paddw             %%mm7, %%mm6     \n\t"\
01015     "paddw             %%mm7, %%mm3     \n\t"\
01016     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
01017     "packuswb          %%mm0, %%mm2     \n\t"\
01018     "packuswb          %%mm6, %%mm5     \n\t"\
01019     "packuswb          %%mm3, %%mm4     \n\t"\
01020 
01021 #define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
01022 
01023 // do vertical chrominance interpolation
01024 #define REAL_YSCALEYUV2RGB1b(index, c) \
01025     "xor            "#index", "#index"  \n\t"\
01026     ".p2align              4            \n\t"\
01027     "1:                                 \n\t"\
01028     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
01029     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
01030     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
01031     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
01032     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
01033     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
01034     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
01035     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
01036     "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
01037     "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
01038     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
01039     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
01040     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
01041     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
01042     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
01043     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
01044     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
01045     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
01046     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
01047     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
01048     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
01049     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
01050     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
01051     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
01052     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
01053     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
01054     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
01055     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
01056     "paddw             %%mm3, %%mm4     \n\t"\
01057     "movq              %%mm2, %%mm0     \n\t"\
01058     "movq              %%mm5, %%mm6     \n\t"\
01059     "movq              %%mm4, %%mm3     \n\t"\
01060     "punpcklwd         %%mm2, %%mm2     \n\t"\
01061     "punpcklwd         %%mm5, %%mm5     \n\t"\
01062     "punpcklwd         %%mm4, %%mm4     \n\t"\
01063     "paddw             %%mm1, %%mm2     \n\t"\
01064     "paddw             %%mm1, %%mm5     \n\t"\
01065     "paddw             %%mm1, %%mm4     \n\t"\
01066     "punpckhwd         %%mm0, %%mm0     \n\t"\
01067     "punpckhwd         %%mm6, %%mm6     \n\t"\
01068     "punpckhwd         %%mm3, %%mm3     \n\t"\
01069     "paddw             %%mm7, %%mm0     \n\t"\
01070     "paddw             %%mm7, %%mm6     \n\t"\
01071     "paddw             %%mm7, %%mm3     \n\t"\
01072     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
01073     "packuswb          %%mm0, %%mm2     \n\t"\
01074     "packuswb          %%mm6, %%mm5     \n\t"\
01075     "packuswb          %%mm3, %%mm4     \n\t"\
01076 
01077 #define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
01078 
01079 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
01080     "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
01081     "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
01082     "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
01083     "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
01084     "packuswb          %%mm1, %%mm7     \n\t"
01085 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
01086 
01090 static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
01091                                 const int16_t *ubuf[2], const int16_t *bguf[2],
01092                                 const int16_t *abuf0, uint8_t *dest,
01093                                 int dstW, int uvalpha, int y)
01094 {
01095     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01096     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
01097 
01098     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
01099         if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
01100             __asm__ volatile(
01101                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01102                 "mov        %4, %%"REG_b"               \n\t"
01103                 "push %%"REG_BP"                        \n\t"
01104                 YSCALEYUV2RGB1(%%REGBP, %5)
01105                 YSCALEYUV2RGB1_ALPHA(%%REGBP)
01106                 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01107                 "pop %%"REG_BP"                         \n\t"
01108                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01109                 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01110                    "a" (&c->redDither)
01111             );
01112         } else {
01113             __asm__ volatile(
01114                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01115                 "mov        %4, %%"REG_b"               \n\t"
01116                 "push %%"REG_BP"                        \n\t"
01117                 YSCALEYUV2RGB1(%%REGBP, %5)
01118                 "pcmpeqd %%mm7, %%mm7                   \n\t"
01119                 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01120                 "pop %%"REG_BP"                         \n\t"
01121                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01122                 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01123                    "a" (&c->redDither)
01124             );
01125         }
01126     } else {
01127         if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
01128             __asm__ volatile(
01129                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01130                 "mov        %4, %%"REG_b"               \n\t"
01131                 "push %%"REG_BP"                        \n\t"
01132                 YSCALEYUV2RGB1b(%%REGBP, %5)
01133                 YSCALEYUV2RGB1_ALPHA(%%REGBP)
01134                 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01135                 "pop %%"REG_BP"                         \n\t"
01136                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01137                 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01138                    "a" (&c->redDither)
01139             );
01140         } else {
01141             __asm__ volatile(
01142                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01143                 "mov        %4, %%"REG_b"               \n\t"
01144                 "push %%"REG_BP"                        \n\t"
01145                 YSCALEYUV2RGB1b(%%REGBP, %5)
01146                 "pcmpeqd %%mm7, %%mm7                   \n\t"
01147                 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01148                 "pop %%"REG_BP"                         \n\t"
01149                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01150                 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01151                    "a" (&c->redDither)
01152             );
01153         }
01154     }
01155 }
01156 
01157 static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
01158                                 const int16_t *ubuf[2], const int16_t *bguf[2],
01159                                 const int16_t *abuf0, uint8_t *dest,
01160                                 int dstW, int uvalpha, int y)
01161 {
01162     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01163     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
01164 
01165     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
01166         __asm__ volatile(
01167             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01168             "mov        %4, %%"REG_b"               \n\t"
01169             "push %%"REG_BP"                        \n\t"
01170             YSCALEYUV2RGB1(%%REGBP, %5)
01171             "pxor    %%mm7, %%mm7                   \n\t"
01172             WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
01173             "pop %%"REG_BP"                         \n\t"
01174             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01175             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01176                "a" (&c->redDither)
01177         );
01178     } else {
01179         __asm__ volatile(
01180             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01181             "mov        %4, %%"REG_b"               \n\t"
01182             "push %%"REG_BP"                        \n\t"
01183             YSCALEYUV2RGB1b(%%REGBP, %5)
01184             "pxor    %%mm7, %%mm7                   \n\t"
01185             WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
01186             "pop %%"REG_BP"                         \n\t"
01187             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01188             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01189                "a" (&c->redDither)
01190         );
01191     }
01192 }
01193 
01194 static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
01195                                  const int16_t *ubuf[2], const int16_t *bguf[2],
01196                                  const int16_t *abuf0, uint8_t *dest,
01197                                  int dstW, int uvalpha, int y)
01198 {
01199     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01200     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
01201 
01202     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
01203         __asm__ volatile(
01204             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01205             "mov        %4, %%"REG_b"               \n\t"
01206             "push %%"REG_BP"                        \n\t"
01207             YSCALEYUV2RGB1(%%REGBP, %5)
01208             "pxor    %%mm7, %%mm7                   \n\t"
01209             /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01210 #ifdef DITHER1XBPP
01211             "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01212             "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01213             "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01214 #endif
01215             WRITERGB15(%%REGb, 8280(%5), %%REGBP)
01216             "pop %%"REG_BP"                         \n\t"
01217             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01218             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01219                "a" (&c->redDither)
01220         );
01221     } else {
01222         __asm__ volatile(
01223             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01224             "mov        %4, %%"REG_b"               \n\t"
01225             "push %%"REG_BP"                        \n\t"
01226             YSCALEYUV2RGB1b(%%REGBP, %5)
01227             "pxor    %%mm7, %%mm7                   \n\t"
01228             /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01229 #ifdef DITHER1XBPP
01230             "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01231             "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01232             "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01233 #endif
01234             WRITERGB15(%%REGb, 8280(%5), %%REGBP)
01235             "pop %%"REG_BP"                         \n\t"
01236             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01237             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01238                "a" (&c->redDither)
01239         );
01240     }
01241 }
01242 
01243 static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
01244                                  const int16_t *ubuf[2], const int16_t *bguf[2],
01245                                  const int16_t *abuf0, uint8_t *dest,
01246                                  int dstW, int uvalpha, int y)
01247 {
01248     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01249     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
01250 
01251     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
01252         __asm__ volatile(
01253             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01254             "mov        %4, %%"REG_b"               \n\t"
01255             "push %%"REG_BP"                        \n\t"
01256             YSCALEYUV2RGB1(%%REGBP, %5)
01257             "pxor    %%mm7, %%mm7                   \n\t"
01258             /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01259 #ifdef DITHER1XBPP
01260             "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01261             "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01262             "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01263 #endif
01264             WRITERGB16(%%REGb, 8280(%5), %%REGBP)
01265             "pop %%"REG_BP"                         \n\t"
01266             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01267             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01268                "a" (&c->redDither)
01269         );
01270     } else {
01271         __asm__ volatile(
01272             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01273             "mov        %4, %%"REG_b"               \n\t"
01274             "push %%"REG_BP"                        \n\t"
01275             YSCALEYUV2RGB1b(%%REGBP, %5)
01276             "pxor    %%mm7, %%mm7                   \n\t"
01277             /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01278 #ifdef DITHER1XBPP
01279             "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01280             "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01281             "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01282 #endif
01283             WRITERGB16(%%REGb, 8280(%5), %%REGBP)
01284             "pop %%"REG_BP"                         \n\t"
01285             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01286             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01287                "a" (&c->redDither)
01288         );
01289     }
01290 }
01291 
01292 #define REAL_YSCALEYUV2PACKED1(index, c) \
01293     "xor            "#index", "#index"  \n\t"\
01294     ".p2align              4            \n\t"\
01295     "1:                                 \n\t"\
01296     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
01297     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
01298     "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
01299     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
01300     "psraw                $7, %%mm3     \n\t" \
01301     "psraw                $7, %%mm4     \n\t" \
01302     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
01303     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
01304     "psraw                $7, %%mm1     \n\t" \
01305     "psraw                $7, %%mm7     \n\t" \
01306 
01307 #define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
01308 
01309 #define REAL_YSCALEYUV2PACKED1b(index, c) \
01310     "xor "#index", "#index"             \n\t"\
01311     ".p2align              4            \n\t"\
01312     "1:                                 \n\t"\
01313     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
01314     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
01315     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
01316     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
01317     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
01318     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
01319     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
01320     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
01321     "psrlw                $8, %%mm3     \n\t" \
01322     "psrlw                $8, %%mm4     \n\t" \
01323     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
01324     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
01325     "psraw                $7, %%mm1     \n\t" \
01326     "psraw                $7, %%mm7     \n\t"
01327 #define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
01328 
01329 static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
01330                                   const int16_t *ubuf[2], const int16_t *bguf[2],
01331                                   const int16_t *abuf0, uint8_t *dest,
01332                                   int dstW, int uvalpha, int y)
01333 {
01334     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01335     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
01336 
01337     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
01338         __asm__ volatile(
01339             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01340             "mov        %4, %%"REG_b"               \n\t"
01341             "push %%"REG_BP"                        \n\t"
01342             YSCALEYUV2PACKED1(%%REGBP, %5)
01343             WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
01344             "pop %%"REG_BP"                         \n\t"
01345             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01346             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01347                "a" (&c->redDither)
01348         );
01349     } else {
01350         __asm__ volatile(
01351             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01352             "mov        %4, %%"REG_b"               \n\t"
01353             "push %%"REG_BP"                        \n\t"
01354             YSCALEYUV2PACKED1b(%%REGBP, %5)
01355             WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
01356             "pop %%"REG_BP"                         \n\t"
01357             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01358             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01359                "a" (&c->redDither)
01360         );
01361     }
01362 }
01363 
01364 static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src,
01365                                                   int width, enum PixelFormat srcFormat)
01366 {
01367 
01368     if(srcFormat == PIX_FMT_BGR24) {
01369         __asm__ volatile(
01370             "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
01371             "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
01372             :
01373         );
01374     } else {
01375         __asm__ volatile(
01376             "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
01377             "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
01378             :
01379         );
01380     }
01381 
01382     __asm__ volatile(
01383         "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
01384         "mov                        %2, %%"REG_a"   \n\t"
01385         "pxor                    %%mm7, %%mm7       \n\t"
01386         "1:                                         \n\t"
01387         PREFETCH"               64(%0)              \n\t"
01388         "movd                     (%0), %%mm0       \n\t"
01389         "movd                    2(%0), %%mm1       \n\t"
01390         "movd                    6(%0), %%mm2       \n\t"
01391         "movd                    8(%0), %%mm3       \n\t"
01392         "add                       $12, %0          \n\t"
01393         "punpcklbw               %%mm7, %%mm0       \n\t"
01394         "punpcklbw               %%mm7, %%mm1       \n\t"
01395         "punpcklbw               %%mm7, %%mm2       \n\t"
01396         "punpcklbw               %%mm7, %%mm3       \n\t"
01397         "pmaddwd                 %%mm5, %%mm0       \n\t"
01398         "pmaddwd                 %%mm6, %%mm1       \n\t"
01399         "pmaddwd                 %%mm5, %%mm2       \n\t"
01400         "pmaddwd                 %%mm6, %%mm3       \n\t"
01401         "paddd                   %%mm1, %%mm0       \n\t"
01402         "paddd                   %%mm3, %%mm2       \n\t"
01403         "paddd                   %%mm4, %%mm0       \n\t"
01404         "paddd                   %%mm4, %%mm2       \n\t"
01405         "psrad                     $15, %%mm0       \n\t"
01406         "psrad                     $15, %%mm2       \n\t"
01407         "packssdw                %%mm2, %%mm0       \n\t"
01408         "packuswb                %%mm0, %%mm0       \n\t"
01409         "movd                %%mm0, (%1, %%"REG_a") \n\t"
01410         "add                        $4, %%"REG_a"   \n\t"
01411         " js                        1b              \n\t"
01412     : "+r" (src)
01413     : "r" (dst+width), "g" ((x86_reg)-width)
01414     : "%"REG_a
01415     );
01416 }
01417 
01418 static void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src,
01419                              int width, uint32_t *unused)
01420 {
01421     RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
01422 }
01423 
01424 static void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src,
01425                              int width, uint32_t *unused)
01426 {
01427     RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
01428 }
01429 
01430 static av_always_inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV,
01431                                                    const uint8_t *src, int width,
01432                                                    enum PixelFormat srcFormat)
01433 {
01434     __asm__ volatile(
01435         "movq                    24(%4), %%mm6       \n\t"
01436         "mov                        %3, %%"REG_a"   \n\t"
01437         "pxor                    %%mm7, %%mm7       \n\t"
01438         "1:                                         \n\t"
01439         PREFETCH"               64(%0)              \n\t"
01440         "movd                     (%0), %%mm0       \n\t"
01441         "movd                    2(%0), %%mm1       \n\t"
01442         "punpcklbw               %%mm7, %%mm0       \n\t"
01443         "punpcklbw               %%mm7, %%mm1       \n\t"
01444         "movq                    %%mm0, %%mm2       \n\t"
01445         "movq                    %%mm1, %%mm3       \n\t"
01446         "pmaddwd                  (%4), %%mm0       \n\t"
01447         "pmaddwd                 8(%4), %%mm1       \n\t"
01448         "pmaddwd                16(%4), %%mm2       \n\t"
01449         "pmaddwd                 %%mm6, %%mm3       \n\t"
01450         "paddd                   %%mm1, %%mm0       \n\t"
01451         "paddd                   %%mm3, %%mm2       \n\t"
01452 
01453         "movd                    6(%0), %%mm1       \n\t"
01454         "movd                    8(%0), %%mm3       \n\t"
01455         "add                       $12, %0          \n\t"
01456         "punpcklbw               %%mm7, %%mm1       \n\t"
01457         "punpcklbw               %%mm7, %%mm3       \n\t"
01458         "movq                    %%mm1, %%mm4       \n\t"
01459         "movq                    %%mm3, %%mm5       \n\t"
01460         "pmaddwd                  (%4), %%mm1       \n\t"
01461         "pmaddwd                 8(%4), %%mm3       \n\t"
01462         "pmaddwd                16(%4), %%mm4       \n\t"
01463         "pmaddwd                 %%mm6, %%mm5       \n\t"
01464         "paddd                   %%mm3, %%mm1       \n\t"
01465         "paddd                   %%mm5, %%mm4       \n\t"
01466 
01467         "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
01468         "paddd                   %%mm3, %%mm0       \n\t"
01469         "paddd                   %%mm3, %%mm2       \n\t"
01470         "paddd                   %%mm3, %%mm1       \n\t"
01471         "paddd                   %%mm3, %%mm4       \n\t"
01472         "psrad                     $15, %%mm0       \n\t"
01473         "psrad                     $15, %%mm2       \n\t"
01474         "psrad                     $15, %%mm1       \n\t"
01475         "psrad                     $15, %%mm4       \n\t"
01476         "packssdw                %%mm1, %%mm0       \n\t"
01477         "packssdw                %%mm4, %%mm2       \n\t"
01478         "packuswb                %%mm0, %%mm0       \n\t"
01479         "packuswb                %%mm2, %%mm2       \n\t"
01480         "movd                %%mm0, (%1, %%"REG_a") \n\t"
01481         "movd                %%mm2, (%2, %%"REG_a") \n\t"
01482         "add                        $4, %%"REG_a"   \n\t"
01483         " js                        1b              \n\t"
01484     : "+r" (src)
01485     : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
01486     : "%"REG_a
01487     );
01488 }
01489 
01490 static void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV,
01491                               const uint8_t *src1, const uint8_t *src2,
01492                               int width, uint32_t *unused)
01493 {
01494     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
01495     assert(src1 == src2);
01496 }
01497 
01498 static void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV,
01499                               const uint8_t *src1, const uint8_t *src2,
01500                               int width, uint32_t *unused)
01501 {
01502     assert(src1==src2);
01503     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
01504 }
01505 
01506 #if COMPILE_TEMPLATE_MMX2
01507 static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
01508                                  int dstWidth, const uint8_t *src,
01509                                  int srcW, int xInc)
01510 {
01511     int16_t *filterPos = c->hLumFilterPos;
01512     int16_t *filter    = c->hLumFilter;
01513     void    *mmx2FilterCode= c->lumMmx2FilterCode;
01514     int i;
01515 #if defined(PIC)
01516     uint64_t ebxsave;
01517 #endif
01518 #if ARCH_X86_64
01519     uint64_t retsave;
01520 #endif
01521 
01522     __asm__ volatile(
01523 #if defined(PIC)
01524         "mov               %%"REG_b", %5        \n\t"
01525 #if ARCH_X86_64
01526         "mov               -8(%%rsp), %%"REG_a" \n\t"
01527         "mov               %%"REG_a", %6        \n\t"
01528 #endif
01529 #else
01530 #if ARCH_X86_64
01531         "mov               -8(%%rsp), %%"REG_a" \n\t"
01532         "mov               %%"REG_a", %5        \n\t"
01533 #endif
01534 #endif
01535         "pxor                  %%mm7, %%mm7     \n\t"
01536         "mov                      %0, %%"REG_c" \n\t"
01537         "mov                      %1, %%"REG_D" \n\t"
01538         "mov                      %2, %%"REG_d" \n\t"
01539         "mov                      %3, %%"REG_b" \n\t"
01540         "xor               %%"REG_a", %%"REG_a" \n\t" // i
01541         PREFETCH"        (%%"REG_c")            \n\t"
01542         PREFETCH"      32(%%"REG_c")            \n\t"
01543         PREFETCH"      64(%%"REG_c")            \n\t"
01544 
01545 #if ARCH_X86_64
01546 #define CALL_MMX2_FILTER_CODE \
01547         "movl            (%%"REG_b"), %%esi     \n\t"\
01548         "call                    *%4            \n\t"\
01549         "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
01550         "add               %%"REG_S", %%"REG_c" \n\t"\
01551         "add               %%"REG_a", %%"REG_D" \n\t"\
01552         "xor               %%"REG_a", %%"REG_a" \n\t"\
01553 
01554 #else
01555 #define CALL_MMX2_FILTER_CODE \
01556         "movl (%%"REG_b"), %%esi        \n\t"\
01557         "call         *%4                       \n\t"\
01558         "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
01559         "add               %%"REG_a", %%"REG_D" \n\t"\
01560         "xor               %%"REG_a", %%"REG_a" \n\t"\
01561 
01562 #endif /* ARCH_X86_64 */
01563 
01564         CALL_MMX2_FILTER_CODE
01565         CALL_MMX2_FILTER_CODE
01566         CALL_MMX2_FILTER_CODE
01567         CALL_MMX2_FILTER_CODE
01568         CALL_MMX2_FILTER_CODE
01569         CALL_MMX2_FILTER_CODE
01570         CALL_MMX2_FILTER_CODE
01571         CALL_MMX2_FILTER_CODE
01572 
01573 #if defined(PIC)
01574         "mov                      %5, %%"REG_b" \n\t"
01575 #if ARCH_X86_64
01576         "mov                      %6, %%"REG_a" \n\t"
01577         "mov               %%"REG_a", -8(%%rsp) \n\t"
01578 #endif
01579 #else
01580 #if ARCH_X86_64
01581         "mov                      %5, %%"REG_a" \n\t"
01582         "mov               %%"REG_a", -8(%%rsp) \n\t"
01583 #endif
01584 #endif
01585         :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
01586            "m" (mmx2FilterCode)
01587 #if defined(PIC)
01588           ,"m" (ebxsave)
01589 #endif
01590 #if ARCH_X86_64
01591           ,"m"(retsave)
01592 #endif
01593         : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
01594 #if !defined(PIC)
01595          ,"%"REG_b
01596 #endif
01597     );
01598 
01599     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
01600         dst[i] = src[srcW-1]*128;
01601 }
01602 
01603 static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
01604                                  int dstWidth, const uint8_t *src1,
01605                                  const uint8_t *src2, int srcW, int xInc)
01606 {
01607     int16_t *filterPos = c->hChrFilterPos;
01608     int16_t *filter    = c->hChrFilter;
01609     void    *mmx2FilterCode= c->chrMmx2FilterCode;
01610     int i;
01611 #if defined(PIC)
01612     DECLARE_ALIGNED(8, uint64_t, ebxsave);
01613 #endif
01614 #if ARCH_X86_64
01615     DECLARE_ALIGNED(8, uint64_t, retsave);
01616 #endif
01617 
01618     __asm__ volatile(
01619 #if defined(PIC)
01620         "mov          %%"REG_b", %7         \n\t"
01621 #if ARCH_X86_64
01622         "mov          -8(%%rsp), %%"REG_a"  \n\t"
01623         "mov          %%"REG_a", %8         \n\t"
01624 #endif
01625 #else
01626 #if ARCH_X86_64
01627         "mov          -8(%%rsp), %%"REG_a"  \n\t"
01628         "mov          %%"REG_a", %7         \n\t"
01629 #endif
01630 #endif
01631         "pxor             %%mm7, %%mm7      \n\t"
01632         "mov                 %0, %%"REG_c"  \n\t"
01633         "mov                 %1, %%"REG_D"  \n\t"
01634         "mov                 %2, %%"REG_d"  \n\t"
01635         "mov                 %3, %%"REG_b"  \n\t"
01636         "xor          %%"REG_a", %%"REG_a"  \n\t" // i
01637         PREFETCH"   (%%"REG_c")             \n\t"
01638         PREFETCH" 32(%%"REG_c")             \n\t"
01639         PREFETCH" 64(%%"REG_c")             \n\t"
01640 
01641         CALL_MMX2_FILTER_CODE
01642         CALL_MMX2_FILTER_CODE
01643         CALL_MMX2_FILTER_CODE
01644         CALL_MMX2_FILTER_CODE
01645         "xor          %%"REG_a", %%"REG_a"  \n\t" // i
01646         "mov                 %5, %%"REG_c"  \n\t" // src
01647         "mov                 %6, %%"REG_D"  \n\t" // buf2
01648         PREFETCH"   (%%"REG_c")             \n\t"
01649         PREFETCH" 32(%%"REG_c")             \n\t"
01650         PREFETCH" 64(%%"REG_c")             \n\t"
01651 
01652         CALL_MMX2_FILTER_CODE
01653         CALL_MMX2_FILTER_CODE
01654         CALL_MMX2_FILTER_CODE
01655         CALL_MMX2_FILTER_CODE
01656 
01657 #if defined(PIC)
01658         "mov %7, %%"REG_b"    \n\t"
01659 #if ARCH_X86_64
01660         "mov                 %8, %%"REG_a"  \n\t"
01661         "mov          %%"REG_a", -8(%%rsp)  \n\t"
01662 #endif
01663 #else
01664 #if ARCH_X86_64
01665         "mov                 %7, %%"REG_a"  \n\t"
01666         "mov          %%"REG_a", -8(%%rsp)  \n\t"
01667 #endif
01668 #endif
01669         :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
01670            "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
01671 #if defined(PIC)
01672           ,"m" (ebxsave)
01673 #endif
01674 #if ARCH_X86_64
01675           ,"m"(retsave)
01676 #endif
01677         : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
01678 #if !defined(PIC)
01679          ,"%"REG_b
01680 #endif
01681     );
01682 
01683     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
01684         dst1[i] = src1[srcW-1]*128;
01685         dst2[i] = src2[srcW-1]*128;
01686     }
01687 }
01688 #endif /* COMPILE_TEMPLATE_MMX2 */
01689 
01690 static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
01691 {
01692     enum PixelFormat srcFormat = c->srcFormat,
01693                      dstFormat = c->dstFormat;
01694 
01695     if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
01696         dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21) {
01697         if (!(c->flags & SWS_BITEXACT)) {
01698             if (c->flags & SWS_ACCURATE_RND) {
01699                 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
01700                     switch (c->dstFormat) {
01701                     case PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X_ar);   break;
01702                     case PIX_FMT_BGR24:   c->yuv2packedX = RENAME(yuv2bgr24_X_ar);   break;
01703                     case PIX_FMT_RGB555:  c->yuv2packedX = RENAME(yuv2rgb555_X_ar);  break;
01704                     case PIX_FMT_RGB565:  c->yuv2packedX = RENAME(yuv2rgb565_X_ar);  break;
01705                     case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
01706                     default: break;
01707                     }
01708                 }
01709             } else {
01710                 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
01711                     switch (c->dstFormat) {
01712                     case PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X);   break;
01713                     case PIX_FMT_BGR24:   c->yuv2packedX = RENAME(yuv2bgr24_X);   break;
01714                     case PIX_FMT_RGB555:  c->yuv2packedX = RENAME(yuv2rgb555_X);  break;
01715                     case PIX_FMT_RGB565:  c->yuv2packedX = RENAME(yuv2rgb565_X);  break;
01716                     case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
01717                     default: break;
01718                     }
01719                 }
01720             }
01721         }
01722         if (!(c->flags & SWS_FULL_CHR_H_INT)) {
01723             switch (c->dstFormat) {
01724             case PIX_FMT_RGB32:
01725                 c->yuv2packed1 = RENAME(yuv2rgb32_1);
01726                 c->yuv2packed2 = RENAME(yuv2rgb32_2);
01727                 break;
01728             case PIX_FMT_BGR24:
01729                 c->yuv2packed1 = RENAME(yuv2bgr24_1);
01730                 c->yuv2packed2 = RENAME(yuv2bgr24_2);
01731                 break;
01732             case PIX_FMT_RGB555:
01733                 c->yuv2packed1 = RENAME(yuv2rgb555_1);
01734                 c->yuv2packed2 = RENAME(yuv2rgb555_2);
01735                 break;
01736             case PIX_FMT_RGB565:
01737                 c->yuv2packed1 = RENAME(yuv2rgb565_1);
01738                 c->yuv2packed2 = RENAME(yuv2rgb565_2);
01739                 break;
01740             case PIX_FMT_YUYV422:
01741                 c->yuv2packed1 = RENAME(yuv2yuyv422_1);
01742                 c->yuv2packed2 = RENAME(yuv2yuyv422_2);
01743                 break;
01744             default:
01745                 break;
01746             }
01747         }
01748     }
01749 
01750     if (c->srcBpc == 8 && c->dstBpc <= 10) {
01751     // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
01752 #if COMPILE_TEMPLATE_MMX2
01753     if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
01754     {
01755         c->hyscale_fast = RENAME(hyscale_fast);
01756         c->hcscale_fast = RENAME(hcscale_fast);
01757     } else {
01758 #endif /* COMPILE_TEMPLATE_MMX2 */
01759         c->hyscale_fast = NULL;
01760         c->hcscale_fast = NULL;
01761 #if COMPILE_TEMPLATE_MMX2
01762     }
01763 #endif /* COMPILE_TEMPLATE_MMX2 */
01764     }
01765 
01766     if (!c->chrSrcHSubSample) {
01767         switch(srcFormat) {
01768         case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
01769         case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV); break;
01770         default: break;
01771         }
01772     }
01773 
01774     switch (srcFormat) {
01775     case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
01776     case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
01777     default: break;
01778     }
01779 }
Generated on Sat Mar 17 2012 12:57:58 for Libav by doxygen 1.7.1