You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1202 lines
53KB

  1. /*
  2. * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "dsputil_mmx.h"
  21. /***********************************/
  22. /* motion compensation */
  23. #define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
  24. "mov"#q" "#C", "#T" \n\t"\
  25. "mov"#d" (%0), "#F" \n\t"\
  26. "paddw "#D", "#T" \n\t"\
  27. "psllw $2, "#T" \n\t"\
  28. "psubw "#B", "#T" \n\t"\
  29. "psubw "#E", "#T" \n\t"\
  30. "punpcklbw "#Z", "#F" \n\t"\
  31. "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\
  32. "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\
  33. "add %2, %0 \n\t"\
  34. "paddw "#F", "#A" \n\t"\
  35. "paddw "#A", "#T" \n\t"\
  36. "psraw $5, "#T" \n\t"\
  37. "packuswb "#T", "#T" \n\t"\
  38. OP(T, (%1), A, d)\
  39. "add %3, %1 \n\t"
  40. #define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
  41. "mov"#q" "#C", "#T" \n\t"\
  42. "mov"#d" (%0), "#F" \n\t"\
  43. "paddw "#D", "#T" \n\t"\
  44. "psllw $2, "#T" \n\t"\
  45. "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\
  46. "psubw "#B", "#T" \n\t"\
  47. "psubw "#E", "#T" \n\t"\
  48. "punpcklbw "#Z", "#F" \n\t"\
  49. "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\
  50. "paddw "#F", "#A" \n\t"\
  51. "add %2, %0 \n\t"\
  52. "paddw "#A", "#T" \n\t"\
  53. "mov"#q" "#T", "#OF"(%1) \n\t"
  54. #define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
  55. #define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
  56. #define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
  57. #define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
  58. #define QPEL_H264(OPNAME, OP, MMX)\
  59. static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  60. int h=4;\
  61. \
  62. __asm__ volatile(\
  63. "pxor %%mm7, %%mm7 \n\t"\
  64. "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\
  65. "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
  66. "1: \n\t"\
  67. "movd -1(%0), %%mm1 \n\t"\
  68. "movd (%0), %%mm2 \n\t"\
  69. "movd 1(%0), %%mm3 \n\t"\
  70. "movd 2(%0), %%mm0 \n\t"\
  71. "punpcklbw %%mm7, %%mm1 \n\t"\
  72. "punpcklbw %%mm7, %%mm2 \n\t"\
  73. "punpcklbw %%mm7, %%mm3 \n\t"\
  74. "punpcklbw %%mm7, %%mm0 \n\t"\
  75. "paddw %%mm0, %%mm1 \n\t"\
  76. "paddw %%mm3, %%mm2 \n\t"\
  77. "movd -2(%0), %%mm0 \n\t"\
  78. "movd 3(%0), %%mm3 \n\t"\
  79. "punpcklbw %%mm7, %%mm0 \n\t"\
  80. "punpcklbw %%mm7, %%mm3 \n\t"\
  81. "paddw %%mm3, %%mm0 \n\t"\
  82. "psllw $2, %%mm2 \n\t"\
  83. "psubw %%mm1, %%mm2 \n\t"\
  84. "pmullw %%mm4, %%mm2 \n\t"\
  85. "paddw %%mm5, %%mm0 \n\t"\
  86. "paddw %%mm2, %%mm0 \n\t"\
  87. "psraw $5, %%mm0 \n\t"\
  88. "packuswb %%mm0, %%mm0 \n\t"\
  89. OP(%%mm0, (%1),%%mm6, d)\
  90. "add %3, %0 \n\t"\
  91. "add %4, %1 \n\t"\
  92. "decl %2 \n\t"\
  93. " jnz 1b \n\t"\
  94. : "+a"(src), "+c"(dst), "+g"(h)\
  95. : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
  96. : "memory"\
  97. );\
  98. }\
  99. static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  100. int h=4;\
  101. __asm__ volatile(\
  102. "pxor %%mm7, %%mm7 \n\t"\
  103. "movq %0, %%mm4 \n\t"\
  104. "movq %1, %%mm5 \n\t"\
  105. :: "m"(ff_pw_5), "m"(ff_pw_16)\
  106. );\
  107. do{\
  108. __asm__ volatile(\
  109. "movd -1(%0), %%mm1 \n\t"\
  110. "movd (%0), %%mm2 \n\t"\
  111. "movd 1(%0), %%mm3 \n\t"\
  112. "movd 2(%0), %%mm0 \n\t"\
  113. "punpcklbw %%mm7, %%mm1 \n\t"\
  114. "punpcklbw %%mm7, %%mm2 \n\t"\
  115. "punpcklbw %%mm7, %%mm3 \n\t"\
  116. "punpcklbw %%mm7, %%mm0 \n\t"\
  117. "paddw %%mm0, %%mm1 \n\t"\
  118. "paddw %%mm3, %%mm2 \n\t"\
  119. "movd -2(%0), %%mm0 \n\t"\
  120. "movd 3(%0), %%mm3 \n\t"\
  121. "punpcklbw %%mm7, %%mm0 \n\t"\
  122. "punpcklbw %%mm7, %%mm3 \n\t"\
  123. "paddw %%mm3, %%mm0 \n\t"\
  124. "psllw $2, %%mm2 \n\t"\
  125. "psubw %%mm1, %%mm2 \n\t"\
  126. "pmullw %%mm4, %%mm2 \n\t"\
  127. "paddw %%mm5, %%mm0 \n\t"\
  128. "paddw %%mm2, %%mm0 \n\t"\
  129. "movd (%2), %%mm3 \n\t"\
  130. "psraw $5, %%mm0 \n\t"\
  131. "packuswb %%mm0, %%mm0 \n\t"\
  132. PAVGB" %%mm3, %%mm0 \n\t"\
  133. OP(%%mm0, (%1),%%mm6, d)\
  134. "add %4, %0 \n\t"\
  135. "add %4, %1 \n\t"\
  136. "add %3, %2 \n\t"\
  137. : "+a"(src), "+c"(dst), "+d"(src2)\
  138. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
  139. : "memory"\
  140. );\
  141. }while(--h);\
  142. }\
  143. static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  144. src -= 2*srcStride;\
  145. __asm__ volatile(\
  146. "pxor %%mm7, %%mm7 \n\t"\
  147. "movd (%0), %%mm0 \n\t"\
  148. "add %2, %0 \n\t"\
  149. "movd (%0), %%mm1 \n\t"\
  150. "add %2, %0 \n\t"\
  151. "movd (%0), %%mm2 \n\t"\
  152. "add %2, %0 \n\t"\
  153. "movd (%0), %%mm3 \n\t"\
  154. "add %2, %0 \n\t"\
  155. "movd (%0), %%mm4 \n\t"\
  156. "add %2, %0 \n\t"\
  157. "punpcklbw %%mm7, %%mm0 \n\t"\
  158. "punpcklbw %%mm7, %%mm1 \n\t"\
  159. "punpcklbw %%mm7, %%mm2 \n\t"\
  160. "punpcklbw %%mm7, %%mm3 \n\t"\
  161. "punpcklbw %%mm7, %%mm4 \n\t"\
  162. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  163. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  164. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  165. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  166. \
  167. : "+a"(src), "+c"(dst)\
  168. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  169. : "memory"\
  170. );\
  171. }\
  172. static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  173. int h=4;\
  174. int w=3;\
  175. src -= 2*srcStride+2;\
  176. while(w--){\
  177. __asm__ volatile(\
  178. "pxor %%mm7, %%mm7 \n\t"\
  179. "movd (%0), %%mm0 \n\t"\
  180. "add %2, %0 \n\t"\
  181. "movd (%0), %%mm1 \n\t"\
  182. "add %2, %0 \n\t"\
  183. "movd (%0), %%mm2 \n\t"\
  184. "add %2, %0 \n\t"\
  185. "movd (%0), %%mm3 \n\t"\
  186. "add %2, %0 \n\t"\
  187. "movd (%0), %%mm4 \n\t"\
  188. "add %2, %0 \n\t"\
  189. "punpcklbw %%mm7, %%mm0 \n\t"\
  190. "punpcklbw %%mm7, %%mm1 \n\t"\
  191. "punpcklbw %%mm7, %%mm2 \n\t"\
  192. "punpcklbw %%mm7, %%mm3 \n\t"\
  193. "punpcklbw %%mm7, %%mm4 \n\t"\
  194. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
  195. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
  196. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
  197. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
  198. \
  199. : "+a"(src)\
  200. : "c"(tmp), "S"((x86_reg)srcStride)\
  201. : "memory"\
  202. );\
  203. tmp += 4;\
  204. src += 4 - 9*srcStride;\
  205. }\
  206. tmp -= 3*4;\
  207. __asm__ volatile(\
  208. "1: \n\t"\
  209. "movq (%0), %%mm0 \n\t"\
  210. "paddw 10(%0), %%mm0 \n\t"\
  211. "movq 2(%0), %%mm1 \n\t"\
  212. "paddw 8(%0), %%mm1 \n\t"\
  213. "movq 4(%0), %%mm2 \n\t"\
  214. "paddw 6(%0), %%mm2 \n\t"\
  215. "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\
  216. "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\
  217. "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\
  218. "paddsw %%mm2, %%mm0 \n\t"\
  219. "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\
  220. "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\
  221. "psraw $6, %%mm0 \n\t"\
  222. "packuswb %%mm0, %%mm0 \n\t"\
  223. OP(%%mm0, (%1),%%mm7, d)\
  224. "add $24, %0 \n\t"\
  225. "add %3, %1 \n\t"\
  226. "decl %2 \n\t"\
  227. " jnz 1b \n\t"\
  228. : "+a"(tmp), "+c"(dst), "+g"(h)\
  229. : "S"((x86_reg)dstStride)\
  230. : "memory"\
  231. );\
  232. }\
  233. \
  234. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  235. int h=8;\
  236. __asm__ volatile(\
  237. "pxor %%mm7, %%mm7 \n\t"\
  238. "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
  239. "1: \n\t"\
  240. "movq (%0), %%mm0 \n\t"\
  241. "movq 1(%0), %%mm2 \n\t"\
  242. "movq %%mm0, %%mm1 \n\t"\
  243. "movq %%mm2, %%mm3 \n\t"\
  244. "punpcklbw %%mm7, %%mm0 \n\t"\
  245. "punpckhbw %%mm7, %%mm1 \n\t"\
  246. "punpcklbw %%mm7, %%mm2 \n\t"\
  247. "punpckhbw %%mm7, %%mm3 \n\t"\
  248. "paddw %%mm2, %%mm0 \n\t"\
  249. "paddw %%mm3, %%mm1 \n\t"\
  250. "psllw $2, %%mm0 \n\t"\
  251. "psllw $2, %%mm1 \n\t"\
  252. "movq -1(%0), %%mm2 \n\t"\
  253. "movq 2(%0), %%mm4 \n\t"\
  254. "movq %%mm2, %%mm3 \n\t"\
  255. "movq %%mm4, %%mm5 \n\t"\
  256. "punpcklbw %%mm7, %%mm2 \n\t"\
  257. "punpckhbw %%mm7, %%mm3 \n\t"\
  258. "punpcklbw %%mm7, %%mm4 \n\t"\
  259. "punpckhbw %%mm7, %%mm5 \n\t"\
  260. "paddw %%mm4, %%mm2 \n\t"\
  261. "paddw %%mm3, %%mm5 \n\t"\
  262. "psubw %%mm2, %%mm0 \n\t"\
  263. "psubw %%mm5, %%mm1 \n\t"\
  264. "pmullw %%mm6, %%mm0 \n\t"\
  265. "pmullw %%mm6, %%mm1 \n\t"\
  266. "movd -2(%0), %%mm2 \n\t"\
  267. "movd 7(%0), %%mm5 \n\t"\
  268. "punpcklbw %%mm7, %%mm2 \n\t"\
  269. "punpcklbw %%mm7, %%mm5 \n\t"\
  270. "paddw %%mm3, %%mm2 \n\t"\
  271. "paddw %%mm5, %%mm4 \n\t"\
  272. "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
  273. "paddw %%mm5, %%mm2 \n\t"\
  274. "paddw %%mm5, %%mm4 \n\t"\
  275. "paddw %%mm2, %%mm0 \n\t"\
  276. "paddw %%mm4, %%mm1 \n\t"\
  277. "psraw $5, %%mm0 \n\t"\
  278. "psraw $5, %%mm1 \n\t"\
  279. "packuswb %%mm1, %%mm0 \n\t"\
  280. OP(%%mm0, (%1),%%mm5, q)\
  281. "add %3, %0 \n\t"\
  282. "add %4, %1 \n\t"\
  283. "decl %2 \n\t"\
  284. " jnz 1b \n\t"\
  285. : "+a"(src), "+c"(dst), "+g"(h)\
  286. : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
  287. : "memory"\
  288. );\
  289. }\
  290. \
  291. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  292. int h=8;\
  293. __asm__ volatile(\
  294. "pxor %%mm7, %%mm7 \n\t"\
  295. "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
  296. "1: \n\t"\
  297. "movq (%0), %%mm0 \n\t"\
  298. "movq 1(%0), %%mm2 \n\t"\
  299. "movq %%mm0, %%mm1 \n\t"\
  300. "movq %%mm2, %%mm3 \n\t"\
  301. "punpcklbw %%mm7, %%mm0 \n\t"\
  302. "punpckhbw %%mm7, %%mm1 \n\t"\
  303. "punpcklbw %%mm7, %%mm2 \n\t"\
  304. "punpckhbw %%mm7, %%mm3 \n\t"\
  305. "paddw %%mm2, %%mm0 \n\t"\
  306. "paddw %%mm3, %%mm1 \n\t"\
  307. "psllw $2, %%mm0 \n\t"\
  308. "psllw $2, %%mm1 \n\t"\
  309. "movq -1(%0), %%mm2 \n\t"\
  310. "movq 2(%0), %%mm4 \n\t"\
  311. "movq %%mm2, %%mm3 \n\t"\
  312. "movq %%mm4, %%mm5 \n\t"\
  313. "punpcklbw %%mm7, %%mm2 \n\t"\
  314. "punpckhbw %%mm7, %%mm3 \n\t"\
  315. "punpcklbw %%mm7, %%mm4 \n\t"\
  316. "punpckhbw %%mm7, %%mm5 \n\t"\
  317. "paddw %%mm4, %%mm2 \n\t"\
  318. "paddw %%mm3, %%mm5 \n\t"\
  319. "psubw %%mm2, %%mm0 \n\t"\
  320. "psubw %%mm5, %%mm1 \n\t"\
  321. "pmullw %%mm6, %%mm0 \n\t"\
  322. "pmullw %%mm6, %%mm1 \n\t"\
  323. "movd -2(%0), %%mm2 \n\t"\
  324. "movd 7(%0), %%mm5 \n\t"\
  325. "punpcklbw %%mm7, %%mm2 \n\t"\
  326. "punpcklbw %%mm7, %%mm5 \n\t"\
  327. "paddw %%mm3, %%mm2 \n\t"\
  328. "paddw %%mm5, %%mm4 \n\t"\
  329. "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
  330. "paddw %%mm5, %%mm2 \n\t"\
  331. "paddw %%mm5, %%mm4 \n\t"\
  332. "paddw %%mm2, %%mm0 \n\t"\
  333. "paddw %%mm4, %%mm1 \n\t"\
  334. "psraw $5, %%mm0 \n\t"\
  335. "psraw $5, %%mm1 \n\t"\
  336. "movq (%2), %%mm4 \n\t"\
  337. "packuswb %%mm1, %%mm0 \n\t"\
  338. PAVGB" %%mm4, %%mm0 \n\t"\
  339. OP(%%mm0, (%1),%%mm5, q)\
  340. "add %5, %0 \n\t"\
  341. "add %5, %1 \n\t"\
  342. "add %4, %2 \n\t"\
  343. "decl %3 \n\t"\
  344. "jg 1b \n\t"\
  345. : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
  346. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
  347. : "memory"\
  348. );\
  349. }\
  350. \
  351. static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  352. int w= 2;\
  353. src -= 2*srcStride;\
  354. \
  355. while(w--){\
  356. __asm__ volatile(\
  357. "pxor %%mm7, %%mm7 \n\t"\
  358. "movd (%0), %%mm0 \n\t"\
  359. "add %2, %0 \n\t"\
  360. "movd (%0), %%mm1 \n\t"\
  361. "add %2, %0 \n\t"\
  362. "movd (%0), %%mm2 \n\t"\
  363. "add %2, %0 \n\t"\
  364. "movd (%0), %%mm3 \n\t"\
  365. "add %2, %0 \n\t"\
  366. "movd (%0), %%mm4 \n\t"\
  367. "add %2, %0 \n\t"\
  368. "punpcklbw %%mm7, %%mm0 \n\t"\
  369. "punpcklbw %%mm7, %%mm1 \n\t"\
  370. "punpcklbw %%mm7, %%mm2 \n\t"\
  371. "punpcklbw %%mm7, %%mm3 \n\t"\
  372. "punpcklbw %%mm7, %%mm4 \n\t"\
  373. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  374. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  375. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  376. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  377. QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
  378. QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
  379. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  380. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  381. "cmpl $16, %4 \n\t"\
  382. "jne 2f \n\t"\
  383. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  384. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  385. QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
  386. QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
  387. QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
  388. QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
  389. QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
  390. QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
  391. "2: \n\t"\
  392. \
  393. : "+a"(src), "+c"(dst)\
  394. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "g"(h)\
  395. : "memory"\
  396. );\
  397. src += 4-(h+5)*srcStride;\
  398. dst += 4-h*dstStride;\
  399. }\
  400. }\
  401. static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
  402. int w = (size+8)>>2;\
  403. src -= 2*srcStride+2;\
  404. while(w--){\
  405. __asm__ volatile(\
  406. "pxor %%mm7, %%mm7 \n\t"\
  407. "movd (%0), %%mm0 \n\t"\
  408. "add %2, %0 \n\t"\
  409. "movd (%0), %%mm1 \n\t"\
  410. "add %2, %0 \n\t"\
  411. "movd (%0), %%mm2 \n\t"\
  412. "add %2, %0 \n\t"\
  413. "movd (%0), %%mm3 \n\t"\
  414. "add %2, %0 \n\t"\
  415. "movd (%0), %%mm4 \n\t"\
  416. "add %2, %0 \n\t"\
  417. "punpcklbw %%mm7, %%mm0 \n\t"\
  418. "punpcklbw %%mm7, %%mm1 \n\t"\
  419. "punpcklbw %%mm7, %%mm2 \n\t"\
  420. "punpcklbw %%mm7, %%mm3 \n\t"\
  421. "punpcklbw %%mm7, %%mm4 \n\t"\
  422. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
  423. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
  424. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
  425. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
  426. QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
  427. QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
  428. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
  429. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
  430. "cmpl $16, %3 \n\t"\
  431. "jne 2f \n\t"\
  432. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
  433. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
  434. QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
  435. QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
  436. QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
  437. QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
  438. QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
  439. QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
  440. "2: \n\t"\
  441. : "+a"(src)\
  442. : "c"(tmp), "S"((x86_reg)srcStride), "g"(size)\
  443. : "memory"\
  444. );\
  445. tmp += 4;\
  446. src += 4 - (size+5)*srcStride;\
  447. }\
  448. }\
  449. static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
  450. int w = size>>4;\
  451. do{\
  452. int h = size;\
  453. __asm__ volatile(\
  454. "1: \n\t"\
  455. "movq (%0), %%mm0 \n\t"\
  456. "movq 8(%0), %%mm3 \n\t"\
  457. "movq 2(%0), %%mm1 \n\t"\
  458. "movq 10(%0), %%mm4 \n\t"\
  459. "paddw %%mm4, %%mm0 \n\t"\
  460. "paddw %%mm3, %%mm1 \n\t"\
  461. "paddw 18(%0), %%mm3 \n\t"\
  462. "paddw 16(%0), %%mm4 \n\t"\
  463. "movq 4(%0), %%mm2 \n\t"\
  464. "movq 12(%0), %%mm5 \n\t"\
  465. "paddw 6(%0), %%mm2 \n\t"\
  466. "paddw 14(%0), %%mm5 \n\t"\
  467. "psubw %%mm1, %%mm0 \n\t"\
  468. "psubw %%mm4, %%mm3 \n\t"\
  469. "psraw $2, %%mm0 \n\t"\
  470. "psraw $2, %%mm3 \n\t"\
  471. "psubw %%mm1, %%mm0 \n\t"\
  472. "psubw %%mm4, %%mm3 \n\t"\
  473. "paddsw %%mm2, %%mm0 \n\t"\
  474. "paddsw %%mm5, %%mm3 \n\t"\
  475. "psraw $2, %%mm0 \n\t"\
  476. "psraw $2, %%mm3 \n\t"\
  477. "paddw %%mm2, %%mm0 \n\t"\
  478. "paddw %%mm5, %%mm3 \n\t"\
  479. "psraw $6, %%mm0 \n\t"\
  480. "psraw $6, %%mm3 \n\t"\
  481. "packuswb %%mm3, %%mm0 \n\t"\
  482. OP(%%mm0, (%1),%%mm7, q)\
  483. "add $48, %0 \n\t"\
  484. "add %3, %1 \n\t"\
  485. "decl %2 \n\t"\
  486. " jnz 1b \n\t"\
  487. : "+a"(tmp), "+c"(dst), "+g"(h)\
  488. : "S"((x86_reg)dstStride)\
  489. : "memory"\
  490. );\
  491. tmp += 8 - size*24;\
  492. dst += 8 - size*dstStride;\
  493. }while(w--);\
  494. }\
  495. \
  496. static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  497. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
  498. }\
  499. static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  500. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
  501. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
  502. }\
  503. \
  504. static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  505. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  506. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  507. src += 8*srcStride;\
  508. dst += 8*dstStride;\
  509. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  510. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  511. }\
  512. \
  513. static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  514. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  515. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  516. src += 8*dstStride;\
  517. dst += 8*dstStride;\
  518. src2 += 8*src2Stride;\
  519. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  520. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  521. }\
  522. \
  523. static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
  524. put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
  525. OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
  526. }\
  527. static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  528. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
  529. }\
  530. \
  531. static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  532. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
  533. }\
  534. \
  535. static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  536. {\
  537. __asm__ volatile(\
  538. "movq (%1), %%mm0 \n\t"\
  539. "movq 24(%1), %%mm1 \n\t"\
  540. "psraw $5, %%mm0 \n\t"\
  541. "psraw $5, %%mm1 \n\t"\
  542. "packuswb %%mm0, %%mm0 \n\t"\
  543. "packuswb %%mm1, %%mm1 \n\t"\
  544. PAVGB" (%0), %%mm0 \n\t"\
  545. PAVGB" (%0,%3), %%mm1 \n\t"\
  546. OP(%%mm0, (%2), %%mm4, d)\
  547. OP(%%mm1, (%2,%4), %%mm5, d)\
  548. "lea (%0,%3,2), %0 \n\t"\
  549. "lea (%2,%4,2), %2 \n\t"\
  550. "movq 48(%1), %%mm0 \n\t"\
  551. "movq 72(%1), %%mm1 \n\t"\
  552. "psraw $5, %%mm0 \n\t"\
  553. "psraw $5, %%mm1 \n\t"\
  554. "packuswb %%mm0, %%mm0 \n\t"\
  555. "packuswb %%mm1, %%mm1 \n\t"\
  556. PAVGB" (%0), %%mm0 \n\t"\
  557. PAVGB" (%0,%3), %%mm1 \n\t"\
  558. OP(%%mm0, (%2), %%mm4, d)\
  559. OP(%%mm1, (%2,%4), %%mm5, d)\
  560. :"+a"(src8), "+c"(src16), "+d"(dst)\
  561. :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
  562. :"memory");\
  563. }\
  564. static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  565. {\
  566. do{\
  567. __asm__ volatile(\
  568. "movq (%1), %%mm0 \n\t"\
  569. "movq 8(%1), %%mm1 \n\t"\
  570. "movq 48(%1), %%mm2 \n\t"\
  571. "movq 8+48(%1), %%mm3 \n\t"\
  572. "psraw $5, %%mm0 \n\t"\
  573. "psraw $5, %%mm1 \n\t"\
  574. "psraw $5, %%mm2 \n\t"\
  575. "psraw $5, %%mm3 \n\t"\
  576. "packuswb %%mm1, %%mm0 \n\t"\
  577. "packuswb %%mm3, %%mm2 \n\t"\
  578. PAVGB" (%0), %%mm0 \n\t"\
  579. PAVGB" (%0,%3), %%mm2 \n\t"\
  580. OP(%%mm0, (%2), %%mm5, q)\
  581. OP(%%mm2, (%2,%4), %%mm5, q)\
  582. ::"a"(src8), "c"(src16), "d"(dst),\
  583. "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
  584. :"memory");\
  585. src8 += 2L*src8Stride;\
  586. src16 += 48;\
  587. dst += 2L*dstStride;\
  588. }while(h-=2);\
  589. }\
  590. static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  591. {\
  592. OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
  593. OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
  594. }\
  595. #if ARCH_X86_64
  596. #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
  597. static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  598. int h=16;\
  599. __asm__ volatile(\
  600. "pxor %%xmm15, %%xmm15 \n\t"\
  601. "movdqa %6, %%xmm14 \n\t"\
  602. "movdqa %7, %%xmm13 \n\t"\
  603. "1: \n\t"\
  604. "lddqu 6(%0), %%xmm1 \n\t"\
  605. "lddqu -2(%0), %%xmm7 \n\t"\
  606. "movdqa %%xmm1, %%xmm0 \n\t"\
  607. "punpckhbw %%xmm15, %%xmm1 \n\t"\
  608. "punpcklbw %%xmm15, %%xmm0 \n\t"\
  609. "punpcklbw %%xmm15, %%xmm7 \n\t"\
  610. "movdqa %%xmm1, %%xmm2 \n\t"\
  611. "movdqa %%xmm0, %%xmm6 \n\t"\
  612. "movdqa %%xmm1, %%xmm3 \n\t"\
  613. "movdqa %%xmm0, %%xmm8 \n\t"\
  614. "movdqa %%xmm1, %%xmm4 \n\t"\
  615. "movdqa %%xmm0, %%xmm9 \n\t"\
  616. "movdqa %%xmm0, %%xmm12 \n\t"\
  617. "movdqa %%xmm1, %%xmm11 \n\t"\
  618. "palignr $10,%%xmm0, %%xmm11\n\t"\
  619. "palignr $10,%%xmm7, %%xmm12\n\t"\
  620. "palignr $2, %%xmm0, %%xmm4 \n\t"\
  621. "palignr $2, %%xmm7, %%xmm9 \n\t"\
  622. "palignr $4, %%xmm0, %%xmm3 \n\t"\
  623. "palignr $4, %%xmm7, %%xmm8 \n\t"\
  624. "palignr $6, %%xmm0, %%xmm2 \n\t"\
  625. "palignr $6, %%xmm7, %%xmm6 \n\t"\
  626. "paddw %%xmm0 ,%%xmm11 \n\t"\
  627. "palignr $8, %%xmm0, %%xmm1 \n\t"\
  628. "palignr $8, %%xmm7, %%xmm0 \n\t"\
  629. "paddw %%xmm12,%%xmm7 \n\t"\
  630. "paddw %%xmm3, %%xmm2 \n\t"\
  631. "paddw %%xmm8, %%xmm6 \n\t"\
  632. "paddw %%xmm4, %%xmm1 \n\t"\
  633. "paddw %%xmm9, %%xmm0 \n\t"\
  634. "psllw $2, %%xmm2 \n\t"\
  635. "psllw $2, %%xmm6 \n\t"\
  636. "psubw %%xmm1, %%xmm2 \n\t"\
  637. "psubw %%xmm0, %%xmm6 \n\t"\
  638. "paddw %%xmm13,%%xmm11 \n\t"\
  639. "paddw %%xmm13,%%xmm7 \n\t"\
  640. "pmullw %%xmm14,%%xmm2 \n\t"\
  641. "pmullw %%xmm14,%%xmm6 \n\t"\
  642. "lddqu (%2), %%xmm3 \n\t"\
  643. "paddw %%xmm11,%%xmm2 \n\t"\
  644. "paddw %%xmm7, %%xmm6 \n\t"\
  645. "psraw $5, %%xmm2 \n\t"\
  646. "psraw $5, %%xmm6 \n\t"\
  647. "packuswb %%xmm2,%%xmm6 \n\t"\
  648. "pavgb %%xmm3, %%xmm6 \n\t"\
  649. OP(%%xmm6, (%1), %%xmm4, dqa)\
  650. "add %5, %0 \n\t"\
  651. "add %5, %1 \n\t"\
  652. "add %4, %2 \n\t"\
  653. "decl %3 \n\t"\
  654. "jg 1b \n\t"\
  655. : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
  656. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
  657. "m"(ff_pw_5), "m"(ff_pw_16)\
  658. : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , \
  659. "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" , \
  660. "%xmm8" , "%xmm9" , "%xmm10", "%xmm11", \
  661. "%xmm12", "%xmm13", "%xmm14", "%xmm15",)\
  662. "memory"\
  663. );\
  664. }
  665. #else // ARCH_X86_64
  666. #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
  667. static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  668. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  669. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  670. src += 8*dstStride;\
  671. dst += 8*dstStride;\
  672. src2 += 8*src2Stride;\
  673. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
  674. OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  675. }
  676. #endif // ARCH_X86_64
  677. #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
  678. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  679. int h=8;\
  680. __asm__ volatile(\
  681. "pxor %%xmm7, %%xmm7 \n\t"\
  682. "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
  683. "1: \n\t"\
  684. "lddqu -2(%0), %%xmm1 \n\t"\
  685. "movdqa %%xmm1, %%xmm0 \n\t"\
  686. "punpckhbw %%xmm7, %%xmm1 \n\t"\
  687. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  688. "movdqa %%xmm1, %%xmm2 \n\t"\
  689. "movdqa %%xmm1, %%xmm3 \n\t"\
  690. "movdqa %%xmm1, %%xmm4 \n\t"\
  691. "movdqa %%xmm1, %%xmm5 \n\t"\
  692. "palignr $2, %%xmm0, %%xmm4 \n\t"\
  693. "palignr $4, %%xmm0, %%xmm3 \n\t"\
  694. "palignr $6, %%xmm0, %%xmm2 \n\t"\
  695. "palignr $8, %%xmm0, %%xmm1 \n\t"\
  696. "palignr $10,%%xmm0, %%xmm5 \n\t"\
  697. "paddw %%xmm5, %%xmm0 \n\t"\
  698. "paddw %%xmm3, %%xmm2 \n\t"\
  699. "paddw %%xmm4, %%xmm1 \n\t"\
  700. "psllw $2, %%xmm2 \n\t"\
  701. "movq (%2), %%xmm3 \n\t"\
  702. "psubw %%xmm1, %%xmm2 \n\t"\
  703. "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
  704. "pmullw %%xmm6, %%xmm2 \n\t"\
  705. "paddw %%xmm0, %%xmm2 \n\t"\
  706. "psraw $5, %%xmm2 \n\t"\
  707. "packuswb %%xmm2, %%xmm2 \n\t"\
  708. "pavgb %%xmm3, %%xmm2 \n\t"\
  709. OP(%%xmm2, (%1), %%xmm4, q)\
  710. "add %5, %0 \n\t"\
  711. "add %5, %1 \n\t"\
  712. "add %4, %2 \n\t"\
  713. "decl %3 \n\t"\
  714. "jg 1b \n\t"\
  715. : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
  716. : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
  717. : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
  718. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
  719. "memory"\
  720. );\
  721. }\
  722. QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
  723. \
  724. static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  725. int h=8;\
  726. __asm__ volatile(\
  727. "pxor %%xmm7, %%xmm7 \n\t"\
  728. "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
  729. "1: \n\t"\
  730. "lddqu -2(%0), %%xmm1 \n\t"\
  731. "movdqa %%xmm1, %%xmm0 \n\t"\
  732. "punpckhbw %%xmm7, %%xmm1 \n\t"\
  733. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  734. "movdqa %%xmm1, %%xmm2 \n\t"\
  735. "movdqa %%xmm1, %%xmm3 \n\t"\
  736. "movdqa %%xmm1, %%xmm4 \n\t"\
  737. "movdqa %%xmm1, %%xmm5 \n\t"\
  738. "palignr $2, %%xmm0, %%xmm4 \n\t"\
  739. "palignr $4, %%xmm0, %%xmm3 \n\t"\
  740. "palignr $6, %%xmm0, %%xmm2 \n\t"\
  741. "palignr $8, %%xmm0, %%xmm1 \n\t"\
  742. "palignr $10,%%xmm0, %%xmm5 \n\t"\
  743. "paddw %%xmm5, %%xmm0 \n\t"\
  744. "paddw %%xmm3, %%xmm2 \n\t"\
  745. "paddw %%xmm4, %%xmm1 \n\t"\
  746. "psllw $2, %%xmm2 \n\t"\
  747. "psubw %%xmm1, %%xmm2 \n\t"\
  748. "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
  749. "pmullw %%xmm6, %%xmm2 \n\t"\
  750. "paddw %%xmm0, %%xmm2 \n\t"\
  751. "psraw $5, %%xmm2 \n\t"\
  752. "packuswb %%xmm2, %%xmm2 \n\t"\
  753. OP(%%xmm2, (%1), %%xmm4, q)\
  754. "add %3, %0 \n\t"\
  755. "add %4, %1 \n\t"\
  756. "decl %2 \n\t"\
  757. " jnz 1b \n\t"\
  758. : "+a"(src), "+c"(dst), "+g"(h)\
  759. : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
  760. : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
  761. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
  762. "memory"\
  763. );\
  764. }\
  765. static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  766. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  767. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  768. src += 8*srcStride;\
  769. dst += 8*dstStride;\
  770. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
  771. OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  772. }\
  773. #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
  774. static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  775. src -= 2*srcStride;\
  776. \
  777. __asm__ volatile(\
  778. "pxor %%xmm7, %%xmm7 \n\t"\
  779. "movq (%0), %%xmm0 \n\t"\
  780. "add %2, %0 \n\t"\
  781. "movq (%0), %%xmm1 \n\t"\
  782. "add %2, %0 \n\t"\
  783. "movq (%0), %%xmm2 \n\t"\
  784. "add %2, %0 \n\t"\
  785. "movq (%0), %%xmm3 \n\t"\
  786. "add %2, %0 \n\t"\
  787. "movq (%0), %%xmm4 \n\t"\
  788. "add %2, %0 \n\t"\
  789. "punpcklbw %%xmm7, %%xmm0 \n\t"\
  790. "punpcklbw %%xmm7, %%xmm1 \n\t"\
  791. "punpcklbw %%xmm7, %%xmm2 \n\t"\
  792. "punpcklbw %%xmm7, %%xmm3 \n\t"\
  793. "punpcklbw %%xmm7, %%xmm4 \n\t"\
  794. QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
  795. QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
  796. QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
  797. QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
  798. QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
  799. QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
  800. QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
  801. QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
  802. "cmpl $16, %4 \n\t"\
  803. "jne 2f \n\t"\
  804. QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
  805. QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
  806. QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
  807. QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
  808. QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
  809. QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
  810. QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
  811. QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
  812. "2: \n\t"\
  813. \
  814. : "+a"(src), "+c"(dst)\
  815. : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "g"(h)\
  816. : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
  817. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
  818. "memory"\
  819. );\
  820. }\
  821. static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  822. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
  823. }\
  824. static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  825. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
  826. OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
  827. }
  828. static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
  829. int w = (size+8)>>3;
  830. src -= 2*srcStride+2;
  831. while(w--){
  832. __asm__ volatile(
  833. "pxor %%xmm7, %%xmm7 \n\t"
  834. "movq (%0), %%xmm0 \n\t"
  835. "add %2, %0 \n\t"
  836. "movq (%0), %%xmm1 \n\t"
  837. "add %2, %0 \n\t"
  838. "movq (%0), %%xmm2 \n\t"
  839. "add %2, %0 \n\t"
  840. "movq (%0), %%xmm3 \n\t"
  841. "add %2, %0 \n\t"
  842. "movq (%0), %%xmm4 \n\t"
  843. "add %2, %0 \n\t"
  844. "punpcklbw %%xmm7, %%xmm0 \n\t"
  845. "punpcklbw %%xmm7, %%xmm1 \n\t"
  846. "punpcklbw %%xmm7, %%xmm2 \n\t"
  847. "punpcklbw %%xmm7, %%xmm3 \n\t"
  848. "punpcklbw %%xmm7, %%xmm4 \n\t"
  849. QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
  850. QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
  851. QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
  852. QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
  853. QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
  854. QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
  855. QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
  856. QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
  857. "cmpl $16, %3 \n\t"
  858. "jne 2f \n\t"
  859. QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48)
  860. QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48)
  861. QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
  862. QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
  863. QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
  864. QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
  865. QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
  866. QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
  867. "2: \n\t"
  868. : "+a"(src)
  869. : "c"(tmp), "S"((x86_reg)srcStride), "g"(size)
  870. : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",
  871. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)
  872. "memory"
  873. );
  874. tmp += 8;
  875. src += 8 - (size+5)*srcStride;
  876. }
  877. }
  878. #define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
  879. static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
  880. int h = size;\
  881. if(size == 16){\
  882. __asm__ volatile(\
  883. "1: \n\t"\
  884. "movdqa 32(%0), %%xmm4 \n\t"\
  885. "movdqa 16(%0), %%xmm5 \n\t"\
  886. "movdqa (%0), %%xmm7 \n\t"\
  887. "movdqa %%xmm4, %%xmm3 \n\t"\
  888. "movdqa %%xmm4, %%xmm2 \n\t"\
  889. "movdqa %%xmm4, %%xmm1 \n\t"\
  890. "movdqa %%xmm4, %%xmm0 \n\t"\
  891. "palignr $10, %%xmm5, %%xmm0 \n\t"\
  892. "palignr $8, %%xmm5, %%xmm1 \n\t"\
  893. "palignr $6, %%xmm5, %%xmm2 \n\t"\
  894. "palignr $4, %%xmm5, %%xmm3 \n\t"\
  895. "palignr $2, %%xmm5, %%xmm4 \n\t"\
  896. "paddw %%xmm5, %%xmm0 \n\t"\
  897. "paddw %%xmm4, %%xmm1 \n\t"\
  898. "paddw %%xmm3, %%xmm2 \n\t"\
  899. "movdqa %%xmm5, %%xmm6 \n\t"\
  900. "movdqa %%xmm5, %%xmm4 \n\t"\
  901. "movdqa %%xmm5, %%xmm3 \n\t"\
  902. "palignr $8, %%xmm7, %%xmm4 \n\t"\
  903. "palignr $2, %%xmm7, %%xmm6 \n\t"\
  904. "palignr $10, %%xmm7, %%xmm3 \n\t"\
  905. "paddw %%xmm6, %%xmm4 \n\t"\
  906. "movdqa %%xmm5, %%xmm6 \n\t"\
  907. "palignr $6, %%xmm7, %%xmm5 \n\t"\
  908. "palignr $4, %%xmm7, %%xmm6 \n\t"\
  909. "paddw %%xmm7, %%xmm3 \n\t"\
  910. "paddw %%xmm6, %%xmm5 \n\t"\
  911. \
  912. "psubw %%xmm1, %%xmm0 \n\t"\
  913. "psubw %%xmm4, %%xmm3 \n\t"\
  914. "psraw $2, %%xmm0 \n\t"\
  915. "psraw $2, %%xmm3 \n\t"\
  916. "psubw %%xmm1, %%xmm0 \n\t"\
  917. "psubw %%xmm4, %%xmm3 \n\t"\
  918. "paddw %%xmm2, %%xmm0 \n\t"\
  919. "paddw %%xmm5, %%xmm3 \n\t"\
  920. "psraw $2, %%xmm0 \n\t"\
  921. "psraw $2, %%xmm3 \n\t"\
  922. "paddw %%xmm2, %%xmm0 \n\t"\
  923. "paddw %%xmm5, %%xmm3 \n\t"\
  924. "psraw $6, %%xmm0 \n\t"\
  925. "psraw $6, %%xmm3 \n\t"\
  926. "packuswb %%xmm0, %%xmm3 \n\t"\
  927. OP(%%xmm3, (%1), %%xmm7, dqa)\
  928. "add $48, %0 \n\t"\
  929. "add %3, %1 \n\t"\
  930. "decl %2 \n\t"\
  931. " jnz 1b \n\t"\
  932. : "+a"(tmp), "+c"(dst), "+g"(h)\
  933. : "S"((x86_reg)dstStride)\
  934. : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
  935. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
  936. "memory"\
  937. );\
  938. }else{\
  939. __asm__ volatile(\
  940. "1: \n\t"\
  941. "movdqa 16(%0), %%xmm1 \n\t"\
  942. "movdqa (%0), %%xmm0 \n\t"\
  943. "movdqa %%xmm1, %%xmm2 \n\t"\
  944. "movdqa %%xmm1, %%xmm3 \n\t"\
  945. "movdqa %%xmm1, %%xmm4 \n\t"\
  946. "movdqa %%xmm1, %%xmm5 \n\t"\
  947. "palignr $10, %%xmm0, %%xmm5 \n\t"\
  948. "palignr $8, %%xmm0, %%xmm4 \n\t"\
  949. "palignr $6, %%xmm0, %%xmm3 \n\t"\
  950. "palignr $4, %%xmm0, %%xmm2 \n\t"\
  951. "palignr $2, %%xmm0, %%xmm1 \n\t"\
  952. "paddw %%xmm5, %%xmm0 \n\t"\
  953. "paddw %%xmm4, %%xmm1 \n\t"\
  954. "paddw %%xmm3, %%xmm2 \n\t"\
  955. "psubw %%xmm1, %%xmm0 \n\t"\
  956. "psraw $2, %%xmm0 \n\t"\
  957. "psubw %%xmm1, %%xmm0 \n\t"\
  958. "paddw %%xmm2, %%xmm0 \n\t"\
  959. "psraw $2, %%xmm0 \n\t"\
  960. "paddw %%xmm2, %%xmm0 \n\t"\
  961. "psraw $6, %%xmm0 \n\t"\
  962. "packuswb %%xmm0, %%xmm0 \n\t"\
  963. OP(%%xmm0, (%1), %%xmm7, q)\
  964. "add $48, %0 \n\t"\
  965. "add %3, %1 \n\t"\
  966. "decl %2 \n\t"\
  967. " jnz 1b \n\t"\
  968. : "+a"(tmp), "+c"(dst), "+g"(h)\
  969. : "S"((x86_reg)dstStride)\
  970. : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
  971. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
  972. "memory"\
  973. );\
  974. }\
  975. }
  976. #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
  977. static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
  978. put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
  979. OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
  980. }\
  981. static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  982. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
  983. }\
  984. static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  985. OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
  986. }\
  987. #define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
  988. #define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
  989. #define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
  990. #define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
  991. #define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
  992. #define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
  993. #define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
  994. #define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
  995. #define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
  996. #define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
  997. #define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
  998. #define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
  999. #define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
  1000. #define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
  1001. #define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
  1002. #define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
  1003. #define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
  1004. #define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
  1005. #define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
  1006. #define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
  1007. #define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
  1008. #define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
  1009. #define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
  1010. #define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
  1011. #define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
  1012. #define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
  1013. #define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
  1014. H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
  1015. H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
  1016. H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
  1017. H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
  1018. static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
  1019. put_pixels16_sse2(dst, src, stride, 16);
  1020. }
  1021. static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
  1022. avg_pixels16_sse2(dst, src, stride, 16);
  1023. }
  1024. #define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
  1025. #define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
  1026. #define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
  1027. static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
  1028. OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
  1029. }\
  1030. #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
  1031. static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1032. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
  1033. }\
  1034. \
  1035. static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1036. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
  1037. }\
  1038. \
  1039. static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1040. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
  1041. }\
  1042. #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
  1043. static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1044. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1045. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1046. OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
  1047. }\
  1048. \
  1049. static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1050. OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
  1051. }\
  1052. \
  1053. static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1054. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1055. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1056. OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
  1057. }\
  1058. #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
  1059. static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1060. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1061. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1062. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
  1063. }\
  1064. \
  1065. static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1066. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1067. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
  1068. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
  1069. }\
  1070. \
  1071. static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1072. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1073. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1074. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
  1075. }\
  1076. \
  1077. static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1078. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1079. put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
  1080. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
  1081. }\
  1082. \
  1083. static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1084. DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
  1085. OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
  1086. }\
  1087. \
  1088. static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1089. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1090. uint8_t * const halfHV= temp;\
  1091. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1092. assert(((int)temp & 7) == 0);\
  1093. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1094. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
  1095. }\
  1096. \
  1097. static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1098. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1099. uint8_t * const halfHV= temp;\
  1100. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1101. assert(((int)temp & 7) == 0);\
  1102. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1103. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
  1104. }\
  1105. \
  1106. static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1107. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1108. uint8_t * const halfHV= temp;\
  1109. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1110. assert(((int)temp & 7) == 0);\
  1111. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1112. OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
  1113. }\
  1114. \
  1115. static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1116. DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1117. uint8_t * const halfHV= temp;\
  1118. int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1119. assert(((int)temp & 7) == 0);\
  1120. put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1121. OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
  1122. }\
  1123. #define H264_MC_4816(MMX)\
  1124. H264_MC(put_, 4, MMX, 8)\
  1125. H264_MC(put_, 8, MMX, 8)\
  1126. H264_MC(put_, 16,MMX, 8)\
  1127. H264_MC(avg_, 4, MMX, 8)\
  1128. H264_MC(avg_, 8, MMX, 8)\
  1129. H264_MC(avg_, 16,MMX, 8)\
  1130. #define H264_MC_816(QPEL, XMM)\
  1131. QPEL(put_, 8, XMM, 16)\
  1132. QPEL(put_, 16,XMM, 16)\
  1133. QPEL(avg_, 8, XMM, 16)\
  1134. QPEL(avg_, 16,XMM, 16)\
  1135. #define AVG_3DNOW_OP(a,b,temp, size) \
  1136. "mov" #size " " #b ", " #temp " \n\t"\
  1137. "pavgusb " #temp ", " #a " \n\t"\
  1138. "mov" #size " " #a ", " #b " \n\t"
  1139. #define AVG_MMX2_OP(a,b,temp, size) \
  1140. "mov" #size " " #b ", " #temp " \n\t"\
  1141. "pavgb " #temp ", " #a " \n\t"\
  1142. "mov" #size " " #a ", " #b " \n\t"
  1143. #define PAVGB "pavgusb"
  1144. QPEL_H264(put_, PUT_OP, 3dnow)
  1145. QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
  1146. #undef PAVGB
  1147. #define PAVGB "pavgb"
  1148. QPEL_H264(put_, PUT_OP, mmx2)
  1149. QPEL_H264(avg_, AVG_MMX2_OP, mmx2)
  1150. QPEL_H264_V_XMM(put_, PUT_OP, sse2)
  1151. QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2)
  1152. QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
  1153. QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2)
  1154. #if HAVE_SSSE3
  1155. QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
  1156. QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3)
  1157. QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3)
  1158. QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3)
  1159. QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
  1160. QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3)
  1161. #endif
  1162. #undef PAVGB
  1163. H264_MC_4816(3dnow)
  1164. H264_MC_4816(mmx2)
  1165. H264_MC_816(H264_MC_V, sse2)
  1166. H264_MC_816(H264_MC_HV, sse2)
  1167. #if HAVE_SSSE3
  1168. H264_MC_816(H264_MC_H, ssse3)
  1169. H264_MC_816(H264_MC_HV, ssse3)
  1170. #endif